diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 34 | ||||
-rw-r--r-- | mm/Makefile | 20 | ||||
-rw-r--r-- | mm/backing-dev.c | 78 | ||||
-rw-r--r-- | mm/bootmem.c | 138 | ||||
-rw-r--r-- | mm/bounce.c | 8 | ||||
-rw-r--r-- | mm/cleancache.c | 6 | ||||
-rw-r--r-- | mm/compaction.c | 595 | ||||
-rw-r--r-- | mm/fadvise.c | 18 | ||||
-rw-r--r-- | mm/filemap.c | 146 | ||||
-rw-r--r-- | mm/filemap_xip.c | 10 | ||||
-rw-r--r-- | mm/frontswap.c | 344 | ||||
-rw-r--r-- | mm/highmem.c | 12 | ||||
-rw-r--r-- | mm/huge_memory.c | 29 | ||||
-rw-r--r-- | mm/hugetlb.c | 234 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 418 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 51 | ||||
-rw-r--r-- | mm/madvise.c | 29 | ||||
-rw-r--r-- | mm/memblock.c | 181 | ||||
-rw-r--r-- | mm/memcontrol.c | 1167 | ||||
-rw-r--r-- | mm/memory-failure.c | 45 | ||||
-rw-r--r-- | mm/memory.c | 93 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 42 | ||||
-rw-r--r-- | mm/mempolicy.c | 102 | ||||
-rw-r--r-- | mm/mempool.c | 12 | ||||
-rw-r--r-- | mm/migrate.c | 106 | ||||
-rw-r--r-- | mm/mmap.c | 161 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 45 | ||||
-rw-r--r-- | mm/mmzone.c | 14 | ||||
-rw-r--r-- | mm/mremap.c | 28 | ||||
-rw-r--r-- | mm/nobootmem.c | 151 | ||||
-rw-r--r-- | mm/nommu.c | 20 | ||||
-rw-r--r-- | mm/oom_kill.c | 278 | ||||
-rw-r--r-- | mm/page-writeback.c | 111 | ||||
-rw-r--r-- | mm/page_alloc.c | 779 | ||||
-rw-r--r-- | mm/page_cgroup.c | 6 | ||||
-rw-r--r-- | mm/page_io.c | 157 | ||||
-rw-r--r-- | mm/page_isolation.c | 108 | ||||
-rw-r--r-- | mm/pagewalk.c | 1 | ||||
-rw-r--r-- | mm/percpu-vm.c | 1 | ||||
-rw-r--r-- | mm/percpu.c | 22 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 4 | ||||
-rw-r--r-- | mm/process_vm_access.c | 16 | ||||
-rw-r--r-- | mm/readahead.c | 40 | ||||
-rw-r--r-- | mm/rmap.c | 6 | ||||
-rw-r--r-- | mm/shmem.c | 572 | ||||
-rw-r--r-- | mm/slab.c | 622 | ||||
-rw-r--r-- | mm/slab.h | 33 | ||||
-rw-r--r-- | mm/slab_common.c | 120 | ||||
-rw-r--r-- | mm/slob.c | 152 | ||||
-rw-r--r-- | mm/slub.c | 479 | ||||
-rw-r--r-- | mm/sparse.c | 62 | ||||
-rw-r--r-- | mm/swap.c | 181 | ||||
-rw-r--r-- | mm/swap_state.c | 7 | ||||
-rw-r--r-- | mm/swapfile.c | 244 | ||||
-rw-r--r-- | mm/thrash.c | 155 | ||||
-rw-r--r-- | mm/truncate.c | 25 | ||||
-rw-r--r-- | mm/util.c | 30 | ||||
-rw-r--r-- | mm/vmalloc.c | 59 | ||||
-rw-r--r-- | mm/vmscan.c | 936 | ||||
-rw-r--r-- | mm/vmstat.c | 18 |
61 files changed, 5833 insertions, 3730 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e338407f1225..d5c8019c6627 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK | |||
140 | config NO_BOOTMEM | 140 | config NO_BOOTMEM |
141 | boolean | 141 | boolean |
142 | 142 | ||
143 | config MEMORY_ISOLATION | ||
144 | boolean | ||
145 | |||
143 | # eventually, we can have this option just 'select SPARSEMEM' | 146 | # eventually, we can have this option just 'select SPARSEMEM' |
144 | config MEMORY_HOTPLUG | 147 | config MEMORY_HOTPLUG |
145 | bool "Allow for memory hot-add" | 148 | bool "Allow for memory hot-add" |
149 | select MEMORY_ISOLATION | ||
146 | depends on SPARSEMEM || X86_64_ACPI_NUMA | 150 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
147 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG | 151 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG |
148 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) | 152 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) |
@@ -198,7 +202,7 @@ config COMPACTION | |||
198 | config MIGRATION | 202 | config MIGRATION |
199 | bool "Page migration" | 203 | bool "Page migration" |
200 | def_bool y | 204 | def_bool y |
201 | depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION | 205 | depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA |
202 | help | 206 | help |
203 | Allows the migration of the physical location of pages of processes | 207 | Allows the migration of the physical location of pages of processes |
204 | while the virtual addresses are not changed. This is useful in | 208 | while the virtual addresses are not changed. This is useful in |
@@ -272,6 +276,7 @@ config MEMORY_FAILURE | |||
272 | depends on MMU | 276 | depends on MMU |
273 | depends on ARCH_SUPPORTS_MEMORY_FAILURE | 277 | depends on ARCH_SUPPORTS_MEMORY_FAILURE |
274 | bool "Enable recovery from hardware memory errors" | 278 | bool "Enable recovery from hardware memory errors" |
279 | select MEMORY_ISOLATION | ||
275 | help | 280 | help |
276 | Enables code to recover from some memory failures on systems | 281 | Enables code to recover from some memory failures on systems |
277 | with MCA recovery. This allows a system to continue running | 282 | with MCA recovery. This allows a system to continue running |
@@ -349,6 +354,16 @@ choice | |||
349 | benefit. | 354 | benefit. |
350 | endchoice | 355 | endchoice |
351 | 356 | ||
357 | config CROSS_MEMORY_ATTACH | ||
358 | bool "Cross Memory Support" | ||
359 | depends on MMU | ||
360 | default y | ||
361 | help | ||
362 | Enabling this option adds the system calls process_vm_readv and | ||
363 | process_vm_writev which allow a process with the correct privileges | ||
364 | to directly read from or write to to another process's address space. | ||
365 | See the man page for more details. | ||
366 | |||
352 | # | 367 | # |
353 | # UP and nommu archs use km based percpu allocator | 368 | # UP and nommu archs use km based percpu allocator |
354 | # | 369 | # |
@@ -379,3 +394,20 @@ config CLEANCACHE | |||
379 | in a negligible performance hit. | 394 | in a negligible performance hit. |
380 | 395 | ||
381 | If unsure, say Y to enable cleancache | 396 | If unsure, say Y to enable cleancache |
397 | |||
398 | config FRONTSWAP | ||
399 | bool "Enable frontswap to cache swap pages if tmem is present" | ||
400 | depends on SWAP | ||
401 | default n | ||
402 | help | ||
403 | Frontswap is so named because it can be thought of as the opposite | ||
404 | of a "backing" store for a swap device. The data is stored into | ||
405 | "transcendent memory", memory that is not directly accessible or | ||
406 | addressable by the kernel and is of unknown and possibly | ||
407 | time-varying size. When space in transcendent memory is available, | ||
408 | a significant swap I/O reduction may be achieved. When none is | ||
409 | available, all frontswap calls are reduced to a single pointer- | ||
410 | compare-against-NULL resulting in a negligible performance hit | ||
411 | and swap data is stored as normal on the matching swap device. | ||
412 | |||
413 | If unsure, say Y to enable frontswap. | ||
diff --git a/mm/Makefile b/mm/Makefile index 50ec00ef2a0e..92753e2d82da 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -5,15 +5,19 @@ | |||
5 | mmu-y := nommu.o | 5 | mmu-y := nommu.o |
6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | 6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ |
7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
8 | vmalloc.o pagewalk.o pgtable-generic.o \ | 8 | vmalloc.o pagewalk.o pgtable-generic.o |
9 | process_vm_access.o | 9 | |
10 | ifdef CONFIG_CROSS_MEMORY_ATTACH | ||
11 | mmu-$(CONFIG_MMU) += process_vm_access.o | ||
12 | endif | ||
10 | 13 | ||
11 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | 14 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ |
12 | maccess.o page_alloc.o page-writeback.o \ | 15 | maccess.o page_alloc.o page-writeback.o \ |
13 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
14 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 17 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
15 | page_isolation.o mm_init.o mmu_context.o percpu.o \ | 18 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
16 | $(mmu-y) | 19 | compaction.o $(mmu-y) |
20 | |||
17 | obj-y += init-mm.o | 21 | obj-y += init-mm.o |
18 | 22 | ||
19 | ifdef CONFIG_NO_BOOTMEM | 23 | ifdef CONFIG_NO_BOOTMEM |
@@ -25,14 +29,14 @@ endif | |||
25 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | 29 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o |
26 | 30 | ||
27 | obj-$(CONFIG_BOUNCE) += bounce.o | 31 | obj-$(CONFIG_BOUNCE) += bounce.o |
28 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 32 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o |
33 | obj-$(CONFIG_FRONTSWAP) += frontswap.o | ||
29 | obj-$(CONFIG_HAS_DMA) += dmapool.o | 34 | obj-$(CONFIG_HAS_DMA) += dmapool.o |
30 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 35 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
31 | obj-$(CONFIG_NUMA) += mempolicy.o | 36 | obj-$(CONFIG_NUMA) += mempolicy.o |
32 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 37 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
33 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | 38 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o |
34 | obj-$(CONFIG_SLOB) += slob.o | 39 | obj-$(CONFIG_SLOB) += slob.o |
35 | obj-$(CONFIG_COMPACTION) += compaction.o | ||
36 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 40 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
37 | obj-$(CONFIG_KSM) += ksm.o | 41 | obj-$(CONFIG_KSM) += ksm.o |
38 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o | 42 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o |
@@ -45,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o | |||
45 | obj-$(CONFIG_MIGRATION) += migrate.o | 49 | obj-$(CONFIG_MIGRATION) += migrate.o |
46 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 50 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
47 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o | 51 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o |
48 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 52 | obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o |
53 | obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o | ||
49 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | 54 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o |
50 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 55 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
51 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | 56 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o |
52 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | 57 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o |
53 | obj-$(CONFIG_CLEANCACHE) += cleancache.o | 58 | obj-$(CONFIG_CLEANCACHE) += cleancache.o |
59 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dd8e2aafb07e..b41823cc05e6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -39,12 +39,6 @@ DEFINE_SPINLOCK(bdi_lock); | |||
39 | LIST_HEAD(bdi_list); | 39 | LIST_HEAD(bdi_list); |
40 | LIST_HEAD(bdi_pending_list); | 40 | LIST_HEAD(bdi_pending_list); |
41 | 41 | ||
42 | static struct task_struct *sync_supers_tsk; | ||
43 | static struct timer_list sync_supers_timer; | ||
44 | |||
45 | static int bdi_sync_supers(void *); | ||
46 | static void sync_supers_timer_fn(unsigned long); | ||
47 | |||
48 | void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) | 42 | void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) |
49 | { | 43 | { |
50 | if (wb1 < wb2) { | 44 | if (wb1 < wb2) { |
@@ -250,12 +244,6 @@ static int __init default_bdi_init(void) | |||
250 | { | 244 | { |
251 | int err; | 245 | int err; |
252 | 246 | ||
253 | sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); | ||
254 | BUG_ON(IS_ERR(sync_supers_tsk)); | ||
255 | |||
256 | setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); | ||
257 | bdi_arm_supers_timer(); | ||
258 | |||
259 | err = bdi_init(&default_backing_dev_info); | 247 | err = bdi_init(&default_backing_dev_info); |
260 | if (!err) | 248 | if (!err) |
261 | bdi_register(&default_backing_dev_info, NULL, "default"); | 249 | bdi_register(&default_backing_dev_info, NULL, "default"); |
@@ -270,46 +258,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) | |||
270 | return wb_has_dirty_io(&bdi->wb); | 258 | return wb_has_dirty_io(&bdi->wb); |
271 | } | 259 | } |
272 | 260 | ||
273 | /* | ||
274 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() | ||
275 | * or we risk deadlocking on ->s_umount. The longer term solution would be | ||
276 | * to implement sync_supers_bdi() or similar and simply do it from the | ||
277 | * bdi writeback thread individually. | ||
278 | */ | ||
279 | static int bdi_sync_supers(void *unused) | ||
280 | { | ||
281 | set_user_nice(current, 0); | ||
282 | |||
283 | while (!kthread_should_stop()) { | ||
284 | set_current_state(TASK_INTERRUPTIBLE); | ||
285 | schedule(); | ||
286 | |||
287 | /* | ||
288 | * Do this periodically, like kupdated() did before. | ||
289 | */ | ||
290 | sync_supers(); | ||
291 | } | ||
292 | |||
293 | return 0; | ||
294 | } | ||
295 | |||
296 | void bdi_arm_supers_timer(void) | ||
297 | { | ||
298 | unsigned long next; | ||
299 | |||
300 | if (!dirty_writeback_interval) | ||
301 | return; | ||
302 | |||
303 | next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; | ||
304 | mod_timer(&sync_supers_timer, round_jiffies_up(next)); | ||
305 | } | ||
306 | |||
307 | static void sync_supers_timer_fn(unsigned long unused) | ||
308 | { | ||
309 | wake_up_process(sync_supers_tsk); | ||
310 | bdi_arm_supers_timer(); | ||
311 | } | ||
312 | |||
313 | static void wakeup_timer_fn(unsigned long data) | 261 | static void wakeup_timer_fn(unsigned long data) |
314 | { | 262 | { |
315 | struct backing_dev_info *bdi = (struct backing_dev_info *)data; | 263 | struct backing_dev_info *bdi = (struct backing_dev_info *)data; |
@@ -677,7 +625,7 @@ int bdi_init(struct backing_dev_info *bdi) | |||
677 | 625 | ||
678 | bdi->min_ratio = 0; | 626 | bdi->min_ratio = 0; |
679 | bdi->max_ratio = 100; | 627 | bdi->max_ratio = 100; |
680 | bdi->max_prop_frac = PROP_FRAC_BASE; | 628 | bdi->max_prop_frac = FPROP_FRAC_BASE; |
681 | spin_lock_init(&bdi->wb_lock); | 629 | spin_lock_init(&bdi->wb_lock); |
682 | INIT_LIST_HEAD(&bdi->bdi_list); | 630 | INIT_LIST_HEAD(&bdi->bdi_list); |
683 | INIT_LIST_HEAD(&bdi->work_list); | 631 | INIT_LIST_HEAD(&bdi->work_list); |
@@ -700,7 +648,7 @@ int bdi_init(struct backing_dev_info *bdi) | |||
700 | bdi->write_bandwidth = INIT_BW; | 648 | bdi->write_bandwidth = INIT_BW; |
701 | bdi->avg_write_bandwidth = INIT_BW; | 649 | bdi->avg_write_bandwidth = INIT_BW; |
702 | 650 | ||
703 | err = prop_local_init_percpu(&bdi->completions); | 651 | err = fprop_local_init_percpu(&bdi->completions); |
704 | 652 | ||
705 | if (err) { | 653 | if (err) { |
706 | err: | 654 | err: |
@@ -744,7 +692,7 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
744 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 692 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |
745 | percpu_counter_destroy(&bdi->bdi_stat[i]); | 693 | percpu_counter_destroy(&bdi->bdi_stat[i]); |
746 | 694 | ||
747 | prop_local_destroy_percpu(&bdi->completions); | 695 | fprop_local_destroy_percpu(&bdi->completions); |
748 | } | 696 | } |
749 | EXPORT_SYMBOL(bdi_destroy); | 697 | EXPORT_SYMBOL(bdi_destroy); |
750 | 698 | ||
@@ -886,3 +834,23 @@ out: | |||
886 | return ret; | 834 | return ret; |
887 | } | 835 | } |
888 | EXPORT_SYMBOL(wait_iff_congested); | 836 | EXPORT_SYMBOL(wait_iff_congested); |
837 | |||
838 | int pdflush_proc_obsolete(struct ctl_table *table, int write, | ||
839 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
840 | { | ||
841 | char kbuf[] = "0\n"; | ||
842 | |||
843 | if (*ppos) { | ||
844 | *lenp = 0; | ||
845 | return 0; | ||
846 | } | ||
847 | |||
848 | if (copy_to_user(buffer, kbuf, sizeof(kbuf))) | ||
849 | return -EFAULT; | ||
850 | printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n", | ||
851 | table->procname); | ||
852 | |||
853 | *lenp = 2; | ||
854 | *ppos += *lenp; | ||
855 | return 2; | ||
856 | } | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index 0131170c9d54..bcb63ac48cc5 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -77,16 +77,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages) | |||
77 | */ | 77 | */ |
78 | static void __init link_bootmem(bootmem_data_t *bdata) | 78 | static void __init link_bootmem(bootmem_data_t *bdata) |
79 | { | 79 | { |
80 | struct list_head *iter; | 80 | bootmem_data_t *ent; |
81 | 81 | ||
82 | list_for_each(iter, &bdata_list) { | 82 | list_for_each_entry(ent, &bdata_list, list) { |
83 | bootmem_data_t *ent; | 83 | if (bdata->node_min_pfn < ent->node_min_pfn) { |
84 | 84 | list_add_tail(&bdata->list, &ent->list); | |
85 | ent = list_entry(iter, bootmem_data_t, list); | 85 | return; |
86 | if (bdata->node_min_pfn < ent->node_min_pfn) | 86 | } |
87 | break; | ||
88 | } | 87 | } |
89 | list_add_tail(&bdata->list, iter); | 88 | |
89 | list_add_tail(&bdata->list, &bdata_list); | ||
90 | } | 90 | } |
91 | 91 | ||
92 | /* | 92 | /* |
@@ -203,7 +203,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
203 | } else { | 203 | } else { |
204 | unsigned long off = 0; | 204 | unsigned long off = 0; |
205 | 205 | ||
206 | while (vec && off < BITS_PER_LONG) { | 206 | vec >>= start & (BITS_PER_LONG - 1); |
207 | while (vec) { | ||
207 | if (vec & 1) { | 208 | if (vec & 1) { |
208 | page = pfn_to_page(start + off); | 209 | page = pfn_to_page(start + off); |
209 | __free_pages_bootmem(page, 0); | 210 | __free_pages_bootmem(page, 0); |
@@ -467,7 +468,7 @@ static unsigned long __init align_off(struct bootmem_data *bdata, | |||
467 | return ALIGN(base + off, align) - base; | 468 | return ALIGN(base + off, align) - base; |
468 | } | 469 | } |
469 | 470 | ||
470 | static void * __init alloc_bootmem_core(struct bootmem_data *bdata, | 471 | static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata, |
471 | unsigned long size, unsigned long align, | 472 | unsigned long size, unsigned long align, |
472 | unsigned long goal, unsigned long limit) | 473 | unsigned long goal, unsigned long limit) |
473 | { | 474 | { |
@@ -588,14 +589,14 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | |||
588 | p_bdata = bootmem_arch_preferred_node(bdata, size, align, | 589 | p_bdata = bootmem_arch_preferred_node(bdata, size, align, |
589 | goal, limit); | 590 | goal, limit); |
590 | if (p_bdata) | 591 | if (p_bdata) |
591 | return alloc_bootmem_core(p_bdata, size, align, | 592 | return alloc_bootmem_bdata(p_bdata, size, align, |
592 | goal, limit); | 593 | goal, limit); |
593 | } | 594 | } |
594 | #endif | 595 | #endif |
595 | return NULL; | 596 | return NULL; |
596 | } | 597 | } |
597 | 598 | ||
598 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, | 599 | static void * __init alloc_bootmem_core(unsigned long size, |
599 | unsigned long align, | 600 | unsigned long align, |
600 | unsigned long goal, | 601 | unsigned long goal, |
601 | unsigned long limit) | 602 | unsigned long limit) |
@@ -603,7 +604,6 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, | |||
603 | bootmem_data_t *bdata; | 604 | bootmem_data_t *bdata; |
604 | void *region; | 605 | void *region; |
605 | 606 | ||
606 | restart: | ||
607 | region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); | 607 | region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); |
608 | if (region) | 608 | if (region) |
609 | return region; | 609 | return region; |
@@ -614,11 +614,25 @@ restart: | |||
614 | if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) | 614 | if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) |
615 | break; | 615 | break; |
616 | 616 | ||
617 | region = alloc_bootmem_core(bdata, size, align, goal, limit); | 617 | region = alloc_bootmem_bdata(bdata, size, align, goal, limit); |
618 | if (region) | 618 | if (region) |
619 | return region; | 619 | return region; |
620 | } | 620 | } |
621 | 621 | ||
622 | return NULL; | ||
623 | } | ||
624 | |||
625 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, | ||
626 | unsigned long align, | ||
627 | unsigned long goal, | ||
628 | unsigned long limit) | ||
629 | { | ||
630 | void *ptr; | ||
631 | |||
632 | restart: | ||
633 | ptr = alloc_bootmem_core(size, align, goal, limit); | ||
634 | if (ptr) | ||
635 | return ptr; | ||
622 | if (goal) { | 636 | if (goal) { |
623 | goal = 0; | 637 | goal = 0; |
624 | goto restart; | 638 | goto restart; |
@@ -684,21 +698,60 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, | |||
684 | return ___alloc_bootmem(size, align, goal, limit); | 698 | return ___alloc_bootmem(size, align, goal, limit); |
685 | } | 699 | } |
686 | 700 | ||
687 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | 701 | void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, |
688 | unsigned long size, unsigned long align, | 702 | unsigned long size, unsigned long align, |
689 | unsigned long goal, unsigned long limit) | 703 | unsigned long goal, unsigned long limit) |
690 | { | 704 | { |
691 | void *ptr; | 705 | void *ptr; |
692 | 706 | ||
693 | ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); | 707 | again: |
708 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, | ||
709 | align, goal, limit); | ||
694 | if (ptr) | 710 | if (ptr) |
695 | return ptr; | 711 | return ptr; |
696 | 712 | ||
697 | ptr = alloc_bootmem_core(bdata, size, align, goal, limit); | 713 | /* do not panic in alloc_bootmem_bdata() */ |
714 | if (limit && goal + size > limit) | ||
715 | limit = 0; | ||
716 | |||
717 | ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); | ||
698 | if (ptr) | 718 | if (ptr) |
699 | return ptr; | 719 | return ptr; |
700 | 720 | ||
701 | return ___alloc_bootmem(size, align, goal, limit); | 721 | ptr = alloc_bootmem_core(size, align, goal, limit); |
722 | if (ptr) | ||
723 | return ptr; | ||
724 | |||
725 | if (goal) { | ||
726 | goal = 0; | ||
727 | goto again; | ||
728 | } | ||
729 | |||
730 | return NULL; | ||
731 | } | ||
732 | |||
733 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | ||
734 | unsigned long align, unsigned long goal) | ||
735 | { | ||
736 | if (WARN_ON_ONCE(slab_is_available())) | ||
737 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
738 | |||
739 | return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); | ||
740 | } | ||
741 | |||
742 | void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | ||
743 | unsigned long align, unsigned long goal, | ||
744 | unsigned long limit) | ||
745 | { | ||
746 | void *ptr; | ||
747 | |||
748 | ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); | ||
749 | if (ptr) | ||
750 | return ptr; | ||
751 | |||
752 | printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); | ||
753 | panic("Out of memory"); | ||
754 | return NULL; | ||
702 | } | 755 | } |
703 | 756 | ||
704 | /** | 757 | /** |
@@ -722,7 +775,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | |||
722 | if (WARN_ON_ONCE(slab_is_available())) | 775 | if (WARN_ON_ONCE(slab_is_available())) |
723 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 776 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
724 | 777 | ||
725 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); | 778 | return ___alloc_bootmem_node(pgdat, size, align, goal, 0); |
726 | } | 779 | } |
727 | 780 | ||
728 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | 781 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, |
@@ -743,7 +796,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | |||
743 | unsigned long new_goal; | 796 | unsigned long new_goal; |
744 | 797 | ||
745 | new_goal = MAX_DMA32_PFN << PAGE_SHIFT; | 798 | new_goal = MAX_DMA32_PFN << PAGE_SHIFT; |
746 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, | 799 | ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, |
747 | new_goal, 0); | 800 | new_goal, 0); |
748 | if (ptr) | 801 | if (ptr) |
749 | return ptr; | 802 | return ptr; |
@@ -754,47 +807,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | |||
754 | 807 | ||
755 | } | 808 | } |
756 | 809 | ||
757 | #ifdef CONFIG_SPARSEMEM | ||
758 | /** | ||
759 | * alloc_bootmem_section - allocate boot memory from a specific section | ||
760 | * @size: size of the request in bytes | ||
761 | * @section_nr: sparse map section to allocate from | ||
762 | * | ||
763 | * Return NULL on failure. | ||
764 | */ | ||
765 | void * __init alloc_bootmem_section(unsigned long size, | ||
766 | unsigned long section_nr) | ||
767 | { | ||
768 | bootmem_data_t *bdata; | ||
769 | unsigned long pfn, goal; | ||
770 | |||
771 | pfn = section_nr_to_pfn(section_nr); | ||
772 | goal = pfn << PAGE_SHIFT; | ||
773 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; | ||
774 | |||
775 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0); | ||
776 | } | ||
777 | #endif | ||
778 | |||
779 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | ||
780 | unsigned long align, unsigned long goal) | ||
781 | { | ||
782 | void *ptr; | ||
783 | |||
784 | if (WARN_ON_ONCE(slab_is_available())) | ||
785 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
786 | |||
787 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); | ||
788 | if (ptr) | ||
789 | return ptr; | ||
790 | |||
791 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); | ||
792 | if (ptr) | ||
793 | return ptr; | ||
794 | |||
795 | return __alloc_bootmem_nopanic(size, align, goal); | ||
796 | } | ||
797 | |||
798 | #ifndef ARCH_LOW_ADDRESS_LIMIT | 810 | #ifndef ARCH_LOW_ADDRESS_LIMIT |
799 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL | 811 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL |
800 | #endif | 812 | #endif |
@@ -839,6 +851,6 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | |||
839 | if (WARN_ON_ONCE(slab_is_available())) | 851 | if (WARN_ON_ONCE(slab_is_available())) |
840 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 852 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
841 | 853 | ||
842 | return ___alloc_bootmem_node(pgdat->bdata, size, align, | 854 | return ___alloc_bootmem_node(pgdat, size, align, |
843 | goal, ARCH_LOW_ADDRESS_LIMIT); | 855 | goal, ARCH_LOW_ADDRESS_LIMIT); |
844 | } | 856 | } |
diff --git a/mm/bounce.c b/mm/bounce.c index d1be02ca1889..042086775561 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -24,23 +24,25 @@ | |||
24 | 24 | ||
25 | static mempool_t *page_pool, *isa_page_pool; | 25 | static mempool_t *page_pool, *isa_page_pool; |
26 | 26 | ||
27 | #ifdef CONFIG_HIGHMEM | 27 | #if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) |
28 | static __init int init_emergency_pool(void) | 28 | static __init int init_emergency_pool(void) |
29 | { | 29 | { |
30 | #ifndef CONFIG_MEMORY_HOTPLUG | 30 | #if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) |
31 | if (max_pfn <= max_low_pfn) | 31 | if (max_pfn <= max_low_pfn) |
32 | return 0; | 32 | return 0; |
33 | #endif | 33 | #endif |
34 | 34 | ||
35 | page_pool = mempool_create_page_pool(POOL_SIZE, 0); | 35 | page_pool = mempool_create_page_pool(POOL_SIZE, 0); |
36 | BUG_ON(!page_pool); | 36 | BUG_ON(!page_pool); |
37 | printk("highmem bounce pool size: %d pages\n", POOL_SIZE); | 37 | printk("bounce pool size: %d pages\n", POOL_SIZE); |
38 | 38 | ||
39 | return 0; | 39 | return 0; |
40 | } | 40 | } |
41 | 41 | ||
42 | __initcall(init_emergency_pool); | 42 | __initcall(init_emergency_pool); |
43 | #endif | ||
43 | 44 | ||
45 | #ifdef CONFIG_HIGHMEM | ||
44 | /* | 46 | /* |
45 | * highmem version, map in to vec | 47 | * highmem version, map in to vec |
46 | */ | 48 | */ |
diff --git a/mm/cleancache.c b/mm/cleancache.c index 5646c740f613..32e6f4136fa2 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c | |||
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs); | |||
80 | static int cleancache_get_key(struct inode *inode, | 80 | static int cleancache_get_key(struct inode *inode, |
81 | struct cleancache_filekey *key) | 81 | struct cleancache_filekey *key) |
82 | { | 82 | { |
83 | int (*fhfn)(struct dentry *, __u32 *fh, int *, int); | 83 | int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *); |
84 | int len = 0, maxlen = CLEANCACHE_KEY_MAX; | 84 | int len = 0, maxlen = CLEANCACHE_KEY_MAX; |
85 | struct super_block *sb = inode->i_sb; | 85 | struct super_block *sb = inode->i_sb; |
86 | 86 | ||
@@ -88,9 +88,7 @@ static int cleancache_get_key(struct inode *inode, | |||
88 | if (sb->s_export_op != NULL) { | 88 | if (sb->s_export_op != NULL) { |
89 | fhfn = sb->s_export_op->encode_fh; | 89 | fhfn = sb->s_export_op->encode_fh; |
90 | if (fhfn) { | 90 | if (fhfn) { |
91 | struct dentry d; | 91 | len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); |
92 | d.d_inode = inode; | ||
93 | len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); | ||
94 | if (len <= 0 || len == 255) | 92 | if (len <= 0 || len == 255) |
95 | return -1; | 93 | return -1; |
96 | if (maxlen > CLEANCACHE_KEY_MAX) | 94 | if (maxlen > CLEANCACHE_KEY_MAX) |
diff --git a/mm/compaction.c b/mm/compaction.c index 74a8c825ff28..7fcd3a52e68d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -16,30 +16,11 @@ | |||
16 | #include <linux/sysfs.h> | 16 | #include <linux/sysfs.h> |
17 | #include "internal.h" | 17 | #include "internal.h" |
18 | 18 | ||
19 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
20 | |||
19 | #define CREATE_TRACE_POINTS | 21 | #define CREATE_TRACE_POINTS |
20 | #include <trace/events/compaction.h> | 22 | #include <trace/events/compaction.h> |
21 | 23 | ||
22 | /* | ||
23 | * compact_control is used to track pages being migrated and the free pages | ||
24 | * they are being migrated to during memory compaction. The free_pfn starts | ||
25 | * at the end of a zone and migrate_pfn begins at the start. Movable pages | ||
26 | * are moved to the end of a zone during a compaction run and the run | ||
27 | * completes when free_pfn <= migrate_pfn | ||
28 | */ | ||
29 | struct compact_control { | ||
30 | struct list_head freepages; /* List of free pages to migrate to */ | ||
31 | struct list_head migratepages; /* List of pages being migrated */ | ||
32 | unsigned long nr_freepages; /* Number of isolated free pages */ | ||
33 | unsigned long nr_migratepages; /* Number of pages to migrate */ | ||
34 | unsigned long free_pfn; /* isolate_freepages search base */ | ||
35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | ||
36 | bool sync; /* Synchronous migration */ | ||
37 | |||
38 | int order; /* order a direct compactor needs */ | ||
39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | ||
40 | struct zone *zone; | ||
41 | }; | ||
42 | |||
43 | static unsigned long release_freepages(struct list_head *freelist) | 24 | static unsigned long release_freepages(struct list_head *freelist) |
44 | { | 25 | { |
45 | struct page *page, *next; | 26 | struct page *page, *next; |
@@ -54,24 +35,76 @@ static unsigned long release_freepages(struct list_head *freelist) | |||
54 | return count; | 35 | return count; |
55 | } | 36 | } |
56 | 37 | ||
57 | /* Isolate free pages onto a private freelist. Must hold zone->lock */ | 38 | static void map_pages(struct list_head *list) |
58 | static unsigned long isolate_freepages_block(struct zone *zone, | ||
59 | unsigned long blockpfn, | ||
60 | struct list_head *freelist) | ||
61 | { | 39 | { |
62 | unsigned long zone_end_pfn, end_pfn; | 40 | struct page *page; |
63 | int nr_scanned = 0, total_isolated = 0; | ||
64 | struct page *cursor; | ||
65 | 41 | ||
66 | /* Get the last PFN we should scan for free pages at */ | 42 | list_for_each_entry(page, list, lru) { |
67 | zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 43 | arch_alloc_page(page, 0); |
68 | end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn); | 44 | kernel_map_pages(page, 1, 1); |
45 | } | ||
46 | } | ||
69 | 47 | ||
70 | /* Find the first usable PFN in the block to initialse page cursor */ | 48 | static inline bool migrate_async_suitable(int migratetype) |
71 | for (; blockpfn < end_pfn; blockpfn++) { | 49 | { |
72 | if (pfn_valid_within(blockpfn)) | 50 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; |
73 | break; | 51 | } |
52 | |||
53 | /* | ||
54 | * Compaction requires the taking of some coarse locks that are potentially | ||
55 | * very heavily contended. Check if the process needs to be scheduled or | ||
56 | * if the lock is contended. For async compaction, back out in the event | ||
57 | * if contention is severe. For sync compaction, schedule. | ||
58 | * | ||
59 | * Returns true if the lock is held. | ||
60 | * Returns false if the lock is released and compaction should abort | ||
61 | */ | ||
62 | static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | ||
63 | bool locked, struct compact_control *cc) | ||
64 | { | ||
65 | if (need_resched() || spin_is_contended(lock)) { | ||
66 | if (locked) { | ||
67 | spin_unlock_irqrestore(lock, *flags); | ||
68 | locked = false; | ||
69 | } | ||
70 | |||
71 | /* async aborts if taking too long or contended */ | ||
72 | if (!cc->sync) { | ||
73 | if (cc->contended) | ||
74 | *cc->contended = true; | ||
75 | return false; | ||
76 | } | ||
77 | |||
78 | cond_resched(); | ||
79 | if (fatal_signal_pending(current)) | ||
80 | return false; | ||
74 | } | 81 | } |
82 | |||
83 | if (!locked) | ||
84 | spin_lock_irqsave(lock, *flags); | ||
85 | return true; | ||
86 | } | ||
87 | |||
88 | static inline bool compact_trylock_irqsave(spinlock_t *lock, | ||
89 | unsigned long *flags, struct compact_control *cc) | ||
90 | { | ||
91 | return compact_checklock_irqsave(lock, flags, false, cc); | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. | ||
96 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free | ||
97 | * pages inside of the pageblock (even though it may still end up isolating | ||
98 | * some pages). | ||
99 | */ | ||
100 | static unsigned long isolate_freepages_block(unsigned long blockpfn, | ||
101 | unsigned long end_pfn, | ||
102 | struct list_head *freelist, | ||
103 | bool strict) | ||
104 | { | ||
105 | int nr_scanned = 0, total_isolated = 0; | ||
106 | struct page *cursor; | ||
107 | |||
75 | cursor = pfn_to_page(blockpfn); | 108 | cursor = pfn_to_page(blockpfn); |
76 | 109 | ||
77 | /* Isolate free pages. This assumes the block is valid */ | 110 | /* Isolate free pages. This assumes the block is valid */ |
@@ -79,15 +112,23 @@ static unsigned long isolate_freepages_block(struct zone *zone, | |||
79 | int isolated, i; | 112 | int isolated, i; |
80 | struct page *page = cursor; | 113 | struct page *page = cursor; |
81 | 114 | ||
82 | if (!pfn_valid_within(blockpfn)) | 115 | if (!pfn_valid_within(blockpfn)) { |
116 | if (strict) | ||
117 | return 0; | ||
83 | continue; | 118 | continue; |
119 | } | ||
84 | nr_scanned++; | 120 | nr_scanned++; |
85 | 121 | ||
86 | if (!PageBuddy(page)) | 122 | if (!PageBuddy(page)) { |
123 | if (strict) | ||
124 | return 0; | ||
87 | continue; | 125 | continue; |
126 | } | ||
88 | 127 | ||
89 | /* Found a free page, break it into order-0 pages */ | 128 | /* Found a free page, break it into order-0 pages */ |
90 | isolated = split_free_page(page); | 129 | isolated = split_free_page(page); |
130 | if (!isolated && strict) | ||
131 | return 0; | ||
91 | total_isolated += isolated; | 132 | total_isolated += isolated; |
92 | for (i = 0; i < isolated; i++) { | 133 | for (i = 0; i < isolated; i++) { |
93 | list_add(&page->lru, freelist); | 134 | list_add(&page->lru, freelist); |
@@ -105,118 +146,75 @@ static unsigned long isolate_freepages_block(struct zone *zone, | |||
105 | return total_isolated; | 146 | return total_isolated; |
106 | } | 147 | } |
107 | 148 | ||
108 | /* Returns true if the page is within a block suitable for migration to */ | 149 | /** |
109 | static bool suitable_migration_target(struct page *page) | 150 | * isolate_freepages_range() - isolate free pages. |
110 | { | 151 | * @start_pfn: The first PFN to start isolating. |
111 | 152 | * @end_pfn: The one-past-last PFN. | |
112 | int migratetype = get_pageblock_migratetype(page); | 153 | * |
113 | 154 | * Non-free pages, invalid PFNs, or zone boundaries within the | |
114 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | 155 | * [start_pfn, end_pfn) range are considered errors, cause function to |
115 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | 156 | * undo its actions and return zero. |
116 | return false; | 157 | * |
117 | 158 | * Otherwise, function returns one-past-the-last PFN of isolated page | |
118 | /* If the page is a large free page, then allow migration */ | 159 | * (which may be greater then end_pfn if end fell in a middle of |
119 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | 160 | * a free page). |
120 | return true; | ||
121 | |||
122 | /* If the block is MIGRATE_MOVABLE, allow migration */ | ||
123 | if (migratetype == MIGRATE_MOVABLE) | ||
124 | return true; | ||
125 | |||
126 | /* Otherwise skip the block */ | ||
127 | return false; | ||
128 | } | ||
129 | |||
130 | /* | ||
131 | * Based on information in the current compact_control, find blocks | ||
132 | * suitable for isolating free pages from and then isolate them. | ||
133 | */ | 161 | */ |
134 | static void isolate_freepages(struct zone *zone, | 162 | unsigned long |
135 | struct compact_control *cc) | 163 | isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) |
136 | { | 164 | { |
137 | struct page *page; | 165 | unsigned long isolated, pfn, block_end_pfn, flags; |
138 | unsigned long high_pfn, low_pfn, pfn; | 166 | struct zone *zone = NULL; |
139 | unsigned long flags; | 167 | LIST_HEAD(freelist); |
140 | int nr_freepages = cc->nr_freepages; | ||
141 | struct list_head *freelist = &cc->freepages; | ||
142 | 168 | ||
143 | /* | 169 | if (pfn_valid(start_pfn)) |
144 | * Initialise the free scanner. The starting point is where we last | 170 | zone = page_zone(pfn_to_page(start_pfn)); |
145 | * scanned from (or the end of the zone if starting). The low point | ||
146 | * is the end of the pageblock the migration scanner is using. | ||
147 | */ | ||
148 | pfn = cc->free_pfn; | ||
149 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; | ||
150 | 171 | ||
151 | /* | 172 | for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { |
152 | * Take care that if the migration scanner is at the end of the zone | 173 | if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) |
153 | * that the free scanner does not accidentally move to the next zone | 174 | break; |
154 | * in the next isolation cycle. | ||
155 | */ | ||
156 | high_pfn = min(low_pfn, pfn); | ||
157 | |||
158 | /* | ||
159 | * Isolate free pages until enough are available to migrate the | ||
160 | * pages on cc->migratepages. We stop searching if the migrate | ||
161 | * and free page scanners meet or enough free pages are isolated. | ||
162 | */ | ||
163 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; | ||
164 | pfn -= pageblock_nr_pages) { | ||
165 | unsigned long isolated; | ||
166 | |||
167 | if (!pfn_valid(pfn)) | ||
168 | continue; | ||
169 | 175 | ||
170 | /* | 176 | /* |
171 | * Check for overlapping nodes/zones. It's possible on some | 177 | * On subsequent iterations ALIGN() is actually not needed, |
172 | * configurations to have a setup like | 178 | * but we keep it that we not to complicate the code. |
173 | * node0 node1 node0 | ||
174 | * i.e. it's possible that all pages within a zones range of | ||
175 | * pages do not belong to a single zone. | ||
176 | */ | 179 | */ |
177 | page = pfn_to_page(pfn); | 180 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); |
178 | if (page_zone(page) != zone) | 181 | block_end_pfn = min(block_end_pfn, end_pfn); |
179 | continue; | ||
180 | 182 | ||
181 | /* Check the block is suitable for migration */ | 183 | spin_lock_irqsave(&zone->lock, flags); |
182 | if (!suitable_migration_target(page)) | 184 | isolated = isolate_freepages_block(pfn, block_end_pfn, |
183 | continue; | 185 | &freelist, true); |
186 | spin_unlock_irqrestore(&zone->lock, flags); | ||
184 | 187 | ||
185 | /* | 188 | /* |
186 | * Found a block suitable for isolating free pages from. Now | 189 | * In strict mode, isolate_freepages_block() returns 0 if |
187 | * we disabled interrupts, double check things are ok and | 190 | * there are any holes in the block (ie. invalid PFNs or |
188 | * isolate the pages. This is to minimise the time IRQs | 191 | * non-free pages). |
189 | * are disabled | ||
190 | */ | 192 | */ |
191 | isolated = 0; | 193 | if (!isolated) |
192 | spin_lock_irqsave(&zone->lock, flags); | 194 | break; |
193 | if (suitable_migration_target(page)) { | ||
194 | isolated = isolate_freepages_block(zone, pfn, freelist); | ||
195 | nr_freepages += isolated; | ||
196 | } | ||
197 | spin_unlock_irqrestore(&zone->lock, flags); | ||
198 | 195 | ||
199 | /* | 196 | /* |
200 | * Record the highest PFN we isolated pages from. When next | 197 | * If we managed to isolate pages, it is always (1 << n) * |
201 | * looking for free pages, the search will restart here as | 198 | * pageblock_nr_pages for some non-negative n. (Max order |
202 | * page migration may have returned some pages to the allocator | 199 | * page may span two pageblocks). |
203 | */ | 200 | */ |
204 | if (isolated) | ||
205 | high_pfn = max(high_pfn, pfn); | ||
206 | } | 201 | } |
207 | 202 | ||
208 | /* split_free_page does not map the pages */ | 203 | /* split_free_page does not map the pages */ |
209 | list_for_each_entry(page, freelist, lru) { | 204 | map_pages(&freelist); |
210 | arch_alloc_page(page, 0); | 205 | |
211 | kernel_map_pages(page, 1, 1); | 206 | if (pfn < end_pfn) { |
207 | /* Loop terminated early, cleanup. */ | ||
208 | release_freepages(&freelist); | ||
209 | return 0; | ||
212 | } | 210 | } |
213 | 211 | ||
214 | cc->free_pfn = high_pfn; | 212 | /* We don't use freelists for anything. */ |
215 | cc->nr_freepages = nr_freepages; | 213 | return pfn; |
216 | } | 214 | } |
217 | 215 | ||
218 | /* Update the number of anon and file isolated pages in the zone */ | 216 | /* Update the number of anon and file isolated pages in the zone */ |
219 | static void acct_isolated(struct zone *zone, struct compact_control *cc) | 217 | static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc) |
220 | { | 218 | { |
221 | struct page *page; | 219 | struct page *page; |
222 | unsigned int count[2] = { 0, }; | 220 | unsigned int count[2] = { 0, }; |
@@ -224,8 +222,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc) | |||
224 | list_for_each_entry(page, &cc->migratepages, lru) | 222 | list_for_each_entry(page, &cc->migratepages, lru) |
225 | count[!!page_is_file_cache(page)]++; | 223 | count[!!page_is_file_cache(page)]++; |
226 | 224 | ||
227 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); | 225 | /* If locked we can use the interrupt unsafe versions */ |
228 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); | 226 | if (locked) { |
227 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); | ||
228 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); | ||
229 | } else { | ||
230 | mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); | ||
231 | mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); | ||
232 | } | ||
229 | } | 233 | } |
230 | 234 | ||
231 | /* Similar to reclaim, but different enough that they don't share logic */ | 235 | /* Similar to reclaim, but different enough that they don't share logic */ |
@@ -243,37 +247,36 @@ static bool too_many_isolated(struct zone *zone) | |||
243 | return isolated > (inactive + active) / 2; | 247 | return isolated > (inactive + active) / 2; |
244 | } | 248 | } |
245 | 249 | ||
246 | /* possible outcome of isolate_migratepages */ | 250 | /** |
247 | typedef enum { | 251 | * isolate_migratepages_range() - isolate all migrate-able pages in range. |
248 | ISOLATE_ABORT, /* Abort compaction now */ | 252 | * @zone: Zone pages are in. |
249 | ISOLATE_NONE, /* No pages isolated, continue scanning */ | 253 | * @cc: Compaction control structure. |
250 | ISOLATE_SUCCESS, /* Pages isolated, migrate */ | 254 | * @low_pfn: The first PFN of the range. |
251 | } isolate_migrate_t; | 255 | * @end_pfn: The one-past-the-last PFN of the range. |
252 | 256 | * | |
253 | /* | 257 | * Isolate all pages that can be migrated from the range specified by |
254 | * Isolate all pages that can be migrated from the block pointed to by | 258 | * [low_pfn, end_pfn). Returns zero if there is a fatal signal |
255 | * the migrate scanner within compact_control. | 259 | * pending), otherwise PFN of the first page that was not scanned |
260 | * (which may be both less, equal to or more then end_pfn). | ||
261 | * | ||
262 | * Assumes that cc->migratepages is empty and cc->nr_migratepages is | ||
263 | * zero. | ||
264 | * | ||
265 | * Apart from cc->migratepages and cc->nr_migratetypes this function | ||
266 | * does not modify any cc's fields, in particular it does not modify | ||
267 | * (or read for that matter) cc->migrate_pfn. | ||
256 | */ | 268 | */ |
257 | static isolate_migrate_t isolate_migratepages(struct zone *zone, | 269 | unsigned long |
258 | struct compact_control *cc) | 270 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
271 | unsigned long low_pfn, unsigned long end_pfn) | ||
259 | { | 272 | { |
260 | unsigned long low_pfn, end_pfn; | ||
261 | unsigned long last_pageblock_nr = 0, pageblock_nr; | 273 | unsigned long last_pageblock_nr = 0, pageblock_nr; |
262 | unsigned long nr_scanned = 0, nr_isolated = 0; | 274 | unsigned long nr_scanned = 0, nr_isolated = 0; |
263 | struct list_head *migratelist = &cc->migratepages; | 275 | struct list_head *migratelist = &cc->migratepages; |
264 | isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE; | 276 | isolate_mode_t mode = 0; |
265 | 277 | struct lruvec *lruvec; | |
266 | /* Do not scan outside zone boundaries */ | 278 | unsigned long flags; |
267 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | 279 | bool locked; |
268 | |||
269 | /* Only scan within a pageblock boundary */ | ||
270 | end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); | ||
271 | |||
272 | /* Do not cross the free scanner or scan within a memory hole */ | ||
273 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { | ||
274 | cc->migrate_pfn = end_pfn; | ||
275 | return ISOLATE_NONE; | ||
276 | } | ||
277 | 280 | ||
278 | /* | 281 | /* |
279 | * Ensure that there are not too many pages isolated from the LRU | 282 | * Ensure that there are not too many pages isolated from the LRU |
@@ -283,35 +286,32 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
283 | while (unlikely(too_many_isolated(zone))) { | 286 | while (unlikely(too_many_isolated(zone))) { |
284 | /* async migration should just abort */ | 287 | /* async migration should just abort */ |
285 | if (!cc->sync) | 288 | if (!cc->sync) |
286 | return ISOLATE_ABORT; | 289 | return 0; |
287 | 290 | ||
288 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 291 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
289 | 292 | ||
290 | if (fatal_signal_pending(current)) | 293 | if (fatal_signal_pending(current)) |
291 | return ISOLATE_ABORT; | 294 | return 0; |
292 | } | 295 | } |
293 | 296 | ||
294 | /* Time to isolate some pages for migration */ | 297 | /* Time to isolate some pages for migration */ |
295 | cond_resched(); | 298 | cond_resched(); |
296 | spin_lock_irq(&zone->lru_lock); | 299 | spin_lock_irqsave(&zone->lru_lock, flags); |
300 | locked = true; | ||
297 | for (; low_pfn < end_pfn; low_pfn++) { | 301 | for (; low_pfn < end_pfn; low_pfn++) { |
298 | struct page *page; | 302 | struct page *page; |
299 | bool locked = true; | ||
300 | 303 | ||
301 | /* give a chance to irqs before checking need_resched() */ | 304 | /* give a chance to irqs before checking need_resched() */ |
302 | if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { | 305 | if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { |
303 | spin_unlock_irq(&zone->lru_lock); | 306 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
304 | locked = false; | 307 | locked = false; |
305 | } | 308 | } |
306 | if (need_resched() || spin_is_contended(&zone->lru_lock)) { | 309 | |
307 | if (locked) | 310 | /* Check if it is ok to still hold the lock */ |
308 | spin_unlock_irq(&zone->lru_lock); | 311 | locked = compact_checklock_irqsave(&zone->lru_lock, &flags, |
309 | cond_resched(); | 312 | locked, cc); |
310 | spin_lock_irq(&zone->lru_lock); | 313 | if (!locked) |
311 | if (fatal_signal_pending(current)) | 314 | break; |
312 | break; | ||
313 | } else if (!locked) | ||
314 | spin_lock_irq(&zone->lru_lock); | ||
315 | 315 | ||
316 | /* | 316 | /* |
317 | * migrate_pfn does not necessarily start aligned to a | 317 | * migrate_pfn does not necessarily start aligned to a |
@@ -351,7 +351,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
351 | */ | 351 | */ |
352 | pageblock_nr = low_pfn >> pageblock_order; | 352 | pageblock_nr = low_pfn >> pageblock_order; |
353 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | 353 | if (!cc->sync && last_pageblock_nr != pageblock_nr && |
354 | get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { | 354 | !migrate_async_suitable(get_pageblock_migratetype(page))) { |
355 | low_pfn += pageblock_nr_pages; | 355 | low_pfn += pageblock_nr_pages; |
356 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | 356 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; |
357 | last_pageblock_nr = pageblock_nr; | 357 | last_pageblock_nr = pageblock_nr; |
@@ -374,14 +374,16 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
374 | if (!cc->sync) | 374 | if (!cc->sync) |
375 | mode |= ISOLATE_ASYNC_MIGRATE; | 375 | mode |= ISOLATE_ASYNC_MIGRATE; |
376 | 376 | ||
377 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
378 | |||
377 | /* Try isolate the page */ | 379 | /* Try isolate the page */ |
378 | if (__isolate_lru_page(page, mode, 0) != 0) | 380 | if (__isolate_lru_page(page, mode) != 0) |
379 | continue; | 381 | continue; |
380 | 382 | ||
381 | VM_BUG_ON(PageTransCompound(page)); | 383 | VM_BUG_ON(PageTransCompound(page)); |
382 | 384 | ||
383 | /* Successfully isolated */ | 385 | /* Successfully isolated */ |
384 | del_page_from_lru_list(zone, page, page_lru(page)); | 386 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
385 | list_add(&page->lru, migratelist); | 387 | list_add(&page->lru, migratelist); |
386 | cc->nr_migratepages++; | 388 | cc->nr_migratepages++; |
387 | nr_isolated++; | 389 | nr_isolated++; |
@@ -393,14 +395,167 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
393 | } | 395 | } |
394 | } | 396 | } |
395 | 397 | ||
396 | acct_isolated(zone, cc); | 398 | acct_isolated(zone, locked, cc); |
397 | 399 | ||
398 | spin_unlock_irq(&zone->lru_lock); | 400 | if (locked) |
399 | cc->migrate_pfn = low_pfn; | 401 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
400 | 402 | ||
401 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 403 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
402 | 404 | ||
403 | return ISOLATE_SUCCESS; | 405 | return low_pfn; |
406 | } | ||
407 | |||
408 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ | ||
409 | #ifdef CONFIG_COMPACTION | ||
410 | |||
411 | /* Returns true if the page is within a block suitable for migration to */ | ||
412 | static bool suitable_migration_target(struct page *page) | ||
413 | { | ||
414 | |||
415 | int migratetype = get_pageblock_migratetype(page); | ||
416 | |||
417 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | ||
418 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | ||
419 | return false; | ||
420 | |||
421 | /* If the page is a large free page, then allow migration */ | ||
422 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | ||
423 | return true; | ||
424 | |||
425 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
426 | if (migrate_async_suitable(migratetype)) | ||
427 | return true; | ||
428 | |||
429 | /* Otherwise skip the block */ | ||
430 | return false; | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * Returns the start pfn of the last page block in a zone. This is the starting | ||
435 | * point for full compaction of a zone. Compaction searches for free pages from | ||
436 | * the end of each zone, while isolate_freepages_block scans forward inside each | ||
437 | * page block. | ||
438 | */ | ||
439 | static unsigned long start_free_pfn(struct zone *zone) | ||
440 | { | ||
441 | unsigned long free_pfn; | ||
442 | free_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
443 | free_pfn &= ~(pageblock_nr_pages-1); | ||
444 | return free_pfn; | ||
445 | } | ||
446 | |||
447 | /* | ||
448 | * Based on information in the current compact_control, find blocks | ||
449 | * suitable for isolating free pages from and then isolate them. | ||
450 | */ | ||
451 | static void isolate_freepages(struct zone *zone, | ||
452 | struct compact_control *cc) | ||
453 | { | ||
454 | struct page *page; | ||
455 | unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; | ||
456 | unsigned long flags; | ||
457 | int nr_freepages = cc->nr_freepages; | ||
458 | struct list_head *freelist = &cc->freepages; | ||
459 | |||
460 | /* | ||
461 | * Initialise the free scanner. The starting point is where we last | ||
462 | * scanned from (or the end of the zone if starting). The low point | ||
463 | * is the end of the pageblock the migration scanner is using. | ||
464 | */ | ||
465 | pfn = cc->free_pfn; | ||
466 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; | ||
467 | |||
468 | /* | ||
469 | * Take care that if the migration scanner is at the end of the zone | ||
470 | * that the free scanner does not accidentally move to the next zone | ||
471 | * in the next isolation cycle. | ||
472 | */ | ||
473 | high_pfn = min(low_pfn, pfn); | ||
474 | |||
475 | zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
476 | |||
477 | /* | ||
478 | * Isolate free pages until enough are available to migrate the | ||
479 | * pages on cc->migratepages. We stop searching if the migrate | ||
480 | * and free page scanners meet or enough free pages are isolated. | ||
481 | */ | ||
482 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; | ||
483 | pfn -= pageblock_nr_pages) { | ||
484 | unsigned long isolated; | ||
485 | |||
486 | if (!pfn_valid(pfn)) | ||
487 | continue; | ||
488 | |||
489 | /* | ||
490 | * Check for overlapping nodes/zones. It's possible on some | ||
491 | * configurations to have a setup like | ||
492 | * node0 node1 node0 | ||
493 | * i.e. it's possible that all pages within a zones range of | ||
494 | * pages do not belong to a single zone. | ||
495 | */ | ||
496 | page = pfn_to_page(pfn); | ||
497 | if (page_zone(page) != zone) | ||
498 | continue; | ||
499 | |||
500 | /* Check the block is suitable for migration */ | ||
501 | if (!suitable_migration_target(page)) | ||
502 | continue; | ||
503 | |||
504 | /* | ||
505 | * Found a block suitable for isolating free pages from. Now | ||
506 | * we disabled interrupts, double check things are ok and | ||
507 | * isolate the pages. This is to minimise the time IRQs | ||
508 | * are disabled | ||
509 | */ | ||
510 | isolated = 0; | ||
511 | |||
512 | /* | ||
513 | * The zone lock must be held to isolate freepages. This | ||
514 | * unfortunately this is a very coarse lock and can be | ||
515 | * heavily contended if there are parallel allocations | ||
516 | * or parallel compactions. For async compaction do not | ||
517 | * spin on the lock | ||
518 | */ | ||
519 | if (!compact_trylock_irqsave(&zone->lock, &flags, cc)) | ||
520 | break; | ||
521 | if (suitable_migration_target(page)) { | ||
522 | end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); | ||
523 | isolated = isolate_freepages_block(pfn, end_pfn, | ||
524 | freelist, false); | ||
525 | nr_freepages += isolated; | ||
526 | } | ||
527 | spin_unlock_irqrestore(&zone->lock, flags); | ||
528 | |||
529 | /* | ||
530 | * Record the highest PFN we isolated pages from. When next | ||
531 | * looking for free pages, the search will restart here as | ||
532 | * page migration may have returned some pages to the allocator | ||
533 | */ | ||
534 | if (isolated) { | ||
535 | high_pfn = max(high_pfn, pfn); | ||
536 | |||
537 | /* | ||
538 | * If the free scanner has wrapped, update | ||
539 | * compact_cached_free_pfn to point to the highest | ||
540 | * pageblock with free pages. This reduces excessive | ||
541 | * scanning of full pageblocks near the end of the | ||
542 | * zone | ||
543 | */ | ||
544 | if (cc->order > 0 && cc->wrapped) | ||
545 | zone->compact_cached_free_pfn = high_pfn; | ||
546 | } | ||
547 | } | ||
548 | |||
549 | /* split_free_page does not map the pages */ | ||
550 | map_pages(freelist); | ||
551 | |||
552 | cc->free_pfn = high_pfn; | ||
553 | cc->nr_freepages = nr_freepages; | ||
554 | |||
555 | /* If compact_cached_free_pfn is reset then set it now */ | ||
556 | if (cc->order > 0 && !cc->wrapped && | ||
557 | zone->compact_cached_free_pfn == start_free_pfn(zone)) | ||
558 | zone->compact_cached_free_pfn = high_pfn; | ||
404 | } | 559 | } |
405 | 560 | ||
406 | /* | 561 | /* |
@@ -449,6 +604,44 @@ static void update_nr_listpages(struct compact_control *cc) | |||
449 | cc->nr_freepages = nr_freepages; | 604 | cc->nr_freepages = nr_freepages; |
450 | } | 605 | } |
451 | 606 | ||
607 | /* possible outcome of isolate_migratepages */ | ||
608 | typedef enum { | ||
609 | ISOLATE_ABORT, /* Abort compaction now */ | ||
610 | ISOLATE_NONE, /* No pages isolated, continue scanning */ | ||
611 | ISOLATE_SUCCESS, /* Pages isolated, migrate */ | ||
612 | } isolate_migrate_t; | ||
613 | |||
614 | /* | ||
615 | * Isolate all pages that can be migrated from the block pointed to by | ||
616 | * the migrate scanner within compact_control. | ||
617 | */ | ||
618 | static isolate_migrate_t isolate_migratepages(struct zone *zone, | ||
619 | struct compact_control *cc) | ||
620 | { | ||
621 | unsigned long low_pfn, end_pfn; | ||
622 | |||
623 | /* Do not scan outside zone boundaries */ | ||
624 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | ||
625 | |||
626 | /* Only scan within a pageblock boundary */ | ||
627 | end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); | ||
628 | |||
629 | /* Do not cross the free scanner or scan within a memory hole */ | ||
630 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { | ||
631 | cc->migrate_pfn = end_pfn; | ||
632 | return ISOLATE_NONE; | ||
633 | } | ||
634 | |||
635 | /* Perform the isolation */ | ||
636 | low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); | ||
637 | if (!low_pfn) | ||
638 | return ISOLATE_ABORT; | ||
639 | |||
640 | cc->migrate_pfn = low_pfn; | ||
641 | |||
642 | return ISOLATE_SUCCESS; | ||
643 | } | ||
644 | |||
452 | static int compact_finished(struct zone *zone, | 645 | static int compact_finished(struct zone *zone, |
453 | struct compact_control *cc) | 646 | struct compact_control *cc) |
454 | { | 647 | { |
@@ -458,8 +651,26 @@ static int compact_finished(struct zone *zone, | |||
458 | if (fatal_signal_pending(current)) | 651 | if (fatal_signal_pending(current)) |
459 | return COMPACT_PARTIAL; | 652 | return COMPACT_PARTIAL; |
460 | 653 | ||
461 | /* Compaction run completes if the migrate and free scanner meet */ | 654 | /* |
462 | if (cc->free_pfn <= cc->migrate_pfn) | 655 | * A full (order == -1) compaction run starts at the beginning and |
656 | * end of a zone; it completes when the migrate and free scanner meet. | ||
657 | * A partial (order > 0) compaction can start with the free scanner | ||
658 | * at a random point in the zone, and may have to restart. | ||
659 | */ | ||
660 | if (cc->free_pfn <= cc->migrate_pfn) { | ||
661 | if (cc->order > 0 && !cc->wrapped) { | ||
662 | /* We started partway through; restart at the end. */ | ||
663 | unsigned long free_pfn = start_free_pfn(zone); | ||
664 | zone->compact_cached_free_pfn = free_pfn; | ||
665 | cc->free_pfn = free_pfn; | ||
666 | cc->wrapped = 1; | ||
667 | return COMPACT_CONTINUE; | ||
668 | } | ||
669 | return COMPACT_COMPLETE; | ||
670 | } | ||
671 | |||
672 | /* We wrapped around and ended up where we started. */ | ||
673 | if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn) | ||
463 | return COMPACT_COMPLETE; | 674 | return COMPACT_COMPLETE; |
464 | 675 | ||
465 | /* | 676 | /* |
@@ -557,8 +768,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
557 | 768 | ||
558 | /* Setup to move all movable pages to the end of the zone */ | 769 | /* Setup to move all movable pages to the end of the zone */ |
559 | cc->migrate_pfn = zone->zone_start_pfn; | 770 | cc->migrate_pfn = zone->zone_start_pfn; |
560 | cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; | 771 | |
561 | cc->free_pfn &= ~(pageblock_nr_pages-1); | 772 | if (cc->order > 0) { |
773 | /* Incremental compaction. Start where the last one stopped. */ | ||
774 | cc->free_pfn = zone->compact_cached_free_pfn; | ||
775 | cc->start_free_pfn = cc->free_pfn; | ||
776 | } else { | ||
777 | /* Order == -1 starts at the end of the zone. */ | ||
778 | cc->free_pfn = start_free_pfn(zone); | ||
779 | } | ||
562 | 780 | ||
563 | migrate_prep_local(); | 781 | migrate_prep_local(); |
564 | 782 | ||
@@ -594,8 +812,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
594 | if (err) { | 812 | if (err) { |
595 | putback_lru_pages(&cc->migratepages); | 813 | putback_lru_pages(&cc->migratepages); |
596 | cc->nr_migratepages = 0; | 814 | cc->nr_migratepages = 0; |
815 | if (err == -ENOMEM) { | ||
816 | ret = COMPACT_PARTIAL; | ||
817 | goto out; | ||
818 | } | ||
597 | } | 819 | } |
598 | |||
599 | } | 820 | } |
600 | 821 | ||
601 | out: | 822 | out: |
@@ -608,7 +829,7 @@ out: | |||
608 | 829 | ||
609 | static unsigned long compact_zone_order(struct zone *zone, | 830 | static unsigned long compact_zone_order(struct zone *zone, |
610 | int order, gfp_t gfp_mask, | 831 | int order, gfp_t gfp_mask, |
611 | bool sync) | 832 | bool sync, bool *contended) |
612 | { | 833 | { |
613 | struct compact_control cc = { | 834 | struct compact_control cc = { |
614 | .nr_freepages = 0, | 835 | .nr_freepages = 0, |
@@ -617,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone, | |||
617 | .migratetype = allocflags_to_migratetype(gfp_mask), | 838 | .migratetype = allocflags_to_migratetype(gfp_mask), |
618 | .zone = zone, | 839 | .zone = zone, |
619 | .sync = sync, | 840 | .sync = sync, |
841 | .contended = contended, | ||
620 | }; | 842 | }; |
621 | INIT_LIST_HEAD(&cc.freepages); | 843 | INIT_LIST_HEAD(&cc.freepages); |
622 | INIT_LIST_HEAD(&cc.migratepages); | 844 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -638,7 +860,7 @@ int sysctl_extfrag_threshold = 500; | |||
638 | */ | 860 | */ |
639 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 861 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
640 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 862 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
641 | bool sync) | 863 | bool sync, bool *contended) |
642 | { | 864 | { |
643 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 865 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
644 | int may_enter_fs = gfp_mask & __GFP_FS; | 866 | int may_enter_fs = gfp_mask & __GFP_FS; |
@@ -662,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
662 | nodemask) { | 884 | nodemask) { |
663 | int status; | 885 | int status; |
664 | 886 | ||
665 | status = compact_zone_order(zone, order, gfp_mask, sync); | 887 | status = compact_zone_order(zone, order, gfp_mask, sync, |
888 | contended); | ||
666 | rc = max(status, rc); | 889 | rc = max(status, rc); |
667 | 890 | ||
668 | /* If a normal allocation would succeed, stop compacting */ | 891 | /* If a normal allocation would succeed, stop compacting */ |
@@ -698,7 +921,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
698 | if (cc->order > 0) { | 921 | if (cc->order > 0) { |
699 | int ok = zone_watermark_ok(zone, cc->order, | 922 | int ok = zone_watermark_ok(zone, cc->order, |
700 | low_wmark_pages(zone), 0, 0); | 923 | low_wmark_pages(zone), 0, 0); |
701 | if (ok && cc->order > zone->compact_order_failed) | 924 | if (ok && cc->order >= zone->compact_order_failed) |
702 | zone->compact_order_failed = cc->order + 1; | 925 | zone->compact_order_failed = cc->order + 1; |
703 | /* Currently async compaction is never deferred. */ | 926 | /* Currently async compaction is never deferred. */ |
704 | else if (!ok && cc->sync) | 927 | else if (!ok && cc->sync) |
@@ -795,3 +1018,5 @@ void compaction_unregister_node(struct node *node) | |||
795 | return device_remove_file(&node->dev, &dev_attr_compact); | 1018 | return device_remove_file(&node->dev, &dev_attr_compact); |
796 | } | 1019 | } |
797 | #endif /* CONFIG_SYSFS && CONFIG_NUMA */ | 1020 | #endif /* CONFIG_SYSFS && CONFIG_NUMA */ |
1021 | |||
1022 | #endif /* CONFIG_COMPACTION */ | ||
diff --git a/mm/fadvise.c b/mm/fadvise.c index 469491e0af79..9b75a045dbf4 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
93 | spin_unlock(&file->f_lock); | 93 | spin_unlock(&file->f_lock); |
94 | break; | 94 | break; |
95 | case POSIX_FADV_WILLNEED: | 95 | case POSIX_FADV_WILLNEED: |
96 | if (!mapping->a_ops->readpage) { | ||
97 | ret = -EINVAL; | ||
98 | break; | ||
99 | } | ||
100 | |||
101 | /* First and last PARTIAL page! */ | 96 | /* First and last PARTIAL page! */ |
102 | start_index = offset >> PAGE_CACHE_SHIFT; | 97 | start_index = offset >> PAGE_CACHE_SHIFT; |
103 | end_index = endbyte >> PAGE_CACHE_SHIFT; | 98 | end_index = endbyte >> PAGE_CACHE_SHIFT; |
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
106 | nrpages = end_index - start_index + 1; | 101 | nrpages = end_index - start_index + 1; |
107 | if (!nrpages) | 102 | if (!nrpages) |
108 | nrpages = ~0UL; | 103 | nrpages = ~0UL; |
109 | 104 | ||
110 | ret = force_page_cache_readahead(mapping, file, | 105 | /* |
111 | start_index, | 106 | * Ignore return value because fadvise() shall return |
112 | nrpages); | 107 | * success even if filesystem can't retrieve a hint, |
113 | if (ret > 0) | 108 | */ |
114 | ret = 0; | 109 | force_page_cache_readahead(mapping, file, start_index, |
110 | nrpages); | ||
115 | break; | 111 | break; |
116 | case POSIX_FADV_NOREUSE: | 112 | case POSIX_FADV_NOREUSE: |
117 | break; | 113 | break; |
diff --git a/mm/filemap.c b/mm/filemap.c index 79c4b2b0b14e..384344575c37 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -29,7 +29,6 @@ | |||
29 | #include <linux/pagevec.h> | 29 | #include <linux/pagevec.h> |
30 | #include <linux/blkdev.h> | 30 | #include <linux/blkdev.h> |
31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
32 | #include <linux/syscalls.h> | ||
33 | #include <linux/cpuset.h> | 32 | #include <linux/cpuset.h> |
34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 33 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
35 | #include <linux/memcontrol.h> | 34 | #include <linux/memcontrol.h> |
@@ -1413,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1413 | retval = filemap_write_and_wait_range(mapping, pos, | 1412 | retval = filemap_write_and_wait_range(mapping, pos, |
1414 | pos + iov_length(iov, nr_segs) - 1); | 1413 | pos + iov_length(iov, nr_segs) - 1); |
1415 | if (!retval) { | 1414 | if (!retval) { |
1416 | struct blk_plug plug; | ||
1417 | |||
1418 | blk_start_plug(&plug); | ||
1419 | retval = mapping->a_ops->direct_IO(READ, iocb, | 1415 | retval = mapping->a_ops->direct_IO(READ, iocb, |
1420 | iov, pos, nr_segs); | 1416 | iov, pos, nr_segs); |
1421 | blk_finish_plug(&plug); | ||
1422 | } | 1417 | } |
1423 | if (retval > 0) { | 1418 | if (retval > 0) { |
1424 | *ppos = pos + retval; | 1419 | *ppos = pos + retval; |
@@ -1478,44 +1473,6 @@ out: | |||
1478 | } | 1473 | } |
1479 | EXPORT_SYMBOL(generic_file_aio_read); | 1474 | EXPORT_SYMBOL(generic_file_aio_read); |
1480 | 1475 | ||
1481 | static ssize_t | ||
1482 | do_readahead(struct address_space *mapping, struct file *filp, | ||
1483 | pgoff_t index, unsigned long nr) | ||
1484 | { | ||
1485 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | ||
1486 | return -EINVAL; | ||
1487 | |||
1488 | force_page_cache_readahead(mapping, filp, index, nr); | ||
1489 | return 0; | ||
1490 | } | ||
1491 | |||
1492 | SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) | ||
1493 | { | ||
1494 | ssize_t ret; | ||
1495 | struct file *file; | ||
1496 | |||
1497 | ret = -EBADF; | ||
1498 | file = fget(fd); | ||
1499 | if (file) { | ||
1500 | if (file->f_mode & FMODE_READ) { | ||
1501 | struct address_space *mapping = file->f_mapping; | ||
1502 | pgoff_t start = offset >> PAGE_CACHE_SHIFT; | ||
1503 | pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; | ||
1504 | unsigned long len = end - start + 1; | ||
1505 | ret = do_readahead(mapping, file, start, len); | ||
1506 | } | ||
1507 | fput(file); | ||
1508 | } | ||
1509 | return ret; | ||
1510 | } | ||
1511 | #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS | ||
1512 | asmlinkage long SyS_readahead(long fd, loff_t offset, long count) | ||
1513 | { | ||
1514 | return SYSC_readahead((int) fd, offset, (size_t) count); | ||
1515 | } | ||
1516 | SYSCALL_ALIAS(sys_readahead, SyS_readahead); | ||
1517 | #endif | ||
1518 | |||
1519 | #ifdef CONFIG_MMU | 1476 | #ifdef CONFIG_MMU |
1520 | /** | 1477 | /** |
1521 | * page_cache_read - adds requested page to the page cache if not already there | 1478 | * page_cache_read - adds requested page to the page cache if not already there |
@@ -1751,8 +1708,35 @@ page_not_uptodate: | |||
1751 | } | 1708 | } |
1752 | EXPORT_SYMBOL(filemap_fault); | 1709 | EXPORT_SYMBOL(filemap_fault); |
1753 | 1710 | ||
1711 | int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
1712 | { | ||
1713 | struct page *page = vmf->page; | ||
1714 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | ||
1715 | int ret = VM_FAULT_LOCKED; | ||
1716 | |||
1717 | sb_start_pagefault(inode->i_sb); | ||
1718 | file_update_time(vma->vm_file); | ||
1719 | lock_page(page); | ||
1720 | if (page->mapping != inode->i_mapping) { | ||
1721 | unlock_page(page); | ||
1722 | ret = VM_FAULT_NOPAGE; | ||
1723 | goto out; | ||
1724 | } | ||
1725 | /* | ||
1726 | * We mark the page dirty already here so that when freeze is in | ||
1727 | * progress, we are guaranteed that writeback during freezing will | ||
1728 | * see the dirty page and writeprotect it again. | ||
1729 | */ | ||
1730 | set_page_dirty(page); | ||
1731 | out: | ||
1732 | sb_end_pagefault(inode->i_sb); | ||
1733 | return ret; | ||
1734 | } | ||
1735 | EXPORT_SYMBOL(filemap_page_mkwrite); | ||
1736 | |||
1754 | const struct vm_operations_struct generic_file_vm_ops = { | 1737 | const struct vm_operations_struct generic_file_vm_ops = { |
1755 | .fault = filemap_fault, | 1738 | .fault = filemap_fault, |
1739 | .page_mkwrite = filemap_page_mkwrite, | ||
1756 | }; | 1740 | }; |
1757 | 1741 | ||
1758 | /* This is used for a general mmap of a disk file */ | 1742 | /* This is used for a general mmap of a disk file */ |
@@ -1938,71 +1922,6 @@ struct page *read_cache_page(struct address_space *mapping, | |||
1938 | } | 1922 | } |
1939 | EXPORT_SYMBOL(read_cache_page); | 1923 | EXPORT_SYMBOL(read_cache_page); |
1940 | 1924 | ||
1941 | /* | ||
1942 | * The logic we want is | ||
1943 | * | ||
1944 | * if suid or (sgid and xgrp) | ||
1945 | * remove privs | ||
1946 | */ | ||
1947 | int should_remove_suid(struct dentry *dentry) | ||
1948 | { | ||
1949 | umode_t mode = dentry->d_inode->i_mode; | ||
1950 | int kill = 0; | ||
1951 | |||
1952 | /* suid always must be killed */ | ||
1953 | if (unlikely(mode & S_ISUID)) | ||
1954 | kill = ATTR_KILL_SUID; | ||
1955 | |||
1956 | /* | ||
1957 | * sgid without any exec bits is just a mandatory locking mark; leave | ||
1958 | * it alone. If some exec bits are set, it's a real sgid; kill it. | ||
1959 | */ | ||
1960 | if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) | ||
1961 | kill |= ATTR_KILL_SGID; | ||
1962 | |||
1963 | if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) | ||
1964 | return kill; | ||
1965 | |||
1966 | return 0; | ||
1967 | } | ||
1968 | EXPORT_SYMBOL(should_remove_suid); | ||
1969 | |||
1970 | static int __remove_suid(struct dentry *dentry, int kill) | ||
1971 | { | ||
1972 | struct iattr newattrs; | ||
1973 | |||
1974 | newattrs.ia_valid = ATTR_FORCE | kill; | ||
1975 | return notify_change(dentry, &newattrs); | ||
1976 | } | ||
1977 | |||
1978 | int file_remove_suid(struct file *file) | ||
1979 | { | ||
1980 | struct dentry *dentry = file->f_path.dentry; | ||
1981 | struct inode *inode = dentry->d_inode; | ||
1982 | int killsuid; | ||
1983 | int killpriv; | ||
1984 | int error = 0; | ||
1985 | |||
1986 | /* Fast path for nothing security related */ | ||
1987 | if (IS_NOSEC(inode)) | ||
1988 | return 0; | ||
1989 | |||
1990 | killsuid = should_remove_suid(dentry); | ||
1991 | killpriv = security_inode_need_killpriv(dentry); | ||
1992 | |||
1993 | if (killpriv < 0) | ||
1994 | return killpriv; | ||
1995 | if (killpriv) | ||
1996 | error = security_inode_killpriv(dentry); | ||
1997 | if (!error && killsuid) | ||
1998 | error = __remove_suid(dentry, killsuid); | ||
1999 | if (!error && (inode->i_sb->s_flags & MS_NOSEC)) | ||
2000 | inode->i_flags |= S_NOSEC; | ||
2001 | |||
2002 | return error; | ||
2003 | } | ||
2004 | EXPORT_SYMBOL(file_remove_suid); | ||
2005 | |||
2006 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, | 1925 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, |
2007 | const struct iovec *iov, size_t base, size_t bytes) | 1926 | const struct iovec *iov, size_t base, size_t bytes) |
2008 | { | 1927 | { |
@@ -2511,8 +2430,6 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2511 | count = ocount; | 2430 | count = ocount; |
2512 | pos = *ppos; | 2431 | pos = *ppos; |
2513 | 2432 | ||
2514 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); | ||
2515 | |||
2516 | /* We can write back this queue in page reclaim */ | 2433 | /* We can write back this queue in page reclaim */ |
2517 | current->backing_dev_info = mapping->backing_dev_info; | 2434 | current->backing_dev_info = mapping->backing_dev_info; |
2518 | written = 0; | 2435 | written = 0; |
@@ -2528,7 +2445,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2528 | if (err) | 2445 | if (err) |
2529 | goto out; | 2446 | goto out; |
2530 | 2447 | ||
2531 | file_update_time(file); | 2448 | err = file_update_time(file); |
2449 | if (err) | ||
2450 | goto out; | ||
2532 | 2451 | ||
2533 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 2452 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ |
2534 | if (unlikely(file->f_flags & O_DIRECT)) { | 2453 | if (unlikely(file->f_flags & O_DIRECT)) { |
@@ -2604,13 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2604 | { | 2523 | { |
2605 | struct file *file = iocb->ki_filp; | 2524 | struct file *file = iocb->ki_filp; |
2606 | struct inode *inode = file->f_mapping->host; | 2525 | struct inode *inode = file->f_mapping->host; |
2607 | struct blk_plug plug; | ||
2608 | ssize_t ret; | 2526 | ssize_t ret; |
2609 | 2527 | ||
2610 | BUG_ON(iocb->ki_pos != pos); | 2528 | BUG_ON(iocb->ki_pos != pos); |
2611 | 2529 | ||
2530 | sb_start_write(inode->i_sb); | ||
2612 | mutex_lock(&inode->i_mutex); | 2531 | mutex_lock(&inode->i_mutex); |
2613 | blk_start_plug(&plug); | ||
2614 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); | 2532 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
2615 | mutex_unlock(&inode->i_mutex); | 2533 | mutex_unlock(&inode->i_mutex); |
2616 | 2534 | ||
@@ -2621,7 +2539,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2621 | if (err < 0 && ret > 0) | 2539 | if (err < 0 && ret > 0) |
2622 | ret = err; | 2540 | ret = err; |
2623 | } | 2541 | } |
2624 | blk_finish_plug(&plug); | 2542 | sb_end_write(inode->i_sb); |
2625 | return ret; | 2543 | return ret; |
2626 | } | 2544 | } |
2627 | EXPORT_SYMBOL(generic_file_aio_write); | 2545 | EXPORT_SYMBOL(generic_file_aio_write); |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index a4eb31132229..13e013b1270c 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -304,6 +304,7 @@ out: | |||
304 | 304 | ||
305 | static const struct vm_operations_struct xip_file_vm_ops = { | 305 | static const struct vm_operations_struct xip_file_vm_ops = { |
306 | .fault = xip_file_fault, | 306 | .fault = xip_file_fault, |
307 | .page_mkwrite = filemap_page_mkwrite, | ||
307 | }; | 308 | }; |
308 | 309 | ||
309 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) | 310 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) |
@@ -401,6 +402,8 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, | |||
401 | loff_t pos; | 402 | loff_t pos; |
402 | ssize_t ret; | 403 | ssize_t ret; |
403 | 404 | ||
405 | sb_start_write(inode->i_sb); | ||
406 | |||
404 | mutex_lock(&inode->i_mutex); | 407 | mutex_lock(&inode->i_mutex); |
405 | 408 | ||
406 | if (!access_ok(VERIFY_READ, buf, len)) { | 409 | if (!access_ok(VERIFY_READ, buf, len)) { |
@@ -411,8 +414,6 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, | |||
411 | pos = *ppos; | 414 | pos = *ppos; |
412 | count = len; | 415 | count = len; |
413 | 416 | ||
414 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); | ||
415 | |||
416 | /* We can write back this queue in page reclaim */ | 417 | /* We can write back this queue in page reclaim */ |
417 | current->backing_dev_info = mapping->backing_dev_info; | 418 | current->backing_dev_info = mapping->backing_dev_info; |
418 | 419 | ||
@@ -426,7 +427,9 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, | |||
426 | if (ret) | 427 | if (ret) |
427 | goto out_backing; | 428 | goto out_backing; |
428 | 429 | ||
429 | file_update_time(filp); | 430 | ret = file_update_time(filp); |
431 | if (ret) | ||
432 | goto out_backing; | ||
430 | 433 | ||
431 | ret = __xip_file_write (filp, buf, count, pos, ppos); | 434 | ret = __xip_file_write (filp, buf, count, pos, ppos); |
432 | 435 | ||
@@ -434,6 +437,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, | |||
434 | current->backing_dev_info = NULL; | 437 | current->backing_dev_info = NULL; |
435 | out_up: | 438 | out_up: |
436 | mutex_unlock(&inode->i_mutex); | 439 | mutex_unlock(&inode->i_mutex); |
440 | sb_end_write(inode->i_sb); | ||
437 | return ret; | 441 | return ret; |
438 | } | 442 | } |
439 | EXPORT_SYMBOL_GPL(xip_file_write); | 443 | EXPORT_SYMBOL_GPL(xip_file_write); |
diff --git a/mm/frontswap.c b/mm/frontswap.c new file mode 100644 index 000000000000..6b3e71a2cd48 --- /dev/null +++ b/mm/frontswap.c | |||
@@ -0,0 +1,344 @@ | |||
1 | /* | ||
2 | * Frontswap frontend | ||
3 | * | ||
4 | * This code provides the generic "frontend" layer to call a matching | ||
5 | * "backend" driver implementation of frontswap. See | ||
6 | * Documentation/vm/frontswap.txt for more information. | ||
7 | * | ||
8 | * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. | ||
9 | * Author: Dan Magenheimer | ||
10 | * | ||
11 | * This work is licensed under the terms of the GNU GPL, version 2. | ||
12 | */ | ||
13 | |||
14 | #include <linux/mman.h> | ||
15 | #include <linux/swap.h> | ||
16 | #include <linux/swapops.h> | ||
17 | #include <linux/security.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/debugfs.h> | ||
20 | #include <linux/frontswap.h> | ||
21 | #include <linux/swapfile.h> | ||
22 | |||
23 | /* | ||
24 | * frontswap_ops is set by frontswap_register_ops to contain the pointers | ||
25 | * to the frontswap "backend" implementation functions. | ||
26 | */ | ||
27 | static struct frontswap_ops frontswap_ops __read_mostly; | ||
28 | |||
29 | /* | ||
30 | * This global enablement flag reduces overhead on systems where frontswap_ops | ||
31 | * has not been registered, so is preferred to the slower alternative: a | ||
32 | * function call that checks a non-global. | ||
33 | */ | ||
34 | bool frontswap_enabled __read_mostly; | ||
35 | EXPORT_SYMBOL(frontswap_enabled); | ||
36 | |||
37 | /* | ||
38 | * If enabled, frontswap_store will return failure even on success. As | ||
39 | * a result, the swap subsystem will always write the page to swap, in | ||
40 | * effect converting frontswap into a writethrough cache. In this mode, | ||
41 | * there is no direct reduction in swap writes, but a frontswap backend | ||
42 | * can unilaterally "reclaim" any pages in use with no data loss, thus | ||
43 | * providing increases control over maximum memory usage due to frontswap. | ||
44 | */ | ||
45 | static bool frontswap_writethrough_enabled __read_mostly; | ||
46 | |||
47 | #ifdef CONFIG_DEBUG_FS | ||
48 | /* | ||
49 | * Counters available via /sys/kernel/debug/frontswap (if debugfs is | ||
50 | * properly configured). These are for information only so are not protected | ||
51 | * against increment races. | ||
52 | */ | ||
53 | static u64 frontswap_loads; | ||
54 | static u64 frontswap_succ_stores; | ||
55 | static u64 frontswap_failed_stores; | ||
56 | static u64 frontswap_invalidates; | ||
57 | |||
58 | static inline void inc_frontswap_loads(void) { | ||
59 | frontswap_loads++; | ||
60 | } | ||
61 | static inline void inc_frontswap_succ_stores(void) { | ||
62 | frontswap_succ_stores++; | ||
63 | } | ||
64 | static inline void inc_frontswap_failed_stores(void) { | ||
65 | frontswap_failed_stores++; | ||
66 | } | ||
67 | static inline void inc_frontswap_invalidates(void) { | ||
68 | frontswap_invalidates++; | ||
69 | } | ||
70 | #else | ||
71 | static inline void inc_frontswap_loads(void) { } | ||
72 | static inline void inc_frontswap_succ_stores(void) { } | ||
73 | static inline void inc_frontswap_failed_stores(void) { } | ||
74 | static inline void inc_frontswap_invalidates(void) { } | ||
75 | #endif | ||
76 | /* | ||
77 | * Register operations for frontswap, returning previous thus allowing | ||
78 | * detection of multiple backends and possible nesting. | ||
79 | */ | ||
80 | struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) | ||
81 | { | ||
82 | struct frontswap_ops old = frontswap_ops; | ||
83 | |||
84 | frontswap_ops = *ops; | ||
85 | frontswap_enabled = true; | ||
86 | return old; | ||
87 | } | ||
88 | EXPORT_SYMBOL(frontswap_register_ops); | ||
89 | |||
90 | /* | ||
91 | * Enable/disable frontswap writethrough (see above). | ||
92 | */ | ||
93 | void frontswap_writethrough(bool enable) | ||
94 | { | ||
95 | frontswap_writethrough_enabled = enable; | ||
96 | } | ||
97 | EXPORT_SYMBOL(frontswap_writethrough); | ||
98 | |||
99 | /* | ||
100 | * Called when a swap device is swapon'd. | ||
101 | */ | ||
102 | void __frontswap_init(unsigned type) | ||
103 | { | ||
104 | struct swap_info_struct *sis = swap_info[type]; | ||
105 | |||
106 | BUG_ON(sis == NULL); | ||
107 | if (sis->frontswap_map == NULL) | ||
108 | return; | ||
109 | frontswap_ops.init(type); | ||
110 | } | ||
111 | EXPORT_SYMBOL(__frontswap_init); | ||
112 | |||
113 | static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) | ||
114 | { | ||
115 | frontswap_clear(sis, offset); | ||
116 | atomic_dec(&sis->frontswap_pages); | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * "Store" data from a page to frontswap and associate it with the page's | ||
121 | * swaptype and offset. Page must be locked and in the swap cache. | ||
122 | * If frontswap already contains a page with matching swaptype and | ||
123 | * offset, the frontswap implementation may either overwrite the data and | ||
124 | * return success or invalidate the page from frontswap and return failure. | ||
125 | */ | ||
126 | int __frontswap_store(struct page *page) | ||
127 | { | ||
128 | int ret = -1, dup = 0; | ||
129 | swp_entry_t entry = { .val = page_private(page), }; | ||
130 | int type = swp_type(entry); | ||
131 | struct swap_info_struct *sis = swap_info[type]; | ||
132 | pgoff_t offset = swp_offset(entry); | ||
133 | |||
134 | BUG_ON(!PageLocked(page)); | ||
135 | BUG_ON(sis == NULL); | ||
136 | if (frontswap_test(sis, offset)) | ||
137 | dup = 1; | ||
138 | ret = frontswap_ops.store(type, offset, page); | ||
139 | if (ret == 0) { | ||
140 | frontswap_set(sis, offset); | ||
141 | inc_frontswap_succ_stores(); | ||
142 | if (!dup) | ||
143 | atomic_inc(&sis->frontswap_pages); | ||
144 | } else { | ||
145 | /* | ||
146 | failed dup always results in automatic invalidate of | ||
147 | the (older) page from frontswap | ||
148 | */ | ||
149 | inc_frontswap_failed_stores(); | ||
150 | if (dup) | ||
151 | __frontswap_clear(sis, offset); | ||
152 | } | ||
153 | if (frontswap_writethrough_enabled) | ||
154 | /* report failure so swap also writes to swap device */ | ||
155 | ret = -1; | ||
156 | return ret; | ||
157 | } | ||
158 | EXPORT_SYMBOL(__frontswap_store); | ||
159 | |||
160 | /* | ||
161 | * "Get" data from frontswap associated with swaptype and offset that were | ||
162 | * specified when the data was put to frontswap and use it to fill the | ||
163 | * specified page with data. Page must be locked and in the swap cache. | ||
164 | */ | ||
165 | int __frontswap_load(struct page *page) | ||
166 | { | ||
167 | int ret = -1; | ||
168 | swp_entry_t entry = { .val = page_private(page), }; | ||
169 | int type = swp_type(entry); | ||
170 | struct swap_info_struct *sis = swap_info[type]; | ||
171 | pgoff_t offset = swp_offset(entry); | ||
172 | |||
173 | BUG_ON(!PageLocked(page)); | ||
174 | BUG_ON(sis == NULL); | ||
175 | if (frontswap_test(sis, offset)) | ||
176 | ret = frontswap_ops.load(type, offset, page); | ||
177 | if (ret == 0) | ||
178 | inc_frontswap_loads(); | ||
179 | return ret; | ||
180 | } | ||
181 | EXPORT_SYMBOL(__frontswap_load); | ||
182 | |||
183 | /* | ||
184 | * Invalidate any data from frontswap associated with the specified swaptype | ||
185 | * and offset so that a subsequent "get" will fail. | ||
186 | */ | ||
187 | void __frontswap_invalidate_page(unsigned type, pgoff_t offset) | ||
188 | { | ||
189 | struct swap_info_struct *sis = swap_info[type]; | ||
190 | |||
191 | BUG_ON(sis == NULL); | ||
192 | if (frontswap_test(sis, offset)) { | ||
193 | frontswap_ops.invalidate_page(type, offset); | ||
194 | __frontswap_clear(sis, offset); | ||
195 | inc_frontswap_invalidates(); | ||
196 | } | ||
197 | } | ||
198 | EXPORT_SYMBOL(__frontswap_invalidate_page); | ||
199 | |||
200 | /* | ||
201 | * Invalidate all data from frontswap associated with all offsets for the | ||
202 | * specified swaptype. | ||
203 | */ | ||
204 | void __frontswap_invalidate_area(unsigned type) | ||
205 | { | ||
206 | struct swap_info_struct *sis = swap_info[type]; | ||
207 | |||
208 | BUG_ON(sis == NULL); | ||
209 | if (sis->frontswap_map == NULL) | ||
210 | return; | ||
211 | frontswap_ops.invalidate_area(type); | ||
212 | atomic_set(&sis->frontswap_pages, 0); | ||
213 | memset(sis->frontswap_map, 0, sis->max / sizeof(long)); | ||
214 | } | ||
215 | EXPORT_SYMBOL(__frontswap_invalidate_area); | ||
216 | |||
217 | static unsigned long __frontswap_curr_pages(void) | ||
218 | { | ||
219 | int type; | ||
220 | unsigned long totalpages = 0; | ||
221 | struct swap_info_struct *si = NULL; | ||
222 | |||
223 | assert_spin_locked(&swap_lock); | ||
224 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
225 | si = swap_info[type]; | ||
226 | totalpages += atomic_read(&si->frontswap_pages); | ||
227 | } | ||
228 | return totalpages; | ||
229 | } | ||
230 | |||
231 | static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, | ||
232 | int *swapid) | ||
233 | { | ||
234 | int ret = -EINVAL; | ||
235 | struct swap_info_struct *si = NULL; | ||
236 | int si_frontswap_pages; | ||
237 | unsigned long total_pages_to_unuse = total; | ||
238 | unsigned long pages = 0, pages_to_unuse = 0; | ||
239 | int type; | ||
240 | |||
241 | assert_spin_locked(&swap_lock); | ||
242 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
243 | si = swap_info[type]; | ||
244 | si_frontswap_pages = atomic_read(&si->frontswap_pages); | ||
245 | if (total_pages_to_unuse < si_frontswap_pages) { | ||
246 | pages = pages_to_unuse = total_pages_to_unuse; | ||
247 | } else { | ||
248 | pages = si_frontswap_pages; | ||
249 | pages_to_unuse = 0; /* unuse all */ | ||
250 | } | ||
251 | /* ensure there is enough RAM to fetch pages from frontswap */ | ||
252 | if (security_vm_enough_memory_mm(current->mm, pages)) { | ||
253 | ret = -ENOMEM; | ||
254 | continue; | ||
255 | } | ||
256 | vm_unacct_memory(pages); | ||
257 | *unused = pages_to_unuse; | ||
258 | *swapid = type; | ||
259 | ret = 0; | ||
260 | break; | ||
261 | } | ||
262 | |||
263 | return ret; | ||
264 | } | ||
265 | |||
266 | static int __frontswap_shrink(unsigned long target_pages, | ||
267 | unsigned long *pages_to_unuse, | ||
268 | int *type) | ||
269 | { | ||
270 | unsigned long total_pages = 0, total_pages_to_unuse; | ||
271 | |||
272 | assert_spin_locked(&swap_lock); | ||
273 | |||
274 | total_pages = __frontswap_curr_pages(); | ||
275 | if (total_pages <= target_pages) { | ||
276 | /* Nothing to do */ | ||
277 | *pages_to_unuse = 0; | ||
278 | return 0; | ||
279 | } | ||
280 | total_pages_to_unuse = total_pages - target_pages; | ||
281 | return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); | ||
282 | } | ||
283 | |||
284 | /* | ||
285 | * Frontswap, like a true swap device, may unnecessarily retain pages | ||
286 | * under certain circumstances; "shrink" frontswap is essentially a | ||
287 | * "partial swapoff" and works by calling try_to_unuse to attempt to | ||
288 | * unuse enough frontswap pages to attempt to -- subject to memory | ||
289 | * constraints -- reduce the number of pages in frontswap to the | ||
290 | * number given in the parameter target_pages. | ||
291 | */ | ||
292 | void frontswap_shrink(unsigned long target_pages) | ||
293 | { | ||
294 | unsigned long pages_to_unuse = 0; | ||
295 | int type, ret; | ||
296 | |||
297 | /* | ||
298 | * we don't want to hold swap_lock while doing a very | ||
299 | * lengthy try_to_unuse, but swap_list may change | ||
300 | * so restart scan from swap_list.head each time | ||
301 | */ | ||
302 | spin_lock(&swap_lock); | ||
303 | ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); | ||
304 | spin_unlock(&swap_lock); | ||
305 | if (ret == 0 && pages_to_unuse) | ||
306 | try_to_unuse(type, true, pages_to_unuse); | ||
307 | return; | ||
308 | } | ||
309 | EXPORT_SYMBOL(frontswap_shrink); | ||
310 | |||
311 | /* | ||
312 | * Count and return the number of frontswap pages across all | ||
313 | * swap devices. This is exported so that backend drivers can | ||
314 | * determine current usage without reading debugfs. | ||
315 | */ | ||
316 | unsigned long frontswap_curr_pages(void) | ||
317 | { | ||
318 | unsigned long totalpages = 0; | ||
319 | |||
320 | spin_lock(&swap_lock); | ||
321 | totalpages = __frontswap_curr_pages(); | ||
322 | spin_unlock(&swap_lock); | ||
323 | |||
324 | return totalpages; | ||
325 | } | ||
326 | EXPORT_SYMBOL(frontswap_curr_pages); | ||
327 | |||
328 | static int __init init_frontswap(void) | ||
329 | { | ||
330 | #ifdef CONFIG_DEBUG_FS | ||
331 | struct dentry *root = debugfs_create_dir("frontswap", NULL); | ||
332 | if (root == NULL) | ||
333 | return -ENXIO; | ||
334 | debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads); | ||
335 | debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores); | ||
336 | debugfs_create_u64("failed_stores", S_IRUGO, root, | ||
337 | &frontswap_failed_stores); | ||
338 | debugfs_create_u64("invalidates", S_IRUGO, | ||
339 | root, &frontswap_invalidates); | ||
340 | #endif | ||
341 | return 0; | ||
342 | } | ||
343 | |||
344 | module_init(init_frontswap); | ||
diff --git a/mm/highmem.c b/mm/highmem.c index 57d82c6250c3..d517cd16a6eb 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); | |||
94 | do { spin_unlock(&kmap_lock); (void)(flags); } while (0) | 94 | do { spin_unlock(&kmap_lock); (void)(flags); } while (0) |
95 | #endif | 95 | #endif |
96 | 96 | ||
97 | struct page *kmap_to_page(void *vaddr) | ||
98 | { | ||
99 | unsigned long addr = (unsigned long)vaddr; | ||
100 | |||
101 | if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) { | ||
102 | int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; | ||
103 | return pte_page(pkmap_page_table[i]); | ||
104 | } | ||
105 | |||
106 | return virt_to_page(addr); | ||
107 | } | ||
108 | |||
97 | static void flush_all_zero_pkmaps(void) | 109 | static void flush_all_zero_pkmaps(void) |
98 | { | 110 | { |
99 | int i; | 111 | int i; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f0e5306eeb55..57c4b9309015 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -636,16 +636,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
636 | unsigned long haddr, pmd_t *pmd, | 636 | unsigned long haddr, pmd_t *pmd, |
637 | struct page *page) | 637 | struct page *page) |
638 | { | 638 | { |
639 | int ret = 0; | ||
640 | pgtable_t pgtable; | 639 | pgtable_t pgtable; |
641 | 640 | ||
642 | VM_BUG_ON(!PageCompound(page)); | 641 | VM_BUG_ON(!PageCompound(page)); |
643 | pgtable = pte_alloc_one(mm, haddr); | 642 | pgtable = pte_alloc_one(mm, haddr); |
644 | if (unlikely(!pgtable)) { | 643 | if (unlikely(!pgtable)) |
645 | mem_cgroup_uncharge_page(page); | ||
646 | put_page(page); | ||
647 | return VM_FAULT_OOM; | 644 | return VM_FAULT_OOM; |
648 | } | ||
649 | 645 | ||
650 | clear_huge_page(page, haddr, HPAGE_PMD_NR); | 646 | clear_huge_page(page, haddr, HPAGE_PMD_NR); |
651 | __SetPageUptodate(page); | 647 | __SetPageUptodate(page); |
@@ -675,7 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
675 | spin_unlock(&mm->page_table_lock); | 671 | spin_unlock(&mm->page_table_lock); |
676 | } | 672 | } |
677 | 673 | ||
678 | return ret; | 674 | return 0; |
679 | } | 675 | } |
680 | 676 | ||
681 | static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) | 677 | static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) |
@@ -724,8 +720,14 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
724 | put_page(page); | 720 | put_page(page); |
725 | goto out; | 721 | goto out; |
726 | } | 722 | } |
723 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, | ||
724 | page))) { | ||
725 | mem_cgroup_uncharge_page(page); | ||
726 | put_page(page); | ||
727 | goto out; | ||
728 | } | ||
727 | 729 | ||
728 | return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); | 730 | return 0; |
729 | } | 731 | } |
730 | out: | 732 | out: |
731 | /* | 733 | /* |
@@ -950,6 +952,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
950 | count_vm_event(THP_FAULT_FALLBACK); | 952 | count_vm_event(THP_FAULT_FALLBACK); |
951 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 953 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, |
952 | pmd, orig_pmd, page, haddr); | 954 | pmd, orig_pmd, page, haddr); |
955 | if (ret & VM_FAULT_OOM) | ||
956 | split_huge_page(page); | ||
953 | put_page(page); | 957 | put_page(page); |
954 | goto out; | 958 | goto out; |
955 | } | 959 | } |
@@ -957,6 +961,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
957 | 961 | ||
958 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 962 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
959 | put_page(new_page); | 963 | put_page(new_page); |
964 | split_huge_page(page); | ||
960 | put_page(page); | 965 | put_page(page); |
961 | ret |= VM_FAULT_OOM; | 966 | ret |= VM_FAULT_OOM; |
962 | goto out; | 967 | goto out; |
@@ -968,8 +973,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
968 | spin_lock(&mm->page_table_lock); | 973 | spin_lock(&mm->page_table_lock); |
969 | put_page(page); | 974 | put_page(page); |
970 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 975 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
976 | spin_unlock(&mm->page_table_lock); | ||
971 | mem_cgroup_uncharge_page(new_page); | 977 | mem_cgroup_uncharge_page(new_page); |
972 | put_page(new_page); | 978 | put_page(new_page); |
979 | goto out; | ||
973 | } else { | 980 | } else { |
974 | pmd_t entry; | 981 | pmd_t entry; |
975 | VM_BUG_ON(!PageHead(page)); | 982 | VM_BUG_ON(!PageHead(page)); |
@@ -1224,10 +1231,13 @@ static void __split_huge_page_refcount(struct page *page) | |||
1224 | { | 1231 | { |
1225 | int i; | 1232 | int i; |
1226 | struct zone *zone = page_zone(page); | 1233 | struct zone *zone = page_zone(page); |
1234 | struct lruvec *lruvec; | ||
1227 | int tail_count = 0; | 1235 | int tail_count = 0; |
1228 | 1236 | ||
1229 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | 1237 | /* prevent PageLRU to go away from under us, and freeze lru stats */ |
1230 | spin_lock_irq(&zone->lru_lock); | 1238 | spin_lock_irq(&zone->lru_lock); |
1239 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
1240 | |||
1231 | compound_lock(page); | 1241 | compound_lock(page); |
1232 | /* complete memcg works before add pages to LRU */ | 1242 | /* complete memcg works before add pages to LRU */ |
1233 | mem_cgroup_split_huge_fixup(page); | 1243 | mem_cgroup_split_huge_fixup(page); |
@@ -1302,13 +1312,12 @@ static void __split_huge_page_refcount(struct page *page) | |||
1302 | BUG_ON(!PageDirty(page_tail)); | 1312 | BUG_ON(!PageDirty(page_tail)); |
1303 | BUG_ON(!PageSwapBacked(page_tail)); | 1313 | BUG_ON(!PageSwapBacked(page_tail)); |
1304 | 1314 | ||
1305 | 1315 | lru_add_page_tail(page, page_tail, lruvec); | |
1306 | lru_add_page_tail(zone, page, page_tail); | ||
1307 | } | 1316 | } |
1308 | atomic_sub(tail_count, &page->_count); | 1317 | atomic_sub(tail_count, &page->_count); |
1309 | BUG_ON(atomic_read(&page->_count) <= 0); | 1318 | BUG_ON(atomic_read(&page->_count) <= 0); |
1310 | 1319 | ||
1311 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1320 | __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); |
1312 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); | 1321 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); |
1313 | 1322 | ||
1314 | ClearPageCompound(page); | 1323 | ClearPageCompound(page); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b8ce6f450956..bc727122dd44 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -24,17 +24,20 @@ | |||
24 | 24 | ||
25 | #include <asm/page.h> | 25 | #include <asm/page.h> |
26 | #include <asm/pgtable.h> | 26 | #include <asm/pgtable.h> |
27 | #include <linux/io.h> | 27 | #include <asm/tlb.h> |
28 | 28 | ||
29 | #include <linux/io.h> | ||
29 | #include <linux/hugetlb.h> | 30 | #include <linux/hugetlb.h> |
31 | #include <linux/hugetlb_cgroup.h> | ||
30 | #include <linux/node.h> | 32 | #include <linux/node.h> |
33 | #include <linux/hugetlb_cgroup.h> | ||
31 | #include "internal.h" | 34 | #include "internal.h" |
32 | 35 | ||
33 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 36 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
34 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 37 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
35 | unsigned long hugepages_treat_as_movable; | 38 | unsigned long hugepages_treat_as_movable; |
36 | 39 | ||
37 | static int max_hstate; | 40 | int hugetlb_max_hstate __read_mostly; |
38 | unsigned int default_hstate_idx; | 41 | unsigned int default_hstate_idx; |
39 | struct hstate hstates[HUGE_MAX_HSTATE]; | 42 | struct hstate hstates[HUGE_MAX_HSTATE]; |
40 | 43 | ||
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate; | |||
45 | static unsigned long __initdata default_hstate_max_huge_pages; | 48 | static unsigned long __initdata default_hstate_max_huge_pages; |
46 | static unsigned long __initdata default_hstate_size; | 49 | static unsigned long __initdata default_hstate_size; |
47 | 50 | ||
48 | #define for_each_hstate(h) \ | ||
49 | for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) | ||
50 | |||
51 | /* | 51 | /* |
52 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 52 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
53 | */ | 53 | */ |
54 | static DEFINE_SPINLOCK(hugetlb_lock); | 54 | DEFINE_SPINLOCK(hugetlb_lock); |
55 | 55 | ||
56 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | 56 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) |
57 | { | 57 | { |
@@ -273,8 +273,8 @@ static long region_count(struct list_head *head, long f, long t) | |||
273 | 273 | ||
274 | /* Locate each segment we overlap with, and count that overlap. */ | 274 | /* Locate each segment we overlap with, and count that overlap. */ |
275 | list_for_each_entry(rg, head, link) { | 275 | list_for_each_entry(rg, head, link) { |
276 | int seg_from; | 276 | long seg_from; |
277 | int seg_to; | 277 | long seg_to; |
278 | 278 | ||
279 | if (rg->to <= f) | 279 | if (rg->to <= f) |
280 | continue; | 280 | continue; |
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src) | |||
509 | static void enqueue_huge_page(struct hstate *h, struct page *page) | 509 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
510 | { | 510 | { |
511 | int nid = page_to_nid(page); | 511 | int nid = page_to_nid(page); |
512 | list_add(&page->lru, &h->hugepage_freelists[nid]); | 512 | list_move(&page->lru, &h->hugepage_freelists[nid]); |
513 | h->free_huge_pages++; | 513 | h->free_huge_pages++; |
514 | h->free_huge_pages_node[nid]++; | 514 | h->free_huge_pages_node[nid]++; |
515 | } | 515 | } |
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | |||
521 | if (list_empty(&h->hugepage_freelists[nid])) | 521 | if (list_empty(&h->hugepage_freelists[nid])) |
522 | return NULL; | 522 | return NULL; |
523 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); | 523 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); |
524 | list_del(&page->lru); | 524 | list_move(&page->lru, &h->hugepage_activelist); |
525 | set_page_refcounted(page); | 525 | set_page_refcounted(page); |
526 | h->free_huge_pages--; | 526 | h->free_huge_pages--; |
527 | h->free_huge_pages_node[nid]--; | 527 | h->free_huge_pages_node[nid]--; |
@@ -532,7 +532,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
532 | struct vm_area_struct *vma, | 532 | struct vm_area_struct *vma, |
533 | unsigned long address, int avoid_reserve) | 533 | unsigned long address, int avoid_reserve) |
534 | { | 534 | { |
535 | struct page *page; | 535 | struct page *page = NULL; |
536 | struct mempolicy *mpol; | 536 | struct mempolicy *mpol; |
537 | nodemask_t *nodemask; | 537 | nodemask_t *nodemask; |
538 | struct zonelist *zonelist; | 538 | struct zonelist *zonelist; |
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) | |||
593 | 1 << PG_active | 1 << PG_reserved | | 593 | 1 << PG_active | 1 << PG_reserved | |
594 | 1 << PG_private | 1 << PG_writeback); | 594 | 1 << PG_private | 1 << PG_writeback); |
595 | } | 595 | } |
596 | VM_BUG_ON(hugetlb_cgroup_from_page(page)); | ||
596 | set_compound_page_dtor(page, NULL); | 597 | set_compound_page_dtor(page, NULL); |
597 | set_page_refcounted(page); | 598 | set_page_refcounted(page); |
598 | arch_release_hugepage(page); | 599 | arch_release_hugepage(page); |
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page) | |||
625 | page->mapping = NULL; | 626 | page->mapping = NULL; |
626 | BUG_ON(page_count(page)); | 627 | BUG_ON(page_count(page)); |
627 | BUG_ON(page_mapcount(page)); | 628 | BUG_ON(page_mapcount(page)); |
628 | INIT_LIST_HEAD(&page->lru); | ||
629 | 629 | ||
630 | spin_lock(&hugetlb_lock); | 630 | spin_lock(&hugetlb_lock); |
631 | hugetlb_cgroup_uncharge_page(hstate_index(h), | ||
632 | pages_per_huge_page(h), page); | ||
631 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { | 633 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { |
634 | /* remove the page from active list */ | ||
635 | list_del(&page->lru); | ||
632 | update_and_free_page(h, page); | 636 | update_and_free_page(h, page); |
633 | h->surplus_huge_pages--; | 637 | h->surplus_huge_pages--; |
634 | h->surplus_huge_pages_node[nid]--; | 638 | h->surplus_huge_pages_node[nid]--; |
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page) | |||
641 | 645 | ||
642 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 646 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
643 | { | 647 | { |
648 | INIT_LIST_HEAD(&page->lru); | ||
644 | set_compound_page_dtor(page, free_huge_page); | 649 | set_compound_page_dtor(page, free_huge_page); |
645 | spin_lock(&hugetlb_lock); | 650 | spin_lock(&hugetlb_lock); |
651 | set_hugetlb_cgroup(page, NULL); | ||
646 | h->nr_huge_pages++; | 652 | h->nr_huge_pages++; |
647 | h->nr_huge_pages_node[nid]++; | 653 | h->nr_huge_pages_node[nid]++; |
648 | spin_unlock(&hugetlb_lock); | 654 | spin_unlock(&hugetlb_lock); |
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |||
889 | 895 | ||
890 | spin_lock(&hugetlb_lock); | 896 | spin_lock(&hugetlb_lock); |
891 | if (page) { | 897 | if (page) { |
898 | INIT_LIST_HEAD(&page->lru); | ||
892 | r_nid = page_to_nid(page); | 899 | r_nid = page_to_nid(page); |
893 | set_compound_page_dtor(page, free_huge_page); | 900 | set_compound_page_dtor(page, free_huge_page); |
901 | set_hugetlb_cgroup(page, NULL); | ||
894 | /* | 902 | /* |
895 | * We incremented the global counters already | 903 | * We incremented the global counters already |
896 | */ | 904 | */ |
@@ -993,7 +1001,6 @@ retry: | |||
993 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 1001 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
994 | if ((--needed) < 0) | 1002 | if ((--needed) < 0) |
995 | break; | 1003 | break; |
996 | list_del(&page->lru); | ||
997 | /* | 1004 | /* |
998 | * This page is now managed by the hugetlb allocator and has | 1005 | * This page is now managed by the hugetlb allocator and has |
999 | * no users -- drop the buddy allocator's reference. | 1006 | * no users -- drop the buddy allocator's reference. |
@@ -1008,7 +1015,6 @@ free: | |||
1008 | /* Free unnecessary surplus pages to the buddy allocator */ | 1015 | /* Free unnecessary surplus pages to the buddy allocator */ |
1009 | if (!list_empty(&surplus_list)) { | 1016 | if (!list_empty(&surplus_list)) { |
1010 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 1017 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
1011 | list_del(&page->lru); | ||
1012 | put_page(page); | 1018 | put_page(page); |
1013 | } | 1019 | } |
1014 | } | 1020 | } |
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1112 | struct hstate *h = hstate_vma(vma); | 1118 | struct hstate *h = hstate_vma(vma); |
1113 | struct page *page; | 1119 | struct page *page; |
1114 | long chg; | 1120 | long chg; |
1121 | int ret, idx; | ||
1122 | struct hugetlb_cgroup *h_cg; | ||
1115 | 1123 | ||
1124 | idx = hstate_index(h); | ||
1116 | /* | 1125 | /* |
1117 | * Processes that did not create the mapping will have no | 1126 | * Processes that did not create the mapping will have no |
1118 | * reserves and will not have accounted against subpool | 1127 | * reserves and will not have accounted against subpool |
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1123 | */ | 1132 | */ |
1124 | chg = vma_needs_reservation(h, vma, addr); | 1133 | chg = vma_needs_reservation(h, vma, addr); |
1125 | if (chg < 0) | 1134 | if (chg < 0) |
1126 | return ERR_PTR(-VM_FAULT_OOM); | 1135 | return ERR_PTR(-ENOMEM); |
1127 | if (chg) | 1136 | if (chg) |
1128 | if (hugepage_subpool_get_pages(spool, chg)) | 1137 | if (hugepage_subpool_get_pages(spool, chg)) |
1129 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1138 | return ERR_PTR(-ENOSPC); |
1130 | 1139 | ||
1140 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); | ||
1141 | if (ret) { | ||
1142 | hugepage_subpool_put_pages(spool, chg); | ||
1143 | return ERR_PTR(-ENOSPC); | ||
1144 | } | ||
1131 | spin_lock(&hugetlb_lock); | 1145 | spin_lock(&hugetlb_lock); |
1132 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); | 1146 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); |
1133 | spin_unlock(&hugetlb_lock); | 1147 | if (page) { |
1134 | 1148 | /* update page cgroup details */ | |
1135 | if (!page) { | 1149 | hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), |
1150 | h_cg, page); | ||
1151 | spin_unlock(&hugetlb_lock); | ||
1152 | } else { | ||
1153 | spin_unlock(&hugetlb_lock); | ||
1136 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1154 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1137 | if (!page) { | 1155 | if (!page) { |
1156 | hugetlb_cgroup_uncharge_cgroup(idx, | ||
1157 | pages_per_huge_page(h), | ||
1158 | h_cg); | ||
1138 | hugepage_subpool_put_pages(spool, chg); | 1159 | hugepage_subpool_put_pages(spool, chg); |
1139 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1160 | return ERR_PTR(-ENOSPC); |
1140 | } | 1161 | } |
1162 | spin_lock(&hugetlb_lock); | ||
1163 | hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), | ||
1164 | h_cg, page); | ||
1165 | list_move(&page->lru, &h->hugepage_activelist); | ||
1166 | spin_unlock(&hugetlb_lock); | ||
1141 | } | 1167 | } |
1142 | 1168 | ||
1143 | set_page_private(page, (unsigned long)spool); | 1169 | set_page_private(page, (unsigned long)spool); |
1144 | 1170 | ||
1145 | vma_commit_reservation(h, vma, addr); | 1171 | vma_commit_reservation(h, vma, addr); |
1146 | |||
1147 | return page; | 1172 | return page; |
1148 | } | 1173 | } |
1149 | 1174 | ||
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, | |||
1646 | struct attribute_group *hstate_attr_group) | 1671 | struct attribute_group *hstate_attr_group) |
1647 | { | 1672 | { |
1648 | int retval; | 1673 | int retval; |
1649 | int hi = h - hstates; | 1674 | int hi = hstate_index(h); |
1650 | 1675 | ||
1651 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); | 1676 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); |
1652 | if (!hstate_kobjs[hi]) | 1677 | if (!hstate_kobjs[hi]) |
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node) | |||
1741 | if (!nhs->hugepages_kobj) | 1766 | if (!nhs->hugepages_kobj) |
1742 | return; /* no hstate attributes */ | 1767 | return; /* no hstate attributes */ |
1743 | 1768 | ||
1744 | for_each_hstate(h) | 1769 | for_each_hstate(h) { |
1745 | if (nhs->hstate_kobjs[h - hstates]) { | 1770 | int idx = hstate_index(h); |
1746 | kobject_put(nhs->hstate_kobjs[h - hstates]); | 1771 | if (nhs->hstate_kobjs[idx]) { |
1747 | nhs->hstate_kobjs[h - hstates] = NULL; | 1772 | kobject_put(nhs->hstate_kobjs[idx]); |
1773 | nhs->hstate_kobjs[idx] = NULL; | ||
1748 | } | 1774 | } |
1775 | } | ||
1749 | 1776 | ||
1750 | kobject_put(nhs->hugepages_kobj); | 1777 | kobject_put(nhs->hugepages_kobj); |
1751 | nhs->hugepages_kobj = NULL; | 1778 | nhs->hugepages_kobj = NULL; |
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void) | |||
1848 | hugetlb_unregister_all_nodes(); | 1875 | hugetlb_unregister_all_nodes(); |
1849 | 1876 | ||
1850 | for_each_hstate(h) { | 1877 | for_each_hstate(h) { |
1851 | kobject_put(hstate_kobjs[h - hstates]); | 1878 | kobject_put(hstate_kobjs[hstate_index(h)]); |
1852 | } | 1879 | } |
1853 | 1880 | ||
1854 | kobject_put(hugepages_kobj); | 1881 | kobject_put(hugepages_kobj); |
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void) | |||
1869 | if (!size_to_hstate(default_hstate_size)) | 1896 | if (!size_to_hstate(default_hstate_size)) |
1870 | hugetlb_add_hstate(HUGETLB_PAGE_ORDER); | 1897 | hugetlb_add_hstate(HUGETLB_PAGE_ORDER); |
1871 | } | 1898 | } |
1872 | default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; | 1899 | default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); |
1873 | if (default_hstate_max_huge_pages) | 1900 | if (default_hstate_max_huge_pages) |
1874 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; | 1901 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; |
1875 | 1902 | ||
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1897 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); | 1924 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); |
1898 | return; | 1925 | return; |
1899 | } | 1926 | } |
1900 | BUG_ON(max_hstate >= HUGE_MAX_HSTATE); | 1927 | BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); |
1901 | BUG_ON(order == 0); | 1928 | BUG_ON(order == 0); |
1902 | h = &hstates[max_hstate++]; | 1929 | h = &hstates[hugetlb_max_hstate++]; |
1903 | h->order = order; | 1930 | h->order = order; |
1904 | h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); | 1931 | h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); |
1905 | h->nr_huge_pages = 0; | 1932 | h->nr_huge_pages = 0; |
1906 | h->free_huge_pages = 0; | 1933 | h->free_huge_pages = 0; |
1907 | for (i = 0; i < MAX_NUMNODES; ++i) | 1934 | for (i = 0; i < MAX_NUMNODES; ++i) |
1908 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1935 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1936 | INIT_LIST_HEAD(&h->hugepage_activelist); | ||
1909 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); | 1937 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); |
1910 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); | 1938 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); |
1911 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1939 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1912 | huge_page_size(h)/1024); | 1940 | huge_page_size(h)/1024); |
1941 | /* | ||
1942 | * Add cgroup control files only if the huge page consists | ||
1943 | * of more than two normal pages. This is because we use | ||
1944 | * page[2].lru.next for storing cgoup details. | ||
1945 | */ | ||
1946 | if (order >= HUGETLB_CGROUP_MIN_ORDER) | ||
1947 | hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); | ||
1913 | 1948 | ||
1914 | parsed_hstate = h; | 1949 | parsed_hstate = h; |
1915 | } | 1950 | } |
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s) | |||
1920 | static unsigned long *last_mhp; | 1955 | static unsigned long *last_mhp; |
1921 | 1956 | ||
1922 | /* | 1957 | /* |
1923 | * !max_hstate means we haven't parsed a hugepagesz= parameter yet, | 1958 | * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, |
1924 | * so this hugepages= parameter goes to the "default hstate". | 1959 | * so this hugepages= parameter goes to the "default hstate". |
1925 | */ | 1960 | */ |
1926 | if (!max_hstate) | 1961 | if (!hugetlb_max_hstate) |
1927 | mhp = &default_hstate_max_huge_pages; | 1962 | mhp = &default_hstate_max_huge_pages; |
1928 | else | 1963 | else |
1929 | mhp = &parsed_hstate->max_huge_pages; | 1964 | mhp = &parsed_hstate->max_huge_pages; |
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s) | |||
1942 | * But we need to allocate >= MAX_ORDER hstates here early to still | 1977 | * But we need to allocate >= MAX_ORDER hstates here early to still |
1943 | * use the bootmem allocator. | 1978 | * use the bootmem allocator. |
1944 | */ | 1979 | */ |
1945 | if (max_hstate && parsed_hstate->order >= MAX_ORDER) | 1980 | if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) |
1946 | hugetlb_hstate_alloc_pages(parsed_hstate); | 1981 | hugetlb_hstate_alloc_pages(parsed_hstate); |
1947 | 1982 | ||
1948 | last_mhp = mhp; | 1983 | last_mhp = mhp; |
@@ -2157,6 +2192,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) | |||
2157 | kref_get(&reservations->refs); | 2192 | kref_get(&reservations->refs); |
2158 | } | 2193 | } |
2159 | 2194 | ||
2195 | static void resv_map_put(struct vm_area_struct *vma) | ||
2196 | { | ||
2197 | struct resv_map *reservations = vma_resv_map(vma); | ||
2198 | |||
2199 | if (!reservations) | ||
2200 | return; | ||
2201 | kref_put(&reservations->refs, resv_map_release); | ||
2202 | } | ||
2203 | |||
2160 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) | 2204 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) |
2161 | { | 2205 | { |
2162 | struct hstate *h = hstate_vma(vma); | 2206 | struct hstate *h = hstate_vma(vma); |
@@ -2173,7 +2217,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2173 | reserve = (end - start) - | 2217 | reserve = (end - start) - |
2174 | region_count(&reservations->regions, start, end); | 2218 | region_count(&reservations->regions, start, end); |
2175 | 2219 | ||
2176 | kref_put(&reservations->refs, resv_map_release); | 2220 | resv_map_put(vma); |
2177 | 2221 | ||
2178 | if (reserve) { | 2222 | if (reserve) { |
2179 | hugetlb_acct_memory(h, -reserve); | 2223 | hugetlb_acct_memory(h, -reserve); |
@@ -2213,6 +2257,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, | |||
2213 | } | 2257 | } |
2214 | entry = pte_mkyoung(entry); | 2258 | entry = pte_mkyoung(entry); |
2215 | entry = pte_mkhuge(entry); | 2259 | entry = pte_mkhuge(entry); |
2260 | entry = arch_make_huge_pte(entry, vma, page, writable); | ||
2216 | 2261 | ||
2217 | return entry; | 2262 | return entry; |
2218 | } | 2263 | } |
@@ -2298,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) | |||
2298 | return 0; | 2343 | return 0; |
2299 | } | 2344 | } |
2300 | 2345 | ||
2301 | void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 2346 | void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, |
2302 | unsigned long end, struct page *ref_page) | 2347 | unsigned long start, unsigned long end, |
2348 | struct page *ref_page) | ||
2303 | { | 2349 | { |
2350 | int force_flush = 0; | ||
2304 | struct mm_struct *mm = vma->vm_mm; | 2351 | struct mm_struct *mm = vma->vm_mm; |
2305 | unsigned long address; | 2352 | unsigned long address; |
2306 | pte_t *ptep; | 2353 | pte_t *ptep; |
2307 | pte_t pte; | 2354 | pte_t pte; |
2308 | struct page *page; | 2355 | struct page *page; |
2309 | struct page *tmp; | ||
2310 | struct hstate *h = hstate_vma(vma); | 2356 | struct hstate *h = hstate_vma(vma); |
2311 | unsigned long sz = huge_page_size(h); | 2357 | unsigned long sz = huge_page_size(h); |
2312 | 2358 | ||
2313 | /* | ||
2314 | * A page gathering list, protected by per file i_mmap_mutex. The | ||
2315 | * lock is used to avoid list corruption from multiple unmapping | ||
2316 | * of the same page since we are using page->lru. | ||
2317 | */ | ||
2318 | LIST_HEAD(page_list); | ||
2319 | |||
2320 | WARN_ON(!is_vm_hugetlb_page(vma)); | 2359 | WARN_ON(!is_vm_hugetlb_page(vma)); |
2321 | BUG_ON(start & ~huge_page_mask(h)); | 2360 | BUG_ON(start & ~huge_page_mask(h)); |
2322 | BUG_ON(end & ~huge_page_mask(h)); | 2361 | BUG_ON(end & ~huge_page_mask(h)); |
2323 | 2362 | ||
2363 | tlb_start_vma(tlb, vma); | ||
2324 | mmu_notifier_invalidate_range_start(mm, start, end); | 2364 | mmu_notifier_invalidate_range_start(mm, start, end); |
2365 | again: | ||
2325 | spin_lock(&mm->page_table_lock); | 2366 | spin_lock(&mm->page_table_lock); |
2326 | for (address = start; address < end; address += sz) { | 2367 | for (address = start; address < end; address += sz) { |
2327 | ptep = huge_pte_offset(mm, address); | 2368 | ptep = huge_pte_offset(mm, address); |
@@ -2360,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2360 | } | 2401 | } |
2361 | 2402 | ||
2362 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 2403 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
2404 | tlb_remove_tlb_entry(tlb, ptep, address); | ||
2363 | if (pte_dirty(pte)) | 2405 | if (pte_dirty(pte)) |
2364 | set_page_dirty(page); | 2406 | set_page_dirty(page); |
2365 | list_add(&page->lru, &page_list); | ||
2366 | 2407 | ||
2408 | page_remove_rmap(page); | ||
2409 | force_flush = !__tlb_remove_page(tlb, page); | ||
2410 | if (force_flush) | ||
2411 | break; | ||
2367 | /* Bail out after unmapping reference page if supplied */ | 2412 | /* Bail out after unmapping reference page if supplied */ |
2368 | if (ref_page) | 2413 | if (ref_page) |
2369 | break; | 2414 | break; |
2370 | } | 2415 | } |
2371 | flush_tlb_range(vma, start, end); | ||
2372 | spin_unlock(&mm->page_table_lock); | 2416 | spin_unlock(&mm->page_table_lock); |
2373 | mmu_notifier_invalidate_range_end(mm, start, end); | 2417 | /* |
2374 | list_for_each_entry_safe(page, tmp, &page_list, lru) { | 2418 | * mmu_gather ran out of room to batch pages, we break out of |
2375 | page_remove_rmap(page); | 2419 | * the PTE lock to avoid doing the potential expensive TLB invalidate |
2376 | list_del(&page->lru); | 2420 | * and page-free while holding it. |
2377 | put_page(page); | 2421 | */ |
2422 | if (force_flush) { | ||
2423 | force_flush = 0; | ||
2424 | tlb_flush_mmu(tlb); | ||
2425 | if (address < end && !ref_page) | ||
2426 | goto again; | ||
2378 | } | 2427 | } |
2428 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
2429 | tlb_end_vma(tlb, vma); | ||
2430 | } | ||
2431 | |||
2432 | void __unmap_hugepage_range_final(struct mmu_gather *tlb, | ||
2433 | struct vm_area_struct *vma, unsigned long start, | ||
2434 | unsigned long end, struct page *ref_page) | ||
2435 | { | ||
2436 | __unmap_hugepage_range(tlb, vma, start, end, ref_page); | ||
2437 | |||
2438 | /* | ||
2439 | * Clear this flag so that x86's huge_pmd_share page_table_shareable | ||
2440 | * test will fail on a vma being torn down, and not grab a page table | ||
2441 | * on its way out. We're lucky that the flag has such an appropriate | ||
2442 | * name, and can in fact be safely cleared here. We could clear it | ||
2443 | * before the __unmap_hugepage_range above, but all that's necessary | ||
2444 | * is to clear it before releasing the i_mmap_mutex. This works | ||
2445 | * because in the context this is called, the VMA is about to be | ||
2446 | * destroyed and the i_mmap_mutex is held. | ||
2447 | */ | ||
2448 | vma->vm_flags &= ~VM_MAYSHARE; | ||
2379 | } | 2449 | } |
2380 | 2450 | ||
2381 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 2451 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, |
2382 | unsigned long end, struct page *ref_page) | 2452 | unsigned long end, struct page *ref_page) |
2383 | { | 2453 | { |
2384 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2454 | struct mm_struct *mm; |
2385 | __unmap_hugepage_range(vma, start, end, ref_page); | 2455 | struct mmu_gather tlb; |
2386 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2456 | |
2457 | mm = vma->vm_mm; | ||
2458 | |||
2459 | tlb_gather_mmu(&tlb, mm, 0); | ||
2460 | __unmap_hugepage_range(&tlb, vma, start, end, ref_page); | ||
2461 | tlb_finish_mmu(&tlb, start, end); | ||
2387 | } | 2462 | } |
2388 | 2463 | ||
2389 | /* | 2464 | /* |
@@ -2428,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2428 | * from the time of fork. This would look like data corruption | 2503 | * from the time of fork. This would look like data corruption |
2429 | */ | 2504 | */ |
2430 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | 2505 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) |
2431 | __unmap_hugepage_range(iter_vma, | 2506 | unmap_hugepage_range(iter_vma, address, |
2432 | address, address + huge_page_size(h), | 2507 | address + huge_page_size(h), page); |
2433 | page); | ||
2434 | } | 2508 | } |
2435 | mutex_unlock(&mapping->i_mmap_mutex); | 2509 | mutex_unlock(&mapping->i_mmap_mutex); |
2436 | 2510 | ||
@@ -2486,6 +2560,7 @@ retry_avoidcopy: | |||
2486 | new_page = alloc_huge_page(vma, address, outside_reserve); | 2560 | new_page = alloc_huge_page(vma, address, outside_reserve); |
2487 | 2561 | ||
2488 | if (IS_ERR(new_page)) { | 2562 | if (IS_ERR(new_page)) { |
2563 | long err = PTR_ERR(new_page); | ||
2489 | page_cache_release(old_page); | 2564 | page_cache_release(old_page); |
2490 | 2565 | ||
2491 | /* | 2566 | /* |
@@ -2498,7 +2573,6 @@ retry_avoidcopy: | |||
2498 | if (outside_reserve) { | 2573 | if (outside_reserve) { |
2499 | BUG_ON(huge_pte_none(pte)); | 2574 | BUG_ON(huge_pte_none(pte)); |
2500 | if (unmap_ref_private(mm, vma, old_page, address)) { | 2575 | if (unmap_ref_private(mm, vma, old_page, address)) { |
2501 | BUG_ON(page_count(old_page) != 1); | ||
2502 | BUG_ON(huge_pte_none(pte)); | 2576 | BUG_ON(huge_pte_none(pte)); |
2503 | spin_lock(&mm->page_table_lock); | 2577 | spin_lock(&mm->page_table_lock); |
2504 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2578 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
@@ -2515,7 +2589,10 @@ retry_avoidcopy: | |||
2515 | 2589 | ||
2516 | /* Caller expects lock to be held */ | 2590 | /* Caller expects lock to be held */ |
2517 | spin_lock(&mm->page_table_lock); | 2591 | spin_lock(&mm->page_table_lock); |
2518 | return -PTR_ERR(new_page); | 2592 | if (err == -ENOMEM) |
2593 | return VM_FAULT_OOM; | ||
2594 | else | ||
2595 | return VM_FAULT_SIGBUS; | ||
2519 | } | 2596 | } |
2520 | 2597 | ||
2521 | /* | 2598 | /* |
@@ -2633,7 +2710,11 @@ retry: | |||
2633 | goto out; | 2710 | goto out; |
2634 | page = alloc_huge_page(vma, address, 0); | 2711 | page = alloc_huge_page(vma, address, 0); |
2635 | if (IS_ERR(page)) { | 2712 | if (IS_ERR(page)) { |
2636 | ret = -PTR_ERR(page); | 2713 | ret = PTR_ERR(page); |
2714 | if (ret == -ENOMEM) | ||
2715 | ret = VM_FAULT_OOM; | ||
2716 | else | ||
2717 | ret = VM_FAULT_SIGBUS; | ||
2637 | goto out; | 2718 | goto out; |
2638 | } | 2719 | } |
2639 | clear_huge_page(page, address, pages_per_huge_page(h)); | 2720 | clear_huge_page(page, address, pages_per_huge_page(h)); |
@@ -2670,7 +2751,7 @@ retry: | |||
2670 | */ | 2751 | */ |
2671 | if (unlikely(PageHWPoison(page))) { | 2752 | if (unlikely(PageHWPoison(page))) { |
2672 | ret = VM_FAULT_HWPOISON | | 2753 | ret = VM_FAULT_HWPOISON | |
2673 | VM_FAULT_SET_HINDEX(h - hstates); | 2754 | VM_FAULT_SET_HINDEX(hstate_index(h)); |
2674 | goto backout_unlocked; | 2755 | goto backout_unlocked; |
2675 | } | 2756 | } |
2676 | } | 2757 | } |
@@ -2743,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2743 | return 0; | 2824 | return 0; |
2744 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 2825 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) |
2745 | return VM_FAULT_HWPOISON_LARGE | | 2826 | return VM_FAULT_HWPOISON_LARGE | |
2746 | VM_FAULT_SET_HINDEX(h - hstates); | 2827 | VM_FAULT_SET_HINDEX(hstate_index(h)); |
2747 | } | 2828 | } |
2748 | 2829 | ||
2749 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); | 2830 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
@@ -2791,6 +2872,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2791 | * so no worry about deadlock. | 2872 | * so no worry about deadlock. |
2792 | */ | 2873 | */ |
2793 | page = pte_page(entry); | 2874 | page = pte_page(entry); |
2875 | get_page(page); | ||
2794 | if (page != pagecache_page) | 2876 | if (page != pagecache_page) |
2795 | lock_page(page); | 2877 | lock_page(page); |
2796 | 2878 | ||
@@ -2822,6 +2904,7 @@ out_page_table_lock: | |||
2822 | } | 2904 | } |
2823 | if (page != pagecache_page) | 2905 | if (page != pagecache_page) |
2824 | unlock_page(page); | 2906 | unlock_page(page); |
2907 | put_page(page); | ||
2825 | 2908 | ||
2826 | out_mutex: | 2909 | out_mutex: |
2827 | mutex_unlock(&hugetlb_instantiation_mutex); | 2910 | mutex_unlock(&hugetlb_instantiation_mutex); |
@@ -2948,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
2948 | } | 3031 | } |
2949 | } | 3032 | } |
2950 | spin_unlock(&mm->page_table_lock); | 3033 | spin_unlock(&mm->page_table_lock); |
2951 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3034 | /* |
2952 | 3035 | * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare | |
3036 | * may have cleared our pud entry and done put_page on the page table: | ||
3037 | * once we release i_mmap_mutex, another task can do the final put_page | ||
3038 | * and that page table be reused and filled with junk. | ||
3039 | */ | ||
2953 | flush_tlb_range(vma, start, end); | 3040 | flush_tlb_range(vma, start, end); |
3041 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | ||
2954 | } | 3042 | } |
2955 | 3043 | ||
2956 | int hugetlb_reserve_pages(struct inode *inode, | 3044 | int hugetlb_reserve_pages(struct inode *inode, |
@@ -2989,12 +3077,16 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2989 | set_vma_resv_flags(vma, HPAGE_RESV_OWNER); | 3077 | set_vma_resv_flags(vma, HPAGE_RESV_OWNER); |
2990 | } | 3078 | } |
2991 | 3079 | ||
2992 | if (chg < 0) | 3080 | if (chg < 0) { |
2993 | return chg; | 3081 | ret = chg; |
3082 | goto out_err; | ||
3083 | } | ||
2994 | 3084 | ||
2995 | /* There must be enough pages in the subpool for the mapping */ | 3085 | /* There must be enough pages in the subpool for the mapping */ |
2996 | if (hugepage_subpool_get_pages(spool, chg)) | 3086 | if (hugepage_subpool_get_pages(spool, chg)) { |
2997 | return -ENOSPC; | 3087 | ret = -ENOSPC; |
3088 | goto out_err; | ||
3089 | } | ||
2998 | 3090 | ||
2999 | /* | 3091 | /* |
3000 | * Check enough hugepages are available for the reservation. | 3092 | * Check enough hugepages are available for the reservation. |
@@ -3003,7 +3095,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
3003 | ret = hugetlb_acct_memory(h, chg); | 3095 | ret = hugetlb_acct_memory(h, chg); |
3004 | if (ret < 0) { | 3096 | if (ret < 0) { |
3005 | hugepage_subpool_put_pages(spool, chg); | 3097 | hugepage_subpool_put_pages(spool, chg); |
3006 | return ret; | 3098 | goto out_err; |
3007 | } | 3099 | } |
3008 | 3100 | ||
3009 | /* | 3101 | /* |
@@ -3020,6 +3112,10 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
3020 | if (!vma || vma->vm_flags & VM_MAYSHARE) | 3112 | if (!vma || vma->vm_flags & VM_MAYSHARE) |
3021 | region_add(&inode->i_mapping->private_list, from, to); | 3113 | region_add(&inode->i_mapping->private_list, from, to); |
3022 | return 0; | 3114 | return 0; |
3115 | out_err: | ||
3116 | if (vma) | ||
3117 | resv_map_put(vma); | ||
3118 | return ret; | ||
3023 | } | 3119 | } |
3024 | 3120 | ||
3025 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | 3121 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) |
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c new file mode 100644 index 000000000000..a3f358fb8a0c --- /dev/null +++ b/mm/hugetlb_cgroup.c | |||
@@ -0,0 +1,418 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Copyright IBM Corporation, 2012 | ||
4 | * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify it | ||
7 | * under the terms of version 2.1 of the GNU Lesser General Public License | ||
8 | * as published by the Free Software Foundation. | ||
9 | * | ||
10 | * This program is distributed in the hope that it would be useful, but | ||
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/cgroup.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <linux/hugetlb.h> | ||
19 | #include <linux/hugetlb_cgroup.h> | ||
20 | |||
21 | struct hugetlb_cgroup { | ||
22 | struct cgroup_subsys_state css; | ||
23 | /* | ||
24 | * the counter to account for hugepages from hugetlb. | ||
25 | */ | ||
26 | struct res_counter hugepage[HUGE_MAX_HSTATE]; | ||
27 | }; | ||
28 | |||
29 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | ||
30 | #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) | ||
31 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | ||
32 | |||
33 | struct cgroup_subsys hugetlb_subsys __read_mostly; | ||
34 | static struct hugetlb_cgroup *root_h_cgroup __read_mostly; | ||
35 | |||
36 | static inline | ||
37 | struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) | ||
38 | { | ||
39 | return container_of(s, struct hugetlb_cgroup, css); | ||
40 | } | ||
41 | |||
42 | static inline | ||
43 | struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup) | ||
44 | { | ||
45 | return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup, | ||
46 | hugetlb_subsys_id)); | ||
47 | } | ||
48 | |||
49 | static inline | ||
50 | struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) | ||
51 | { | ||
52 | return hugetlb_cgroup_from_css(task_subsys_state(task, | ||
53 | hugetlb_subsys_id)); | ||
54 | } | ||
55 | |||
56 | static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) | ||
57 | { | ||
58 | return (h_cg == root_h_cgroup); | ||
59 | } | ||
60 | |||
61 | static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg) | ||
62 | { | ||
63 | if (!cg->parent) | ||
64 | return NULL; | ||
65 | return hugetlb_cgroup_from_cgroup(cg->parent); | ||
66 | } | ||
67 | |||
68 | static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg) | ||
69 | { | ||
70 | int idx; | ||
71 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg); | ||
72 | |||
73 | for (idx = 0; idx < hugetlb_max_hstate; idx++) { | ||
74 | if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) | ||
75 | return true; | ||
76 | } | ||
77 | return false; | ||
78 | } | ||
79 | |||
80 | static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) | ||
81 | { | ||
82 | int idx; | ||
83 | struct cgroup *parent_cgroup; | ||
84 | struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup; | ||
85 | |||
86 | h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); | ||
87 | if (!h_cgroup) | ||
88 | return ERR_PTR(-ENOMEM); | ||
89 | |||
90 | parent_cgroup = cgroup->parent; | ||
91 | if (parent_cgroup) { | ||
92 | parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup); | ||
93 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) | ||
94 | res_counter_init(&h_cgroup->hugepage[idx], | ||
95 | &parent_h_cgroup->hugepage[idx]); | ||
96 | } else { | ||
97 | root_h_cgroup = h_cgroup; | ||
98 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) | ||
99 | res_counter_init(&h_cgroup->hugepage[idx], NULL); | ||
100 | } | ||
101 | return &h_cgroup->css; | ||
102 | } | ||
103 | |||
104 | static void hugetlb_cgroup_destroy(struct cgroup *cgroup) | ||
105 | { | ||
106 | struct hugetlb_cgroup *h_cgroup; | ||
107 | |||
108 | h_cgroup = hugetlb_cgroup_from_cgroup(cgroup); | ||
109 | kfree(h_cgroup); | ||
110 | } | ||
111 | |||
112 | |||
113 | /* | ||
114 | * Should be called with hugetlb_lock held. | ||
115 | * Since we are holding hugetlb_lock, pages cannot get moved from | ||
116 | * active list or uncharged from the cgroup, So no need to get | ||
117 | * page reference and test for page active here. This function | ||
118 | * cannot fail. | ||
119 | */ | ||
120 | static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup, | ||
121 | struct page *page) | ||
122 | { | ||
123 | int csize; | ||
124 | struct res_counter *counter; | ||
125 | struct res_counter *fail_res; | ||
126 | struct hugetlb_cgroup *page_hcg; | ||
127 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
128 | struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup); | ||
129 | |||
130 | page_hcg = hugetlb_cgroup_from_page(page); | ||
131 | /* | ||
132 | * We can have pages in active list without any cgroup | ||
133 | * ie, hugepage with less than 3 pages. We can safely | ||
134 | * ignore those pages. | ||
135 | */ | ||
136 | if (!page_hcg || page_hcg != h_cg) | ||
137 | goto out; | ||
138 | |||
139 | csize = PAGE_SIZE << compound_order(page); | ||
140 | if (!parent) { | ||
141 | parent = root_h_cgroup; | ||
142 | /* root has no limit */ | ||
143 | res_counter_charge_nofail(&parent->hugepage[idx], | ||
144 | csize, &fail_res); | ||
145 | } | ||
146 | counter = &h_cg->hugepage[idx]; | ||
147 | res_counter_uncharge_until(counter, counter->parent, csize); | ||
148 | |||
149 | set_hugetlb_cgroup(page, parent); | ||
150 | out: | ||
151 | return; | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * Force the hugetlb cgroup to empty the hugetlb resources by moving them to | ||
156 | * the parent cgroup. | ||
157 | */ | ||
158 | static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) | ||
159 | { | ||
160 | struct hstate *h; | ||
161 | struct page *page; | ||
162 | int ret = 0, idx = 0; | ||
163 | |||
164 | do { | ||
165 | if (cgroup_task_count(cgroup) || | ||
166 | !list_empty(&cgroup->children)) { | ||
167 | ret = -EBUSY; | ||
168 | goto out; | ||
169 | } | ||
170 | for_each_hstate(h) { | ||
171 | spin_lock(&hugetlb_lock); | ||
172 | list_for_each_entry(page, &h->hugepage_activelist, lru) | ||
173 | hugetlb_cgroup_move_parent(idx, cgroup, page); | ||
174 | |||
175 | spin_unlock(&hugetlb_lock); | ||
176 | idx++; | ||
177 | } | ||
178 | cond_resched(); | ||
179 | } while (hugetlb_cgroup_have_usage(cgroup)); | ||
180 | out: | ||
181 | return ret; | ||
182 | } | ||
183 | |||
184 | int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, | ||
185 | struct hugetlb_cgroup **ptr) | ||
186 | { | ||
187 | int ret = 0; | ||
188 | struct res_counter *fail_res; | ||
189 | struct hugetlb_cgroup *h_cg = NULL; | ||
190 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
191 | |||
192 | if (hugetlb_cgroup_disabled()) | ||
193 | goto done; | ||
194 | /* | ||
195 | * We don't charge any cgroup if the compound page have less | ||
196 | * than 3 pages. | ||
197 | */ | ||
198 | if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) | ||
199 | goto done; | ||
200 | again: | ||
201 | rcu_read_lock(); | ||
202 | h_cg = hugetlb_cgroup_from_task(current); | ||
203 | if (!css_tryget(&h_cg->css)) { | ||
204 | rcu_read_unlock(); | ||
205 | goto again; | ||
206 | } | ||
207 | rcu_read_unlock(); | ||
208 | |||
209 | ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res); | ||
210 | css_put(&h_cg->css); | ||
211 | done: | ||
212 | *ptr = h_cg; | ||
213 | return ret; | ||
214 | } | ||
215 | |||
216 | /* Should be called with hugetlb_lock held */ | ||
217 | void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, | ||
218 | struct hugetlb_cgroup *h_cg, | ||
219 | struct page *page) | ||
220 | { | ||
221 | if (hugetlb_cgroup_disabled() || !h_cg) | ||
222 | return; | ||
223 | |||
224 | set_hugetlb_cgroup(page, h_cg); | ||
225 | return; | ||
226 | } | ||
227 | |||
228 | /* | ||
229 | * Should be called with hugetlb_lock held | ||
230 | */ | ||
231 | void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, | ||
232 | struct page *page) | ||
233 | { | ||
234 | struct hugetlb_cgroup *h_cg; | ||
235 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
236 | |||
237 | if (hugetlb_cgroup_disabled()) | ||
238 | return; | ||
239 | VM_BUG_ON(!spin_is_locked(&hugetlb_lock)); | ||
240 | h_cg = hugetlb_cgroup_from_page(page); | ||
241 | if (unlikely(!h_cg)) | ||
242 | return; | ||
243 | set_hugetlb_cgroup(page, NULL); | ||
244 | res_counter_uncharge(&h_cg->hugepage[idx], csize); | ||
245 | return; | ||
246 | } | ||
247 | |||
248 | void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, | ||
249 | struct hugetlb_cgroup *h_cg) | ||
250 | { | ||
251 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
252 | |||
253 | if (hugetlb_cgroup_disabled() || !h_cg) | ||
254 | return; | ||
255 | |||
256 | if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) | ||
257 | return; | ||
258 | |||
259 | res_counter_uncharge(&h_cg->hugepage[idx], csize); | ||
260 | return; | ||
261 | } | ||
262 | |||
263 | static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft, | ||
264 | struct file *file, char __user *buf, | ||
265 | size_t nbytes, loff_t *ppos) | ||
266 | { | ||
267 | u64 val; | ||
268 | char str[64]; | ||
269 | int idx, name, len; | ||
270 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
271 | |||
272 | idx = MEMFILE_IDX(cft->private); | ||
273 | name = MEMFILE_ATTR(cft->private); | ||
274 | |||
275 | val = res_counter_read_u64(&h_cg->hugepage[idx], name); | ||
276 | len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); | ||
277 | return simple_read_from_buffer(buf, nbytes, ppos, str, len); | ||
278 | } | ||
279 | |||
280 | static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft, | ||
281 | const char *buffer) | ||
282 | { | ||
283 | int idx, name, ret; | ||
284 | unsigned long long val; | ||
285 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
286 | |||
287 | idx = MEMFILE_IDX(cft->private); | ||
288 | name = MEMFILE_ATTR(cft->private); | ||
289 | |||
290 | switch (name) { | ||
291 | case RES_LIMIT: | ||
292 | if (hugetlb_cgroup_is_root(h_cg)) { | ||
293 | /* Can't set limit on root */ | ||
294 | ret = -EINVAL; | ||
295 | break; | ||
296 | } | ||
297 | /* This function does all necessary parse...reuse it */ | ||
298 | ret = res_counter_memparse_write_strategy(buffer, &val); | ||
299 | if (ret) | ||
300 | break; | ||
301 | ret = res_counter_set_limit(&h_cg->hugepage[idx], val); | ||
302 | break; | ||
303 | default: | ||
304 | ret = -EINVAL; | ||
305 | break; | ||
306 | } | ||
307 | return ret; | ||
308 | } | ||
309 | |||
310 | static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event) | ||
311 | { | ||
312 | int idx, name, ret = 0; | ||
313 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
314 | |||
315 | idx = MEMFILE_IDX(event); | ||
316 | name = MEMFILE_ATTR(event); | ||
317 | |||
318 | switch (name) { | ||
319 | case RES_MAX_USAGE: | ||
320 | res_counter_reset_max(&h_cg->hugepage[idx]); | ||
321 | break; | ||
322 | case RES_FAILCNT: | ||
323 | res_counter_reset_failcnt(&h_cg->hugepage[idx]); | ||
324 | break; | ||
325 | default: | ||
326 | ret = -EINVAL; | ||
327 | break; | ||
328 | } | ||
329 | return ret; | ||
330 | } | ||
331 | |||
332 | static char *mem_fmt(char *buf, int size, unsigned long hsize) | ||
333 | { | ||
334 | if (hsize >= (1UL << 30)) | ||
335 | snprintf(buf, size, "%luGB", hsize >> 30); | ||
336 | else if (hsize >= (1UL << 20)) | ||
337 | snprintf(buf, size, "%luMB", hsize >> 20); | ||
338 | else | ||
339 | snprintf(buf, size, "%luKB", hsize >> 10); | ||
340 | return buf; | ||
341 | } | ||
342 | |||
343 | int __init hugetlb_cgroup_file_init(int idx) | ||
344 | { | ||
345 | char buf[32]; | ||
346 | struct cftype *cft; | ||
347 | struct hstate *h = &hstates[idx]; | ||
348 | |||
349 | /* format the size */ | ||
350 | mem_fmt(buf, 32, huge_page_size(h)); | ||
351 | |||
352 | /* Add the limit file */ | ||
353 | cft = &h->cgroup_files[0]; | ||
354 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); | ||
355 | cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); | ||
356 | cft->read = hugetlb_cgroup_read; | ||
357 | cft->write_string = hugetlb_cgroup_write; | ||
358 | |||
359 | /* Add the usage file */ | ||
360 | cft = &h->cgroup_files[1]; | ||
361 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); | ||
362 | cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); | ||
363 | cft->read = hugetlb_cgroup_read; | ||
364 | |||
365 | /* Add the MAX usage file */ | ||
366 | cft = &h->cgroup_files[2]; | ||
367 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); | ||
368 | cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); | ||
369 | cft->trigger = hugetlb_cgroup_reset; | ||
370 | cft->read = hugetlb_cgroup_read; | ||
371 | |||
372 | /* Add the failcntfile */ | ||
373 | cft = &h->cgroup_files[3]; | ||
374 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); | ||
375 | cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); | ||
376 | cft->trigger = hugetlb_cgroup_reset; | ||
377 | cft->read = hugetlb_cgroup_read; | ||
378 | |||
379 | /* NULL terminate the last cft */ | ||
380 | cft = &h->cgroup_files[4]; | ||
381 | memset(cft, 0, sizeof(*cft)); | ||
382 | |||
383 | WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); | ||
384 | |||
385 | return 0; | ||
386 | } | ||
387 | |||
388 | /* | ||
389 | * hugetlb_lock will make sure a parallel cgroup rmdir won't happen | ||
390 | * when we migrate hugepages | ||
391 | */ | ||
392 | void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) | ||
393 | { | ||
394 | struct hugetlb_cgroup *h_cg; | ||
395 | struct hstate *h = page_hstate(oldhpage); | ||
396 | |||
397 | if (hugetlb_cgroup_disabled()) | ||
398 | return; | ||
399 | |||
400 | VM_BUG_ON(!PageHuge(oldhpage)); | ||
401 | spin_lock(&hugetlb_lock); | ||
402 | h_cg = hugetlb_cgroup_from_page(oldhpage); | ||
403 | set_hugetlb_cgroup(oldhpage, NULL); | ||
404 | |||
405 | /* move the h_cg details to new cgroup */ | ||
406 | set_hugetlb_cgroup(newhpage, h_cg); | ||
407 | list_move(&newhpage->lru, &h->hugepage_activelist); | ||
408 | spin_unlock(&hugetlb_lock); | ||
409 | return; | ||
410 | } | ||
411 | |||
412 | struct cgroup_subsys hugetlb_subsys = { | ||
413 | .name = "hugetlb", | ||
414 | .create = hugetlb_cgroup_create, | ||
415 | .pre_destroy = hugetlb_cgroup_pre_destroy, | ||
416 | .destroy = hugetlb_cgroup_destroy, | ||
417 | .subsys_id = hugetlb_subsys_id, | ||
418 | }; | ||
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index cc448bb983ba..3a61efc518d5 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -123,7 +123,7 @@ static int pfn_inject_init(void) | |||
123 | if (!dentry) | 123 | if (!dentry) |
124 | goto fail; | 124 | goto fail; |
125 | 125 | ||
126 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 126 | #ifdef CONFIG_MEMCG_SWAP |
127 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, | 127 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, |
128 | hwpoison_dir, &hwpoison_filter_memcg); | 128 | hwpoison_dir, &hwpoison_filter_memcg); |
129 | if (!dentry) | 129 | if (!dentry) |
diff --git a/mm/internal.h b/mm/internal.h index 2189af491783..b8c91b342e24 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -100,6 +100,46 @@ extern void prep_compound_page(struct page *page, unsigned long order); | |||
100 | extern bool is_free_buddy_page(struct page *page); | 100 | extern bool is_free_buddy_page(struct page *page); |
101 | #endif | 101 | #endif |
102 | 102 | ||
103 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
104 | |||
105 | /* | ||
106 | * in mm/compaction.c | ||
107 | */ | ||
108 | /* | ||
109 | * compact_control is used to track pages being migrated and the free pages | ||
110 | * they are being migrated to during memory compaction. The free_pfn starts | ||
111 | * at the end of a zone and migrate_pfn begins at the start. Movable pages | ||
112 | * are moved to the end of a zone during a compaction run and the run | ||
113 | * completes when free_pfn <= migrate_pfn | ||
114 | */ | ||
115 | struct compact_control { | ||
116 | struct list_head freepages; /* List of free pages to migrate to */ | ||
117 | struct list_head migratepages; /* List of pages being migrated */ | ||
118 | unsigned long nr_freepages; /* Number of isolated free pages */ | ||
119 | unsigned long nr_migratepages; /* Number of pages to migrate */ | ||
120 | unsigned long free_pfn; /* isolate_freepages search base */ | ||
121 | unsigned long start_free_pfn; /* where we started the search */ | ||
122 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | ||
123 | bool sync; /* Synchronous migration */ | ||
124 | bool wrapped; /* Order > 0 compactions are | ||
125 | incremental, once free_pfn | ||
126 | and migrate_pfn meet, we restart | ||
127 | from the top of the zone; | ||
128 | remember we wrapped around. */ | ||
129 | |||
130 | int order; /* order a direct compactor needs */ | ||
131 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | ||
132 | struct zone *zone; | ||
133 | bool *contended; /* True if a lock was contended */ | ||
134 | }; | ||
135 | |||
136 | unsigned long | ||
137 | isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); | ||
138 | unsigned long | ||
139 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | ||
140 | unsigned long low_pfn, unsigned long end_pfn); | ||
141 | |||
142 | #endif | ||
103 | 143 | ||
104 | /* | 144 | /* |
105 | * function for dealing with page's order in buddy system. | 145 | * function for dealing with page's order in buddy system. |
@@ -131,7 +171,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
131 | * to determine if it's being mapped into a LOCKED vma. | 171 | * to determine if it's being mapped into a LOCKED vma. |
132 | * If so, mark page as mlocked. | 172 | * If so, mark page as mlocked. |
133 | */ | 173 | */ |
134 | static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) | 174 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, |
175 | struct page *page) | ||
135 | { | 176 | { |
136 | VM_BUG_ON(PageLRU(page)); | 177 | VM_BUG_ON(PageLRU(page)); |
137 | 178 | ||
@@ -189,7 +230,7 @@ extern unsigned long vma_address(struct page *page, | |||
189 | struct vm_area_struct *vma); | 230 | struct vm_area_struct *vma); |
190 | #endif | 231 | #endif |
191 | #else /* !CONFIG_MMU */ | 232 | #else /* !CONFIG_MMU */ |
192 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 233 | static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) |
193 | { | 234 | { |
194 | return 0; | 235 | return 0; |
195 | } | 236 | } |
@@ -309,3 +350,9 @@ extern u64 hwpoison_filter_flags_mask; | |||
309 | extern u64 hwpoison_filter_flags_value; | 350 | extern u64 hwpoison_filter_flags_value; |
310 | extern u64 hwpoison_filter_memcg; | 351 | extern u64 hwpoison_filter_memcg; |
311 | extern u32 hwpoison_filter_enable; | 352 | extern u32 hwpoison_filter_enable; |
353 | |||
354 | extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, | ||
355 | unsigned long, unsigned long, | ||
356 | unsigned long, unsigned long); | ||
357 | |||
358 | extern void set_pageblock_order(void); | ||
diff --git a/mm/madvise.c b/mm/madvise.c index 1ccbba5b6674..14d260fa0d17 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -11,8 +11,11 @@ | |||
11 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
12 | #include <linux/page-isolation.h> | 12 | #include <linux/page-isolation.h> |
13 | #include <linux/hugetlb.h> | 13 | #include <linux/hugetlb.h> |
14 | #include <linux/falloc.h> | ||
14 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
15 | #include <linux/ksm.h> | 16 | #include <linux/ksm.h> |
17 | #include <linux/fs.h> | ||
18 | #include <linux/file.h> | ||
16 | 19 | ||
17 | /* | 20 | /* |
18 | * Any behaviour which results in changes to the vma->vm_flags needs to | 21 | * Any behaviour which results in changes to the vma->vm_flags needs to |
@@ -200,33 +203,39 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
200 | struct vm_area_struct **prev, | 203 | struct vm_area_struct **prev, |
201 | unsigned long start, unsigned long end) | 204 | unsigned long start, unsigned long end) |
202 | { | 205 | { |
203 | struct address_space *mapping; | 206 | loff_t offset; |
204 | loff_t offset, endoff; | ||
205 | int error; | 207 | int error; |
208 | struct file *f; | ||
206 | 209 | ||
207 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ | 210 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ |
208 | 211 | ||
209 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) | 212 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) |
210 | return -EINVAL; | 213 | return -EINVAL; |
211 | 214 | ||
212 | if (!vma->vm_file || !vma->vm_file->f_mapping | 215 | f = vma->vm_file; |
213 | || !vma->vm_file->f_mapping->host) { | 216 | |
217 | if (!f || !f->f_mapping || !f->f_mapping->host) { | ||
214 | return -EINVAL; | 218 | return -EINVAL; |
215 | } | 219 | } |
216 | 220 | ||
217 | if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) | 221 | if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) |
218 | return -EACCES; | 222 | return -EACCES; |
219 | 223 | ||
220 | mapping = vma->vm_file->f_mapping; | ||
221 | |||
222 | offset = (loff_t)(start - vma->vm_start) | 224 | offset = (loff_t)(start - vma->vm_start) |
223 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | 225 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
224 | endoff = (loff_t)(end - vma->vm_start - 1) | ||
225 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | ||
226 | 226 | ||
227 | /* vmtruncate_range needs to take i_mutex */ | 227 | /* |
228 | * Filesystem's fallocate may need to take i_mutex. We need to | ||
229 | * explicitly grab a reference because the vma (and hence the | ||
230 | * vma's reference to the file) can go away as soon as we drop | ||
231 | * mmap_sem. | ||
232 | */ | ||
233 | get_file(f); | ||
228 | up_read(¤t->mm->mmap_sem); | 234 | up_read(¤t->mm->mmap_sem); |
229 | error = vmtruncate_range(mapping->host, offset, endoff); | 235 | error = do_fallocate(f, |
236 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, | ||
237 | offset, end - start); | ||
238 | fput(f); | ||
230 | down_read(¤t->mm->mmap_sem); | 239 | down_read(¤t->mm->mmap_sem); |
231 | return error; | 240 | return error; |
232 | } | 241 | } |
diff --git a/mm/memblock.c b/mm/memblock.c index 99f285599501..4d9393c7edc9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -37,6 +37,8 @@ struct memblock memblock __initdata_memblock = { | |||
37 | 37 | ||
38 | int memblock_debug __initdata_memblock; | 38 | int memblock_debug __initdata_memblock; |
39 | static int memblock_can_resize __initdata_memblock; | 39 | static int memblock_can_resize __initdata_memblock; |
40 | static int memblock_memory_in_slab __initdata_memblock = 0; | ||
41 | static int memblock_reserved_in_slab __initdata_memblock = 0; | ||
40 | 42 | ||
41 | /* inline so we don't get a warning when pr_debug is compiled out */ | 43 | /* inline so we don't get a warning when pr_debug is compiled out */ |
42 | static inline const char *memblock_type_name(struct memblock_type *type) | 44 | static inline const char *memblock_type_name(struct memblock_type *type) |
@@ -141,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, | |||
141 | MAX_NUMNODES); | 143 | MAX_NUMNODES); |
142 | } | 144 | } |
143 | 145 | ||
144 | /* | ||
145 | * Free memblock.reserved.regions | ||
146 | */ | ||
147 | int __init_memblock memblock_free_reserved_regions(void) | ||
148 | { | ||
149 | if (memblock.reserved.regions == memblock_reserved_init_regions) | ||
150 | return 0; | ||
151 | |||
152 | return memblock_free(__pa(memblock.reserved.regions), | ||
153 | sizeof(struct memblock_region) * memblock.reserved.max); | ||
154 | } | ||
155 | |||
156 | /* | ||
157 | * Reserve memblock.reserved.regions | ||
158 | */ | ||
159 | int __init_memblock memblock_reserve_reserved_regions(void) | ||
160 | { | ||
161 | if (memblock.reserved.regions == memblock_reserved_init_regions) | ||
162 | return 0; | ||
163 | |||
164 | return memblock_reserve(__pa(memblock.reserved.regions), | ||
165 | sizeof(struct memblock_region) * memblock.reserved.max); | ||
166 | } | ||
167 | |||
168 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) | 146 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) |
169 | { | 147 | { |
170 | type->total_size -= type->regions[r].size; | 148 | type->total_size -= type->regions[r].size; |
@@ -182,11 +160,42 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u | |||
182 | } | 160 | } |
183 | } | 161 | } |
184 | 162 | ||
185 | static int __init_memblock memblock_double_array(struct memblock_type *type) | 163 | phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( |
164 | phys_addr_t *addr) | ||
165 | { | ||
166 | if (memblock.reserved.regions == memblock_reserved_init_regions) | ||
167 | return 0; | ||
168 | |||
169 | *addr = __pa(memblock.reserved.regions); | ||
170 | |||
171 | return PAGE_ALIGN(sizeof(struct memblock_region) * | ||
172 | memblock.reserved.max); | ||
173 | } | ||
174 | |||
175 | /** | ||
176 | * memblock_double_array - double the size of the memblock regions array | ||
177 | * @type: memblock type of the regions array being doubled | ||
178 | * @new_area_start: starting address of memory range to avoid overlap with | ||
179 | * @new_area_size: size of memory range to avoid overlap with | ||
180 | * | ||
181 | * Double the size of the @type regions array. If memblock is being used to | ||
182 | * allocate memory for a new reserved regions array and there is a previously | ||
183 | * allocated memory range [@new_area_start,@new_area_start+@new_area_size] | ||
184 | * waiting to be reserved, ensure the memory used by the new array does | ||
185 | * not overlap. | ||
186 | * | ||
187 | * RETURNS: | ||
188 | * 0 on success, -1 on failure. | ||
189 | */ | ||
190 | static int __init_memblock memblock_double_array(struct memblock_type *type, | ||
191 | phys_addr_t new_area_start, | ||
192 | phys_addr_t new_area_size) | ||
186 | { | 193 | { |
187 | struct memblock_region *new_array, *old_array; | 194 | struct memblock_region *new_array, *old_array; |
195 | phys_addr_t old_alloc_size, new_alloc_size; | ||
188 | phys_addr_t old_size, new_size, addr; | 196 | phys_addr_t old_size, new_size, addr; |
189 | int use_slab = slab_is_available(); | 197 | int use_slab = slab_is_available(); |
198 | int *in_slab; | ||
190 | 199 | ||
191 | /* We don't allow resizing until we know about the reserved regions | 200 | /* We don't allow resizing until we know about the reserved regions |
192 | * of memory that aren't suitable for allocation | 201 | * of memory that aren't suitable for allocation |
@@ -197,36 +206,62 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
197 | /* Calculate new doubled size */ | 206 | /* Calculate new doubled size */ |
198 | old_size = type->max * sizeof(struct memblock_region); | 207 | old_size = type->max * sizeof(struct memblock_region); |
199 | new_size = old_size << 1; | 208 | new_size = old_size << 1; |
209 | /* | ||
210 | * We need to allocated new one align to PAGE_SIZE, | ||
211 | * so we can free them completely later. | ||
212 | */ | ||
213 | old_alloc_size = PAGE_ALIGN(old_size); | ||
214 | new_alloc_size = PAGE_ALIGN(new_size); | ||
215 | |||
216 | /* Retrieve the slab flag */ | ||
217 | if (type == &memblock.memory) | ||
218 | in_slab = &memblock_memory_in_slab; | ||
219 | else | ||
220 | in_slab = &memblock_reserved_in_slab; | ||
200 | 221 | ||
201 | /* Try to find some space for it. | 222 | /* Try to find some space for it. |
202 | * | 223 | * |
203 | * WARNING: We assume that either slab_is_available() and we use it or | 224 | * WARNING: We assume that either slab_is_available() and we use it or |
204 | * we use MEMBLOCK for allocations. That means that this is unsafe to use | 225 | * we use MEMBLOCK for allocations. That means that this is unsafe to |
205 | * when bootmem is currently active (unless bootmem itself is implemented | 226 | * use when bootmem is currently active (unless bootmem itself is |
206 | * on top of MEMBLOCK which isn't the case yet) | 227 | * implemented on top of MEMBLOCK which isn't the case yet) |
207 | * | 228 | * |
208 | * This should however not be an issue for now, as we currently only | 229 | * This should however not be an issue for now, as we currently only |
209 | * call into MEMBLOCK while it's still active, or much later when slab is | 230 | * call into MEMBLOCK while it's still active, or much later when slab |
210 | * active for memory hotplug operations | 231 | * is active for memory hotplug operations |
211 | */ | 232 | */ |
212 | if (use_slab) { | 233 | if (use_slab) { |
213 | new_array = kmalloc(new_size, GFP_KERNEL); | 234 | new_array = kmalloc(new_size, GFP_KERNEL); |
214 | addr = new_array ? __pa(new_array) : 0; | 235 | addr = new_array ? __pa(new_array) : 0; |
215 | } else | 236 | } else { |
216 | addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); | 237 | /* only exclude range when trying to double reserved.regions */ |
238 | if (type != &memblock.reserved) | ||
239 | new_area_start = new_area_size = 0; | ||
240 | |||
241 | addr = memblock_find_in_range(new_area_start + new_area_size, | ||
242 | memblock.current_limit, | ||
243 | new_alloc_size, PAGE_SIZE); | ||
244 | if (!addr && new_area_size) | ||
245 | addr = memblock_find_in_range(0, | ||
246 | min(new_area_start, memblock.current_limit), | ||
247 | new_alloc_size, PAGE_SIZE); | ||
248 | |||
249 | new_array = addr ? __va(addr) : 0; | ||
250 | } | ||
217 | if (!addr) { | 251 | if (!addr) { |
218 | pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", | 252 | pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", |
219 | memblock_type_name(type), type->max, type->max * 2); | 253 | memblock_type_name(type), type->max, type->max * 2); |
220 | return -1; | 254 | return -1; |
221 | } | 255 | } |
222 | new_array = __va(addr); | ||
223 | 256 | ||
224 | memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", | 257 | memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]", |
225 | memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); | 258 | memblock_type_name(type), type->max * 2, (u64)addr, |
259 | (u64)addr + new_size - 1); | ||
226 | 260 | ||
227 | /* Found space, we now need to move the array over before | 261 | /* |
228 | * we add the reserved region since it may be our reserved | 262 | * Found space, we now need to move the array over before we add the |
229 | * array itself that is full. | 263 | * reserved region since it may be our reserved array itself that is |
264 | * full. | ||
230 | */ | 265 | */ |
231 | memcpy(new_array, type->regions, old_size); | 266 | memcpy(new_array, type->regions, old_size); |
232 | memset(new_array + type->max, 0, old_size); | 267 | memset(new_array + type->max, 0, old_size); |
@@ -234,21 +269,22 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
234 | type->regions = new_array; | 269 | type->regions = new_array; |
235 | type->max <<= 1; | 270 | type->max <<= 1; |
236 | 271 | ||
237 | /* If we use SLAB that's it, we are done */ | 272 | /* Free old array. We needn't free it if the array is the static one */ |
238 | if (use_slab) | 273 | if (*in_slab) |
239 | return 0; | 274 | kfree(old_array); |
240 | 275 | else if (old_array != memblock_memory_init_regions && | |
241 | /* Add the new reserved region now. Should not fail ! */ | 276 | old_array != memblock_reserved_init_regions) |
242 | BUG_ON(memblock_reserve(addr, new_size)); | 277 | memblock_free(__pa(old_array), old_alloc_size); |
243 | 278 | ||
244 | /* If the array wasn't our static init one, then free it. We only do | 279 | /* |
245 | * that before SLAB is available as later on, we don't know whether | 280 | * Reserve the new array if that comes from the memblock. Otherwise, we |
246 | * to use kfree or free_bootmem_pages(). Shouldn't be a big deal | 281 | * needn't do it |
247 | * anyways | ||
248 | */ | 282 | */ |
249 | if (old_array != memblock_memory_init_regions && | 283 | if (!use_slab) |
250 | old_array != memblock_reserved_init_regions) | 284 | BUG_ON(memblock_reserve(addr, new_alloc_size)); |
251 | memblock_free(__pa(old_array), old_size); | 285 | |
286 | /* Update slab flag */ | ||
287 | *in_slab = use_slab; | ||
252 | 288 | ||
253 | return 0; | 289 | return 0; |
254 | } | 290 | } |
@@ -330,6 +366,9 @@ static int __init_memblock memblock_add_region(struct memblock_type *type, | |||
330 | phys_addr_t end = base + memblock_cap_size(base, &size); | 366 | phys_addr_t end = base + memblock_cap_size(base, &size); |
331 | int i, nr_new; | 367 | int i, nr_new; |
332 | 368 | ||
369 | if (!size) | ||
370 | return 0; | ||
371 | |||
333 | /* special case for empty array */ | 372 | /* special case for empty array */ |
334 | if (type->regions[0].size == 0) { | 373 | if (type->regions[0].size == 0) { |
335 | WARN_ON(type->cnt != 1 || type->total_size); | 374 | WARN_ON(type->cnt != 1 || type->total_size); |
@@ -384,7 +423,7 @@ repeat: | |||
384 | */ | 423 | */ |
385 | if (!insert) { | 424 | if (!insert) { |
386 | while (type->cnt + nr_new > type->max) | 425 | while (type->cnt + nr_new > type->max) |
387 | if (memblock_double_array(type) < 0) | 426 | if (memblock_double_array(type, obase, size) < 0) |
388 | return -ENOMEM; | 427 | return -ENOMEM; |
389 | insert = true; | 428 | insert = true; |
390 | goto repeat; | 429 | goto repeat; |
@@ -430,9 +469,12 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
430 | 469 | ||
431 | *start_rgn = *end_rgn = 0; | 470 | *start_rgn = *end_rgn = 0; |
432 | 471 | ||
472 | if (!size) | ||
473 | return 0; | ||
474 | |||
433 | /* we'll create at most two more regions */ | 475 | /* we'll create at most two more regions */ |
434 | while (type->cnt + 2 > type->max) | 476 | while (type->cnt + 2 > type->max) |
435 | if (memblock_double_array(type) < 0) | 477 | if (memblock_double_array(type, base, size) < 0) |
436 | return -ENOMEM; | 478 | return -ENOMEM; |
437 | 479 | ||
438 | for (i = 0; i < type->cnt; i++) { | 480 | for (i = 0; i < type->cnt; i++) { |
@@ -514,7 +556,6 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | |||
514 | (unsigned long long)base, | 556 | (unsigned long long)base, |
515 | (unsigned long long)base + size, | 557 | (unsigned long long)base + size, |
516 | (void *)_RET_IP_); | 558 | (void *)_RET_IP_); |
517 | BUG_ON(0 == size); | ||
518 | 559 | ||
519 | return memblock_add_region(_rgn, base, size, MAX_NUMNODES); | 560 | return memblock_add_region(_rgn, base, size, MAX_NUMNODES); |
520 | } | 561 | } |
@@ -523,9 +564,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | |||
523 | * __next_free_mem_range - next function for for_each_free_mem_range() | 564 | * __next_free_mem_range - next function for for_each_free_mem_range() |
524 | * @idx: pointer to u64 loop variable | 565 | * @idx: pointer to u64 loop variable |
525 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 566 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes |
526 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 567 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
527 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 568 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
528 | * @p_nid: ptr to int for nid of the range, can be %NULL | 569 | * @out_nid: ptr to int for nid of the range, can be %NULL |
529 | * | 570 | * |
530 | * Find the first free area from *@idx which matches @nid, fill the out | 571 | * Find the first free area from *@idx which matches @nid, fill the out |
531 | * parameters, and update *@idx for the next iteration. The lower 32bit of | 572 | * parameters, and update *@idx for the next iteration. The lower 32bit of |
@@ -599,9 +640,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, | |||
599 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() | 640 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() |
600 | * @idx: pointer to u64 loop variable | 641 | * @idx: pointer to u64 loop variable |
601 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 642 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes |
602 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 643 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
603 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 644 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
604 | * @p_nid: ptr to int for nid of the range, can be %NULL | 645 | * @out_nid: ptr to int for nid of the range, can be %NULL |
605 | * | 646 | * |
606 | * Reverse of __next_free_mem_range(). | 647 | * Reverse of __next_free_mem_range(). |
607 | */ | 648 | */ |
@@ -850,6 +891,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) | |||
850 | return memblock_search(&memblock.memory, addr) != -1; | 891 | return memblock_search(&memblock.memory, addr) != -1; |
851 | } | 892 | } |
852 | 893 | ||
894 | /** | ||
895 | * memblock_is_region_memory - check if a region is a subset of memory | ||
896 | * @base: base of region to check | ||
897 | * @size: size of region to check | ||
898 | * | ||
899 | * Check if the region [@base, @base+@size) is a subset of a memory block. | ||
900 | * | ||
901 | * RETURNS: | ||
902 | * 0 if false, non-zero if true | ||
903 | */ | ||
853 | int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) | 904 | int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) |
854 | { | 905 | { |
855 | int idx = memblock_search(&memblock.memory, base); | 906 | int idx = memblock_search(&memblock.memory, base); |
@@ -862,6 +913,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size | |||
862 | memblock.memory.regions[idx].size) >= end; | 913 | memblock.memory.regions[idx].size) >= end; |
863 | } | 914 | } |
864 | 915 | ||
916 | /** | ||
917 | * memblock_is_region_reserved - check if a region intersects reserved memory | ||
918 | * @base: base of region to check | ||
919 | * @size: size of region to check | ||
920 | * | ||
921 | * Check if the region [@base, @base+@size) intersects a reserved memory block. | ||
922 | * | ||
923 | * RETURNS: | ||
924 | * 0 if false, non-zero if true | ||
925 | */ | ||
865 | int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) | 926 | int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) |
866 | { | 927 | { |
867 | memblock_cap_size(base, &size); | 928 | memblock_cap_size(base, &size); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7d698df4a067..795e525afaba 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -59,21 +59,21 @@ | |||
59 | 59 | ||
60 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 60 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
61 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 61 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
62 | struct mem_cgroup *root_mem_cgroup __read_mostly; | 62 | static struct mem_cgroup *root_mem_cgroup __read_mostly; |
63 | 63 | ||
64 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 64 | #ifdef CONFIG_MEMCG_SWAP |
65 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | 65 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
66 | int do_swap_account __read_mostly; | 66 | int do_swap_account __read_mostly; |
67 | 67 | ||
68 | /* for remember boot option*/ | 68 | /* for remember boot option*/ |
69 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED | 69 | #ifdef CONFIG_MEMCG_SWAP_ENABLED |
70 | static int really_do_swap_account __initdata = 1; | 70 | static int really_do_swap_account __initdata = 1; |
71 | #else | 71 | #else |
72 | static int really_do_swap_account __initdata = 0; | 72 | static int really_do_swap_account __initdata = 0; |
73 | #endif | 73 | #endif |
74 | 74 | ||
75 | #else | 75 | #else |
76 | #define do_swap_account (0) | 76 | #define do_swap_account 0 |
77 | #endif | 77 | #endif |
78 | 78 | ||
79 | 79 | ||
@@ -87,19 +87,32 @@ enum mem_cgroup_stat_index { | |||
87 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 87 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
88 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 88 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
90 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 90 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ |
91 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ | ||
92 | MEM_CGROUP_STAT_NSTATS, | 91 | MEM_CGROUP_STAT_NSTATS, |
93 | }; | 92 | }; |
94 | 93 | ||
94 | static const char * const mem_cgroup_stat_names[] = { | ||
95 | "cache", | ||
96 | "rss", | ||
97 | "mapped_file", | ||
98 | "swap", | ||
99 | }; | ||
100 | |||
95 | enum mem_cgroup_events_index { | 101 | enum mem_cgroup_events_index { |
96 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ | 102 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ |
97 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ | 103 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ |
98 | MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ | ||
99 | MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ | 104 | MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ |
100 | MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ | 105 | MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ |
101 | MEM_CGROUP_EVENTS_NSTATS, | 106 | MEM_CGROUP_EVENTS_NSTATS, |
102 | }; | 107 | }; |
108 | |||
109 | static const char * const mem_cgroup_events_names[] = { | ||
110 | "pgpgin", | ||
111 | "pgpgout", | ||
112 | "pgfault", | ||
113 | "pgmajfault", | ||
114 | }; | ||
115 | |||
103 | /* | 116 | /* |
104 | * Per memcg event counter is incremented at every pagein/pageout. With THP, | 117 | * Per memcg event counter is incremented at every pagein/pageout. With THP, |
105 | * it will be incremated by the number of pages. This counter is used for | 118 | * it will be incremated by the number of pages. This counter is used for |
@@ -112,13 +125,14 @@ enum mem_cgroup_events_target { | |||
112 | MEM_CGROUP_TARGET_NUMAINFO, | 125 | MEM_CGROUP_TARGET_NUMAINFO, |
113 | MEM_CGROUP_NTARGETS, | 126 | MEM_CGROUP_NTARGETS, |
114 | }; | 127 | }; |
115 | #define THRESHOLDS_EVENTS_TARGET (128) | 128 | #define THRESHOLDS_EVENTS_TARGET 128 |
116 | #define SOFTLIMIT_EVENTS_TARGET (1024) | 129 | #define SOFTLIMIT_EVENTS_TARGET 1024 |
117 | #define NUMAINFO_EVENTS_TARGET (1024) | 130 | #define NUMAINFO_EVENTS_TARGET 1024 |
118 | 131 | ||
119 | struct mem_cgroup_stat_cpu { | 132 | struct mem_cgroup_stat_cpu { |
120 | long count[MEM_CGROUP_STAT_NSTATS]; | 133 | long count[MEM_CGROUP_STAT_NSTATS]; |
121 | unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; | 134 | unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; |
135 | unsigned long nr_page_events; | ||
122 | unsigned long targets[MEM_CGROUP_NTARGETS]; | 136 | unsigned long targets[MEM_CGROUP_NTARGETS]; |
123 | }; | 137 | }; |
124 | 138 | ||
@@ -138,7 +152,6 @@ struct mem_cgroup_per_zone { | |||
138 | 152 | ||
139 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 153 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
140 | 154 | ||
141 | struct zone_reclaim_stat reclaim_stat; | ||
142 | struct rb_node tree_node; /* RB tree node */ | 155 | struct rb_node tree_node; /* RB tree node */ |
143 | unsigned long long usage_in_excess;/* Set to the value by which */ | 156 | unsigned long long usage_in_excess;/* Set to the value by which */ |
144 | /* the soft limit is exceeded*/ | 157 | /* the soft limit is exceeded*/ |
@@ -182,7 +195,7 @@ struct mem_cgroup_threshold { | |||
182 | 195 | ||
183 | /* For threshold */ | 196 | /* For threshold */ |
184 | struct mem_cgroup_threshold_ary { | 197 | struct mem_cgroup_threshold_ary { |
185 | /* An array index points to threshold just below usage. */ | 198 | /* An array index points to threshold just below or equal to usage. */ |
186 | int current_threshold; | 199 | int current_threshold; |
187 | /* Size of entries[] */ | 200 | /* Size of entries[] */ |
188 | unsigned int size; | 201 | unsigned int size; |
@@ -245,8 +258,8 @@ struct mem_cgroup { | |||
245 | */ | 258 | */ |
246 | struct rcu_head rcu_freeing; | 259 | struct rcu_head rcu_freeing; |
247 | /* | 260 | /* |
248 | * But when using vfree(), that cannot be done at | 261 | * We also need some space for a worker in deferred freeing. |
249 | * interrupt time, so we must then queue the work. | 262 | * By the time we call it, rcu_freeing is no longer in use. |
250 | */ | 263 | */ |
251 | struct work_struct work_freeing; | 264 | struct work_struct work_freeing; |
252 | }; | 265 | }; |
@@ -305,7 +318,7 @@ struct mem_cgroup { | |||
305 | /* | 318 | /* |
306 | * percpu counter. | 319 | * percpu counter. |
307 | */ | 320 | */ |
308 | struct mem_cgroup_stat_cpu *stat; | 321 | struct mem_cgroup_stat_cpu __percpu *stat; |
309 | /* | 322 | /* |
310 | * used when a cpu is offlined or other synchronizations | 323 | * used when a cpu is offlined or other synchronizations |
311 | * See mem_cgroup_read_stat(). | 324 | * See mem_cgroup_read_stat(). |
@@ -360,14 +373,12 @@ static bool move_file(void) | |||
360 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 373 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
361 | * limit reclaim to prevent infinite loops, if they ever occur. | 374 | * limit reclaim to prevent infinite loops, if they ever occur. |
362 | */ | 375 | */ |
363 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) | 376 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
364 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) | 377 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 |
365 | 378 | ||
366 | enum charge_type { | 379 | enum charge_type { |
367 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 380 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
368 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 381 | MEM_CGROUP_CHARGE_TYPE_ANON, |
369 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ | ||
370 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ | ||
371 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ | 382 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ |
372 | MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ | 383 | MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ |
373 | NR_CHARGE_TYPE, | 384 | NR_CHARGE_TYPE, |
@@ -377,8 +388,8 @@ enum charge_type { | |||
377 | #define _MEM (0) | 388 | #define _MEM (0) |
378 | #define _MEMSWAP (1) | 389 | #define _MEMSWAP (1) |
379 | #define _OOM_TYPE (2) | 390 | #define _OOM_TYPE (2) |
380 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | 391 | #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) |
381 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 392 | #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) |
382 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 393 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
383 | /* Used for OOM nofiier */ | 394 | /* Used for OOM nofiier */ |
384 | #define OOM_CONTROL (0) | 395 | #define OOM_CONTROL (0) |
@@ -394,8 +405,14 @@ enum charge_type { | |||
394 | static void mem_cgroup_get(struct mem_cgroup *memcg); | 405 | static void mem_cgroup_get(struct mem_cgroup *memcg); |
395 | static void mem_cgroup_put(struct mem_cgroup *memcg); | 406 | static void mem_cgroup_put(struct mem_cgroup *memcg); |
396 | 407 | ||
408 | static inline | ||
409 | struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) | ||
410 | { | ||
411 | return container_of(s, struct mem_cgroup, css); | ||
412 | } | ||
413 | |||
397 | /* Writing them here to avoid exposing memcg's inner layout */ | 414 | /* Writing them here to avoid exposing memcg's inner layout */ |
398 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 415 | #ifdef CONFIG_MEMCG_KMEM |
399 | #include <net/sock.h> | 416 | #include <net/sock.h> |
400 | #include <net/ip.h> | 417 | #include <net/ip.h> |
401 | 418 | ||
@@ -404,6 +421,7 @@ void sock_update_memcg(struct sock *sk) | |||
404 | { | 421 | { |
405 | if (mem_cgroup_sockets_enabled) { | 422 | if (mem_cgroup_sockets_enabled) { |
406 | struct mem_cgroup *memcg; | 423 | struct mem_cgroup *memcg; |
424 | struct cg_proto *cg_proto; | ||
407 | 425 | ||
408 | BUG_ON(!sk->sk_prot->proto_cgroup); | 426 | BUG_ON(!sk->sk_prot->proto_cgroup); |
409 | 427 | ||
@@ -423,9 +441,10 @@ void sock_update_memcg(struct sock *sk) | |||
423 | 441 | ||
424 | rcu_read_lock(); | 442 | rcu_read_lock(); |
425 | memcg = mem_cgroup_from_task(current); | 443 | memcg = mem_cgroup_from_task(current); |
426 | if (!mem_cgroup_is_root(memcg)) { | 444 | cg_proto = sk->sk_prot->proto_cgroup(memcg); |
445 | if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) { | ||
427 | mem_cgroup_get(memcg); | 446 | mem_cgroup_get(memcg); |
428 | sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg); | 447 | sk->sk_cgrp = cg_proto; |
429 | } | 448 | } |
430 | rcu_read_unlock(); | 449 | rcu_read_unlock(); |
431 | } | 450 | } |
@@ -452,7 +471,20 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) | |||
452 | } | 471 | } |
453 | EXPORT_SYMBOL(tcp_proto_cgroup); | 472 | EXPORT_SYMBOL(tcp_proto_cgroup); |
454 | #endif /* CONFIG_INET */ | 473 | #endif /* CONFIG_INET */ |
455 | #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ | 474 | #endif /* CONFIG_MEMCG_KMEM */ |
475 | |||
476 | #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) | ||
477 | static void disarm_sock_keys(struct mem_cgroup *memcg) | ||
478 | { | ||
479 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) | ||
480 | return; | ||
481 | static_key_slow_dec(&memcg_socket_limit_enabled); | ||
482 | } | ||
483 | #else | ||
484 | static void disarm_sock_keys(struct mem_cgroup *memcg) | ||
485 | { | ||
486 | } | ||
487 | #endif | ||
456 | 488 | ||
457 | static void drain_all_stock_async(struct mem_cgroup *memcg); | 489 | static void drain_all_stock_async(struct mem_cgroup *memcg); |
458 | 490 | ||
@@ -675,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, | |||
675 | bool charge) | 707 | bool charge) |
676 | { | 708 | { |
677 | int val = (charge) ? 1 : -1; | 709 | int val = (charge) ? 1 : -1; |
678 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); | 710 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); |
679 | } | 711 | } |
680 | 712 | ||
681 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | 713 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, |
@@ -718,12 +750,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | |||
718 | nr_pages = -nr_pages; /* for event */ | 750 | nr_pages = -nr_pages; /* for event */ |
719 | } | 751 | } |
720 | 752 | ||
721 | __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); | 753 | __this_cpu_add(memcg->stat->nr_page_events, nr_pages); |
722 | 754 | ||
723 | preempt_enable(); | 755 | preempt_enable(); |
724 | } | 756 | } |
725 | 757 | ||
726 | unsigned long | 758 | unsigned long |
759 | mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) | ||
760 | { | ||
761 | struct mem_cgroup_per_zone *mz; | ||
762 | |||
763 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); | ||
764 | return mz->lru_size[lru]; | ||
765 | } | ||
766 | |||
767 | static unsigned long | ||
727 | mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, | 768 | mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, |
728 | unsigned int lru_mask) | 769 | unsigned int lru_mask) |
729 | { | 770 | { |
@@ -770,7 +811,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | |||
770 | { | 811 | { |
771 | unsigned long val, next; | 812 | unsigned long val, next; |
772 | 813 | ||
773 | val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); | 814 | val = __this_cpu_read(memcg->stat->nr_page_events); |
774 | next = __this_cpu_read(memcg->stat->targets[target]); | 815 | next = __this_cpu_read(memcg->stat->targets[target]); |
775 | /* from time_after() in jiffies.h */ | 816 | /* from time_after() in jiffies.h */ |
776 | if ((long)next - (long)val < 0) { | 817 | if ((long)next - (long)val < 0) { |
@@ -827,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
827 | 868 | ||
828 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 869 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
829 | { | 870 | { |
830 | return container_of(cgroup_subsys_state(cont, | 871 | return mem_cgroup_from_css( |
831 | mem_cgroup_subsys_id), struct mem_cgroup, | 872 | cgroup_subsys_state(cont, mem_cgroup_subsys_id)); |
832 | css); | ||
833 | } | 873 | } |
834 | 874 | ||
835 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | 875 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) |
@@ -842,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
842 | if (unlikely(!p)) | 882 | if (unlikely(!p)) |
843 | return NULL; | 883 | return NULL; |
844 | 884 | ||
845 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), | 885 | return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); |
846 | struct mem_cgroup, css); | ||
847 | } | 886 | } |
848 | 887 | ||
849 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | 888 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) |
@@ -929,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
929 | css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); | 968 | css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); |
930 | if (css) { | 969 | if (css) { |
931 | if (css == &root->css || css_tryget(css)) | 970 | if (css == &root->css || css_tryget(css)) |
932 | memcg = container_of(css, | 971 | memcg = mem_cgroup_from_css(css); |
933 | struct mem_cgroup, css); | ||
934 | } else | 972 | } else |
935 | id = 0; | 973 | id = 0; |
936 | rcu_read_unlock(); | 974 | rcu_read_unlock(); |
@@ -1013,7 +1051,7 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event); | |||
1013 | /** | 1051 | /** |
1014 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg | 1052 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg |
1015 | * @zone: zone of the wanted lruvec | 1053 | * @zone: zone of the wanted lruvec |
1016 | * @mem: memcg of the wanted lruvec | 1054 | * @memcg: memcg of the wanted lruvec |
1017 | * | 1055 | * |
1018 | * Returns the lru list vector holding pages for the given @zone and | 1056 | * Returns the lru list vector holding pages for the given @zone and |
1019 | * @mem. This can be the global zone lruvec, if the memory controller | 1057 | * @mem. This can be the global zone lruvec, if the memory controller |
@@ -1046,19 +1084,11 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, | |||
1046 | */ | 1084 | */ |
1047 | 1085 | ||
1048 | /** | 1086 | /** |
1049 | * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec | 1087 | * mem_cgroup_page_lruvec - return lruvec for adding an lru page |
1050 | * @zone: zone of the page | ||
1051 | * @page: the page | 1088 | * @page: the page |
1052 | * @lru: current lru | 1089 | * @zone: zone of the page |
1053 | * | ||
1054 | * This function accounts for @page being added to @lru, and returns | ||
1055 | * the lruvec for the given @zone and the memcg @page is charged to. | ||
1056 | * | ||
1057 | * The callsite is then responsible for physically linking the page to | ||
1058 | * the returned lruvec->lists[@lru]. | ||
1059 | */ | 1090 | */ |
1060 | struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, | 1091 | struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) |
1061 | enum lru_list lru) | ||
1062 | { | 1092 | { |
1063 | struct mem_cgroup_per_zone *mz; | 1093 | struct mem_cgroup_per_zone *mz; |
1064 | struct mem_cgroup *memcg; | 1094 | struct mem_cgroup *memcg; |
@@ -1071,7 +1101,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, | |||
1071 | memcg = pc->mem_cgroup; | 1101 | memcg = pc->mem_cgroup; |
1072 | 1102 | ||
1073 | /* | 1103 | /* |
1074 | * Surreptitiously switch any uncharged page to root: | 1104 | * Surreptitiously switch any uncharged offlist page to root: |
1075 | * an uncharged page off lru does nothing to secure | 1105 | * an uncharged page off lru does nothing to secure |
1076 | * its former mem_cgroup from sudden removal. | 1106 | * its former mem_cgroup from sudden removal. |
1077 | * | 1107 | * |
@@ -1079,85 +1109,60 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, | |||
1079 | * under page_cgroup lock: between them, they make all uses | 1109 | * under page_cgroup lock: between them, they make all uses |
1080 | * of pc->mem_cgroup safe. | 1110 | * of pc->mem_cgroup safe. |
1081 | */ | 1111 | */ |
1082 | if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) | 1112 | if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) |
1083 | pc->mem_cgroup = memcg = root_mem_cgroup; | 1113 | pc->mem_cgroup = memcg = root_mem_cgroup; |
1084 | 1114 | ||
1085 | mz = page_cgroup_zoneinfo(memcg, page); | 1115 | mz = page_cgroup_zoneinfo(memcg, page); |
1086 | /* compound_order() is stabilized through lru_lock */ | ||
1087 | mz->lru_size[lru] += 1 << compound_order(page); | ||
1088 | return &mz->lruvec; | 1116 | return &mz->lruvec; |
1089 | } | 1117 | } |
1090 | 1118 | ||
1091 | /** | 1119 | /** |
1092 | * mem_cgroup_lru_del_list - account for removing an lru page | 1120 | * mem_cgroup_update_lru_size - account for adding or removing an lru page |
1093 | * @page: the page | 1121 | * @lruvec: mem_cgroup per zone lru vector |
1094 | * @lru: target lru | 1122 | * @lru: index of lru list the page is sitting on |
1123 | * @nr_pages: positive when adding or negative when removing | ||
1095 | * | 1124 | * |
1096 | * This function accounts for @page being removed from @lru. | 1125 | * This function must be called when a page is added to or removed from an |
1097 | * | 1126 | * lru list. |
1098 | * The callsite is then responsible for physically unlinking | ||
1099 | * @page->lru. | ||
1100 | */ | 1127 | */ |
1101 | void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) | 1128 | void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, |
1129 | int nr_pages) | ||
1102 | { | 1130 | { |
1103 | struct mem_cgroup_per_zone *mz; | 1131 | struct mem_cgroup_per_zone *mz; |
1104 | struct mem_cgroup *memcg; | 1132 | unsigned long *lru_size; |
1105 | struct page_cgroup *pc; | ||
1106 | 1133 | ||
1107 | if (mem_cgroup_disabled()) | 1134 | if (mem_cgroup_disabled()) |
1108 | return; | 1135 | return; |
1109 | 1136 | ||
1110 | pc = lookup_page_cgroup(page); | 1137 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); |
1111 | memcg = pc->mem_cgroup; | 1138 | lru_size = mz->lru_size + lru; |
1112 | VM_BUG_ON(!memcg); | 1139 | *lru_size += nr_pages; |
1113 | mz = page_cgroup_zoneinfo(memcg, page); | 1140 | VM_BUG_ON((long)(*lru_size) < 0); |
1114 | /* huge page split is done under lru_lock. so, we have no races. */ | ||
1115 | VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page))); | ||
1116 | mz->lru_size[lru] -= 1 << compound_order(page); | ||
1117 | } | ||
1118 | |||
1119 | void mem_cgroup_lru_del(struct page *page) | ||
1120 | { | ||
1121 | mem_cgroup_lru_del_list(page, page_lru(page)); | ||
1122 | } | ||
1123 | |||
1124 | /** | ||
1125 | * mem_cgroup_lru_move_lists - account for moving a page between lrus | ||
1126 | * @zone: zone of the page | ||
1127 | * @page: the page | ||
1128 | * @from: current lru | ||
1129 | * @to: target lru | ||
1130 | * | ||
1131 | * This function accounts for @page being moved between the lrus @from | ||
1132 | * and @to, and returns the lruvec for the given @zone and the memcg | ||
1133 | * @page is charged to. | ||
1134 | * | ||
1135 | * The callsite is then responsible for physically relinking | ||
1136 | * @page->lru to the returned lruvec->lists[@to]. | ||
1137 | */ | ||
1138 | struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, | ||
1139 | struct page *page, | ||
1140 | enum lru_list from, | ||
1141 | enum lru_list to) | ||
1142 | { | ||
1143 | /* XXX: Optimize this, especially for @from == @to */ | ||
1144 | mem_cgroup_lru_del_list(page, from); | ||
1145 | return mem_cgroup_lru_add_list(zone, page, to); | ||
1146 | } | 1141 | } |
1147 | 1142 | ||
1148 | /* | 1143 | /* |
1149 | * Checks whether given mem is same or in the root_mem_cgroup's | 1144 | * Checks whether given mem is same or in the root_mem_cgroup's |
1150 | * hierarchy subtree | 1145 | * hierarchy subtree |
1151 | */ | 1146 | */ |
1147 | bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | ||
1148 | struct mem_cgroup *memcg) | ||
1149 | { | ||
1150 | if (root_memcg == memcg) | ||
1151 | return true; | ||
1152 | if (!root_memcg->use_hierarchy || !memcg) | ||
1153 | return false; | ||
1154 | return css_is_ancestor(&memcg->css, &root_memcg->css); | ||
1155 | } | ||
1156 | |||
1152 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | 1157 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, |
1153 | struct mem_cgroup *memcg) | 1158 | struct mem_cgroup *memcg) |
1154 | { | 1159 | { |
1155 | if (root_memcg != memcg) { | 1160 | bool ret; |
1156 | return (root_memcg->use_hierarchy && | ||
1157 | css_is_ancestor(&memcg->css, &root_memcg->css)); | ||
1158 | } | ||
1159 | 1161 | ||
1160 | return true; | 1162 | rcu_read_lock(); |
1163 | ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); | ||
1164 | rcu_read_unlock(); | ||
1165 | return ret; | ||
1161 | } | 1166 | } |
1162 | 1167 | ||
1163 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) | 1168 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) |
@@ -1195,19 +1200,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) | |||
1195 | return ret; | 1200 | return ret; |
1196 | } | 1201 | } |
1197 | 1202 | ||
1198 | int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) | 1203 | int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) |
1199 | { | 1204 | { |
1200 | unsigned long inactive_ratio; | 1205 | unsigned long inactive_ratio; |
1201 | int nid = zone_to_nid(zone); | ||
1202 | int zid = zone_idx(zone); | ||
1203 | unsigned long inactive; | 1206 | unsigned long inactive; |
1204 | unsigned long active; | 1207 | unsigned long active; |
1205 | unsigned long gb; | 1208 | unsigned long gb; |
1206 | 1209 | ||
1207 | inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, | 1210 | inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); |
1208 | BIT(LRU_INACTIVE_ANON)); | 1211 | active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); |
1209 | active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, | ||
1210 | BIT(LRU_ACTIVE_ANON)); | ||
1211 | 1212 | ||
1212 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | 1213 | gb = (inactive + active) >> (30 - PAGE_SHIFT); |
1213 | if (gb) | 1214 | if (gb) |
@@ -1218,55 +1219,23 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) | |||
1218 | return inactive * inactive_ratio < active; | 1219 | return inactive * inactive_ratio < active; |
1219 | } | 1220 | } |
1220 | 1221 | ||
1221 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) | 1222 | int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) |
1222 | { | 1223 | { |
1223 | unsigned long active; | 1224 | unsigned long active; |
1224 | unsigned long inactive; | 1225 | unsigned long inactive; |
1225 | int zid = zone_idx(zone); | ||
1226 | int nid = zone_to_nid(zone); | ||
1227 | 1226 | ||
1228 | inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, | 1227 | inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE); |
1229 | BIT(LRU_INACTIVE_FILE)); | 1228 | active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE); |
1230 | active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, | ||
1231 | BIT(LRU_ACTIVE_FILE)); | ||
1232 | 1229 | ||
1233 | return (active > inactive); | 1230 | return (active > inactive); |
1234 | } | 1231 | } |
1235 | 1232 | ||
1236 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, | ||
1237 | struct zone *zone) | ||
1238 | { | ||
1239 | int nid = zone_to_nid(zone); | ||
1240 | int zid = zone_idx(zone); | ||
1241 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
1242 | |||
1243 | return &mz->reclaim_stat; | ||
1244 | } | ||
1245 | |||
1246 | struct zone_reclaim_stat * | ||
1247 | mem_cgroup_get_reclaim_stat_from_page(struct page *page) | ||
1248 | { | ||
1249 | struct page_cgroup *pc; | ||
1250 | struct mem_cgroup_per_zone *mz; | ||
1251 | |||
1252 | if (mem_cgroup_disabled()) | ||
1253 | return NULL; | ||
1254 | |||
1255 | pc = lookup_page_cgroup(page); | ||
1256 | if (!PageCgroupUsed(pc)) | ||
1257 | return NULL; | ||
1258 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | ||
1259 | smp_rmb(); | ||
1260 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | ||
1261 | return &mz->reclaim_stat; | ||
1262 | } | ||
1263 | |||
1264 | #define mem_cgroup_from_res_counter(counter, member) \ | 1233 | #define mem_cgroup_from_res_counter(counter, member) \ |
1265 | container_of(counter, struct mem_cgroup, member) | 1234 | container_of(counter, struct mem_cgroup, member) |
1266 | 1235 | ||
1267 | /** | 1236 | /** |
1268 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup | 1237 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup |
1269 | * @mem: the memory cgroup | 1238 | * @memcg: the memory cgroup |
1270 | * | 1239 | * |
1271 | * Returns the maximum amount of memory @mem can be charged with, in | 1240 | * Returns the maximum amount of memory @mem can be charged with, in |
1272 | * pages. | 1241 | * pages. |
@@ -1486,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) | |||
1486 | /* | 1455 | /* |
1487 | * Return the memory (and swap, if configured) limit for a memcg. | 1456 | * Return the memory (and swap, if configured) limit for a memcg. |
1488 | */ | 1457 | */ |
1489 | u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | 1458 | static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) |
1490 | { | 1459 | { |
1491 | u64 limit; | 1460 | u64 limit; |
1492 | u64 memsw; | 1461 | u64 memsw; |
@@ -1502,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
1502 | return min(limit, memsw); | 1471 | return min(limit, memsw); |
1503 | } | 1472 | } |
1504 | 1473 | ||
1474 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | ||
1475 | int order) | ||
1476 | { | ||
1477 | struct mem_cgroup *iter; | ||
1478 | unsigned long chosen_points = 0; | ||
1479 | unsigned long totalpages; | ||
1480 | unsigned int points = 0; | ||
1481 | struct task_struct *chosen = NULL; | ||
1482 | |||
1483 | /* | ||
1484 | * If current has a pending SIGKILL, then automatically select it. The | ||
1485 | * goal is to allow it to allocate so that it may quickly exit and free | ||
1486 | * its memory. | ||
1487 | */ | ||
1488 | if (fatal_signal_pending(current)) { | ||
1489 | set_thread_flag(TIF_MEMDIE); | ||
1490 | return; | ||
1491 | } | ||
1492 | |||
1493 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | ||
1494 | totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; | ||
1495 | for_each_mem_cgroup_tree(iter, memcg) { | ||
1496 | struct cgroup *cgroup = iter->css.cgroup; | ||
1497 | struct cgroup_iter it; | ||
1498 | struct task_struct *task; | ||
1499 | |||
1500 | cgroup_iter_start(cgroup, &it); | ||
1501 | while ((task = cgroup_iter_next(cgroup, &it))) { | ||
1502 | switch (oom_scan_process_thread(task, totalpages, NULL, | ||
1503 | false)) { | ||
1504 | case OOM_SCAN_SELECT: | ||
1505 | if (chosen) | ||
1506 | put_task_struct(chosen); | ||
1507 | chosen = task; | ||
1508 | chosen_points = ULONG_MAX; | ||
1509 | get_task_struct(chosen); | ||
1510 | /* fall through */ | ||
1511 | case OOM_SCAN_CONTINUE: | ||
1512 | continue; | ||
1513 | case OOM_SCAN_ABORT: | ||
1514 | cgroup_iter_end(cgroup, &it); | ||
1515 | mem_cgroup_iter_break(memcg, iter); | ||
1516 | if (chosen) | ||
1517 | put_task_struct(chosen); | ||
1518 | return; | ||
1519 | case OOM_SCAN_OK: | ||
1520 | break; | ||
1521 | }; | ||
1522 | points = oom_badness(task, memcg, NULL, totalpages); | ||
1523 | if (points > chosen_points) { | ||
1524 | if (chosen) | ||
1525 | put_task_struct(chosen); | ||
1526 | chosen = task; | ||
1527 | chosen_points = points; | ||
1528 | get_task_struct(chosen); | ||
1529 | } | ||
1530 | } | ||
1531 | cgroup_iter_end(cgroup, &it); | ||
1532 | } | ||
1533 | |||
1534 | if (!chosen) | ||
1535 | return; | ||
1536 | points = chosen_points * 1000 / totalpages; | ||
1537 | oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, | ||
1538 | NULL, "Memory cgroup out of memory"); | ||
1539 | } | ||
1540 | |||
1505 | static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | 1541 | static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, |
1506 | gfp_t gfp_mask, | 1542 | gfp_t gfp_mask, |
1507 | unsigned long flags) | 1543 | unsigned long flags) |
@@ -1540,7 +1576,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | |||
1540 | 1576 | ||
1541 | /** | 1577 | /** |
1542 | * test_mem_cgroup_node_reclaimable | 1578 | * test_mem_cgroup_node_reclaimable |
1543 | * @mem: the target memcg | 1579 | * @memcg: the target memcg |
1544 | * @nid: the node ID to be checked. | 1580 | * @nid: the node ID to be checked. |
1545 | * @noswap : specify true here if the user wants flle only information. | 1581 | * @noswap : specify true here if the user wants flle only information. |
1546 | * | 1582 | * |
@@ -1634,7 +1670,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
1634 | * unused nodes. But scan_nodes is lazily updated and may not cotain | 1670 | * unused nodes. But scan_nodes is lazily updated and may not cotain |
1635 | * enough new information. We need to do double check. | 1671 | * enough new information. We need to do double check. |
1636 | */ | 1672 | */ |
1637 | bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | 1673 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
1638 | { | 1674 | { |
1639 | int nid; | 1675 | int nid; |
1640 | 1676 | ||
@@ -1669,7 +1705,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
1669 | return 0; | 1705 | return 0; |
1670 | } | 1706 | } |
1671 | 1707 | ||
1672 | bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | 1708 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
1673 | { | 1709 | { |
1674 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); | 1710 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); |
1675 | } | 1711 | } |
@@ -1843,7 +1879,8 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
1843 | /* | 1879 | /* |
1844 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 1880 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. |
1845 | */ | 1881 | */ |
1846 | bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | 1882 | static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, |
1883 | int order) | ||
1847 | { | 1884 | { |
1848 | struct oom_wait_info owait; | 1885 | struct oom_wait_info owait; |
1849 | bool locked, need_to_kill; | 1886 | bool locked, need_to_kill; |
@@ -1930,7 +1967,7 @@ again: | |||
1930 | return; | 1967 | return; |
1931 | /* | 1968 | /* |
1932 | * If this memory cgroup is not under account moving, we don't | 1969 | * If this memory cgroup is not under account moving, we don't |
1933 | * need to take move_lock_page_cgroup(). Because we already hold | 1970 | * need to take move_lock_mem_cgroup(). Because we already hold |
1934 | * rcu_read_lock(), any calls to move_account will be delayed until | 1971 | * rcu_read_lock(), any calls to move_account will be delayed until |
1935 | * rcu_read_unlock() if mem_cgroup_stolen() == true. | 1972 | * rcu_read_unlock() if mem_cgroup_stolen() == true. |
1936 | */ | 1973 | */ |
@@ -1952,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) | |||
1952 | /* | 1989 | /* |
1953 | * It's guaranteed that pc->mem_cgroup never changes while | 1990 | * It's guaranteed that pc->mem_cgroup never changes while |
1954 | * lock is held because a routine modifies pc->mem_cgroup | 1991 | * lock is held because a routine modifies pc->mem_cgroup |
1955 | * should take move_lock_page_cgroup(). | 1992 | * should take move_lock_mem_cgroup(). |
1956 | */ | 1993 | */ |
1957 | move_unlock_mem_cgroup(pc->mem_cgroup, flags); | 1994 | move_unlock_mem_cgroup(pc->mem_cgroup, flags); |
1958 | } | 1995 | } |
@@ -1992,7 +2029,7 @@ struct memcg_stock_pcp { | |||
1992 | unsigned int nr_pages; | 2029 | unsigned int nr_pages; |
1993 | struct work_struct work; | 2030 | struct work_struct work; |
1994 | unsigned long flags; | 2031 | unsigned long flags; |
1995 | #define FLUSHING_CACHED_CHARGE (0) | 2032 | #define FLUSHING_CACHED_CHARGE 0 |
1996 | }; | 2033 | }; |
1997 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | 2034 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); |
1998 | static DEFINE_MUTEX(percpu_charge_mutex); | 2035 | static DEFINE_MUTEX(percpu_charge_mutex); |
@@ -2139,7 +2176,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) | |||
2139 | int i; | 2176 | int i; |
2140 | 2177 | ||
2141 | spin_lock(&memcg->pcp_counter_lock); | 2178 | spin_lock(&memcg->pcp_counter_lock); |
2142 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { | 2179 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
2143 | long x = per_cpu(memcg->stat->count[i], cpu); | 2180 | long x = per_cpu(memcg->stat->count[i], cpu); |
2144 | 2181 | ||
2145 | per_cpu(memcg->stat->count[i], cpu) = 0; | 2182 | per_cpu(memcg->stat->count[i], cpu) = 0; |
@@ -2165,7 +2202,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | |||
2165 | if (action == CPU_ONLINE) | 2202 | if (action == CPU_ONLINE) |
2166 | return NOTIFY_OK; | 2203 | return NOTIFY_OK; |
2167 | 2204 | ||
2168 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) | 2205 | if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) |
2169 | return NOTIFY_OK; | 2206 | return NOTIFY_OK; |
2170 | 2207 | ||
2171 | for_each_mem_cgroup(iter) | 2208 | for_each_mem_cgroup(iter) |
@@ -2299,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2299 | * We always charge the cgroup the mm_struct belongs to. | 2336 | * We always charge the cgroup the mm_struct belongs to. |
2300 | * The mm_struct's mem_cgroup changes on task migration if the | 2337 | * The mm_struct's mem_cgroup changes on task migration if the |
2301 | * thread group leader migrates. It's possible that mm is not | 2338 | * thread group leader migrates. It's possible that mm is not |
2302 | * set, if so charge the init_mm (happens for pagecache usage). | 2339 | * set, if so charge the root memcg (happens for pagecache usage). |
2303 | */ | 2340 | */ |
2304 | if (!*ptr && !mm) | 2341 | if (!*ptr && !mm) |
2305 | *ptr = root_mem_cgroup; | 2342 | *ptr = root_mem_cgroup; |
@@ -2427,6 +2464,24 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, | |||
2427 | } | 2464 | } |
2428 | 2465 | ||
2429 | /* | 2466 | /* |
2467 | * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. | ||
2468 | * This is useful when moving usage to parent cgroup. | ||
2469 | */ | ||
2470 | static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, | ||
2471 | unsigned int nr_pages) | ||
2472 | { | ||
2473 | unsigned long bytes = nr_pages * PAGE_SIZE; | ||
2474 | |||
2475 | if (mem_cgroup_is_root(memcg)) | ||
2476 | return; | ||
2477 | |||
2478 | res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); | ||
2479 | if (do_swap_account) | ||
2480 | res_counter_uncharge_until(&memcg->memsw, | ||
2481 | memcg->memsw.parent, bytes); | ||
2482 | } | ||
2483 | |||
2484 | /* | ||
2430 | * A helper function to get mem_cgroup from ID. must be called under | 2485 | * A helper function to get mem_cgroup from ID. must be called under |
2431 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 2486 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
2432 | * it's concern. (dropping refcnt from swap can be called against removed | 2487 | * it's concern. (dropping refcnt from swap can be called against removed |
@@ -2442,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
2442 | css = css_lookup(&mem_cgroup_subsys, id); | 2497 | css = css_lookup(&mem_cgroup_subsys, id); |
2443 | if (!css) | 2498 | if (!css) |
2444 | return NULL; | 2499 | return NULL; |
2445 | return container_of(css, struct mem_cgroup, css); | 2500 | return mem_cgroup_from_css(css); |
2446 | } | 2501 | } |
2447 | 2502 | ||
2448 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | 2503 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
@@ -2476,20 +2531,17 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
2476 | static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | 2531 | static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, |
2477 | struct page *page, | 2532 | struct page *page, |
2478 | unsigned int nr_pages, | 2533 | unsigned int nr_pages, |
2479 | struct page_cgroup *pc, | ||
2480 | enum charge_type ctype, | 2534 | enum charge_type ctype, |
2481 | bool lrucare) | 2535 | bool lrucare) |
2482 | { | 2536 | { |
2537 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
2483 | struct zone *uninitialized_var(zone); | 2538 | struct zone *uninitialized_var(zone); |
2539 | struct lruvec *lruvec; | ||
2484 | bool was_on_lru = false; | 2540 | bool was_on_lru = false; |
2485 | bool anon; | 2541 | bool anon; |
2486 | 2542 | ||
2487 | lock_page_cgroup(pc); | 2543 | lock_page_cgroup(pc); |
2488 | if (unlikely(PageCgroupUsed(pc))) { | 2544 | VM_BUG_ON(PageCgroupUsed(pc)); |
2489 | unlock_page_cgroup(pc); | ||
2490 | __mem_cgroup_cancel_charge(memcg, nr_pages); | ||
2491 | return; | ||
2492 | } | ||
2493 | /* | 2545 | /* |
2494 | * we don't need page_cgroup_lock about tail pages, becase they are not | 2546 | * we don't need page_cgroup_lock about tail pages, becase they are not |
2495 | * accessed by any other context at this point. | 2547 | * accessed by any other context at this point. |
@@ -2503,8 +2555,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2503 | zone = page_zone(page); | 2555 | zone = page_zone(page); |
2504 | spin_lock_irq(&zone->lru_lock); | 2556 | spin_lock_irq(&zone->lru_lock); |
2505 | if (PageLRU(page)) { | 2557 | if (PageLRU(page)) { |
2558 | lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); | ||
2506 | ClearPageLRU(page); | 2559 | ClearPageLRU(page); |
2507 | del_page_from_lru_list(zone, page, page_lru(page)); | 2560 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
2508 | was_on_lru = true; | 2561 | was_on_lru = true; |
2509 | } | 2562 | } |
2510 | } | 2563 | } |
@@ -2522,14 +2575,15 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2522 | 2575 | ||
2523 | if (lrucare) { | 2576 | if (lrucare) { |
2524 | if (was_on_lru) { | 2577 | if (was_on_lru) { |
2578 | lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); | ||
2525 | VM_BUG_ON(PageLRU(page)); | 2579 | VM_BUG_ON(PageLRU(page)); |
2526 | SetPageLRU(page); | 2580 | SetPageLRU(page); |
2527 | add_page_to_lru_list(zone, page, page_lru(page)); | 2581 | add_page_to_lru_list(page, lruvec, page_lru(page)); |
2528 | } | 2582 | } |
2529 | spin_unlock_irq(&zone->lru_lock); | 2583 | spin_unlock_irq(&zone->lru_lock); |
2530 | } | 2584 | } |
2531 | 2585 | ||
2532 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 2586 | if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) |
2533 | anon = true; | 2587 | anon = true; |
2534 | else | 2588 | else |
2535 | anon = false; | 2589 | anon = false; |
@@ -2547,7 +2601,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2547 | 2601 | ||
2548 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2602 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
2549 | 2603 | ||
2550 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) | 2604 | #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) |
2551 | /* | 2605 | /* |
2552 | * Because tail pages are not marked as "used", set it. We're under | 2606 | * Because tail pages are not marked as "used", set it. We're under |
2553 | * zone->lru_lock, 'splitting on pmd' and compound_lock. | 2607 | * zone->lru_lock, 'splitting on pmd' and compound_lock. |
@@ -2578,23 +2632,19 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
2578 | * @pc: page_cgroup of the page. | 2632 | * @pc: page_cgroup of the page. |
2579 | * @from: mem_cgroup which the page is moved from. | 2633 | * @from: mem_cgroup which the page is moved from. |
2580 | * @to: mem_cgroup which the page is moved to. @from != @to. | 2634 | * @to: mem_cgroup which the page is moved to. @from != @to. |
2581 | * @uncharge: whether we should call uncharge and css_put against @from. | ||
2582 | * | 2635 | * |
2583 | * The caller must confirm following. | 2636 | * The caller must confirm following. |
2584 | * - page is not on LRU (isolate_page() is useful.) | 2637 | * - page is not on LRU (isolate_page() is useful.) |
2585 | * - compound_lock is held when nr_pages > 1 | 2638 | * - compound_lock is held when nr_pages > 1 |
2586 | * | 2639 | * |
2587 | * This function doesn't do "charge" nor css_get to new cgroup. It should be | 2640 | * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" |
2588 | * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is | 2641 | * from old cgroup. |
2589 | * true, this function does "uncharge" from old cgroup, but it doesn't if | ||
2590 | * @uncharge is false, so a caller should do "uncharge". | ||
2591 | */ | 2642 | */ |
2592 | static int mem_cgroup_move_account(struct page *page, | 2643 | static int mem_cgroup_move_account(struct page *page, |
2593 | unsigned int nr_pages, | 2644 | unsigned int nr_pages, |
2594 | struct page_cgroup *pc, | 2645 | struct page_cgroup *pc, |
2595 | struct mem_cgroup *from, | 2646 | struct mem_cgroup *from, |
2596 | struct mem_cgroup *to, | 2647 | struct mem_cgroup *to) |
2597 | bool uncharge) | ||
2598 | { | 2648 | { |
2599 | unsigned long flags; | 2649 | unsigned long flags; |
2600 | int ret; | 2650 | int ret; |
@@ -2628,9 +2678,6 @@ static int mem_cgroup_move_account(struct page *page, | |||
2628 | preempt_enable(); | 2678 | preempt_enable(); |
2629 | } | 2679 | } |
2630 | mem_cgroup_charge_statistics(from, anon, -nr_pages); | 2680 | mem_cgroup_charge_statistics(from, anon, -nr_pages); |
2631 | if (uncharge) | ||
2632 | /* This is not "cancel", but cancel_charge does all we need. */ | ||
2633 | __mem_cgroup_cancel_charge(from, nr_pages); | ||
2634 | 2681 | ||
2635 | /* caller should have done css_get */ | 2682 | /* caller should have done css_get */ |
2636 | pc->mem_cgroup = to; | 2683 | pc->mem_cgroup = to; |
@@ -2661,18 +2708,15 @@ out: | |||
2661 | 2708 | ||
2662 | static int mem_cgroup_move_parent(struct page *page, | 2709 | static int mem_cgroup_move_parent(struct page *page, |
2663 | struct page_cgroup *pc, | 2710 | struct page_cgroup *pc, |
2664 | struct mem_cgroup *child, | 2711 | struct mem_cgroup *child) |
2665 | gfp_t gfp_mask) | ||
2666 | { | 2712 | { |
2667 | struct cgroup *cg = child->css.cgroup; | ||
2668 | struct cgroup *pcg = cg->parent; | ||
2669 | struct mem_cgroup *parent; | 2713 | struct mem_cgroup *parent; |
2670 | unsigned int nr_pages; | 2714 | unsigned int nr_pages; |
2671 | unsigned long uninitialized_var(flags); | 2715 | unsigned long uninitialized_var(flags); |
2672 | int ret; | 2716 | int ret; |
2673 | 2717 | ||
2674 | /* Is ROOT ? */ | 2718 | /* Is ROOT ? */ |
2675 | if (!pcg) | 2719 | if (mem_cgroup_is_root(child)) |
2676 | return -EINVAL; | 2720 | return -EINVAL; |
2677 | 2721 | ||
2678 | ret = -EBUSY; | 2722 | ret = -EBUSY; |
@@ -2683,21 +2727,23 @@ static int mem_cgroup_move_parent(struct page *page, | |||
2683 | 2727 | ||
2684 | nr_pages = hpage_nr_pages(page); | 2728 | nr_pages = hpage_nr_pages(page); |
2685 | 2729 | ||
2686 | parent = mem_cgroup_from_cont(pcg); | 2730 | parent = parent_mem_cgroup(child); |
2687 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); | 2731 | /* |
2688 | if (ret) | 2732 | * If no parent, move charges to root cgroup. |
2689 | goto put_back; | 2733 | */ |
2734 | if (!parent) | ||
2735 | parent = root_mem_cgroup; | ||
2690 | 2736 | ||
2691 | if (nr_pages > 1) | 2737 | if (nr_pages > 1) |
2692 | flags = compound_lock_irqsave(page); | 2738 | flags = compound_lock_irqsave(page); |
2693 | 2739 | ||
2694 | ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); | 2740 | ret = mem_cgroup_move_account(page, nr_pages, |
2695 | if (ret) | 2741 | pc, child, parent); |
2696 | __mem_cgroup_cancel_charge(parent, nr_pages); | 2742 | if (!ret) |
2743 | __mem_cgroup_cancel_local_charge(child, nr_pages); | ||
2697 | 2744 | ||
2698 | if (nr_pages > 1) | 2745 | if (nr_pages > 1) |
2699 | compound_unlock_irqrestore(page, flags); | 2746 | compound_unlock_irqrestore(page, flags); |
2700 | put_back: | ||
2701 | putback_lru_page(page); | 2747 | putback_lru_page(page); |
2702 | put: | 2748 | put: |
2703 | put_page(page); | 2749 | put_page(page); |
@@ -2716,7 +2762,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2716 | { | 2762 | { |
2717 | struct mem_cgroup *memcg = NULL; | 2763 | struct mem_cgroup *memcg = NULL; |
2718 | unsigned int nr_pages = 1; | 2764 | unsigned int nr_pages = 1; |
2719 | struct page_cgroup *pc; | ||
2720 | bool oom = true; | 2765 | bool oom = true; |
2721 | int ret; | 2766 | int ret; |
2722 | 2767 | ||
@@ -2730,11 +2775,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2730 | oom = false; | 2775 | oom = false; |
2731 | } | 2776 | } |
2732 | 2777 | ||
2733 | pc = lookup_page_cgroup(page); | ||
2734 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); | 2778 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); |
2735 | if (ret == -ENOMEM) | 2779 | if (ret == -ENOMEM) |
2736 | return ret; | 2780 | return ret; |
2737 | __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false); | 2781 | __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); |
2738 | return 0; | 2782 | return 0; |
2739 | } | 2783 | } |
2740 | 2784 | ||
@@ -2747,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
2747 | VM_BUG_ON(page->mapping && !PageAnon(page)); | 2791 | VM_BUG_ON(page->mapping && !PageAnon(page)); |
2748 | VM_BUG_ON(!mm); | 2792 | VM_BUG_ON(!mm); |
2749 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 2793 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
2750 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2794 | MEM_CGROUP_CHARGE_TYPE_ANON); |
2751 | } | ||
2752 | |||
2753 | static void | ||
2754 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | ||
2755 | enum charge_type ctype); | ||
2756 | |||
2757 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | ||
2758 | gfp_t gfp_mask) | ||
2759 | { | ||
2760 | struct mem_cgroup *memcg = NULL; | ||
2761 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2762 | int ret; | ||
2763 | |||
2764 | if (mem_cgroup_disabled()) | ||
2765 | return 0; | ||
2766 | if (PageCompound(page)) | ||
2767 | return 0; | ||
2768 | |||
2769 | if (unlikely(!mm)) | ||
2770 | mm = &init_mm; | ||
2771 | if (!page_is_file_cache(page)) | ||
2772 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
2773 | |||
2774 | if (!PageSwapCache(page)) | ||
2775 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); | ||
2776 | else { /* page is swapcache/shmem */ | ||
2777 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); | ||
2778 | if (!ret) | ||
2779 | __mem_cgroup_commit_charge_swapin(page, memcg, type); | ||
2780 | } | ||
2781 | return ret; | ||
2782 | } | 2795 | } |
2783 | 2796 | ||
2784 | /* | 2797 | /* |
@@ -2787,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2787 | * struct page_cgroup is acquired. This refcnt will be consumed by | 2800 | * struct page_cgroup is acquired. This refcnt will be consumed by |
2788 | * "commit()" or removed by "cancel()" | 2801 | * "commit()" or removed by "cancel()" |
2789 | */ | 2802 | */ |
2790 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 2803 | static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
2791 | struct page *page, | 2804 | struct page *page, |
2792 | gfp_t mask, struct mem_cgroup **memcgp) | 2805 | gfp_t mask, |
2806 | struct mem_cgroup **memcgp) | ||
2793 | { | 2807 | { |
2794 | struct mem_cgroup *memcg; | 2808 | struct mem_cgroup *memcg; |
2809 | struct page_cgroup *pc; | ||
2795 | int ret; | 2810 | int ret; |
2796 | 2811 | ||
2797 | *memcgp = NULL; | 2812 | pc = lookup_page_cgroup(page); |
2798 | |||
2799 | if (mem_cgroup_disabled()) | ||
2800 | return 0; | ||
2801 | |||
2802 | if (!do_swap_account) | ||
2803 | goto charge_cur_mm; | ||
2804 | /* | 2813 | /* |
2805 | * A racing thread's fault, or swapoff, may have already updated | 2814 | * Every swap fault against a single page tries to charge the |
2806 | * the pte, and even removed page from swap cache: in those cases | 2815 | * page, bail as early as possible. shmem_unuse() encounters |
2807 | * do_swap_page()'s pte_same() test will fail; but there's also a | 2816 | * already charged pages, too. The USED bit is protected by |
2808 | * KSM case which does need to charge the page. | 2817 | * the page lock, which serializes swap cache removal, which |
2818 | * in turn serializes uncharging. | ||
2809 | */ | 2819 | */ |
2810 | if (!PageSwapCache(page)) | 2820 | if (PageCgroupUsed(pc)) |
2821 | return 0; | ||
2822 | if (!do_swap_account) | ||
2811 | goto charge_cur_mm; | 2823 | goto charge_cur_mm; |
2812 | memcg = try_get_mem_cgroup_from_page(page); | 2824 | memcg = try_get_mem_cgroup_from_page(page); |
2813 | if (!memcg) | 2825 | if (!memcg) |
@@ -2819,28 +2831,55 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2819 | ret = 0; | 2831 | ret = 0; |
2820 | return ret; | 2832 | return ret; |
2821 | charge_cur_mm: | 2833 | charge_cur_mm: |
2822 | if (unlikely(!mm)) | ||
2823 | mm = &init_mm; | ||
2824 | ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); | 2834 | ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); |
2825 | if (ret == -EINTR) | 2835 | if (ret == -EINTR) |
2826 | ret = 0; | 2836 | ret = 0; |
2827 | return ret; | 2837 | return ret; |
2828 | } | 2838 | } |
2829 | 2839 | ||
2840 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, | ||
2841 | gfp_t gfp_mask, struct mem_cgroup **memcgp) | ||
2842 | { | ||
2843 | *memcgp = NULL; | ||
2844 | if (mem_cgroup_disabled()) | ||
2845 | return 0; | ||
2846 | /* | ||
2847 | * A racing thread's fault, or swapoff, may have already | ||
2848 | * updated the pte, and even removed page from swap cache: in | ||
2849 | * those cases unuse_pte()'s pte_same() test will fail; but | ||
2850 | * there's also a KSM case which does need to charge the page. | ||
2851 | */ | ||
2852 | if (!PageSwapCache(page)) { | ||
2853 | int ret; | ||
2854 | |||
2855 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); | ||
2856 | if (ret == -EINTR) | ||
2857 | ret = 0; | ||
2858 | return ret; | ||
2859 | } | ||
2860 | return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); | ||
2861 | } | ||
2862 | |||
2863 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) | ||
2864 | { | ||
2865 | if (mem_cgroup_disabled()) | ||
2866 | return; | ||
2867 | if (!memcg) | ||
2868 | return; | ||
2869 | __mem_cgroup_cancel_charge(memcg, 1); | ||
2870 | } | ||
2871 | |||
2830 | static void | 2872 | static void |
2831 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, | 2873 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, |
2832 | enum charge_type ctype) | 2874 | enum charge_type ctype) |
2833 | { | 2875 | { |
2834 | struct page_cgroup *pc; | ||
2835 | |||
2836 | if (mem_cgroup_disabled()) | 2876 | if (mem_cgroup_disabled()) |
2837 | return; | 2877 | return; |
2838 | if (!memcg) | 2878 | if (!memcg) |
2839 | return; | 2879 | return; |
2840 | cgroup_exclude_rmdir(&memcg->css); | 2880 | cgroup_exclude_rmdir(&memcg->css); |
2841 | 2881 | ||
2842 | pc = lookup_page_cgroup(page); | 2882 | __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); |
2843 | __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true); | ||
2844 | /* | 2883 | /* |
2845 | * Now swap is on-memory. This means this page may be | 2884 | * Now swap is on-memory. This means this page may be |
2846 | * counted both as mem and swap....double count. | 2885 | * counted both as mem and swap....double count. |
@@ -2850,24 +2889,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, | |||
2850 | */ | 2889 | */ |
2851 | if (do_swap_account && PageSwapCache(page)) { | 2890 | if (do_swap_account && PageSwapCache(page)) { |
2852 | swp_entry_t ent = {.val = page_private(page)}; | 2891 | swp_entry_t ent = {.val = page_private(page)}; |
2853 | struct mem_cgroup *swap_memcg; | 2892 | mem_cgroup_uncharge_swap(ent); |
2854 | unsigned short id; | ||
2855 | |||
2856 | id = swap_cgroup_record(ent, 0); | ||
2857 | rcu_read_lock(); | ||
2858 | swap_memcg = mem_cgroup_lookup(id); | ||
2859 | if (swap_memcg) { | ||
2860 | /* | ||
2861 | * This recorded memcg can be obsolete one. So, avoid | ||
2862 | * calling css_tryget | ||
2863 | */ | ||
2864 | if (!mem_cgroup_is_root(swap_memcg)) | ||
2865 | res_counter_uncharge(&swap_memcg->memsw, | ||
2866 | PAGE_SIZE); | ||
2867 | mem_cgroup_swap_statistics(swap_memcg, false); | ||
2868 | mem_cgroup_put(swap_memcg); | ||
2869 | } | ||
2870 | rcu_read_unlock(); | ||
2871 | } | 2893 | } |
2872 | /* | 2894 | /* |
2873 | * At swapin, we may charge account against cgroup which has no tasks. | 2895 | * At swapin, we may charge account against cgroup which has no tasks. |
@@ -2881,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page, | |||
2881 | struct mem_cgroup *memcg) | 2903 | struct mem_cgroup *memcg) |
2882 | { | 2904 | { |
2883 | __mem_cgroup_commit_charge_swapin(page, memcg, | 2905 | __mem_cgroup_commit_charge_swapin(page, memcg, |
2884 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2906 | MEM_CGROUP_CHARGE_TYPE_ANON); |
2885 | } | 2907 | } |
2886 | 2908 | ||
2887 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) | 2909 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
2910 | gfp_t gfp_mask) | ||
2888 | { | 2911 | { |
2912 | struct mem_cgroup *memcg = NULL; | ||
2913 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2914 | int ret; | ||
2915 | |||
2889 | if (mem_cgroup_disabled()) | 2916 | if (mem_cgroup_disabled()) |
2890 | return; | 2917 | return 0; |
2891 | if (!memcg) | 2918 | if (PageCompound(page)) |
2892 | return; | 2919 | return 0; |
2893 | __mem_cgroup_cancel_charge(memcg, 1); | 2920 | |
2921 | if (!PageSwapCache(page)) | ||
2922 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); | ||
2923 | else { /* page is swapcache/shmem */ | ||
2924 | ret = __mem_cgroup_try_charge_swapin(mm, page, | ||
2925 | gfp_mask, &memcg); | ||
2926 | if (!ret) | ||
2927 | __mem_cgroup_commit_charge_swapin(page, memcg, type); | ||
2928 | } | ||
2929 | return ret; | ||
2894 | } | 2930 | } |
2895 | 2931 | ||
2896 | static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, | 2932 | static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, |
@@ -2950,7 +2986,8 @@ direct_uncharge: | |||
2950 | * uncharge if !page_mapped(page) | 2986 | * uncharge if !page_mapped(page) |
2951 | */ | 2987 | */ |
2952 | static struct mem_cgroup * | 2988 | static struct mem_cgroup * |
2953 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 2989 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, |
2990 | bool end_migration) | ||
2954 | { | 2991 | { |
2955 | struct mem_cgroup *memcg = NULL; | 2992 | struct mem_cgroup *memcg = NULL; |
2956 | unsigned int nr_pages = 1; | 2993 | unsigned int nr_pages = 1; |
@@ -2960,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2960 | if (mem_cgroup_disabled()) | 2997 | if (mem_cgroup_disabled()) |
2961 | return NULL; | 2998 | return NULL; |
2962 | 2999 | ||
2963 | if (PageSwapCache(page)) | 3000 | VM_BUG_ON(PageSwapCache(page)); |
2964 | return NULL; | ||
2965 | 3001 | ||
2966 | if (PageTransHuge(page)) { | 3002 | if (PageTransHuge(page)) { |
2967 | nr_pages <<= compound_order(page); | 3003 | nr_pages <<= compound_order(page); |
@@ -2984,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2984 | anon = PageAnon(page); | 3020 | anon = PageAnon(page); |
2985 | 3021 | ||
2986 | switch (ctype) { | 3022 | switch (ctype) { |
2987 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 3023 | case MEM_CGROUP_CHARGE_TYPE_ANON: |
2988 | /* | 3024 | /* |
2989 | * Generally PageAnon tells if it's the anon statistics to be | 3025 | * Generally PageAnon tells if it's the anon statistics to be |
2990 | * updated; but sometimes e.g. mem_cgroup_uncharge_page() is | 3026 | * updated; but sometimes e.g. mem_cgroup_uncharge_page() is |
@@ -2994,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2994 | /* fallthrough */ | 3030 | /* fallthrough */ |
2995 | case MEM_CGROUP_CHARGE_TYPE_DROP: | 3031 | case MEM_CGROUP_CHARGE_TYPE_DROP: |
2996 | /* See mem_cgroup_prepare_migration() */ | 3032 | /* See mem_cgroup_prepare_migration() */ |
2997 | if (page_mapped(page) || PageCgroupMigration(pc)) | 3033 | if (page_mapped(page)) |
3034 | goto unlock_out; | ||
3035 | /* | ||
3036 | * Pages under migration may not be uncharged. But | ||
3037 | * end_migration() /must/ be the one uncharging the | ||
3038 | * unused post-migration page and so it has to call | ||
3039 | * here with the migration bit still set. See the | ||
3040 | * res_counter handling below. | ||
3041 | */ | ||
3042 | if (!end_migration && PageCgroupMigration(pc)) | ||
2998 | goto unlock_out; | 3043 | goto unlock_out; |
2999 | break; | 3044 | break; |
3000 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: | 3045 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: |
@@ -3028,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
3028 | mem_cgroup_swap_statistics(memcg, true); | 3073 | mem_cgroup_swap_statistics(memcg, true); |
3029 | mem_cgroup_get(memcg); | 3074 | mem_cgroup_get(memcg); |
3030 | } | 3075 | } |
3031 | if (!mem_cgroup_is_root(memcg)) | 3076 | /* |
3077 | * Migration does not charge the res_counter for the | ||
3078 | * replacement page, so leave it alone when phasing out the | ||
3079 | * page that is unused after the migration. | ||
3080 | */ | ||
3081 | if (!end_migration && !mem_cgroup_is_root(memcg)) | ||
3032 | mem_cgroup_do_uncharge(memcg, nr_pages, ctype); | 3082 | mem_cgroup_do_uncharge(memcg, nr_pages, ctype); |
3033 | 3083 | ||
3034 | return memcg; | 3084 | return memcg; |
@@ -3044,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page) | |||
3044 | if (page_mapped(page)) | 3094 | if (page_mapped(page)) |
3045 | return; | 3095 | return; |
3046 | VM_BUG_ON(page->mapping && !PageAnon(page)); | 3096 | VM_BUG_ON(page->mapping && !PageAnon(page)); |
3047 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 3097 | if (PageSwapCache(page)) |
3098 | return; | ||
3099 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); | ||
3048 | } | 3100 | } |
3049 | 3101 | ||
3050 | void mem_cgroup_uncharge_cache_page(struct page *page) | 3102 | void mem_cgroup_uncharge_cache_page(struct page *page) |
3051 | { | 3103 | { |
3052 | VM_BUG_ON(page_mapped(page)); | 3104 | VM_BUG_ON(page_mapped(page)); |
3053 | VM_BUG_ON(page->mapping); | 3105 | VM_BUG_ON(page->mapping); |
3054 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 3106 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); |
3055 | } | 3107 | } |
3056 | 3108 | ||
3057 | /* | 3109 | /* |
@@ -3115,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
3115 | if (!swapout) /* this was a swap cache but the swap is unused ! */ | 3167 | if (!swapout) /* this was a swap cache but the swap is unused ! */ |
3116 | ctype = MEM_CGROUP_CHARGE_TYPE_DROP; | 3168 | ctype = MEM_CGROUP_CHARGE_TYPE_DROP; |
3117 | 3169 | ||
3118 | memcg = __mem_cgroup_uncharge_common(page, ctype); | 3170 | memcg = __mem_cgroup_uncharge_common(page, ctype, false); |
3119 | 3171 | ||
3120 | /* | 3172 | /* |
3121 | * record memcg information, if swapout && memcg != NULL, | 3173 | * record memcg information, if swapout && memcg != NULL, |
@@ -3126,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
3126 | } | 3178 | } |
3127 | #endif | 3179 | #endif |
3128 | 3180 | ||
3129 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3181 | #ifdef CONFIG_MEMCG_SWAP |
3130 | /* | 3182 | /* |
3131 | * called from swap_entry_free(). remove record in swap_cgroup and | 3183 | * called from swap_entry_free(). remove record in swap_cgroup and |
3132 | * uncharge "memsw" account. | 3184 | * uncharge "memsw" account. |
@@ -3160,7 +3212,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
3160 | * @entry: swap entry to be moved | 3212 | * @entry: swap entry to be moved |
3161 | * @from: mem_cgroup which the entry is moved from | 3213 | * @from: mem_cgroup which the entry is moved from |
3162 | * @to: mem_cgroup which the entry is moved to | 3214 | * @to: mem_cgroup which the entry is moved to |
3163 | * @need_fixup: whether we should fixup res_counters and refcounts. | ||
3164 | * | 3215 | * |
3165 | * It succeeds only when the swap_cgroup's record for this entry is the same | 3216 | * It succeeds only when the swap_cgroup's record for this entry is the same |
3166 | * as the mem_cgroup's id of @from. | 3217 | * as the mem_cgroup's id of @from. |
@@ -3171,7 +3222,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
3171 | * both res and memsw, and called css_get(). | 3222 | * both res and memsw, and called css_get(). |
3172 | */ | 3223 | */ |
3173 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | 3224 | static int mem_cgroup_move_swap_account(swp_entry_t entry, |
3174 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | 3225 | struct mem_cgroup *from, struct mem_cgroup *to) |
3175 | { | 3226 | { |
3176 | unsigned short old_id, new_id; | 3227 | unsigned short old_id, new_id; |
3177 | 3228 | ||
@@ -3190,24 +3241,13 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3190 | * swap-in, the refcount of @to might be decreased to 0. | 3241 | * swap-in, the refcount of @to might be decreased to 0. |
3191 | */ | 3242 | */ |
3192 | mem_cgroup_get(to); | 3243 | mem_cgroup_get(to); |
3193 | if (need_fixup) { | ||
3194 | if (!mem_cgroup_is_root(from)) | ||
3195 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
3196 | mem_cgroup_put(from); | ||
3197 | /* | ||
3198 | * we charged both to->res and to->memsw, so we should | ||
3199 | * uncharge to->res. | ||
3200 | */ | ||
3201 | if (!mem_cgroup_is_root(to)) | ||
3202 | res_counter_uncharge(&to->res, PAGE_SIZE); | ||
3203 | } | ||
3204 | return 0; | 3244 | return 0; |
3205 | } | 3245 | } |
3206 | return -EINVAL; | 3246 | return -EINVAL; |
3207 | } | 3247 | } |
3208 | #else | 3248 | #else |
3209 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | 3249 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, |
3210 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | 3250 | struct mem_cgroup *from, struct mem_cgroup *to) |
3211 | { | 3251 | { |
3212 | return -EINVAL; | 3252 | return -EINVAL; |
3213 | } | 3253 | } |
@@ -3217,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3217 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old | 3257 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old |
3218 | * page belongs to. | 3258 | * page belongs to. |
3219 | */ | 3259 | */ |
3220 | int mem_cgroup_prepare_migration(struct page *page, | 3260 | void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, |
3221 | struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) | 3261 | struct mem_cgroup **memcgp) |
3222 | { | 3262 | { |
3223 | struct mem_cgroup *memcg = NULL; | 3263 | struct mem_cgroup *memcg = NULL; |
3224 | struct page_cgroup *pc; | 3264 | struct page_cgroup *pc; |
3225 | enum charge_type ctype; | 3265 | enum charge_type ctype; |
3226 | int ret = 0; | ||
3227 | 3266 | ||
3228 | *memcgp = NULL; | 3267 | *memcgp = NULL; |
3229 | 3268 | ||
3230 | VM_BUG_ON(PageTransHuge(page)); | 3269 | VM_BUG_ON(PageTransHuge(page)); |
3231 | if (mem_cgroup_disabled()) | 3270 | if (mem_cgroup_disabled()) |
3232 | return 0; | 3271 | return; |
3233 | 3272 | ||
3234 | pc = lookup_page_cgroup(page); | 3273 | pc = lookup_page_cgroup(page); |
3235 | lock_page_cgroup(pc); | 3274 | lock_page_cgroup(pc); |
@@ -3274,39 +3313,25 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3274 | * we return here. | 3313 | * we return here. |
3275 | */ | 3314 | */ |
3276 | if (!memcg) | 3315 | if (!memcg) |
3277 | return 0; | 3316 | return; |
3278 | 3317 | ||
3279 | *memcgp = memcg; | 3318 | *memcgp = memcg; |
3280 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); | ||
3281 | css_put(&memcg->css);/* drop extra refcnt */ | ||
3282 | if (ret) { | ||
3283 | if (PageAnon(page)) { | ||
3284 | lock_page_cgroup(pc); | ||
3285 | ClearPageCgroupMigration(pc); | ||
3286 | unlock_page_cgroup(pc); | ||
3287 | /* | ||
3288 | * The old page may be fully unmapped while we kept it. | ||
3289 | */ | ||
3290 | mem_cgroup_uncharge_page(page); | ||
3291 | } | ||
3292 | /* we'll need to revisit this error code (we have -EINTR) */ | ||
3293 | return -ENOMEM; | ||
3294 | } | ||
3295 | /* | 3319 | /* |
3296 | * We charge new page before it's used/mapped. So, even if unlock_page() | 3320 | * We charge new page before it's used/mapped. So, even if unlock_page() |
3297 | * is called before end_migration, we can catch all events on this new | 3321 | * is called before end_migration, we can catch all events on this new |
3298 | * page. In the case new page is migrated but not remapped, new page's | 3322 | * page. In the case new page is migrated but not remapped, new page's |
3299 | * mapcount will be finally 0 and we call uncharge in end_migration(). | 3323 | * mapcount will be finally 0 and we call uncharge in end_migration(). |
3300 | */ | 3324 | */ |
3301 | pc = lookup_page_cgroup(newpage); | ||
3302 | if (PageAnon(page)) | 3325 | if (PageAnon(page)) |
3303 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | 3326 | ctype = MEM_CGROUP_CHARGE_TYPE_ANON; |
3304 | else if (page_is_file_cache(page)) | ||
3305 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
3306 | else | 3327 | else |
3307 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3328 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
3308 | __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false); | 3329 | /* |
3309 | return ret; | 3330 | * The page is committed to the memcg, but it's not actually |
3331 | * charged to the res_counter since we plan on replacing the | ||
3332 | * old one and only one page is going to be left afterwards. | ||
3333 | */ | ||
3334 | __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); | ||
3310 | } | 3335 | } |
3311 | 3336 | ||
3312 | /* remove redundant charge if migration failed*/ | 3337 | /* remove redundant charge if migration failed*/ |
@@ -3328,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3328 | used = newpage; | 3353 | used = newpage; |
3329 | unused = oldpage; | 3354 | unused = oldpage; |
3330 | } | 3355 | } |
3356 | anon = PageAnon(used); | ||
3357 | __mem_cgroup_uncharge_common(unused, | ||
3358 | anon ? MEM_CGROUP_CHARGE_TYPE_ANON | ||
3359 | : MEM_CGROUP_CHARGE_TYPE_CACHE, | ||
3360 | true); | ||
3361 | css_put(&memcg->css); | ||
3331 | /* | 3362 | /* |
3332 | * We disallowed uncharge of pages under migration because mapcount | 3363 | * We disallowed uncharge of pages under migration because mapcount |
3333 | * of the page goes down to zero, temporarly. | 3364 | * of the page goes down to zero, temporarly. |
@@ -3337,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3337 | lock_page_cgroup(pc); | 3368 | lock_page_cgroup(pc); |
3338 | ClearPageCgroupMigration(pc); | 3369 | ClearPageCgroupMigration(pc); |
3339 | unlock_page_cgroup(pc); | 3370 | unlock_page_cgroup(pc); |
3340 | anon = PageAnon(used); | ||
3341 | __mem_cgroup_uncharge_common(unused, | ||
3342 | anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED | ||
3343 | : MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
3344 | 3371 | ||
3345 | /* | 3372 | /* |
3346 | * If a page is a file cache, radix-tree replacement is very atomic | 3373 | * If a page is a file cache, radix-tree replacement is very atomic |
@@ -3369,7 +3396,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3369 | void mem_cgroup_replace_page_cache(struct page *oldpage, | 3396 | void mem_cgroup_replace_page_cache(struct page *oldpage, |
3370 | struct page *newpage) | 3397 | struct page *newpage) |
3371 | { | 3398 | { |
3372 | struct mem_cgroup *memcg; | 3399 | struct mem_cgroup *memcg = NULL; |
3373 | struct page_cgroup *pc; | 3400 | struct page_cgroup *pc; |
3374 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | 3401 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; |
3375 | 3402 | ||
@@ -3379,20 +3406,25 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
3379 | pc = lookup_page_cgroup(oldpage); | 3406 | pc = lookup_page_cgroup(oldpage); |
3380 | /* fix accounting on old pages */ | 3407 | /* fix accounting on old pages */ |
3381 | lock_page_cgroup(pc); | 3408 | lock_page_cgroup(pc); |
3382 | memcg = pc->mem_cgroup; | 3409 | if (PageCgroupUsed(pc)) { |
3383 | mem_cgroup_charge_statistics(memcg, false, -1); | 3410 | memcg = pc->mem_cgroup; |
3384 | ClearPageCgroupUsed(pc); | 3411 | mem_cgroup_charge_statistics(memcg, false, -1); |
3412 | ClearPageCgroupUsed(pc); | ||
3413 | } | ||
3385 | unlock_page_cgroup(pc); | 3414 | unlock_page_cgroup(pc); |
3386 | 3415 | ||
3387 | if (PageSwapBacked(oldpage)) | 3416 | /* |
3388 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3417 | * When called from shmem_replace_page(), in some cases the |
3389 | 3418 | * oldpage has already been charged, and in some cases not. | |
3419 | */ | ||
3420 | if (!memcg) | ||
3421 | return; | ||
3390 | /* | 3422 | /* |
3391 | * Even if newpage->mapping was NULL before starting replacement, | 3423 | * Even if newpage->mapping was NULL before starting replacement, |
3392 | * the newpage may be on LRU(or pagevec for LRU) already. We lock | 3424 | * the newpage may be on LRU(or pagevec for LRU) already. We lock |
3393 | * LRU while we overwrite pc->mem_cgroup. | 3425 | * LRU while we overwrite pc->mem_cgroup. |
3394 | */ | 3426 | */ |
3395 | __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true); | 3427 | __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); |
3396 | } | 3428 | } |
3397 | 3429 | ||
3398 | #ifdef CONFIG_DEBUG_VM | 3430 | #ifdef CONFIG_DEBUG_VM |
@@ -3461,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3461 | /* | 3493 | /* |
3462 | * Rather than hide all in some function, I do this in | 3494 | * Rather than hide all in some function, I do this in |
3463 | * open coded manner. You see what this really does. | 3495 | * open coded manner. You see what this really does. |
3464 | * We have to guarantee memcg->res.limit < memcg->memsw.limit. | 3496 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. |
3465 | */ | 3497 | */ |
3466 | mutex_lock(&set_limit_mutex); | 3498 | mutex_lock(&set_limit_mutex); |
3467 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 3499 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
@@ -3522,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
3522 | /* | 3554 | /* |
3523 | * Rather than hide all in some function, I do this in | 3555 | * Rather than hide all in some function, I do this in |
3524 | * open coded manner. You see what this really does. | 3556 | * open coded manner. You see what this really does. |
3525 | * We have to guarantee memcg->res.limit < memcg->memsw.limit. | 3557 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. |
3526 | */ | 3558 | */ |
3527 | mutex_lock(&set_limit_mutex); | 3559 | mutex_lock(&set_limit_mutex); |
3528 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | 3560 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); |
@@ -3654,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3654 | } | 3686 | } |
3655 | 3687 | ||
3656 | /* | 3688 | /* |
3657 | * This routine traverse page_cgroup in given list and drop them all. | 3689 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't |
3658 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 3690 | * reclaim the pages page themselves - it just removes the page_cgroups. |
3691 | * Returns true if some page_cgroups were not freed, indicating that the caller | ||
3692 | * must retry this operation. | ||
3659 | */ | 3693 | */ |
3660 | static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | 3694 | static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, |
3661 | int node, int zid, enum lru_list lru) | 3695 | int node, int zid, enum lru_list lru) |
3662 | { | 3696 | { |
3663 | struct mem_cgroup_per_zone *mz; | 3697 | struct mem_cgroup_per_zone *mz; |
@@ -3665,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3665 | struct list_head *list; | 3699 | struct list_head *list; |
3666 | struct page *busy; | 3700 | struct page *busy; |
3667 | struct zone *zone; | 3701 | struct zone *zone; |
3668 | int ret = 0; | ||
3669 | 3702 | ||
3670 | zone = &NODE_DATA(node)->node_zones[zid]; | 3703 | zone = &NODE_DATA(node)->node_zones[zid]; |
3671 | mz = mem_cgroup_zoneinfo(memcg, node, zid); | 3704 | mz = mem_cgroup_zoneinfo(memcg, node, zid); |
@@ -3679,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3679 | struct page_cgroup *pc; | 3712 | struct page_cgroup *pc; |
3680 | struct page *page; | 3713 | struct page *page; |
3681 | 3714 | ||
3682 | ret = 0; | ||
3683 | spin_lock_irqsave(&zone->lru_lock, flags); | 3715 | spin_lock_irqsave(&zone->lru_lock, flags); |
3684 | if (list_empty(list)) { | 3716 | if (list_empty(list)) { |
3685 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3717 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
@@ -3696,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3696 | 3728 | ||
3697 | pc = lookup_page_cgroup(page); | 3729 | pc = lookup_page_cgroup(page); |
3698 | 3730 | ||
3699 | ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); | 3731 | if (mem_cgroup_move_parent(page, pc, memcg)) { |
3700 | if (ret == -ENOMEM || ret == -EINTR) | ||
3701 | break; | ||
3702 | |||
3703 | if (ret == -EBUSY || ret == -EINVAL) { | ||
3704 | /* found lock contention or "pc" is obsolete. */ | 3732 | /* found lock contention or "pc" is obsolete. */ |
3705 | busy = page; | 3733 | busy = page; |
3706 | cond_resched(); | 3734 | cond_resched(); |
3707 | } else | 3735 | } else |
3708 | busy = NULL; | 3736 | busy = NULL; |
3709 | } | 3737 | } |
3710 | 3738 | return !list_empty(list); | |
3711 | if (!ret && !list_empty(list)) | ||
3712 | return -EBUSY; | ||
3713 | return ret; | ||
3714 | } | 3739 | } |
3715 | 3740 | ||
3716 | /* | 3741 | /* |
@@ -3735,9 +3760,6 @@ move_account: | |||
3735 | ret = -EBUSY; | 3760 | ret = -EBUSY; |
3736 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | 3761 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) |
3737 | goto out; | 3762 | goto out; |
3738 | ret = -EINTR; | ||
3739 | if (signal_pending(current)) | ||
3740 | goto out; | ||
3741 | /* This is for making all *used* pages to be on LRU. */ | 3763 | /* This is for making all *used* pages to be on LRU. */ |
3742 | lru_add_drain_all(); | 3764 | lru_add_drain_all(); |
3743 | drain_all_stock_sync(memcg); | 3765 | drain_all_stock_sync(memcg); |
@@ -3758,12 +3780,9 @@ move_account: | |||
3758 | } | 3780 | } |
3759 | mem_cgroup_end_move(memcg); | 3781 | mem_cgroup_end_move(memcg); |
3760 | memcg_oom_recover(memcg); | 3782 | memcg_oom_recover(memcg); |
3761 | /* it seems parent cgroup doesn't have enough mem */ | ||
3762 | if (ret == -ENOMEM) | ||
3763 | goto try_to_free; | ||
3764 | cond_resched(); | 3783 | cond_resched(); |
3765 | /* "ret" should also be checked to ensure all lists are empty. */ | 3784 | /* "ret" should also be checked to ensure all lists are empty. */ |
3766 | } while (memcg->res.usage > 0 || ret); | 3785 | } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); |
3767 | out: | 3786 | out: |
3768 | css_put(&memcg->css); | 3787 | css_put(&memcg->css); |
3769 | return ret; | 3788 | return ret; |
@@ -3778,7 +3797,7 @@ try_to_free: | |||
3778 | lru_add_drain_all(); | 3797 | lru_add_drain_all(); |
3779 | /* try to free all pages in this cgroup */ | 3798 | /* try to free all pages in this cgroup */ |
3780 | shrink = 1; | 3799 | shrink = 1; |
3781 | while (nr_retries && memcg->res.usage > 0) { | 3800 | while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { |
3782 | int progress; | 3801 | int progress; |
3783 | 3802 | ||
3784 | if (signal_pending(current)) { | 3803 | if (signal_pending(current)) { |
@@ -3799,7 +3818,7 @@ try_to_free: | |||
3799 | goto move_account; | 3818 | goto move_account; |
3800 | } | 3819 | } |
3801 | 3820 | ||
3802 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | 3821 | static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) |
3803 | { | 3822 | { |
3804 | return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); | 3823 | return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); |
3805 | } | 3824 | } |
@@ -3822,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3822 | parent_memcg = mem_cgroup_from_cont(parent); | 3841 | parent_memcg = mem_cgroup_from_cont(parent); |
3823 | 3842 | ||
3824 | cgroup_lock(); | 3843 | cgroup_lock(); |
3844 | |||
3845 | if (memcg->use_hierarchy == val) | ||
3846 | goto out; | ||
3847 | |||
3825 | /* | 3848 | /* |
3826 | * If parent's use_hierarchy is set, we can't make any modifications | 3849 | * If parent's use_hierarchy is set, we can't make any modifications |
3827 | * in the child subtrees. If it is unset, then the change can | 3850 | * in the child subtrees. If it is unset, then the change can |
@@ -3838,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3838 | retval = -EBUSY; | 3861 | retval = -EBUSY; |
3839 | } else | 3862 | } else |
3840 | retval = -EINVAL; | 3863 | retval = -EINVAL; |
3864 | |||
3865 | out: | ||
3841 | cgroup_unlock(); | 3866 | cgroup_unlock(); |
3842 | 3867 | ||
3843 | return retval; | 3868 | return retval; |
@@ -3874,19 +3899,26 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
3874 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); | 3899 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); |
3875 | 3900 | ||
3876 | if (swap) | 3901 | if (swap) |
3877 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); | 3902 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); |
3878 | 3903 | ||
3879 | return val << PAGE_SHIFT; | 3904 | return val << PAGE_SHIFT; |
3880 | } | 3905 | } |
3881 | 3906 | ||
3882 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 3907 | static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, |
3908 | struct file *file, char __user *buf, | ||
3909 | size_t nbytes, loff_t *ppos) | ||
3883 | { | 3910 | { |
3884 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 3911 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
3912 | char str[64]; | ||
3885 | u64 val; | 3913 | u64 val; |
3886 | int type, name; | 3914 | int type, name, len; |
3887 | 3915 | ||
3888 | type = MEMFILE_TYPE(cft->private); | 3916 | type = MEMFILE_TYPE(cft->private); |
3889 | name = MEMFILE_ATTR(cft->private); | 3917 | name = MEMFILE_ATTR(cft->private); |
3918 | |||
3919 | if (!do_swap_account && type == _MEMSWAP) | ||
3920 | return -EOPNOTSUPP; | ||
3921 | |||
3890 | switch (type) { | 3922 | switch (type) { |
3891 | case _MEM: | 3923 | case _MEM: |
3892 | if (name == RES_USAGE) | 3924 | if (name == RES_USAGE) |
@@ -3903,7 +3935,9 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
3903 | default: | 3935 | default: |
3904 | BUG(); | 3936 | BUG(); |
3905 | } | 3937 | } |
3906 | return val; | 3938 | |
3939 | len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); | ||
3940 | return simple_read_from_buffer(buf, nbytes, ppos, str, len); | ||
3907 | } | 3941 | } |
3908 | /* | 3942 | /* |
3909 | * The user of this function is... | 3943 | * The user of this function is... |
@@ -3919,6 +3953,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
3919 | 3953 | ||
3920 | type = MEMFILE_TYPE(cft->private); | 3954 | type = MEMFILE_TYPE(cft->private); |
3921 | name = MEMFILE_ATTR(cft->private); | 3955 | name = MEMFILE_ATTR(cft->private); |
3956 | |||
3957 | if (!do_swap_account && type == _MEMSWAP) | ||
3958 | return -EOPNOTSUPP; | ||
3959 | |||
3922 | switch (name) { | 3960 | switch (name) { |
3923 | case RES_LIMIT: | 3961 | case RES_LIMIT: |
3924 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | 3962 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ |
@@ -3984,12 +4022,15 @@ out: | |||
3984 | 4022 | ||
3985 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 4023 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
3986 | { | 4024 | { |
3987 | struct mem_cgroup *memcg; | 4025 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
3988 | int type, name; | 4026 | int type, name; |
3989 | 4027 | ||
3990 | memcg = mem_cgroup_from_cont(cont); | ||
3991 | type = MEMFILE_TYPE(event); | 4028 | type = MEMFILE_TYPE(event); |
3992 | name = MEMFILE_ATTR(event); | 4029 | name = MEMFILE_ATTR(event); |
4030 | |||
4031 | if (!do_swap_account && type == _MEMSWAP) | ||
4032 | return -EOPNOTSUPP; | ||
4033 | |||
3993 | switch (name) { | 4034 | switch (name) { |
3994 | case RES_MAX_USAGE: | 4035 | case RES_MAX_USAGE: |
3995 | if (type == _MEM) | 4036 | if (type == _MEM) |
@@ -4041,103 +4082,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | |||
4041 | } | 4082 | } |
4042 | #endif | 4083 | #endif |
4043 | 4084 | ||
4044 | |||
4045 | /* For read statistics */ | ||
4046 | enum { | ||
4047 | MCS_CACHE, | ||
4048 | MCS_RSS, | ||
4049 | MCS_FILE_MAPPED, | ||
4050 | MCS_PGPGIN, | ||
4051 | MCS_PGPGOUT, | ||
4052 | MCS_SWAP, | ||
4053 | MCS_PGFAULT, | ||
4054 | MCS_PGMAJFAULT, | ||
4055 | MCS_INACTIVE_ANON, | ||
4056 | MCS_ACTIVE_ANON, | ||
4057 | MCS_INACTIVE_FILE, | ||
4058 | MCS_ACTIVE_FILE, | ||
4059 | MCS_UNEVICTABLE, | ||
4060 | NR_MCS_STAT, | ||
4061 | }; | ||
4062 | |||
4063 | struct mcs_total_stat { | ||
4064 | s64 stat[NR_MCS_STAT]; | ||
4065 | }; | ||
4066 | |||
4067 | struct { | ||
4068 | char *local_name; | ||
4069 | char *total_name; | ||
4070 | } memcg_stat_strings[NR_MCS_STAT] = { | ||
4071 | {"cache", "total_cache"}, | ||
4072 | {"rss", "total_rss"}, | ||
4073 | {"mapped_file", "total_mapped_file"}, | ||
4074 | {"pgpgin", "total_pgpgin"}, | ||
4075 | {"pgpgout", "total_pgpgout"}, | ||
4076 | {"swap", "total_swap"}, | ||
4077 | {"pgfault", "total_pgfault"}, | ||
4078 | {"pgmajfault", "total_pgmajfault"}, | ||
4079 | {"inactive_anon", "total_inactive_anon"}, | ||
4080 | {"active_anon", "total_active_anon"}, | ||
4081 | {"inactive_file", "total_inactive_file"}, | ||
4082 | {"active_file", "total_active_file"}, | ||
4083 | {"unevictable", "total_unevictable"} | ||
4084 | }; | ||
4085 | |||
4086 | |||
4087 | static void | ||
4088 | mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) | ||
4089 | { | ||
4090 | s64 val; | ||
4091 | |||
4092 | /* per cpu stat */ | ||
4093 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE); | ||
4094 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | ||
4095 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS); | ||
4096 | s->stat[MCS_RSS] += val * PAGE_SIZE; | ||
4097 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); | ||
4098 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; | ||
4099 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN); | ||
4100 | s->stat[MCS_PGPGIN] += val; | ||
4101 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT); | ||
4102 | s->stat[MCS_PGPGOUT] += val; | ||
4103 | if (do_swap_account) { | ||
4104 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); | ||
4105 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | ||
4106 | } | ||
4107 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT); | ||
4108 | s->stat[MCS_PGFAULT] += val; | ||
4109 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT); | ||
4110 | s->stat[MCS_PGMAJFAULT] += val; | ||
4111 | |||
4112 | /* per zone stat */ | ||
4113 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); | ||
4114 | s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; | ||
4115 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); | ||
4116 | s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; | ||
4117 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); | ||
4118 | s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; | ||
4119 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); | ||
4120 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; | ||
4121 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); | ||
4122 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; | ||
4123 | } | ||
4124 | |||
4125 | static void | ||
4126 | mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) | ||
4127 | { | ||
4128 | struct mem_cgroup *iter; | ||
4129 | |||
4130 | for_each_mem_cgroup_tree(iter, memcg) | ||
4131 | mem_cgroup_get_local_stat(iter, s); | ||
4132 | } | ||
4133 | |||
4134 | #ifdef CONFIG_NUMA | 4085 | #ifdef CONFIG_NUMA |
4135 | static int mem_control_numa_stat_show(struct seq_file *m, void *arg) | 4086 | static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, |
4087 | struct seq_file *m) | ||
4136 | { | 4088 | { |
4137 | int nid; | 4089 | int nid; |
4138 | unsigned long total_nr, file_nr, anon_nr, unevictable_nr; | 4090 | unsigned long total_nr, file_nr, anon_nr, unevictable_nr; |
4139 | unsigned long node_nr; | 4091 | unsigned long node_nr; |
4140 | struct cgroup *cont = m->private; | ||
4141 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 4092 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4142 | 4093 | ||
4143 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); | 4094 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); |
@@ -4178,64 +4129,100 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) | |||
4178 | } | 4129 | } |
4179 | #endif /* CONFIG_NUMA */ | 4130 | #endif /* CONFIG_NUMA */ |
4180 | 4131 | ||
4181 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | 4132 | static const char * const mem_cgroup_lru_names[] = { |
4182 | struct cgroup_map_cb *cb) | 4133 | "inactive_anon", |
4183 | { | 4134 | "active_anon", |
4184 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 4135 | "inactive_file", |
4185 | struct mcs_total_stat mystat; | 4136 | "active_file", |
4186 | int i; | 4137 | "unevictable", |
4138 | }; | ||
4187 | 4139 | ||
4188 | memset(&mystat, 0, sizeof(mystat)); | 4140 | static inline void mem_cgroup_lru_names_not_uptodate(void) |
4189 | mem_cgroup_get_local_stat(memcg, &mystat); | 4141 | { |
4142 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | ||
4143 | } | ||
4190 | 4144 | ||
4145 | static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, | ||
4146 | struct seq_file *m) | ||
4147 | { | ||
4148 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | ||
4149 | struct mem_cgroup *mi; | ||
4150 | unsigned int i; | ||
4191 | 4151 | ||
4192 | for (i = 0; i < NR_MCS_STAT; i++) { | 4152 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
4193 | if (i == MCS_SWAP && !do_swap_account) | 4153 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) |
4194 | continue; | 4154 | continue; |
4195 | cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); | 4155 | seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], |
4156 | mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); | ||
4196 | } | 4157 | } |
4197 | 4158 | ||
4159 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) | ||
4160 | seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], | ||
4161 | mem_cgroup_read_events(memcg, i)); | ||
4162 | |||
4163 | for (i = 0; i < NR_LRU_LISTS; i++) | ||
4164 | seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], | ||
4165 | mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); | ||
4166 | |||
4198 | /* Hierarchical information */ | 4167 | /* Hierarchical information */ |
4199 | { | 4168 | { |
4200 | unsigned long long limit, memsw_limit; | 4169 | unsigned long long limit, memsw_limit; |
4201 | memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); | 4170 | memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); |
4202 | cb->fill(cb, "hierarchical_memory_limit", limit); | 4171 | seq_printf(m, "hierarchical_memory_limit %llu\n", limit); |
4203 | if (do_swap_account) | 4172 | if (do_swap_account) |
4204 | cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); | 4173 | seq_printf(m, "hierarchical_memsw_limit %llu\n", |
4174 | memsw_limit); | ||
4205 | } | 4175 | } |
4206 | 4176 | ||
4207 | memset(&mystat, 0, sizeof(mystat)); | 4177 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
4208 | mem_cgroup_get_total_stat(memcg, &mystat); | 4178 | long long val = 0; |
4209 | for (i = 0; i < NR_MCS_STAT; i++) { | 4179 | |
4210 | if (i == MCS_SWAP && !do_swap_account) | 4180 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) |
4211 | continue; | 4181 | continue; |
4212 | cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); | 4182 | for_each_mem_cgroup_tree(mi, memcg) |
4183 | val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; | ||
4184 | seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); | ||
4185 | } | ||
4186 | |||
4187 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { | ||
4188 | unsigned long long val = 0; | ||
4189 | |||
4190 | for_each_mem_cgroup_tree(mi, memcg) | ||
4191 | val += mem_cgroup_read_events(mi, i); | ||
4192 | seq_printf(m, "total_%s %llu\n", | ||
4193 | mem_cgroup_events_names[i], val); | ||
4194 | } | ||
4195 | |||
4196 | for (i = 0; i < NR_LRU_LISTS; i++) { | ||
4197 | unsigned long long val = 0; | ||
4198 | |||
4199 | for_each_mem_cgroup_tree(mi, memcg) | ||
4200 | val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; | ||
4201 | seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); | ||
4213 | } | 4202 | } |
4214 | 4203 | ||
4215 | #ifdef CONFIG_DEBUG_VM | 4204 | #ifdef CONFIG_DEBUG_VM |
4216 | { | 4205 | { |
4217 | int nid, zid; | 4206 | int nid, zid; |
4218 | struct mem_cgroup_per_zone *mz; | 4207 | struct mem_cgroup_per_zone *mz; |
4208 | struct zone_reclaim_stat *rstat; | ||
4219 | unsigned long recent_rotated[2] = {0, 0}; | 4209 | unsigned long recent_rotated[2] = {0, 0}; |
4220 | unsigned long recent_scanned[2] = {0, 0}; | 4210 | unsigned long recent_scanned[2] = {0, 0}; |
4221 | 4211 | ||
4222 | for_each_online_node(nid) | 4212 | for_each_online_node(nid) |
4223 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 4213 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
4224 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | 4214 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
4215 | rstat = &mz->lruvec.reclaim_stat; | ||
4225 | 4216 | ||
4226 | recent_rotated[0] += | 4217 | recent_rotated[0] += rstat->recent_rotated[0]; |
4227 | mz->reclaim_stat.recent_rotated[0]; | 4218 | recent_rotated[1] += rstat->recent_rotated[1]; |
4228 | recent_rotated[1] += | 4219 | recent_scanned[0] += rstat->recent_scanned[0]; |
4229 | mz->reclaim_stat.recent_rotated[1]; | 4220 | recent_scanned[1] += rstat->recent_scanned[1]; |
4230 | recent_scanned[0] += | ||
4231 | mz->reclaim_stat.recent_scanned[0]; | ||
4232 | recent_scanned[1] += | ||
4233 | mz->reclaim_stat.recent_scanned[1]; | ||
4234 | } | 4221 | } |
4235 | cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); | 4222 | seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); |
4236 | cb->fill(cb, "recent_rotated_file", recent_rotated[1]); | 4223 | seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); |
4237 | cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); | 4224 | seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); |
4238 | cb->fill(cb, "recent_scanned_file", recent_scanned[1]); | 4225 | seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); |
4239 | } | 4226 | } |
4240 | #endif | 4227 | #endif |
4241 | 4228 | ||
@@ -4297,7 +4284,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
4297 | usage = mem_cgroup_usage(memcg, swap); | 4284 | usage = mem_cgroup_usage(memcg, swap); |
4298 | 4285 | ||
4299 | /* | 4286 | /* |
4300 | * current_threshold points to threshold just below usage. | 4287 | * current_threshold points to threshold just below or equal to usage. |
4301 | * If it's not true, a threshold was crossed after last | 4288 | * If it's not true, a threshold was crossed after last |
4302 | * call of __mem_cgroup_threshold(). | 4289 | * call of __mem_cgroup_threshold(). |
4303 | */ | 4290 | */ |
@@ -4423,14 +4410,15 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, | |||
4423 | /* Find current threshold */ | 4410 | /* Find current threshold */ |
4424 | new->current_threshold = -1; | 4411 | new->current_threshold = -1; |
4425 | for (i = 0; i < size; i++) { | 4412 | for (i = 0; i < size; i++) { |
4426 | if (new->entries[i].threshold < usage) { | 4413 | if (new->entries[i].threshold <= usage) { |
4427 | /* | 4414 | /* |
4428 | * new->current_threshold will not be used until | 4415 | * new->current_threshold will not be used until |
4429 | * rcu_assign_pointer(), so it's safe to increment | 4416 | * rcu_assign_pointer(), so it's safe to increment |
4430 | * it here. | 4417 | * it here. |
4431 | */ | 4418 | */ |
4432 | ++new->current_threshold; | 4419 | ++new->current_threshold; |
4433 | } | 4420 | } else |
4421 | break; | ||
4434 | } | 4422 | } |
4435 | 4423 | ||
4436 | /* Free old spare buffer and save old primary buffer as spare */ | 4424 | /* Free old spare buffer and save old primary buffer as spare */ |
@@ -4499,7 +4487,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, | |||
4499 | continue; | 4487 | continue; |
4500 | 4488 | ||
4501 | new->entries[j] = thresholds->primary->entries[i]; | 4489 | new->entries[j] = thresholds->primary->entries[i]; |
4502 | if (new->entries[j].threshold < usage) { | 4490 | if (new->entries[j].threshold <= usage) { |
4503 | /* | 4491 | /* |
4504 | * new->current_threshold will not be used | 4492 | * new->current_threshold will not be used |
4505 | * until rcu_assign_pointer(), so it's safe to increment | 4493 | * until rcu_assign_pointer(), so it's safe to increment |
@@ -4513,6 +4501,12 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, | |||
4513 | swap_buffers: | 4501 | swap_buffers: |
4514 | /* Swap primary and spare array */ | 4502 | /* Swap primary and spare array */ |
4515 | thresholds->spare = thresholds->primary; | 4503 | thresholds->spare = thresholds->primary; |
4504 | /* If all events are unregistered, free the spare array */ | ||
4505 | if (!new) { | ||
4506 | kfree(thresholds->spare); | ||
4507 | thresholds->spare = NULL; | ||
4508 | } | ||
4509 | |||
4516 | rcu_assign_pointer(thresholds->primary, new); | 4510 | rcu_assign_pointer(thresholds->primary, new); |
4517 | 4511 | ||
4518 | /* To be sure that nobody uses thresholds */ | 4512 | /* To be sure that nobody uses thresholds */ |
@@ -4607,46 +4601,23 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | |||
4607 | return 0; | 4601 | return 0; |
4608 | } | 4602 | } |
4609 | 4603 | ||
4610 | #ifdef CONFIG_NUMA | 4604 | #ifdef CONFIG_MEMCG_KMEM |
4611 | static const struct file_operations mem_control_numa_stat_file_operations = { | 4605 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
4612 | .read = seq_read, | ||
4613 | .llseek = seq_lseek, | ||
4614 | .release = single_release, | ||
4615 | }; | ||
4616 | |||
4617 | static int mem_control_numa_stat_open(struct inode *unused, struct file *file) | ||
4618 | { | 4606 | { |
4619 | struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; | 4607 | return mem_cgroup_sockets_init(memcg, ss); |
4620 | |||
4621 | file->f_op = &mem_control_numa_stat_file_operations; | ||
4622 | return single_open(file, mem_control_numa_stat_show, cont); | ||
4623 | } | ||
4624 | #endif /* CONFIG_NUMA */ | ||
4625 | |||
4626 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | ||
4627 | static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
4628 | { | ||
4629 | /* | ||
4630 | * Part of this would be better living in a separate allocation | ||
4631 | * function, leaving us with just the cgroup tree population work. | ||
4632 | * We, however, depend on state such as network's proto_list that | ||
4633 | * is only initialized after cgroup creation. I found the less | ||
4634 | * cumbersome way to deal with it to defer it all to populate time | ||
4635 | */ | ||
4636 | return mem_cgroup_sockets_init(cont, ss); | ||
4637 | }; | 4608 | }; |
4638 | 4609 | ||
4639 | static void kmem_cgroup_destroy(struct cgroup *cont) | 4610 | static void kmem_cgroup_destroy(struct mem_cgroup *memcg) |
4640 | { | 4611 | { |
4641 | mem_cgroup_sockets_destroy(cont); | 4612 | mem_cgroup_sockets_destroy(memcg); |
4642 | } | 4613 | } |
4643 | #else | 4614 | #else |
4644 | static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) | 4615 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
4645 | { | 4616 | { |
4646 | return 0; | 4617 | return 0; |
4647 | } | 4618 | } |
4648 | 4619 | ||
4649 | static void kmem_cgroup_destroy(struct cgroup *cont) | 4620 | static void kmem_cgroup_destroy(struct mem_cgroup *memcg) |
4650 | { | 4621 | { |
4651 | } | 4622 | } |
4652 | #endif | 4623 | #endif |
@@ -4655,7 +4626,7 @@ static struct cftype mem_cgroup_files[] = { | |||
4655 | { | 4626 | { |
4656 | .name = "usage_in_bytes", | 4627 | .name = "usage_in_bytes", |
4657 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 4628 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
4658 | .read_u64 = mem_cgroup_read, | 4629 | .read = mem_cgroup_read, |
4659 | .register_event = mem_cgroup_usage_register_event, | 4630 | .register_event = mem_cgroup_usage_register_event, |
4660 | .unregister_event = mem_cgroup_usage_unregister_event, | 4631 | .unregister_event = mem_cgroup_usage_unregister_event, |
4661 | }, | 4632 | }, |
@@ -4663,29 +4634,29 @@ static struct cftype mem_cgroup_files[] = { | |||
4663 | .name = "max_usage_in_bytes", | 4634 | .name = "max_usage_in_bytes", |
4664 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), | 4635 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), |
4665 | .trigger = mem_cgroup_reset, | 4636 | .trigger = mem_cgroup_reset, |
4666 | .read_u64 = mem_cgroup_read, | 4637 | .read = mem_cgroup_read, |
4667 | }, | 4638 | }, |
4668 | { | 4639 | { |
4669 | .name = "limit_in_bytes", | 4640 | .name = "limit_in_bytes", |
4670 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), | 4641 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), |
4671 | .write_string = mem_cgroup_write, | 4642 | .write_string = mem_cgroup_write, |
4672 | .read_u64 = mem_cgroup_read, | 4643 | .read = mem_cgroup_read, |
4673 | }, | 4644 | }, |
4674 | { | 4645 | { |
4675 | .name = "soft_limit_in_bytes", | 4646 | .name = "soft_limit_in_bytes", |
4676 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), | 4647 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), |
4677 | .write_string = mem_cgroup_write, | 4648 | .write_string = mem_cgroup_write, |
4678 | .read_u64 = mem_cgroup_read, | 4649 | .read = mem_cgroup_read, |
4679 | }, | 4650 | }, |
4680 | { | 4651 | { |
4681 | .name = "failcnt", | 4652 | .name = "failcnt", |
4682 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), | 4653 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
4683 | .trigger = mem_cgroup_reset, | 4654 | .trigger = mem_cgroup_reset, |
4684 | .read_u64 = mem_cgroup_read, | 4655 | .read = mem_cgroup_read, |
4685 | }, | 4656 | }, |
4686 | { | 4657 | { |
4687 | .name = "stat", | 4658 | .name = "stat", |
4688 | .read_map = mem_control_stat_show, | 4659 | .read_seq_string = memcg_stat_show, |
4689 | }, | 4660 | }, |
4690 | { | 4661 | { |
4691 | .name = "force_empty", | 4662 | .name = "force_empty", |
@@ -4717,18 +4688,14 @@ static struct cftype mem_cgroup_files[] = { | |||
4717 | #ifdef CONFIG_NUMA | 4688 | #ifdef CONFIG_NUMA |
4718 | { | 4689 | { |
4719 | .name = "numa_stat", | 4690 | .name = "numa_stat", |
4720 | .open = mem_control_numa_stat_open, | 4691 | .read_seq_string = memcg_numa_stat_show, |
4721 | .mode = S_IRUGO, | ||
4722 | }, | 4692 | }, |
4723 | #endif | 4693 | #endif |
4724 | }; | 4694 | #ifdef CONFIG_MEMCG_SWAP |
4725 | |||
4726 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
4727 | static struct cftype memsw_cgroup_files[] = { | ||
4728 | { | 4695 | { |
4729 | .name = "memsw.usage_in_bytes", | 4696 | .name = "memsw.usage_in_bytes", |
4730 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 4697 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
4731 | .read_u64 = mem_cgroup_read, | 4698 | .read = mem_cgroup_read, |
4732 | .register_event = mem_cgroup_usage_register_event, | 4699 | .register_event = mem_cgroup_usage_register_event, |
4733 | .unregister_event = mem_cgroup_usage_unregister_event, | 4700 | .unregister_event = mem_cgroup_usage_unregister_event, |
4734 | }, | 4701 | }, |
@@ -4736,41 +4703,28 @@ static struct cftype memsw_cgroup_files[] = { | |||
4736 | .name = "memsw.max_usage_in_bytes", | 4703 | .name = "memsw.max_usage_in_bytes", |
4737 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | 4704 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), |
4738 | .trigger = mem_cgroup_reset, | 4705 | .trigger = mem_cgroup_reset, |
4739 | .read_u64 = mem_cgroup_read, | 4706 | .read = mem_cgroup_read, |
4740 | }, | 4707 | }, |
4741 | { | 4708 | { |
4742 | .name = "memsw.limit_in_bytes", | 4709 | .name = "memsw.limit_in_bytes", |
4743 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | 4710 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), |
4744 | .write_string = mem_cgroup_write, | 4711 | .write_string = mem_cgroup_write, |
4745 | .read_u64 = mem_cgroup_read, | 4712 | .read = mem_cgroup_read, |
4746 | }, | 4713 | }, |
4747 | { | 4714 | { |
4748 | .name = "memsw.failcnt", | 4715 | .name = "memsw.failcnt", |
4749 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | 4716 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), |
4750 | .trigger = mem_cgroup_reset, | 4717 | .trigger = mem_cgroup_reset, |
4751 | .read_u64 = mem_cgroup_read, | 4718 | .read = mem_cgroup_read, |
4752 | }, | 4719 | }, |
4753 | }; | ||
4754 | |||
4755 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
4756 | { | ||
4757 | if (!do_swap_account) | ||
4758 | return 0; | ||
4759 | return cgroup_add_files(cont, ss, memsw_cgroup_files, | ||
4760 | ARRAY_SIZE(memsw_cgroup_files)); | ||
4761 | }; | ||
4762 | #else | ||
4763 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
4764 | { | ||
4765 | return 0; | ||
4766 | } | ||
4767 | #endif | 4720 | #endif |
4721 | { }, /* terminate */ | ||
4722 | }; | ||
4768 | 4723 | ||
4769 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | 4724 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
4770 | { | 4725 | { |
4771 | struct mem_cgroup_per_node *pn; | 4726 | struct mem_cgroup_per_node *pn; |
4772 | struct mem_cgroup_per_zone *mz; | 4727 | struct mem_cgroup_per_zone *mz; |
4773 | enum lru_list lru; | ||
4774 | int zone, tmp = node; | 4728 | int zone, tmp = node; |
4775 | /* | 4729 | /* |
4776 | * This routine is called against possible nodes. | 4730 | * This routine is called against possible nodes. |
@@ -4788,8 +4742,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
4788 | 4742 | ||
4789 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4743 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
4790 | mz = &pn->zoneinfo[zone]; | 4744 | mz = &pn->zoneinfo[zone]; |
4791 | for_each_lru(lru) | 4745 | lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]); |
4792 | INIT_LIST_HEAD(&mz->lruvec.lists[lru]); | ||
4793 | mz->usage_in_excess = 0; | 4746 | mz->usage_in_excess = 0; |
4794 | mz->on_tree = false; | 4747 | mz->on_tree = false; |
4795 | mz->memcg = memcg; | 4748 | mz->memcg = memcg; |
@@ -4832,23 +4785,40 @@ out_free: | |||
4832 | } | 4785 | } |
4833 | 4786 | ||
4834 | /* | 4787 | /* |
4835 | * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, | 4788 | * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, |
4836 | * but in process context. The work_freeing structure is overlaid | 4789 | * but in process context. The work_freeing structure is overlaid |
4837 | * on the rcu_freeing structure, which itself is overlaid on memsw. | 4790 | * on the rcu_freeing structure, which itself is overlaid on memsw. |
4838 | */ | 4791 | */ |
4839 | static void vfree_work(struct work_struct *work) | 4792 | static void free_work(struct work_struct *work) |
4840 | { | 4793 | { |
4841 | struct mem_cgroup *memcg; | 4794 | struct mem_cgroup *memcg; |
4795 | int size = sizeof(struct mem_cgroup); | ||
4842 | 4796 | ||
4843 | memcg = container_of(work, struct mem_cgroup, work_freeing); | 4797 | memcg = container_of(work, struct mem_cgroup, work_freeing); |
4844 | vfree(memcg); | 4798 | /* |
4799 | * We need to make sure that (at least for now), the jump label | ||
4800 | * destruction code runs outside of the cgroup lock. This is because | ||
4801 | * get_online_cpus(), which is called from the static_branch update, | ||
4802 | * can't be called inside the cgroup_lock. cpusets are the ones | ||
4803 | * enforcing this dependency, so if they ever change, we might as well. | ||
4804 | * | ||
4805 | * schedule_work() will guarantee this happens. Be careful if you need | ||
4806 | * to move this code around, and make sure it is outside | ||
4807 | * the cgroup_lock. | ||
4808 | */ | ||
4809 | disarm_sock_keys(memcg); | ||
4810 | if (size < PAGE_SIZE) | ||
4811 | kfree(memcg); | ||
4812 | else | ||
4813 | vfree(memcg); | ||
4845 | } | 4814 | } |
4846 | static void vfree_rcu(struct rcu_head *rcu_head) | 4815 | |
4816 | static void free_rcu(struct rcu_head *rcu_head) | ||
4847 | { | 4817 | { |
4848 | struct mem_cgroup *memcg; | 4818 | struct mem_cgroup *memcg; |
4849 | 4819 | ||
4850 | memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); | 4820 | memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); |
4851 | INIT_WORK(&memcg->work_freeing, vfree_work); | 4821 | INIT_WORK(&memcg->work_freeing, free_work); |
4852 | schedule_work(&memcg->work_freeing); | 4822 | schedule_work(&memcg->work_freeing); |
4853 | } | 4823 | } |
4854 | 4824 | ||
@@ -4874,10 +4844,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
4874 | free_mem_cgroup_per_zone_info(memcg, node); | 4844 | free_mem_cgroup_per_zone_info(memcg, node); |
4875 | 4845 | ||
4876 | free_percpu(memcg->stat); | 4846 | free_percpu(memcg->stat); |
4877 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) | 4847 | call_rcu(&memcg->rcu_freeing, free_rcu); |
4878 | kfree_rcu(memcg, rcu_freeing); | ||
4879 | else | ||
4880 | call_rcu(&memcg->rcu_freeing, vfree_rcu); | ||
4881 | } | 4848 | } |
4882 | 4849 | ||
4883 | static void mem_cgroup_get(struct mem_cgroup *memcg) | 4850 | static void mem_cgroup_get(struct mem_cgroup *memcg) |
@@ -4911,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
4911 | } | 4878 | } |
4912 | EXPORT_SYMBOL(parent_mem_cgroup); | 4879 | EXPORT_SYMBOL(parent_mem_cgroup); |
4913 | 4880 | ||
4914 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4881 | #ifdef CONFIG_MEMCG_SWAP |
4915 | static void __init enable_swap_cgroup(void) | 4882 | static void __init enable_swap_cgroup(void) |
4916 | { | 4883 | { |
4917 | if (!mem_cgroup_disabled() && really_do_swap_account) | 4884 | if (!mem_cgroup_disabled() && really_do_swap_account) |
@@ -5016,6 +4983,17 @@ mem_cgroup_create(struct cgroup *cont) | |||
5016 | memcg->move_charge_at_immigrate = 0; | 4983 | memcg->move_charge_at_immigrate = 0; |
5017 | mutex_init(&memcg->thresholds_lock); | 4984 | mutex_init(&memcg->thresholds_lock); |
5018 | spin_lock_init(&memcg->move_lock); | 4985 | spin_lock_init(&memcg->move_lock); |
4986 | |||
4987 | error = memcg_init_kmem(memcg, &mem_cgroup_subsys); | ||
4988 | if (error) { | ||
4989 | /* | ||
4990 | * We call put now because our (and parent's) refcnts | ||
4991 | * are already in place. mem_cgroup_put() will internally | ||
4992 | * call __mem_cgroup_free, so return directly | ||
4993 | */ | ||
4994 | mem_cgroup_put(memcg); | ||
4995 | return ERR_PTR(error); | ||
4996 | } | ||
5019 | return &memcg->css; | 4997 | return &memcg->css; |
5020 | free_out: | 4998 | free_out: |
5021 | __mem_cgroup_free(memcg); | 4999 | __mem_cgroup_free(memcg); |
@@ -5033,28 +5011,11 @@ static void mem_cgroup_destroy(struct cgroup *cont) | |||
5033 | { | 5011 | { |
5034 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 5012 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
5035 | 5013 | ||
5036 | kmem_cgroup_destroy(cont); | 5014 | kmem_cgroup_destroy(memcg); |
5037 | 5015 | ||
5038 | mem_cgroup_put(memcg); | 5016 | mem_cgroup_put(memcg); |
5039 | } | 5017 | } |
5040 | 5018 | ||
5041 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | ||
5042 | struct cgroup *cont) | ||
5043 | { | ||
5044 | int ret; | ||
5045 | |||
5046 | ret = cgroup_add_files(cont, ss, mem_cgroup_files, | ||
5047 | ARRAY_SIZE(mem_cgroup_files)); | ||
5048 | |||
5049 | if (!ret) | ||
5050 | ret = register_memsw_files(cont, ss); | ||
5051 | |||
5052 | if (!ret) | ||
5053 | ret = register_kmem_files(cont, ss); | ||
5054 | |||
5055 | return ret; | ||
5056 | } | ||
5057 | |||
5058 | #ifdef CONFIG_MMU | 5019 | #ifdef CONFIG_MMU |
5059 | /* Handlers for move charge at task migration. */ | 5020 | /* Handlers for move charge at task migration. */ |
5060 | #define PRECHARGE_COUNT_AT_ONCE 256 | 5021 | #define PRECHARGE_COUNT_AT_ONCE 256 |
@@ -5147,7 +5108,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | |||
5147 | return NULL; | 5108 | return NULL; |
5148 | if (PageAnon(page)) { | 5109 | if (PageAnon(page)) { |
5149 | /* we don't move shared anon */ | 5110 | /* we don't move shared anon */ |
5150 | if (!move_anon() || page_mapcount(page) > 2) | 5111 | if (!move_anon()) |
5151 | return NULL; | 5112 | return NULL; |
5152 | } else if (!move_file()) | 5113 | } else if (!move_file()) |
5153 | /* we ignore mapcount for file pages */ | 5114 | /* we ignore mapcount for file pages */ |
@@ -5158,32 +5119,37 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | |||
5158 | return page; | 5119 | return page; |
5159 | } | 5120 | } |
5160 | 5121 | ||
5122 | #ifdef CONFIG_SWAP | ||
5161 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | 5123 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, |
5162 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | 5124 | unsigned long addr, pte_t ptent, swp_entry_t *entry) |
5163 | { | 5125 | { |
5164 | int usage_count; | ||
5165 | struct page *page = NULL; | 5126 | struct page *page = NULL; |
5166 | swp_entry_t ent = pte_to_swp_entry(ptent); | 5127 | swp_entry_t ent = pte_to_swp_entry(ptent); |
5167 | 5128 | ||
5168 | if (!move_anon() || non_swap_entry(ent)) | 5129 | if (!move_anon() || non_swap_entry(ent)) |
5169 | return NULL; | 5130 | return NULL; |
5170 | usage_count = mem_cgroup_count_swap_user(ent, &page); | 5131 | /* |
5171 | if (usage_count > 1) { /* we don't move shared anon */ | 5132 | * Because lookup_swap_cache() updates some statistics counter, |
5172 | if (page) | 5133 | * we call find_get_page() with swapper_space directly. |
5173 | put_page(page); | 5134 | */ |
5174 | return NULL; | 5135 | page = find_get_page(&swapper_space, ent.val); |
5175 | } | ||
5176 | if (do_swap_account) | 5136 | if (do_swap_account) |
5177 | entry->val = ent.val; | 5137 | entry->val = ent.val; |
5178 | 5138 | ||
5179 | return page; | 5139 | return page; |
5180 | } | 5140 | } |
5141 | #else | ||
5142 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | ||
5143 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | ||
5144 | { | ||
5145 | return NULL; | ||
5146 | } | ||
5147 | #endif | ||
5181 | 5148 | ||
5182 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | 5149 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, |
5183 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | 5150 | unsigned long addr, pte_t ptent, swp_entry_t *entry) |
5184 | { | 5151 | { |
5185 | struct page *page = NULL; | 5152 | struct page *page = NULL; |
5186 | struct inode *inode; | ||
5187 | struct address_space *mapping; | 5153 | struct address_space *mapping; |
5188 | pgoff_t pgoff; | 5154 | pgoff_t pgoff; |
5189 | 5155 | ||
@@ -5192,7 +5158,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
5192 | if (!move_file()) | 5158 | if (!move_file()) |
5193 | return NULL; | 5159 | return NULL; |
5194 | 5160 | ||
5195 | inode = vma->vm_file->f_path.dentry->d_inode; | ||
5196 | mapping = vma->vm_file->f_mapping; | 5161 | mapping = vma->vm_file->f_mapping; |
5197 | if (pte_none(ptent)) | 5162 | if (pte_none(ptent)) |
5198 | pgoff = linear_page_index(vma, addr); | 5163 | pgoff = linear_page_index(vma, addr); |
@@ -5481,7 +5446,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
5481 | * part of thp split is not executed yet. | 5446 | * part of thp split is not executed yet. |
5482 | */ | 5447 | */ |
5483 | if (pmd_trans_huge_lock(pmd, vma) == 1) { | 5448 | if (pmd_trans_huge_lock(pmd, vma) == 1) { |
5484 | if (!mc.precharge) { | 5449 | if (mc.precharge < HPAGE_PMD_NR) { |
5485 | spin_unlock(&vma->vm_mm->page_table_lock); | 5450 | spin_unlock(&vma->vm_mm->page_table_lock); |
5486 | return 0; | 5451 | return 0; |
5487 | } | 5452 | } |
@@ -5491,8 +5456,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
5491 | if (!isolate_lru_page(page)) { | 5456 | if (!isolate_lru_page(page)) { |
5492 | pc = lookup_page_cgroup(page); | 5457 | pc = lookup_page_cgroup(page); |
5493 | if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, | 5458 | if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, |
5494 | pc, mc.from, mc.to, | 5459 | pc, mc.from, mc.to)) { |
5495 | false)) { | ||
5496 | mc.precharge -= HPAGE_PMD_NR; | 5460 | mc.precharge -= HPAGE_PMD_NR; |
5497 | mc.moved_charge += HPAGE_PMD_NR; | 5461 | mc.moved_charge += HPAGE_PMD_NR; |
5498 | } | 5462 | } |
@@ -5522,7 +5486,7 @@ retry: | |||
5522 | goto put; | 5486 | goto put; |
5523 | pc = lookup_page_cgroup(page); | 5487 | pc = lookup_page_cgroup(page); |
5524 | if (!mem_cgroup_move_account(page, 1, pc, | 5488 | if (!mem_cgroup_move_account(page, 1, pc, |
5525 | mc.from, mc.to, false)) { | 5489 | mc.from, mc.to)) { |
5526 | mc.precharge--; | 5490 | mc.precharge--; |
5527 | /* we uncharge from mc.from later. */ | 5491 | /* we uncharge from mc.from later. */ |
5528 | mc.moved_charge++; | 5492 | mc.moved_charge++; |
@@ -5533,8 +5497,7 @@ put: /* get_mctgt_type() gets the page */ | |||
5533 | break; | 5497 | break; |
5534 | case MC_TARGET_SWAP: | 5498 | case MC_TARGET_SWAP: |
5535 | ent = target.ent; | 5499 | ent = target.ent; |
5536 | if (!mem_cgroup_move_swap_account(ent, | 5500 | if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { |
5537 | mc.from, mc.to, false)) { | ||
5538 | mc.precharge--; | 5501 | mc.precharge--; |
5539 | /* we fixup refcnts and charges later. */ | 5502 | /* we fixup refcnts and charges later. */ |
5540 | mc.moved_swap++; | 5503 | mc.moved_swap++; |
@@ -5610,7 +5573,6 @@ static void mem_cgroup_move_task(struct cgroup *cont, | |||
5610 | if (mm) { | 5573 | if (mm) { |
5611 | if (mc.to) | 5574 | if (mc.to) |
5612 | mem_cgroup_move_charge(mm); | 5575 | mem_cgroup_move_charge(mm); |
5613 | put_swap_token(mm); | ||
5614 | mmput(mm); | 5576 | mmput(mm); |
5615 | } | 5577 | } |
5616 | if (mc.to) | 5578 | if (mc.to) |
@@ -5638,15 +5600,16 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
5638 | .create = mem_cgroup_create, | 5600 | .create = mem_cgroup_create, |
5639 | .pre_destroy = mem_cgroup_pre_destroy, | 5601 | .pre_destroy = mem_cgroup_pre_destroy, |
5640 | .destroy = mem_cgroup_destroy, | 5602 | .destroy = mem_cgroup_destroy, |
5641 | .populate = mem_cgroup_populate, | ||
5642 | .can_attach = mem_cgroup_can_attach, | 5603 | .can_attach = mem_cgroup_can_attach, |
5643 | .cancel_attach = mem_cgroup_cancel_attach, | 5604 | .cancel_attach = mem_cgroup_cancel_attach, |
5644 | .attach = mem_cgroup_move_task, | 5605 | .attach = mem_cgroup_move_task, |
5606 | .base_cftypes = mem_cgroup_files, | ||
5645 | .early_init = 0, | 5607 | .early_init = 0, |
5646 | .use_id = 1, | 5608 | .use_id = 1, |
5609 | .__DEPRECATED_clear_css_refs = true, | ||
5647 | }; | 5610 | }; |
5648 | 5611 | ||
5649 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 5612 | #ifdef CONFIG_MEMCG_SWAP |
5650 | static int __init enable_swap_account(char *s) | 5613 | static int __init enable_swap_account(char *s) |
5651 | { | 5614 | { |
5652 | /* consider enabled if no parameter or 1 is given */ | 5615 | /* consider enabled if no parameter or 1 is given */ |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 97cc2733551a..a6e2141a6610 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p) | |||
128 | * can only guarantee that the page either belongs to the memcg tasks, or is | 128 | * can only guarantee that the page either belongs to the memcg tasks, or is |
129 | * a freed page. | 129 | * a freed page. |
130 | */ | 130 | */ |
131 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 131 | #ifdef CONFIG_MEMCG_SWAP |
132 | u64 hwpoison_filter_memcg; | 132 | u64 hwpoison_filter_memcg; |
133 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); | 133 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); |
134 | static int hwpoison_filter_task(struct page *p) | 134 | static int hwpoison_filter_task(struct page *p) |
@@ -345,14 +345,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, | |||
345 | * Also when FAIL is set do a force kill because something went | 345 | * Also when FAIL is set do a force kill because something went |
346 | * wrong earlier. | 346 | * wrong earlier. |
347 | */ | 347 | */ |
348 | static void kill_procs(struct list_head *to_kill, int doit, int trapno, | 348 | static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, |
349 | int fail, struct page *page, unsigned long pfn, | 349 | int fail, struct page *page, unsigned long pfn, |
350 | int flags) | 350 | int flags) |
351 | { | 351 | { |
352 | struct to_kill *tk, *next; | 352 | struct to_kill *tk, *next; |
353 | 353 | ||
354 | list_for_each_entry_safe (tk, next, to_kill, nd) { | 354 | list_for_each_entry_safe (tk, next, to_kill, nd) { |
355 | if (doit) { | 355 | if (forcekill) { |
356 | /* | 356 | /* |
357 | * In case something went wrong with munmapping | 357 | * In case something went wrong with munmapping |
358 | * make sure the process doesn't catch the | 358 | * make sure the process doesn't catch the |
@@ -858,7 +858,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
858 | struct address_space *mapping; | 858 | struct address_space *mapping; |
859 | LIST_HEAD(tokill); | 859 | LIST_HEAD(tokill); |
860 | int ret; | 860 | int ret; |
861 | int kill = 1; | 861 | int kill = 1, forcekill; |
862 | struct page *hpage = compound_head(p); | 862 | struct page *hpage = compound_head(p); |
863 | struct page *ppage; | 863 | struct page *ppage; |
864 | 864 | ||
@@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
888 | * be called inside page lock (it's recommended but not enforced). | 888 | * be called inside page lock (it's recommended but not enforced). |
889 | */ | 889 | */ |
890 | mapping = page_mapping(hpage); | 890 | mapping = page_mapping(hpage); |
891 | if (!PageDirty(hpage) && mapping && | 891 | if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && |
892 | mapping_cap_writeback_dirty(mapping)) { | 892 | mapping_cap_writeback_dirty(mapping)) { |
893 | if (page_mkclean(hpage)) { | 893 | if (page_mkclean(hpage)) { |
894 | SetPageDirty(hpage); | 894 | SetPageDirty(hpage); |
@@ -965,12 +965,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
965 | * Now that the dirty bit has been propagated to the | 965 | * Now that the dirty bit has been propagated to the |
966 | * struct page and all unmaps done we can decide if | 966 | * struct page and all unmaps done we can decide if |
967 | * killing is needed or not. Only kill when the page | 967 | * killing is needed or not. Only kill when the page |
968 | * was dirty, otherwise the tokill list is merely | 968 | * was dirty or the process is not restartable, |
969 | * otherwise the tokill list is merely | ||
969 | * freed. When there was a problem unmapping earlier | 970 | * freed. When there was a problem unmapping earlier |
970 | * use a more force-full uncatchable kill to prevent | 971 | * use a more force-full uncatchable kill to prevent |
971 | * any accesses to the poisoned memory. | 972 | * any accesses to the poisoned memory. |
972 | */ | 973 | */ |
973 | kill_procs(&tokill, !!PageDirty(ppage), trapno, | 974 | forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); |
975 | kill_procs(&tokill, forcekill, trapno, | ||
974 | ret != SWAP_SUCCESS, p, pfn, flags); | 976 | ret != SWAP_SUCCESS, p, pfn, flags); |
975 | 977 | ||
976 | return ret; | 978 | return ret; |
@@ -1388,23 +1390,23 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1388 | */ | 1390 | */ |
1389 | if (!get_page_unless_zero(compound_head(p))) { | 1391 | if (!get_page_unless_zero(compound_head(p))) { |
1390 | if (PageHuge(p)) { | 1392 | if (PageHuge(p)) { |
1391 | pr_info("get_any_page: %#lx free huge page\n", pfn); | 1393 | pr_info("%s: %#lx free huge page\n", __func__, pfn); |
1392 | ret = dequeue_hwpoisoned_huge_page(compound_head(p)); | 1394 | ret = dequeue_hwpoisoned_huge_page(compound_head(p)); |
1393 | } else if (is_free_buddy_page(p)) { | 1395 | } else if (is_free_buddy_page(p)) { |
1394 | pr_info("get_any_page: %#lx free buddy page\n", pfn); | 1396 | pr_info("%s: %#lx free buddy page\n", __func__, pfn); |
1395 | /* Set hwpoison bit while page is still isolated */ | 1397 | /* Set hwpoison bit while page is still isolated */ |
1396 | SetPageHWPoison(p); | 1398 | SetPageHWPoison(p); |
1397 | ret = 0; | 1399 | ret = 0; |
1398 | } else { | 1400 | } else { |
1399 | pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", | 1401 | pr_info("%s: %#lx: unknown zero refcount page type %lx\n", |
1400 | pfn, p->flags); | 1402 | __func__, pfn, p->flags); |
1401 | ret = -EIO; | 1403 | ret = -EIO; |
1402 | } | 1404 | } |
1403 | } else { | 1405 | } else { |
1404 | /* Not a free page */ | 1406 | /* Not a free page */ |
1405 | ret = 1; | 1407 | ret = 1; |
1406 | } | 1408 | } |
1407 | unset_migratetype_isolate(p); | 1409 | unset_migratetype_isolate(p, MIGRATE_MOVABLE); |
1408 | unlock_memory_hotplug(); | 1410 | unlock_memory_hotplug(); |
1409 | return ret; | 1411 | return ret; |
1410 | } | 1412 | } |
@@ -1414,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1414 | int ret; | 1416 | int ret; |
1415 | unsigned long pfn = page_to_pfn(page); | 1417 | unsigned long pfn = page_to_pfn(page); |
1416 | struct page *hpage = compound_head(page); | 1418 | struct page *hpage = compound_head(page); |
1417 | LIST_HEAD(pagelist); | ||
1418 | 1419 | ||
1419 | ret = get_any_page(page, pfn, flags); | 1420 | ret = get_any_page(page, pfn, flags); |
1420 | if (ret < 0) | 1421 | if (ret < 0) |
@@ -1429,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1429 | } | 1430 | } |
1430 | 1431 | ||
1431 | /* Keep page count to indicate a given hugepage is isolated. */ | 1432 | /* Keep page count to indicate a given hugepage is isolated. */ |
1432 | 1433 | ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false, | |
1433 | list_add(&hpage->lru, &pagelist); | 1434 | MIGRATE_SYNC); |
1434 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, | 1435 | put_page(hpage); |
1435 | true); | ||
1436 | if (ret) { | 1436 | if (ret) { |
1437 | struct page *page1, *page2; | ||
1438 | list_for_each_entry_safe(page1, page2, &pagelist, lru) | ||
1439 | put_page(page1); | ||
1440 | |||
1441 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1437 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1442 | pfn, ret, page->flags); | 1438 | pfn, ret, page->flags); |
1443 | if (ret > 0) | ||
1444 | ret = -EIO; | ||
1445 | return ret; | 1439 | return ret; |
1446 | } | 1440 | } |
1447 | done: | 1441 | done: |
1448 | if (!PageHWPoison(hpage)) | 1442 | if (!PageHWPoison(hpage)) |
1449 | atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); | 1443 | atomic_long_add(1 << compound_trans_order(hpage), |
1444 | &mce_bad_pages); | ||
1450 | set_page_hwpoison_huge_page(hpage); | 1445 | set_page_hwpoison_huge_page(hpage); |
1451 | dequeue_hwpoisoned_huge_page(hpage); | 1446 | dequeue_hwpoisoned_huge_page(hpage); |
1452 | /* keep elevated page count for bad page */ | 1447 | /* keep elevated page count for bad page */ |
@@ -1561,7 +1556,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1561 | page_is_file_cache(page)); | 1556 | page_is_file_cache(page)); |
1562 | list_add(&page->lru, &pagelist); | 1557 | list_add(&page->lru, &pagelist); |
1563 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1558 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1564 | 0, MIGRATE_SYNC); | 1559 | false, MIGRATE_SYNC); |
1565 | if (ret) { | 1560 | if (ret) { |
1566 | putback_lru_pages(&pagelist); | 1561 | putback_lru_pages(&pagelist); |
1567 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1562 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
diff --git a/mm/memory.c b/mm/memory.c index 6105f475fa86..57361708d1a5 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -206,6 +206,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) | |||
206 | tlb->mm = mm; | 206 | tlb->mm = mm; |
207 | 207 | ||
208 | tlb->fullmm = fullmm; | 208 | tlb->fullmm = fullmm; |
209 | tlb->start = -1UL; | ||
210 | tlb->end = 0; | ||
209 | tlb->need_flush = 0; | 211 | tlb->need_flush = 0; |
210 | tlb->fast_mode = (num_possible_cpus() == 1); | 212 | tlb->fast_mode = (num_possible_cpus() == 1); |
211 | tlb->local.next = NULL; | 213 | tlb->local.next = NULL; |
@@ -248,6 +250,8 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e | |||
248 | { | 250 | { |
249 | struct mmu_gather_batch *batch, *next; | 251 | struct mmu_gather_batch *batch, *next; |
250 | 252 | ||
253 | tlb->start = start; | ||
254 | tlb->end = end; | ||
251 | tlb_flush_mmu(tlb); | 255 | tlb_flush_mmu(tlb); |
252 | 256 | ||
253 | /* keep the page table cache within bounds */ | 257 | /* keep the page table cache within bounds */ |
@@ -1204,6 +1208,11 @@ again: | |||
1204 | */ | 1208 | */ |
1205 | if (force_flush) { | 1209 | if (force_flush) { |
1206 | force_flush = 0; | 1210 | force_flush = 0; |
1211 | |||
1212 | #ifdef HAVE_GENERIC_MMU_GATHER | ||
1213 | tlb->start = addr; | ||
1214 | tlb->end = end; | ||
1215 | #endif | ||
1207 | tlb_flush_mmu(tlb); | 1216 | tlb_flush_mmu(tlb); |
1208 | if (addr != end) | 1217 | if (addr != end) |
1209 | goto again; | 1218 | goto again; |
@@ -1225,7 +1234,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1225 | next = pmd_addr_end(addr, end); | 1234 | next = pmd_addr_end(addr, end); |
1226 | if (pmd_trans_huge(*pmd)) { | 1235 | if (pmd_trans_huge(*pmd)) { |
1227 | if (next - addr != HPAGE_PMD_SIZE) { | 1236 | if (next - addr != HPAGE_PMD_SIZE) { |
1228 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | 1237 | #ifdef CONFIG_DEBUG_VM |
1238 | if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { | ||
1239 | pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", | ||
1240 | __func__, addr, end, | ||
1241 | vma->vm_start, | ||
1242 | vma->vm_end); | ||
1243 | BUG(); | ||
1244 | } | ||
1245 | #endif | ||
1229 | split_huge_page_pmd(vma->vm_mm, pmd); | 1246 | split_huge_page_pmd(vma->vm_mm, pmd); |
1230 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1247 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
1231 | goto next; | 1248 | goto next; |
@@ -1295,7 +1312,7 @@ static void unmap_page_range(struct mmu_gather *tlb, | |||
1295 | 1312 | ||
1296 | static void unmap_single_vma(struct mmu_gather *tlb, | 1313 | static void unmap_single_vma(struct mmu_gather *tlb, |
1297 | struct vm_area_struct *vma, unsigned long start_addr, | 1314 | struct vm_area_struct *vma, unsigned long start_addr, |
1298 | unsigned long end_addr, unsigned long *nr_accounted, | 1315 | unsigned long end_addr, |
1299 | struct zap_details *details) | 1316 | struct zap_details *details) |
1300 | { | 1317 | { |
1301 | unsigned long start = max(vma->vm_start, start_addr); | 1318 | unsigned long start = max(vma->vm_start, start_addr); |
@@ -1307,8 +1324,8 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1307 | if (end <= vma->vm_start) | 1324 | if (end <= vma->vm_start) |
1308 | return; | 1325 | return; |
1309 | 1326 | ||
1310 | if (vma->vm_flags & VM_ACCOUNT) | 1327 | if (vma->vm_file) |
1311 | *nr_accounted += (end - start) >> PAGE_SHIFT; | 1328 | uprobe_munmap(vma, start, end); |
1312 | 1329 | ||
1313 | if (unlikely(is_pfn_mapping(vma))) | 1330 | if (unlikely(is_pfn_mapping(vma))) |
1314 | untrack_pfn_vma(vma, 0, 0); | 1331 | untrack_pfn_vma(vma, 0, 0); |
@@ -1326,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1326 | * Since no pte has actually been setup, it is | 1343 | * Since no pte has actually been setup, it is |
1327 | * safe to do nothing in this case. | 1344 | * safe to do nothing in this case. |
1328 | */ | 1345 | */ |
1329 | if (vma->vm_file) | 1346 | if (vma->vm_file) { |
1330 | unmap_hugepage_range(vma, start, end, NULL); | 1347 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); |
1348 | __unmap_hugepage_range_final(tlb, vma, start, end, NULL); | ||
1349 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | ||
1350 | } | ||
1331 | } else | 1351 | } else |
1332 | unmap_page_range(tlb, vma, start, end, details); | 1352 | unmap_page_range(tlb, vma, start, end, details); |
1333 | } | 1353 | } |
@@ -1339,8 +1359,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1339 | * @vma: the starting vma | 1359 | * @vma: the starting vma |
1340 | * @start_addr: virtual address at which to start unmapping | 1360 | * @start_addr: virtual address at which to start unmapping |
1341 | * @end_addr: virtual address at which to end unmapping | 1361 | * @end_addr: virtual address at which to end unmapping |
1342 | * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here | ||
1343 | * @details: details of nonlinear truncation or shared cache invalidation | ||
1344 | * | 1362 | * |
1345 | * Unmap all pages in the vma list. | 1363 | * Unmap all pages in the vma list. |
1346 | * | 1364 | * |
@@ -1355,40 +1373,40 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1355 | */ | 1373 | */ |
1356 | void unmap_vmas(struct mmu_gather *tlb, | 1374 | void unmap_vmas(struct mmu_gather *tlb, |
1357 | struct vm_area_struct *vma, unsigned long start_addr, | 1375 | struct vm_area_struct *vma, unsigned long start_addr, |
1358 | unsigned long end_addr, unsigned long *nr_accounted, | 1376 | unsigned long end_addr) |
1359 | struct zap_details *details) | ||
1360 | { | 1377 | { |
1361 | struct mm_struct *mm = vma->vm_mm; | 1378 | struct mm_struct *mm = vma->vm_mm; |
1362 | 1379 | ||
1363 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); | 1380 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); |
1364 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) | 1381 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) |
1365 | unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted, | 1382 | unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); |
1366 | details); | ||
1367 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); | 1383 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); |
1368 | } | 1384 | } |
1369 | 1385 | ||
1370 | /** | 1386 | /** |
1371 | * zap_page_range - remove user pages in a given range | 1387 | * zap_page_range - remove user pages in a given range |
1372 | * @vma: vm_area_struct holding the applicable pages | 1388 | * @vma: vm_area_struct holding the applicable pages |
1373 | * @address: starting address of pages to zap | 1389 | * @start: starting address of pages to zap |
1374 | * @size: number of bytes to zap | 1390 | * @size: number of bytes to zap |
1375 | * @details: details of nonlinear truncation or shared cache invalidation | 1391 | * @details: details of nonlinear truncation or shared cache invalidation |
1376 | * | 1392 | * |
1377 | * Caller must protect the VMA list | 1393 | * Caller must protect the VMA list |
1378 | */ | 1394 | */ |
1379 | void zap_page_range(struct vm_area_struct *vma, unsigned long address, | 1395 | void zap_page_range(struct vm_area_struct *vma, unsigned long start, |
1380 | unsigned long size, struct zap_details *details) | 1396 | unsigned long size, struct zap_details *details) |
1381 | { | 1397 | { |
1382 | struct mm_struct *mm = vma->vm_mm; | 1398 | struct mm_struct *mm = vma->vm_mm; |
1383 | struct mmu_gather tlb; | 1399 | struct mmu_gather tlb; |
1384 | unsigned long end = address + size; | 1400 | unsigned long end = start + size; |
1385 | unsigned long nr_accounted = 0; | ||
1386 | 1401 | ||
1387 | lru_add_drain(); | 1402 | lru_add_drain(); |
1388 | tlb_gather_mmu(&tlb, mm, 0); | 1403 | tlb_gather_mmu(&tlb, mm, 0); |
1389 | update_hiwater_rss(mm); | 1404 | update_hiwater_rss(mm); |
1390 | unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); | 1405 | mmu_notifier_invalidate_range_start(mm, start, end); |
1391 | tlb_finish_mmu(&tlb, address, end); | 1406 | for ( ; vma && vma->vm_start < end; vma = vma->vm_next) |
1407 | unmap_single_vma(&tlb, vma, start, end, details); | ||
1408 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
1409 | tlb_finish_mmu(&tlb, start, end); | ||
1392 | } | 1410 | } |
1393 | 1411 | ||
1394 | /** | 1412 | /** |
@@ -1406,13 +1424,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr | |||
1406 | struct mm_struct *mm = vma->vm_mm; | 1424 | struct mm_struct *mm = vma->vm_mm; |
1407 | struct mmu_gather tlb; | 1425 | struct mmu_gather tlb; |
1408 | unsigned long end = address + size; | 1426 | unsigned long end = address + size; |
1409 | unsigned long nr_accounted = 0; | ||
1410 | 1427 | ||
1411 | lru_add_drain(); | 1428 | lru_add_drain(); |
1412 | tlb_gather_mmu(&tlb, mm, 0); | 1429 | tlb_gather_mmu(&tlb, mm, 0); |
1413 | update_hiwater_rss(mm); | 1430 | update_hiwater_rss(mm); |
1414 | mmu_notifier_invalidate_range_start(mm, address, end); | 1431 | mmu_notifier_invalidate_range_start(mm, address, end); |
1415 | unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details); | 1432 | unmap_single_vma(&tlb, vma, address, end, details); |
1416 | mmu_notifier_invalidate_range_end(mm, address, end); | 1433 | mmu_notifier_invalidate_range_end(mm, address, end); |
1417 | tlb_finish_mmu(&tlb, address, end); | 1434 | tlb_finish_mmu(&tlb, address, end); |
1418 | } | 1435 | } |
@@ -2633,6 +2650,9 @@ reuse: | |||
2633 | if (!page_mkwrite) { | 2650 | if (!page_mkwrite) { |
2634 | wait_on_page_locked(dirty_page); | 2651 | wait_on_page_locked(dirty_page); |
2635 | set_page_dirty_balance(dirty_page, page_mkwrite); | 2652 | set_page_dirty_balance(dirty_page, page_mkwrite); |
2653 | /* file_update_time outside page_lock */ | ||
2654 | if (vma->vm_file) | ||
2655 | file_update_time(vma->vm_file); | ||
2636 | } | 2656 | } |
2637 | put_page(dirty_page); | 2657 | put_page(dirty_page); |
2638 | if (page_mkwrite) { | 2658 | if (page_mkwrite) { |
@@ -2650,10 +2670,6 @@ reuse: | |||
2650 | } | 2670 | } |
2651 | } | 2671 | } |
2652 | 2672 | ||
2653 | /* file_update_time outside page_lock */ | ||
2654 | if (vma->vm_file) | ||
2655 | file_update_time(vma->vm_file); | ||
2656 | |||
2657 | return ret; | 2673 | return ret; |
2658 | } | 2674 | } |
2659 | 2675 | ||
@@ -2911,7 +2927,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2911 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2927 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
2912 | page = lookup_swap_cache(entry); | 2928 | page = lookup_swap_cache(entry); |
2913 | if (!page) { | 2929 | if (!page) { |
2914 | grab_swap_token(mm); /* Contend for token _before_ read-in */ | ||
2915 | page = swapin_readahead(entry, | 2930 | page = swapin_readahead(entry, |
2916 | GFP_HIGHUSER_MOVABLE, vma, address); | 2931 | GFP_HIGHUSER_MOVABLE, vma, address); |
2917 | if (!page) { | 2932 | if (!page) { |
@@ -2941,6 +2956,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2941 | } | 2956 | } |
2942 | 2957 | ||
2943 | locked = lock_page_or_retry(page, mm, flags); | 2958 | locked = lock_page_or_retry(page, mm, flags); |
2959 | |||
2944 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2960 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2945 | if (!locked) { | 2961 | if (!locked) { |
2946 | ret |= VM_FAULT_RETRY; | 2962 | ret |= VM_FAULT_RETRY; |
@@ -3322,12 +3338,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3322 | 3338 | ||
3323 | if (dirty_page) { | 3339 | if (dirty_page) { |
3324 | struct address_space *mapping = page->mapping; | 3340 | struct address_space *mapping = page->mapping; |
3341 | int dirtied = 0; | ||
3325 | 3342 | ||
3326 | if (set_page_dirty(dirty_page)) | 3343 | if (set_page_dirty(dirty_page)) |
3327 | page_mkwrite = 1; | 3344 | dirtied = 1; |
3328 | unlock_page(dirty_page); | 3345 | unlock_page(dirty_page); |
3329 | put_page(dirty_page); | 3346 | put_page(dirty_page); |
3330 | if (page_mkwrite && mapping) { | 3347 | if ((dirtied || page_mkwrite) && mapping) { |
3331 | /* | 3348 | /* |
3332 | * Some device drivers do not set page.mapping but still | 3349 | * Some device drivers do not set page.mapping but still |
3333 | * dirty their pages | 3350 | * dirty their pages |
@@ -3336,7 +3353,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3336 | } | 3353 | } |
3337 | 3354 | ||
3338 | /* file_update_time outside page_lock */ | 3355 | /* file_update_time outside page_lock */ |
3339 | if (vma->vm_file) | 3356 | if (vma->vm_file && !page_mkwrite) |
3340 | file_update_time(vma->vm_file); | 3357 | file_update_time(vma->vm_file); |
3341 | } else { | 3358 | } else { |
3342 | unlock_page(vmf.page); | 3359 | unlock_page(vmf.page); |
@@ -3489,6 +3506,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3489 | if (unlikely(is_vm_hugetlb_page(vma))) | 3506 | if (unlikely(is_vm_hugetlb_page(vma))) |
3490 | return hugetlb_fault(mm, vma, address, flags); | 3507 | return hugetlb_fault(mm, vma, address, flags); |
3491 | 3508 | ||
3509 | retry: | ||
3492 | pgd = pgd_offset(mm, address); | 3510 | pgd = pgd_offset(mm, address); |
3493 | pud = pud_alloc(mm, pgd, address); | 3511 | pud = pud_alloc(mm, pgd, address); |
3494 | if (!pud) | 3512 | if (!pud) |
@@ -3502,13 +3520,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3502 | pmd, flags); | 3520 | pmd, flags); |
3503 | } else { | 3521 | } else { |
3504 | pmd_t orig_pmd = *pmd; | 3522 | pmd_t orig_pmd = *pmd; |
3523 | int ret; | ||
3524 | |||
3505 | barrier(); | 3525 | barrier(); |
3506 | if (pmd_trans_huge(orig_pmd)) { | 3526 | if (pmd_trans_huge(orig_pmd)) { |
3507 | if (flags & FAULT_FLAG_WRITE && | 3527 | if (flags & FAULT_FLAG_WRITE && |
3508 | !pmd_write(orig_pmd) && | 3528 | !pmd_write(orig_pmd) && |
3509 | !pmd_trans_splitting(orig_pmd)) | 3529 | !pmd_trans_splitting(orig_pmd)) { |
3510 | return do_huge_pmd_wp_page(mm, vma, address, | 3530 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, |
3511 | pmd, orig_pmd); | 3531 | orig_pmd); |
3532 | /* | ||
3533 | * If COW results in an oom, the huge pmd will | ||
3534 | * have been split, so retry the fault on the | ||
3535 | * pte for a smaller charge. | ||
3536 | */ | ||
3537 | if (unlikely(ret & VM_FAULT_OOM)) | ||
3538 | goto retry; | ||
3539 | return ret; | ||
3540 | } | ||
3512 | return 0; | 3541 | return 0; |
3513 | } | 3542 | } |
3514 | } | 3543 | } |
@@ -3912,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip) | |||
3912 | free_page((unsigned long)buf); | 3941 | free_page((unsigned long)buf); |
3913 | } | 3942 | } |
3914 | } | 3943 | } |
3915 | up_read(¤t->mm->mmap_sem); | 3944 | up_read(&mm->mmap_sem); |
3916 | } | 3945 | } |
3917 | 3946 | ||
3918 | #ifdef CONFIG_PROVE_LOCKING | 3947 | #ifdef CONFIG_PROVE_LOCKING |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6629fafd6ce4..3ad25f9d1fc1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -74,8 +74,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) | |||
74 | res->end = start + size - 1; | 74 | res->end = start + size - 1; |
75 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | 75 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
76 | if (request_resource(&iomem_resource, res) < 0) { | 76 | if (request_resource(&iomem_resource, res) < 0) { |
77 | printk("System RAM resource %llx - %llx cannot be added\n", | 77 | printk("System RAM resource %pR cannot be added\n", res); |
78 | (unsigned long long)res->start, (unsigned long long)res->end); | ||
79 | kfree(res); | 78 | kfree(res); |
80 | res = NULL; | 79 | res = NULL; |
81 | } | 80 | } |
@@ -502,8 +501,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
502 | online_pages_range); | 501 | online_pages_range); |
503 | if (ret) { | 502 | if (ret) { |
504 | mutex_unlock(&zonelists_mutex); | 503 | mutex_unlock(&zonelists_mutex); |
505 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", | 504 | printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", |
506 | nr_pages, pfn); | 505 | (unsigned long long) pfn << PAGE_SHIFT, |
506 | (((unsigned long long) pfn + nr_pages) | ||
507 | << PAGE_SHIFT) - 1); | ||
507 | memory_notify(MEM_CANCEL_ONLINE, &arg); | 508 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
508 | unlock_memory_hotplug(); | 509 | unlock_memory_hotplug(); |
509 | return ret; | 510 | return ret; |
@@ -511,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
511 | 512 | ||
512 | zone->present_pages += onlined_pages; | 513 | zone->present_pages += onlined_pages; |
513 | zone->zone_pgdat->node_present_pages += onlined_pages; | 514 | zone->zone_pgdat->node_present_pages += onlined_pages; |
514 | if (need_zonelists_rebuild) | 515 | if (onlined_pages) { |
515 | build_all_zonelists(zone); | 516 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); |
516 | else | 517 | if (need_zonelists_rebuild) |
517 | zone_pcp_update(zone); | 518 | build_all_zonelists(NULL, zone); |
519 | else | ||
520 | zone_pcp_update(zone); | ||
521 | } | ||
518 | 522 | ||
519 | mutex_unlock(&zonelists_mutex); | 523 | mutex_unlock(&zonelists_mutex); |
520 | 524 | ||
521 | init_per_zone_wmark_min(); | 525 | init_per_zone_wmark_min(); |
522 | 526 | ||
523 | if (onlined_pages) { | 527 | if (onlined_pages) |
524 | kswapd_run(zone_to_nid(zone)); | 528 | kswapd_run(zone_to_nid(zone)); |
525 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | ||
526 | } | ||
527 | 529 | ||
528 | vm_total_pages = nr_free_pagecache_pages(); | 530 | vm_total_pages = nr_free_pagecache_pages(); |
529 | 531 | ||
@@ -561,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
561 | * to access not-initialized zonelist, build here. | 563 | * to access not-initialized zonelist, build here. |
562 | */ | 564 | */ |
563 | mutex_lock(&zonelists_mutex); | 565 | mutex_lock(&zonelists_mutex); |
564 | build_all_zonelists(NULL); | 566 | build_all_zonelists(pgdat, NULL); |
565 | mutex_unlock(&zonelists_mutex); | 567 | mutex_unlock(&zonelists_mutex); |
566 | 568 | ||
567 | return pgdat; | 569 | return pgdat; |
@@ -617,7 +619,7 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
617 | pgdat = hotadd_new_pgdat(nid, start); | 619 | pgdat = hotadd_new_pgdat(nid, start); |
618 | ret = -ENOMEM; | 620 | ret = -ENOMEM; |
619 | if (!pgdat) | 621 | if (!pgdat) |
620 | goto out; | 622 | goto error; |
621 | new_pgdat = 1; | 623 | new_pgdat = 1; |
622 | } | 624 | } |
623 | 625 | ||
@@ -891,7 +893,7 @@ static int __ref offline_pages(unsigned long start_pfn, | |||
891 | nr_pages = end_pfn - start_pfn; | 893 | nr_pages = end_pfn - start_pfn; |
892 | 894 | ||
893 | /* set above range as isolated */ | 895 | /* set above range as isolated */ |
894 | ret = start_isolate_page_range(start_pfn, end_pfn); | 896 | ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
895 | if (ret) | 897 | if (ret) |
896 | goto out; | 898 | goto out; |
897 | 899 | ||
@@ -956,7 +958,7 @@ repeat: | |||
956 | We cannot do rollback at this point. */ | 958 | We cannot do rollback at this point. */ |
957 | offline_isolated_pages(start_pfn, end_pfn); | 959 | offline_isolated_pages(start_pfn, end_pfn); |
958 | /* reset pagetype flags and makes migrate type to be MOVABLE */ | 960 | /* reset pagetype flags and makes migrate type to be MOVABLE */ |
959 | undo_isolate_page_range(start_pfn, end_pfn); | 961 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
960 | /* removal success */ | 962 | /* removal success */ |
961 | zone->present_pages -= offlined_pages; | 963 | zone->present_pages -= offlined_pages; |
962 | zone->zone_pgdat->node_present_pages -= offlined_pages; | 964 | zone->zone_pgdat->node_present_pages -= offlined_pages; |
@@ -964,6 +966,9 @@ repeat: | |||
964 | 966 | ||
965 | init_per_zone_wmark_min(); | 967 | init_per_zone_wmark_min(); |
966 | 968 | ||
969 | if (!populated_zone(zone)) | ||
970 | zone_pcp_reset(zone); | ||
971 | |||
967 | if (!node_present_pages(node)) { | 972 | if (!node_present_pages(node)) { |
968 | node_clear_state(node, N_HIGH_MEMORY); | 973 | node_clear_state(node, N_HIGH_MEMORY); |
969 | kswapd_stop(node); | 974 | kswapd_stop(node); |
@@ -977,11 +982,12 @@ repeat: | |||
977 | return 0; | 982 | return 0; |
978 | 983 | ||
979 | failed_removal: | 984 | failed_removal: |
980 | printk(KERN_INFO "memory offlining %lx to %lx failed\n", | 985 | printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", |
981 | start_pfn, end_pfn); | 986 | (unsigned long long) start_pfn << PAGE_SHIFT, |
987 | ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); | ||
982 | memory_notify(MEM_CANCEL_OFFLINE, &arg); | 988 | memory_notify(MEM_CANCEL_OFFLINE, &arg); |
983 | /* pushback to free area */ | 989 | /* pushback to free area */ |
984 | undo_isolate_page_range(start_pfn, end_pfn); | 990 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
985 | 991 | ||
986 | out: | 992 | out: |
987 | unlock_memory_hotplug(); | 993 | unlock_memory_hotplug(); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index cfb6c8678754..bd92431d4c49 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -390,7 +390,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, | |||
390 | { | 390 | { |
391 | if (!pol) | 391 | if (!pol) |
392 | return; | 392 | return; |
393 | if (!mpol_store_user_nodemask(pol) && step == 0 && | 393 | if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && |
394 | nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) | 394 | nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) |
395 | return; | 395 | return; |
396 | 396 | ||
@@ -607,27 +607,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
607 | return first; | 607 | return first; |
608 | } | 608 | } |
609 | 609 | ||
610 | /* Apply policy to a single VMA */ | ||
611 | static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) | ||
612 | { | ||
613 | int err = 0; | ||
614 | struct mempolicy *old = vma->vm_policy; | ||
615 | |||
616 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | ||
617 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | ||
618 | vma->vm_ops, vma->vm_file, | ||
619 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | ||
620 | |||
621 | if (vma->vm_ops && vma->vm_ops->set_policy) | ||
622 | err = vma->vm_ops->set_policy(vma, new); | ||
623 | if (!err) { | ||
624 | mpol_get(new); | ||
625 | vma->vm_policy = new; | ||
626 | mpol_put(old); | ||
627 | } | ||
628 | return err; | ||
629 | } | ||
630 | |||
631 | /* Step 2: apply policy to a range and do splits. */ | 610 | /* Step 2: apply policy to a range and do splits. */ |
632 | static int mbind_range(struct mm_struct *mm, unsigned long start, | 611 | static int mbind_range(struct mm_struct *mm, unsigned long start, |
633 | unsigned long end, struct mempolicy *new_pol) | 612 | unsigned long end, struct mempolicy *new_pol) |
@@ -676,9 +655,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
676 | if (err) | 655 | if (err) |
677 | goto out; | 656 | goto out; |
678 | } | 657 | } |
679 | err = policy_vma(vma, new_pol); | 658 | |
680 | if (err) | 659 | /* |
681 | goto out; | 660 | * Apply policy to a single VMA. The reference counting of |
661 | * policy for vma_policy linkages has already been handled by | ||
662 | * vma_merge and split_vma as necessary. If this is a shared | ||
663 | * policy then ->set_policy will increment the reference count | ||
664 | * for an sp node. | ||
665 | */ | ||
666 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | ||
667 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | ||
668 | vma->vm_ops, vma->vm_file, | ||
669 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | ||
670 | if (vma->vm_ops && vma->vm_ops->set_policy) { | ||
671 | err = vma->vm_ops->set_policy(vma, new_pol); | ||
672 | if (err) | ||
673 | goto out; | ||
674 | } | ||
682 | } | 675 | } |
683 | 676 | ||
684 | out: | 677 | out: |
@@ -957,8 +950,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
957 | * | 950 | * |
958 | * Returns the number of page that could not be moved. | 951 | * Returns the number of page that could not be moved. |
959 | */ | 952 | */ |
960 | int do_migrate_pages(struct mm_struct *mm, | 953 | int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, |
961 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | 954 | const nodemask_t *to, int flags) |
962 | { | 955 | { |
963 | int busy = 0; | 956 | int busy = 0; |
964 | int err; | 957 | int err; |
@@ -970,7 +963,7 @@ int do_migrate_pages(struct mm_struct *mm, | |||
970 | 963 | ||
971 | down_read(&mm->mmap_sem); | 964 | down_read(&mm->mmap_sem); |
972 | 965 | ||
973 | err = migrate_vmas(mm, from_nodes, to_nodes, flags); | 966 | err = migrate_vmas(mm, from, to, flags); |
974 | if (err) | 967 | if (err) |
975 | goto out; | 968 | goto out; |
976 | 969 | ||
@@ -1005,14 +998,34 @@ int do_migrate_pages(struct mm_struct *mm, | |||
1005 | * moved to an empty node, then there is nothing left worth migrating. | 998 | * moved to an empty node, then there is nothing left worth migrating. |
1006 | */ | 999 | */ |
1007 | 1000 | ||
1008 | tmp = *from_nodes; | 1001 | tmp = *from; |
1009 | while (!nodes_empty(tmp)) { | 1002 | while (!nodes_empty(tmp)) { |
1010 | int s,d; | 1003 | int s,d; |
1011 | int source = -1; | 1004 | int source = -1; |
1012 | int dest = 0; | 1005 | int dest = 0; |
1013 | 1006 | ||
1014 | for_each_node_mask(s, tmp) { | 1007 | for_each_node_mask(s, tmp) { |
1015 | d = node_remap(s, *from_nodes, *to_nodes); | 1008 | |
1009 | /* | ||
1010 | * do_migrate_pages() tries to maintain the relative | ||
1011 | * node relationship of the pages established between | ||
1012 | * threads and memory areas. | ||
1013 | * | ||
1014 | * However if the number of source nodes is not equal to | ||
1015 | * the number of destination nodes we can not preserve | ||
1016 | * this node relative relationship. In that case, skip | ||
1017 | * copying memory from a node that is in the destination | ||
1018 | * mask. | ||
1019 | * | ||
1020 | * Example: [2,3,4] -> [3,4,5] moves everything. | ||
1021 | * [0-7] - > [3,4,5] moves only 0,1,2,6,7. | ||
1022 | */ | ||
1023 | |||
1024 | if ((nodes_weight(*from) != nodes_weight(*to)) && | ||
1025 | (node_isset(s, *to))) | ||
1026 | continue; | ||
1027 | |||
1028 | d = node_remap(s, *from, *to); | ||
1016 | if (s == d) | 1029 | if (s == d) |
1017 | continue; | 1030 | continue; |
1018 | 1031 | ||
@@ -1072,8 +1085,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
1072 | { | 1085 | { |
1073 | } | 1086 | } |
1074 | 1087 | ||
1075 | int do_migrate_pages(struct mm_struct *mm, | 1088 | int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, |
1076 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | 1089 | const nodemask_t *to, int flags) |
1077 | { | 1090 | { |
1078 | return -ENOSYS; | 1091 | return -ENOSYS; |
1079 | } | 1092 | } |
@@ -1164,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1164 | if (!list_empty(&pagelist)) { | 1177 | if (!list_empty(&pagelist)) { |
1165 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1178 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1166 | (unsigned long)vma, | 1179 | (unsigned long)vma, |
1167 | false, true); | 1180 | false, MIGRATE_SYNC); |
1168 | if (nr_failed) | 1181 | if (nr_failed) |
1169 | putback_lru_pages(&pagelist); | 1182 | putback_lru_pages(&pagelist); |
1170 | } | 1183 | } |
@@ -1334,8 +1347,8 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1334 | * userid as the target process. | 1347 | * userid as the target process. |
1335 | */ | 1348 | */ |
1336 | tcred = __task_cred(task); | 1349 | tcred = __task_cred(task); |
1337 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && | 1350 | if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && |
1338 | cred->uid != tcred->suid && cred->uid != tcred->uid && | 1351 | !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && |
1339 | !capable(CAP_SYS_NICE)) { | 1352 | !capable(CAP_SYS_NICE)) { |
1340 | rcu_read_unlock(); | 1353 | rcu_read_unlock(); |
1341 | err = -EPERM; | 1354 | err = -EPERM; |
@@ -1361,11 +1374,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1361 | 1374 | ||
1362 | mm = get_task_mm(task); | 1375 | mm = get_task_mm(task); |
1363 | put_task_struct(task); | 1376 | put_task_struct(task); |
1364 | if (mm) | 1377 | |
1365 | err = do_migrate_pages(mm, old, new, | 1378 | if (!mm) { |
1366 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); | ||
1367 | else | ||
1368 | err = -EINVAL; | 1379 | err = -EINVAL; |
1380 | goto out; | ||
1381 | } | ||
1382 | |||
1383 | err = do_migrate_pages(mm, old, new, | ||
1384 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); | ||
1369 | 1385 | ||
1370 | mmput(mm); | 1386 | mmput(mm); |
1371 | out: | 1387 | out: |
@@ -1586,8 +1602,14 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
1586 | * task can change it's policy. The system default policy requires no | 1602 | * task can change it's policy. The system default policy requires no |
1587 | * such protection. | 1603 | * such protection. |
1588 | */ | 1604 | */ |
1589 | unsigned slab_node(struct mempolicy *policy) | 1605 | unsigned slab_node(void) |
1590 | { | 1606 | { |
1607 | struct mempolicy *policy; | ||
1608 | |||
1609 | if (in_interrupt()) | ||
1610 | return numa_node_id(); | ||
1611 | |||
1612 | policy = current->mempolicy; | ||
1591 | if (!policy || policy->flags & MPOL_F_LOCAL) | 1613 | if (!policy || policy->flags & MPOL_F_LOCAL) |
1592 | return numa_node_id(); | 1614 | return numa_node_id(); |
1593 | 1615 | ||
diff --git a/mm/mempool.c b/mm/mempool.c index d9049811f352..54990476c049 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -63,19 +63,21 @@ EXPORT_SYMBOL(mempool_destroy); | |||
63 | mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, | 63 | mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, |
64 | mempool_free_t *free_fn, void *pool_data) | 64 | mempool_free_t *free_fn, void *pool_data) |
65 | { | 65 | { |
66 | return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1); | 66 | return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data, |
67 | GFP_KERNEL, NUMA_NO_NODE); | ||
67 | } | 68 | } |
68 | EXPORT_SYMBOL(mempool_create); | 69 | EXPORT_SYMBOL(mempool_create); |
69 | 70 | ||
70 | mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, | 71 | mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, |
71 | mempool_free_t *free_fn, void *pool_data, int node_id) | 72 | mempool_free_t *free_fn, void *pool_data, |
73 | gfp_t gfp_mask, int node_id) | ||
72 | { | 74 | { |
73 | mempool_t *pool; | 75 | mempool_t *pool; |
74 | pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id); | 76 | pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); |
75 | if (!pool) | 77 | if (!pool) |
76 | return NULL; | 78 | return NULL; |
77 | pool->elements = kmalloc_node(min_nr * sizeof(void *), | 79 | pool->elements = kmalloc_node(min_nr * sizeof(void *), |
78 | GFP_KERNEL, node_id); | 80 | gfp_mask, node_id); |
79 | if (!pool->elements) { | 81 | if (!pool->elements) { |
80 | kfree(pool); | 82 | kfree(pool); |
81 | return NULL; | 83 | return NULL; |
@@ -93,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, | |||
93 | while (pool->curr_nr < pool->min_nr) { | 95 | while (pool->curr_nr < pool->min_nr) { |
94 | void *element; | 96 | void *element; |
95 | 97 | ||
96 | element = pool->alloc(GFP_KERNEL, pool->pool_data); | 98 | element = pool->alloc(gfp_mask, pool->pool_data); |
97 | if (unlikely(!element)) { | 99 | if (unlikely(!element)) { |
98 | mempool_destroy(pool); | 100 | mempool_destroy(pool); |
99 | return NULL; | 101 | return NULL; |
diff --git a/mm/migrate.c b/mm/migrate.c index 51c08a0c6f68..77ed2d773705 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/memcontrol.h> | 33 | #include <linux/memcontrol.h> |
34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
35 | #include <linux/hugetlb.h> | 35 | #include <linux/hugetlb.h> |
36 | #include <linux/hugetlb_cgroup.h> | ||
36 | #include <linux/gfp.h> | 37 | #include <linux/gfp.h> |
37 | 38 | ||
38 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
@@ -436,7 +437,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
436 | * is actually a signal that all of the page has become dirty. | 437 | * is actually a signal that all of the page has become dirty. |
437 | * Whereas only part of our page may be dirty. | 438 | * Whereas only part of our page may be dirty. |
438 | */ | 439 | */ |
439 | __set_page_dirty_nobuffers(newpage); | 440 | if (PageSwapBacked(page)) |
441 | SetPageDirty(newpage); | ||
442 | else | ||
443 | __set_page_dirty_nobuffers(newpage); | ||
440 | } | 444 | } |
441 | 445 | ||
442 | mlock_migrate_page(newpage, page); | 446 | mlock_migrate_page(newpage, page); |
@@ -679,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
679 | { | 683 | { |
680 | int rc = -EAGAIN; | 684 | int rc = -EAGAIN; |
681 | int remap_swapcache = 1; | 685 | int remap_swapcache = 1; |
682 | int charge = 0; | ||
683 | struct mem_cgroup *mem; | 686 | struct mem_cgroup *mem; |
684 | struct anon_vma *anon_vma = NULL; | 687 | struct anon_vma *anon_vma = NULL; |
685 | 688 | ||
@@ -721,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
721 | } | 724 | } |
722 | 725 | ||
723 | /* charge against new page */ | 726 | /* charge against new page */ |
724 | charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); | 727 | mem_cgroup_prepare_migration(page, newpage, &mem); |
725 | if (charge == -ENOMEM) { | ||
726 | rc = -ENOMEM; | ||
727 | goto unlock; | ||
728 | } | ||
729 | BUG_ON(charge); | ||
730 | 728 | ||
731 | if (PageWriteback(page)) { | 729 | if (PageWriteback(page)) { |
732 | /* | 730 | /* |
@@ -816,8 +814,7 @@ skip_unmap: | |||
816 | put_anon_vma(anon_vma); | 814 | put_anon_vma(anon_vma); |
817 | 815 | ||
818 | uncharge: | 816 | uncharge: |
819 | if (!charge) | 817 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); |
820 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); | ||
821 | unlock: | 818 | unlock: |
822 | unlock_page(page); | 819 | unlock_page(page); |
823 | out: | 820 | out: |
@@ -928,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
928 | 925 | ||
929 | if (anon_vma) | 926 | if (anon_vma) |
930 | put_anon_vma(anon_vma); | 927 | put_anon_vma(anon_vma); |
931 | unlock_page(hpage); | ||
932 | 928 | ||
933 | out: | 929 | if (!rc) |
934 | if (rc != -EAGAIN) { | 930 | hugetlb_cgroup_migrate(hpage, new_hpage); |
935 | list_del(&hpage->lru); | ||
936 | put_page(hpage); | ||
937 | } | ||
938 | 931 | ||
932 | unlock_page(hpage); | ||
933 | out: | ||
939 | put_page(new_hpage); | 934 | put_page(new_hpage); |
940 | |||
941 | if (result) { | 935 | if (result) { |
942 | if (rc) | 936 | if (rc) |
943 | *result = rc; | 937 | *result = rc; |
@@ -1013,48 +1007,32 @@ out: | |||
1013 | return nr_failed + retry; | 1007 | return nr_failed + retry; |
1014 | } | 1008 | } |
1015 | 1009 | ||
1016 | int migrate_huge_pages(struct list_head *from, | 1010 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, |
1017 | new_page_t get_new_page, unsigned long private, bool offlining, | 1011 | unsigned long private, bool offlining, |
1018 | enum migrate_mode mode) | 1012 | enum migrate_mode mode) |
1019 | { | 1013 | { |
1020 | int retry = 1; | 1014 | int pass, rc; |
1021 | int nr_failed = 0; | 1015 | |
1022 | int pass = 0; | 1016 | for (pass = 0; pass < 10; pass++) { |
1023 | struct page *page; | 1017 | rc = unmap_and_move_huge_page(get_new_page, |
1024 | struct page *page2; | 1018 | private, hpage, pass > 2, offlining, |
1025 | int rc; | 1019 | mode); |
1026 | 1020 | switch (rc) { | |
1027 | for (pass = 0; pass < 10 && retry; pass++) { | 1021 | case -ENOMEM: |
1028 | retry = 0; | 1022 | goto out; |
1029 | 1023 | case -EAGAIN: | |
1030 | list_for_each_entry_safe(page, page2, from, lru) { | 1024 | /* try again */ |
1031 | cond_resched(); | 1025 | cond_resched(); |
1032 | 1026 | break; | |
1033 | rc = unmap_and_move_huge_page(get_new_page, | 1027 | case 0: |
1034 | private, page, pass > 2, offlining, | 1028 | goto out; |
1035 | mode); | 1029 | default: |
1036 | 1030 | rc = -EIO; | |
1037 | switch(rc) { | 1031 | goto out; |
1038 | case -ENOMEM: | ||
1039 | goto out; | ||
1040 | case -EAGAIN: | ||
1041 | retry++; | ||
1042 | break; | ||
1043 | case 0: | ||
1044 | break; | ||
1045 | default: | ||
1046 | /* Permanent failure */ | ||
1047 | nr_failed++; | ||
1048 | break; | ||
1049 | } | ||
1050 | } | 1032 | } |
1051 | } | 1033 | } |
1052 | rc = 0; | ||
1053 | out: | 1034 | out: |
1054 | if (rc) | 1035 | return rc; |
1055 | return rc; | ||
1056 | |||
1057 | return nr_failed + retry; | ||
1058 | } | 1036 | } |
1059 | 1037 | ||
1060 | #ifdef CONFIG_NUMA | 1038 | #ifdef CONFIG_NUMA |
@@ -1371,8 +1349,8 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1371 | * userid as the target process. | 1349 | * userid as the target process. |
1372 | */ | 1350 | */ |
1373 | tcred = __task_cred(task); | 1351 | tcred = __task_cred(task); |
1374 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && | 1352 | if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && |
1375 | cred->uid != tcred->suid && cred->uid != tcred->uid && | 1353 | !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && |
1376 | !capable(CAP_SYS_NICE)) { | 1354 | !capable(CAP_SYS_NICE)) { |
1377 | rcu_read_unlock(); | 1355 | rcu_read_unlock(); |
1378 | err = -EPERM; | 1356 | err = -EPERM; |
@@ -1388,14 +1366,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1388 | mm = get_task_mm(task); | 1366 | mm = get_task_mm(task); |
1389 | put_task_struct(task); | 1367 | put_task_struct(task); |
1390 | 1368 | ||
1391 | if (mm) { | 1369 | if (!mm) |
1392 | if (nodes) | 1370 | return -EINVAL; |
1393 | err = do_pages_move(mm, task_nodes, nr_pages, pages, | 1371 | |
1394 | nodes, status, flags); | 1372 | if (nodes) |
1395 | else | 1373 | err = do_pages_move(mm, task_nodes, nr_pages, pages, |
1396 | err = do_pages_stat(mm, nr_pages, pages, status); | 1374 | nodes, status, flags); |
1397 | } else | 1375 | else |
1398 | err = -EINVAL; | 1376 | err = do_pages_stat(mm, nr_pages, pages, status); |
1399 | 1377 | ||
1400 | mmput(mm); | 1378 | mmput(mm); |
1401 | return err; | 1379 | return err; |
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/perf_event.h> | 30 | #include <linux/perf_event.h> |
31 | #include <linux/audit.h> | 31 | #include <linux/audit.h> |
32 | #include <linux/khugepaged.h> | 32 | #include <linux/khugepaged.h> |
33 | #include <linux/uprobes.h> | ||
33 | 34 | ||
34 | #include <asm/uaccess.h> | 35 | #include <asm/uaccess.h> |
35 | #include <asm/cacheflush.h> | 36 | #include <asm/cacheflush.h> |
@@ -240,6 +241,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) | |||
240 | return next; | 241 | return next; |
241 | } | 242 | } |
242 | 243 | ||
244 | static unsigned long do_brk(unsigned long addr, unsigned long len); | ||
245 | |||
243 | SYSCALL_DEFINE1(brk, unsigned long, brk) | 246 | SYSCALL_DEFINE1(brk, unsigned long, brk) |
244 | { | 247 | { |
245 | unsigned long rlim, retval; | 248 | unsigned long rlim, retval; |
@@ -544,8 +547,15 @@ again: remove_next = 1 + (end > next->vm_end); | |||
544 | 547 | ||
545 | if (file) { | 548 | if (file) { |
546 | mapping = file->f_mapping; | 549 | mapping = file->f_mapping; |
547 | if (!(vma->vm_flags & VM_NONLINEAR)) | 550 | if (!(vma->vm_flags & VM_NONLINEAR)) { |
548 | root = &mapping->i_mmap; | 551 | root = &mapping->i_mmap; |
552 | uprobe_munmap(vma, vma->vm_start, vma->vm_end); | ||
553 | |||
554 | if (adjust_next) | ||
555 | uprobe_munmap(next, next->vm_start, | ||
556 | next->vm_end); | ||
557 | } | ||
558 | |||
549 | mutex_lock(&mapping->i_mmap_mutex); | 559 | mutex_lock(&mapping->i_mmap_mutex); |
550 | if (insert) { | 560 | if (insert) { |
551 | /* | 561 | /* |
@@ -615,8 +625,16 @@ again: remove_next = 1 + (end > next->vm_end); | |||
615 | if (mapping) | 625 | if (mapping) |
616 | mutex_unlock(&mapping->i_mmap_mutex); | 626 | mutex_unlock(&mapping->i_mmap_mutex); |
617 | 627 | ||
628 | if (root) { | ||
629 | uprobe_mmap(vma); | ||
630 | |||
631 | if (adjust_next) | ||
632 | uprobe_mmap(next); | ||
633 | } | ||
634 | |||
618 | if (remove_next) { | 635 | if (remove_next) { |
619 | if (file) { | 636 | if (file) { |
637 | uprobe_munmap(next, next->vm_start, next->vm_end); | ||
620 | fput(file); | 638 | fput(file); |
621 | if (next->vm_flags & VM_EXECUTABLE) | 639 | if (next->vm_flags & VM_EXECUTABLE) |
622 | removed_exe_file_vma(mm); | 640 | removed_exe_file_vma(mm); |
@@ -636,6 +654,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
636 | goto again; | 654 | goto again; |
637 | } | 655 | } |
638 | } | 656 | } |
657 | if (insert && file) | ||
658 | uprobe_mmap(insert); | ||
639 | 659 | ||
640 | validate_mm(mm); | 660 | validate_mm(mm); |
641 | 661 | ||
@@ -923,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
923 | const unsigned long stack_flags | 943 | const unsigned long stack_flags |
924 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); | 944 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); |
925 | 945 | ||
946 | mm->total_vm += pages; | ||
947 | |||
926 | if (file) { | 948 | if (file) { |
927 | mm->shared_vm += pages; | 949 | mm->shared_vm += pages; |
928 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) | 950 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) |
@@ -958,8 +980,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
958 | struct mm_struct * mm = current->mm; | 980 | struct mm_struct * mm = current->mm; |
959 | struct inode *inode; | 981 | struct inode *inode; |
960 | vm_flags_t vm_flags; | 982 | vm_flags_t vm_flags; |
961 | int error; | ||
962 | unsigned long reqprot = prot; | ||
963 | 983 | ||
964 | /* | 984 | /* |
965 | * Does the application expect PROT_READ to imply PROT_EXEC? | 985 | * Does the application expect PROT_READ to imply PROT_EXEC? |
@@ -1081,13 +1101,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1081 | } | 1101 | } |
1082 | } | 1102 | } |
1083 | 1103 | ||
1084 | error = security_file_mmap(file, reqprot, prot, flags, addr, 0); | ||
1085 | if (error) | ||
1086 | return error; | ||
1087 | |||
1088 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); | 1104 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); |
1089 | } | 1105 | } |
1090 | EXPORT_SYMBOL(do_mmap_pgoff); | ||
1091 | 1106 | ||
1092 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | 1107 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, |
1093 | unsigned long, prot, unsigned long, flags, | 1108 | unsigned long, prot, unsigned long, flags, |
@@ -1120,10 +1135,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1120 | 1135 | ||
1121 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | 1136 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); |
1122 | 1137 | ||
1123 | down_write(¤t->mm->mmap_sem); | 1138 | retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); |
1124 | retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
1125 | up_write(¤t->mm->mmap_sem); | ||
1126 | |||
1127 | if (file) | 1139 | if (file) |
1128 | fput(file); | 1140 | fput(file); |
1129 | out: | 1141 | out: |
@@ -1337,13 +1349,16 @@ munmap_back: | |||
1337 | out: | 1349 | out: |
1338 | perf_event_mmap(vma); | 1350 | perf_event_mmap(vma); |
1339 | 1351 | ||
1340 | mm->total_vm += len >> PAGE_SHIFT; | ||
1341 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1352 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1342 | if (vm_flags & VM_LOCKED) { | 1353 | if (vm_flags & VM_LOCKED) { |
1343 | if (!mlock_vma_pages_range(vma, addr, addr + len)) | 1354 | if (!mlock_vma_pages_range(vma, addr, addr + len)) |
1344 | mm->locked_vm += (len >> PAGE_SHIFT); | 1355 | mm->locked_vm += (len >> PAGE_SHIFT); |
1345 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | 1356 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) |
1346 | make_pages_present(addr, addr + len); | 1357 | make_pages_present(addr, addr + len); |
1358 | |||
1359 | if (file) | ||
1360 | uprobe_mmap(vma); | ||
1361 | |||
1347 | return addr; | 1362 | return addr; |
1348 | 1363 | ||
1349 | unmap_and_free_vma: | 1364 | unmap_and_free_vma: |
@@ -1579,7 +1594,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | |||
1579 | if (addr & ~PAGE_MASK) | 1594 | if (addr & ~PAGE_MASK) |
1580 | return -EINVAL; | 1595 | return -EINVAL; |
1581 | 1596 | ||
1582 | return arch_rebalance_pgtables(addr, len); | 1597 | addr = arch_rebalance_pgtables(addr, len); |
1598 | error = security_mmap_addr(addr); | ||
1599 | return error ? error : addr; | ||
1583 | } | 1600 | } |
1584 | 1601 | ||
1585 | EXPORT_SYMBOL(get_unmapped_area); | 1602 | EXPORT_SYMBOL(get_unmapped_area); |
@@ -1589,33 +1606,34 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
1589 | { | 1606 | { |
1590 | struct vm_area_struct *vma = NULL; | 1607 | struct vm_area_struct *vma = NULL; |
1591 | 1608 | ||
1592 | if (mm) { | 1609 | if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */ |
1593 | /* Check the cache first. */ | 1610 | return NULL; |
1594 | /* (Cache hit rate is typically around 35%.) */ | 1611 | |
1595 | vma = mm->mmap_cache; | 1612 | /* Check the cache first. */ |
1596 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { | 1613 | /* (Cache hit rate is typically around 35%.) */ |
1597 | struct rb_node * rb_node; | 1614 | vma = mm->mmap_cache; |
1598 | 1615 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { | |
1599 | rb_node = mm->mm_rb.rb_node; | 1616 | struct rb_node *rb_node; |
1600 | vma = NULL; | 1617 | |
1601 | 1618 | rb_node = mm->mm_rb.rb_node; | |
1602 | while (rb_node) { | 1619 | vma = NULL; |
1603 | struct vm_area_struct * vma_tmp; | 1620 | |
1604 | 1621 | while (rb_node) { | |
1605 | vma_tmp = rb_entry(rb_node, | 1622 | struct vm_area_struct *vma_tmp; |
1606 | struct vm_area_struct, vm_rb); | 1623 | |
1607 | 1624 | vma_tmp = rb_entry(rb_node, | |
1608 | if (vma_tmp->vm_end > addr) { | 1625 | struct vm_area_struct, vm_rb); |
1609 | vma = vma_tmp; | 1626 | |
1610 | if (vma_tmp->vm_start <= addr) | 1627 | if (vma_tmp->vm_end > addr) { |
1611 | break; | 1628 | vma = vma_tmp; |
1612 | rb_node = rb_node->rb_left; | 1629 | if (vma_tmp->vm_start <= addr) |
1613 | } else | 1630 | break; |
1614 | rb_node = rb_node->rb_right; | 1631 | rb_node = rb_node->rb_left; |
1615 | } | 1632 | } else |
1616 | if (vma) | 1633 | rb_node = rb_node->rb_right; |
1617 | mm->mmap_cache = vma; | ||
1618 | } | 1634 | } |
1635 | if (vma) | ||
1636 | mm->mmap_cache = vma; | ||
1619 | } | 1637 | } |
1620 | return vma; | 1638 | return vma; |
1621 | } | 1639 | } |
@@ -1689,7 +1707,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
1689 | return -ENOMEM; | 1707 | return -ENOMEM; |
1690 | 1708 | ||
1691 | /* Ok, everything looks good - let it rip */ | 1709 | /* Ok, everything looks good - let it rip */ |
1692 | mm->total_vm += grow; | ||
1693 | if (vma->vm_flags & VM_LOCKED) | 1710 | if (vma->vm_flags & VM_LOCKED) |
1694 | mm->locked_vm += grow; | 1711 | mm->locked_vm += grow; |
1695 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); | 1712 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); |
@@ -1768,7 +1785,7 @@ int expand_downwards(struct vm_area_struct *vma, | |||
1768 | return -ENOMEM; | 1785 | return -ENOMEM; |
1769 | 1786 | ||
1770 | address &= PAGE_MASK; | 1787 | address &= PAGE_MASK; |
1771 | error = security_file_mmap(NULL, 0, 0, 0, address, 1); | 1788 | error = security_mmap_addr(address); |
1772 | if (error) | 1789 | if (error) |
1773 | return error; | 1790 | return error; |
1774 | 1791 | ||
@@ -1862,15 +1879,19 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
1862 | */ | 1879 | */ |
1863 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) | 1880 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) |
1864 | { | 1881 | { |
1882 | unsigned long nr_accounted = 0; | ||
1883 | |||
1865 | /* Update high watermark before we lower total_vm */ | 1884 | /* Update high watermark before we lower total_vm */ |
1866 | update_hiwater_vm(mm); | 1885 | update_hiwater_vm(mm); |
1867 | do { | 1886 | do { |
1868 | long nrpages = vma_pages(vma); | 1887 | long nrpages = vma_pages(vma); |
1869 | 1888 | ||
1870 | mm->total_vm -= nrpages; | 1889 | if (vma->vm_flags & VM_ACCOUNT) |
1890 | nr_accounted += nrpages; | ||
1871 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); | 1891 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); |
1872 | vma = remove_vma(vma); | 1892 | vma = remove_vma(vma); |
1873 | } while (vma); | 1893 | } while (vma); |
1894 | vm_unacct_memory(nr_accounted); | ||
1874 | validate_mm(mm); | 1895 | validate_mm(mm); |
1875 | } | 1896 | } |
1876 | 1897 | ||
@@ -1885,13 +1906,11 @@ static void unmap_region(struct mm_struct *mm, | |||
1885 | { | 1906 | { |
1886 | struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; | 1907 | struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; |
1887 | struct mmu_gather tlb; | 1908 | struct mmu_gather tlb; |
1888 | unsigned long nr_accounted = 0; | ||
1889 | 1909 | ||
1890 | lru_add_drain(); | 1910 | lru_add_drain(); |
1891 | tlb_gather_mmu(&tlb, mm, 0); | 1911 | tlb_gather_mmu(&tlb, mm, 0); |
1892 | update_hiwater_rss(mm); | 1912 | update_hiwater_rss(mm); |
1893 | unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); | 1913 | unmap_vmas(&tlb, vma, start, end); |
1894 | vm_unacct_memory(nr_accounted); | ||
1895 | free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, | 1914 | free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, |
1896 | next ? next->vm_start : 0); | 1915 | next ? next->vm_start : 0); |
1897 | tlb_finish_mmu(&tlb, start, end); | 1916 | tlb_finish_mmu(&tlb, start, end); |
@@ -2106,20 +2125,23 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
2106 | return 0; | 2125 | return 0; |
2107 | } | 2126 | } |
2108 | 2127 | ||
2109 | EXPORT_SYMBOL(do_munmap); | 2128 | int vm_munmap(unsigned long start, size_t len) |
2110 | |||
2111 | SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) | ||
2112 | { | 2129 | { |
2113 | int ret; | 2130 | int ret; |
2114 | struct mm_struct *mm = current->mm; | 2131 | struct mm_struct *mm = current->mm; |
2115 | 2132 | ||
2116 | profile_munmap(addr); | ||
2117 | |||
2118 | down_write(&mm->mmap_sem); | 2133 | down_write(&mm->mmap_sem); |
2119 | ret = do_munmap(mm, addr, len); | 2134 | ret = do_munmap(mm, start, len); |
2120 | up_write(&mm->mmap_sem); | 2135 | up_write(&mm->mmap_sem); |
2121 | return ret; | 2136 | return ret; |
2122 | } | 2137 | } |
2138 | EXPORT_SYMBOL(vm_munmap); | ||
2139 | |||
2140 | SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) | ||
2141 | { | ||
2142 | profile_munmap(addr); | ||
2143 | return vm_munmap(addr, len); | ||
2144 | } | ||
2123 | 2145 | ||
2124 | static inline void verify_mm_writelocked(struct mm_struct *mm) | 2146 | static inline void verify_mm_writelocked(struct mm_struct *mm) |
2125 | { | 2147 | { |
@@ -2136,7 +2158,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) | |||
2136 | * anonymous maps. eventually we may be able to do some | 2158 | * anonymous maps. eventually we may be able to do some |
2137 | * brk-specific accounting here. | 2159 | * brk-specific accounting here. |
2138 | */ | 2160 | */ |
2139 | unsigned long do_brk(unsigned long addr, unsigned long len) | 2161 | static unsigned long do_brk(unsigned long addr, unsigned long len) |
2140 | { | 2162 | { |
2141 | struct mm_struct * mm = current->mm; | 2163 | struct mm_struct * mm = current->mm; |
2142 | struct vm_area_struct * vma, * prev; | 2164 | struct vm_area_struct * vma, * prev; |
@@ -2149,10 +2171,6 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2149 | if (!len) | 2171 | if (!len) |
2150 | return addr; | 2172 | return addr; |
2151 | 2173 | ||
2152 | error = security_file_mmap(NULL, 0, 0, 0, addr, 1); | ||
2153 | if (error) | ||
2154 | return error; | ||
2155 | |||
2156 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | 2174 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; |
2157 | 2175 | ||
2158 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); | 2176 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); |
@@ -2232,7 +2250,17 @@ out: | |||
2232 | return addr; | 2250 | return addr; |
2233 | } | 2251 | } |
2234 | 2252 | ||
2235 | EXPORT_SYMBOL(do_brk); | 2253 | unsigned long vm_brk(unsigned long addr, unsigned long len) |
2254 | { | ||
2255 | struct mm_struct *mm = current->mm; | ||
2256 | unsigned long ret; | ||
2257 | |||
2258 | down_write(&mm->mmap_sem); | ||
2259 | ret = do_brk(addr, len); | ||
2260 | up_write(&mm->mmap_sem); | ||
2261 | return ret; | ||
2262 | } | ||
2263 | EXPORT_SYMBOL(vm_brk); | ||
2236 | 2264 | ||
2237 | /* Release all mmaps. */ | 2265 | /* Release all mmaps. */ |
2238 | void exit_mmap(struct mm_struct *mm) | 2266 | void exit_mmap(struct mm_struct *mm) |
@@ -2264,8 +2292,7 @@ void exit_mmap(struct mm_struct *mm) | |||
2264 | tlb_gather_mmu(&tlb, mm, 1); | 2292 | tlb_gather_mmu(&tlb, mm, 1); |
2265 | /* update_hiwater_rss(mm) here? but nobody should be looking */ | 2293 | /* update_hiwater_rss(mm) here? but nobody should be looking */ |
2266 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 2294 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
2267 | unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); | 2295 | unmap_vmas(&tlb, vma, 0, -1); |
2268 | vm_unacct_memory(nr_accounted); | ||
2269 | 2296 | ||
2270 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); | 2297 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); |
2271 | tlb_finish_mmu(&tlb, 0, -1); | 2298 | tlb_finish_mmu(&tlb, 0, -1); |
@@ -2274,10 +2301,14 @@ void exit_mmap(struct mm_struct *mm) | |||
2274 | * Walk the list again, actually closing and freeing it, | 2301 | * Walk the list again, actually closing and freeing it, |
2275 | * with preemption enabled, without holding any MM locks. | 2302 | * with preemption enabled, without holding any MM locks. |
2276 | */ | 2303 | */ |
2277 | while (vma) | 2304 | while (vma) { |
2305 | if (vma->vm_flags & VM_ACCOUNT) | ||
2306 | nr_accounted += vma_pages(vma); | ||
2278 | vma = remove_vma(vma); | 2307 | vma = remove_vma(vma); |
2308 | } | ||
2309 | vm_unacct_memory(nr_accounted); | ||
2279 | 2310 | ||
2280 | BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); | 2311 | WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); |
2281 | } | 2312 | } |
2282 | 2313 | ||
2283 | /* Insert vm structure into process list sorted by address | 2314 | /* Insert vm structure into process list sorted by address |
@@ -2311,6 +2342,7 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | |||
2311 | if ((vma->vm_flags & VM_ACCOUNT) && | 2342 | if ((vma->vm_flags & VM_ACCOUNT) && |
2312 | security_vm_enough_memory_mm(mm, vma_pages(vma))) | 2343 | security_vm_enough_memory_mm(mm, vma_pages(vma))) |
2313 | return -ENOMEM; | 2344 | return -ENOMEM; |
2345 | |||
2314 | vma_link(mm, vma, prev, rb_link, rb_parent); | 2346 | vma_link(mm, vma, prev, rb_link, rb_parent); |
2315 | return 0; | 2347 | return 0; |
2316 | } | 2348 | } |
@@ -2380,6 +2412,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2380 | new_vma->vm_pgoff = pgoff; | 2412 | new_vma->vm_pgoff = pgoff; |
2381 | if (new_vma->vm_file) { | 2413 | if (new_vma->vm_file) { |
2382 | get_file(new_vma->vm_file); | 2414 | get_file(new_vma->vm_file); |
2415 | |||
2383 | if (vma->vm_flags & VM_EXECUTABLE) | 2416 | if (vma->vm_flags & VM_EXECUTABLE) |
2384 | added_exe_file_vma(mm); | 2417 | added_exe_file_vma(mm); |
2385 | } | 2418 | } |
@@ -2484,10 +2517,6 @@ int install_special_mapping(struct mm_struct *mm, | |||
2484 | vma->vm_ops = &special_mapping_vmops; | 2517 | vma->vm_ops = &special_mapping_vmops; |
2485 | vma->vm_private_data = pages; | 2518 | vma->vm_private_data = pages; |
2486 | 2519 | ||
2487 | ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); | ||
2488 | if (ret) | ||
2489 | goto out; | ||
2490 | |||
2491 | ret = insert_vm_struct(mm, vma); | 2520 | ret = insert_vm_struct(mm, vma); |
2492 | if (ret) | 2521 | if (ret) |
2493 | goto out; | 2522 | goto out; |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9a611d3a1848..862b60822d9f 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -33,6 +33,24 @@ | |||
33 | void __mmu_notifier_release(struct mm_struct *mm) | 33 | void __mmu_notifier_release(struct mm_struct *mm) |
34 | { | 34 | { |
35 | struct mmu_notifier *mn; | 35 | struct mmu_notifier *mn; |
36 | struct hlist_node *n; | ||
37 | |||
38 | /* | ||
39 | * RCU here will block mmu_notifier_unregister until | ||
40 | * ->release returns. | ||
41 | */ | ||
42 | rcu_read_lock(); | ||
43 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) | ||
44 | /* | ||
45 | * if ->release runs before mmu_notifier_unregister it | ||
46 | * must be handled as it's the only way for the driver | ||
47 | * to flush all existing sptes and stop the driver | ||
48 | * from establishing any more sptes before all the | ||
49 | * pages in the mm are freed. | ||
50 | */ | ||
51 | if (mn->ops->release) | ||
52 | mn->ops->release(mn, mm); | ||
53 | rcu_read_unlock(); | ||
36 | 54 | ||
37 | spin_lock(&mm->mmu_notifier_mm->lock); | 55 | spin_lock(&mm->mmu_notifier_mm->lock); |
38 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | 56 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
46 | * mmu_notifier_unregister to return. | 64 | * mmu_notifier_unregister to return. |
47 | */ | 65 | */ |
48 | hlist_del_init_rcu(&mn->hlist); | 66 | hlist_del_init_rcu(&mn->hlist); |
49 | /* | ||
50 | * RCU here will block mmu_notifier_unregister until | ||
51 | * ->release returns. | ||
52 | */ | ||
53 | rcu_read_lock(); | ||
54 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
55 | /* | ||
56 | * if ->release runs before mmu_notifier_unregister it | ||
57 | * must be handled as it's the only way for the driver | ||
58 | * to flush all existing sptes and stop the driver | ||
59 | * from establishing any more sptes before all the | ||
60 | * pages in the mm are freed. | ||
61 | */ | ||
62 | if (mn->ops->release) | ||
63 | mn->ops->release(mn, mm); | ||
64 | rcu_read_unlock(); | ||
65 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
66 | } | 67 | } |
67 | spin_unlock(&mm->mmu_notifier_mm->lock); | 68 | spin_unlock(&mm->mmu_notifier_mm->lock); |
68 | 69 | ||
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
284 | { | 285 | { |
285 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 286 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
286 | 287 | ||
287 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
288 | if (!hlist_unhashed(&mn->hlist)) { | 288 | if (!hlist_unhashed(&mn->hlist)) { |
289 | hlist_del_rcu(&mn->hlist); | ||
290 | |||
291 | /* | 289 | /* |
292 | * RCU here will force exit_mmap to wait ->release to finish | 290 | * RCU here will force exit_mmap to wait ->release to finish |
293 | * before freeing the pages. | 291 | * before freeing the pages. |
294 | */ | 292 | */ |
295 | rcu_read_lock(); | 293 | rcu_read_lock(); |
296 | spin_unlock(&mm->mmu_notifier_mm->lock); | 294 | |
297 | /* | 295 | /* |
298 | * exit_mmap will block in mmu_notifier_release to | 296 | * exit_mmap will block in mmu_notifier_release to |
299 | * guarantee ->release is called before freeing the | 297 | * guarantee ->release is called before freeing the |
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
302 | if (mn->ops->release) | 300 | if (mn->ops->release) |
303 | mn->ops->release(mn, mm); | 301 | mn->ops->release(mn, mm); |
304 | rcu_read_unlock(); | 302 | rcu_read_unlock(); |
305 | } else | 303 | |
304 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
305 | hlist_del_rcu(&mn->hlist); | ||
306 | spin_unlock(&mm->mmu_notifier_mm->lock); | 306 | spin_unlock(&mm->mmu_notifier_mm->lock); |
307 | } | ||
307 | 308 | ||
308 | /* | 309 | /* |
309 | * Wait any running method to finish, of course including | 310 | * Wait any running method to finish, of course including |
diff --git a/mm/mmzone.c b/mm/mmzone.c index 7cf7b7ddc7c5..3cef80f6ac79 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -86,3 +86,17 @@ int memmap_valid_within(unsigned long pfn, | |||
86 | return 1; | 86 | return 1; |
87 | } | 87 | } |
88 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ | 88 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ |
89 | |||
90 | void lruvec_init(struct lruvec *lruvec, struct zone *zone) | ||
91 | { | ||
92 | enum lru_list lru; | ||
93 | |||
94 | memset(lruvec, 0, sizeof(struct lruvec)); | ||
95 | |||
96 | for_each_lru(lru) | ||
97 | INIT_LIST_HEAD(&lruvec->lists[lru]); | ||
98 | |||
99 | #ifdef CONFIG_MEMCG | ||
100 | lruvec->zone = zone; | ||
101 | #endif | ||
102 | } | ||
diff --git a/mm/mremap.c b/mm/mremap.c index db8d983b5a7d..cc06d0e48d05 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
260 | * If this were a serious issue, we'd add a flag to do_munmap(). | 260 | * If this were a serious issue, we'd add a flag to do_munmap(). |
261 | */ | 261 | */ |
262 | hiwater_vm = mm->hiwater_vm; | 262 | hiwater_vm = mm->hiwater_vm; |
263 | mm->total_vm += new_len >> PAGE_SHIFT; | ||
264 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); | 263 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); |
265 | 264 | ||
266 | if (do_munmap(mm, old_addr, old_len) < 0) { | 265 | if (do_munmap(mm, old_addr, old_len) < 0) { |
@@ -371,10 +370,6 @@ static unsigned long mremap_to(unsigned long addr, | |||
371 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | 370 | if ((addr <= new_addr) && (addr+old_len) > new_addr) |
372 | goto out; | 371 | goto out; |
373 | 372 | ||
374 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
375 | if (ret) | ||
376 | goto out; | ||
377 | |||
378 | ret = do_munmap(mm, new_addr, new_len); | 373 | ret = do_munmap(mm, new_addr, new_len); |
379 | if (ret) | 374 | if (ret) |
380 | goto out; | 375 | goto out; |
@@ -432,15 +427,17 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) | |||
432 | * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise | 427 | * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise |
433 | * This option implies MREMAP_MAYMOVE. | 428 | * This option implies MREMAP_MAYMOVE. |
434 | */ | 429 | */ |
435 | unsigned long do_mremap(unsigned long addr, | 430 | SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, |
436 | unsigned long old_len, unsigned long new_len, | 431 | unsigned long, new_len, unsigned long, flags, |
437 | unsigned long flags, unsigned long new_addr) | 432 | unsigned long, new_addr) |
438 | { | 433 | { |
439 | struct mm_struct *mm = current->mm; | 434 | struct mm_struct *mm = current->mm; |
440 | struct vm_area_struct *vma; | 435 | struct vm_area_struct *vma; |
441 | unsigned long ret = -EINVAL; | 436 | unsigned long ret = -EINVAL; |
442 | unsigned long charged = 0; | 437 | unsigned long charged = 0; |
443 | 438 | ||
439 | down_write(¤t->mm->mmap_sem); | ||
440 | |||
444 | if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) | 441 | if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) |
445 | goto out; | 442 | goto out; |
446 | 443 | ||
@@ -499,7 +496,6 @@ unsigned long do_mremap(unsigned long addr, | |||
499 | goto out; | 496 | goto out; |
500 | } | 497 | } |
501 | 498 | ||
502 | mm->total_vm += pages; | ||
503 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); | 499 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
504 | if (vma->vm_flags & VM_LOCKED) { | 500 | if (vma->vm_flags & VM_LOCKED) { |
505 | mm->locked_vm += pages; | 501 | mm->locked_vm += pages; |
@@ -530,25 +526,11 @@ unsigned long do_mremap(unsigned long addr, | |||
530 | goto out; | 526 | goto out; |
531 | } | 527 | } |
532 | 528 | ||
533 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
534 | if (ret) | ||
535 | goto out; | ||
536 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | 529 | ret = move_vma(vma, addr, old_len, new_len, new_addr); |
537 | } | 530 | } |
538 | out: | 531 | out: |
539 | if (ret & ~PAGE_MASK) | 532 | if (ret & ~PAGE_MASK) |
540 | vm_unacct_memory(charged); | 533 | vm_unacct_memory(charged); |
541 | return ret; | ||
542 | } | ||
543 | |||
544 | SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | ||
545 | unsigned long, new_len, unsigned long, flags, | ||
546 | unsigned long, new_addr) | ||
547 | { | ||
548 | unsigned long ret; | ||
549 | |||
550 | down_write(¤t->mm->mmap_sem); | ||
551 | ret = do_mremap(addr, old_len, new_len, flags, new_addr); | ||
552 | up_write(¤t->mm->mmap_sem); | 534 | up_write(¤t->mm->mmap_sem); |
553 | return ret; | 535 | return ret; |
554 | } | 536 | } |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 24f0fc1a56d6..405573010f99 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -82,8 +82,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) | |||
82 | 82 | ||
83 | static void __init __free_pages_memory(unsigned long start, unsigned long end) | 83 | static void __init __free_pages_memory(unsigned long start, unsigned long end) |
84 | { | 84 | { |
85 | int i; | 85 | unsigned long i, start_aligned, end_aligned; |
86 | unsigned long start_aligned, end_aligned; | ||
87 | int order = ilog2(BITS_PER_LONG); | 86 | int order = ilog2(BITS_PER_LONG); |
88 | 87 | ||
89 | start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); | 88 | start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); |
@@ -106,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) | |||
106 | __free_pages_bootmem(pfn_to_page(i), 0); | 105 | __free_pages_bootmem(pfn_to_page(i), 0); |
107 | } | 106 | } |
108 | 107 | ||
108 | static unsigned long __init __free_memory_core(phys_addr_t start, | ||
109 | phys_addr_t end) | ||
110 | { | ||
111 | unsigned long start_pfn = PFN_UP(start); | ||
112 | unsigned long end_pfn = min_t(unsigned long, | ||
113 | PFN_DOWN(end), max_low_pfn); | ||
114 | |||
115 | if (start_pfn > end_pfn) | ||
116 | return 0; | ||
117 | |||
118 | __free_pages_memory(start_pfn, end_pfn); | ||
119 | |||
120 | return end_pfn - start_pfn; | ||
121 | } | ||
122 | |||
109 | unsigned long __init free_low_memory_core_early(int nodeid) | 123 | unsigned long __init free_low_memory_core_early(int nodeid) |
110 | { | 124 | { |
111 | unsigned long count = 0; | 125 | unsigned long count = 0; |
112 | phys_addr_t start, end; | 126 | phys_addr_t start, end, size; |
113 | u64 i; | 127 | u64 i; |
114 | 128 | ||
115 | /* free reserved array temporarily so that it's treated as free area */ | 129 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) |
116 | memblock_free_reserved_regions(); | 130 | count += __free_memory_core(start, end); |
117 | 131 | ||
118 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { | 132 | /* free range that is used for reserved array if we allocate it */ |
119 | unsigned long start_pfn = PFN_UP(start); | 133 | size = get_allocated_memblock_reserved_regions_info(&start); |
120 | unsigned long end_pfn = min_t(unsigned long, | 134 | if (size) |
121 | PFN_DOWN(end), max_low_pfn); | 135 | count += __free_memory_core(start, start + size); |
122 | if (start_pfn < end_pfn) { | ||
123 | __free_pages_memory(start_pfn, end_pfn); | ||
124 | count += end_pfn - start_pfn; | ||
125 | } | ||
126 | } | ||
127 | 136 | ||
128 | /* put region array back? */ | ||
129 | memblock_reserve_reserved_regions(); | ||
130 | return count; | 137 | return count; |
131 | } | 138 | } |
132 | 139 | ||
@@ -275,6 +282,57 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, | |||
275 | return ___alloc_bootmem(size, align, goal, limit); | 282 | return ___alloc_bootmem(size, align, goal, limit); |
276 | } | 283 | } |
277 | 284 | ||
285 | void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, | ||
286 | unsigned long size, | ||
287 | unsigned long align, | ||
288 | unsigned long goal, | ||
289 | unsigned long limit) | ||
290 | { | ||
291 | void *ptr; | ||
292 | |||
293 | again: | ||
294 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
295 | goal, limit); | ||
296 | if (ptr) | ||
297 | return ptr; | ||
298 | |||
299 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, | ||
300 | goal, limit); | ||
301 | if (ptr) | ||
302 | return ptr; | ||
303 | |||
304 | if (goal) { | ||
305 | goal = 0; | ||
306 | goto again; | ||
307 | } | ||
308 | |||
309 | return NULL; | ||
310 | } | ||
311 | |||
312 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | ||
313 | unsigned long align, unsigned long goal) | ||
314 | { | ||
315 | if (WARN_ON_ONCE(slab_is_available())) | ||
316 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
317 | |||
318 | return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); | ||
319 | } | ||
320 | |||
321 | void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | ||
322 | unsigned long align, unsigned long goal, | ||
323 | unsigned long limit) | ||
324 | { | ||
325 | void *ptr; | ||
326 | |||
327 | ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit); | ||
328 | if (ptr) | ||
329 | return ptr; | ||
330 | |||
331 | printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); | ||
332 | panic("Out of memory"); | ||
333 | return NULL; | ||
334 | } | ||
335 | |||
278 | /** | 336 | /** |
279 | * __alloc_bootmem_node - allocate boot memory from a specific node | 337 | * __alloc_bootmem_node - allocate boot memory from a specific node |
280 | * @pgdat: node to allocate from | 338 | * @pgdat: node to allocate from |
@@ -293,18 +351,10 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, | |||
293 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | 351 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, |
294 | unsigned long align, unsigned long goal) | 352 | unsigned long align, unsigned long goal) |
295 | { | 353 | { |
296 | void *ptr; | ||
297 | |||
298 | if (WARN_ON_ONCE(slab_is_available())) | 354 | if (WARN_ON_ONCE(slab_is_available())) |
299 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 355 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
300 | 356 | ||
301 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | 357 | return ___alloc_bootmem_node(pgdat, size, align, goal, 0); |
302 | goal, -1ULL); | ||
303 | if (ptr) | ||
304 | return ptr; | ||
305 | |||
306 | return __alloc_memory_core_early(MAX_NUMNODES, size, align, | ||
307 | goal, -1ULL); | ||
308 | } | 358 | } |
309 | 359 | ||
310 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | 360 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, |
@@ -313,44 +363,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | |||
313 | return __alloc_bootmem_node(pgdat, size, align, goal); | 363 | return __alloc_bootmem_node(pgdat, size, align, goal); |
314 | } | 364 | } |
315 | 365 | ||
316 | #ifdef CONFIG_SPARSEMEM | ||
317 | /** | ||
318 | * alloc_bootmem_section - allocate boot memory from a specific section | ||
319 | * @size: size of the request in bytes | ||
320 | * @section_nr: sparse map section to allocate from | ||
321 | * | ||
322 | * Return NULL on failure. | ||
323 | */ | ||
324 | void * __init alloc_bootmem_section(unsigned long size, | ||
325 | unsigned long section_nr) | ||
326 | { | ||
327 | unsigned long pfn, goal, limit; | ||
328 | |||
329 | pfn = section_nr_to_pfn(section_nr); | ||
330 | goal = pfn << PAGE_SHIFT; | ||
331 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; | ||
332 | |||
333 | return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, | ||
334 | SMP_CACHE_BYTES, goal, limit); | ||
335 | } | ||
336 | #endif | ||
337 | |||
338 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | ||
339 | unsigned long align, unsigned long goal) | ||
340 | { | ||
341 | void *ptr; | ||
342 | |||
343 | if (WARN_ON_ONCE(slab_is_available())) | ||
344 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
345 | |||
346 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
347 | goal, -1ULL); | ||
348 | if (ptr) | ||
349 | return ptr; | ||
350 | |||
351 | return __alloc_bootmem_nopanic(size, align, goal); | ||
352 | } | ||
353 | |||
354 | #ifndef ARCH_LOW_ADDRESS_LIMIT | 366 | #ifndef ARCH_LOW_ADDRESS_LIMIT |
355 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL | 367 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL |
356 | #endif | 368 | #endif |
@@ -392,16 +404,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, | |||
392 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | 404 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, |
393 | unsigned long align, unsigned long goal) | 405 | unsigned long align, unsigned long goal) |
394 | { | 406 | { |
395 | void *ptr; | ||
396 | |||
397 | if (WARN_ON_ONCE(slab_is_available())) | 407 | if (WARN_ON_ONCE(slab_is_available())) |
398 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 408 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
399 | 409 | ||
400 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | 410 | return ___alloc_bootmem_node(pgdat, size, align, goal, |
401 | goal, ARCH_LOW_ADDRESS_LIMIT); | 411 | ARCH_LOW_ADDRESS_LIMIT); |
402 | if (ptr) | ||
403 | return ptr; | ||
404 | |||
405 | return __alloc_memory_core_early(MAX_NUMNODES, size, align, | ||
406 | goal, ARCH_LOW_ADDRESS_LIMIT); | ||
407 | } | 412 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index f59e170fceb4..d4b0c10872de 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -889,7 +889,6 @@ static int validate_mmap_request(struct file *file, | |||
889 | unsigned long *_capabilities) | 889 | unsigned long *_capabilities) |
890 | { | 890 | { |
891 | unsigned long capabilities, rlen; | 891 | unsigned long capabilities, rlen; |
892 | unsigned long reqprot = prot; | ||
893 | int ret; | 892 | int ret; |
894 | 893 | ||
895 | /* do the simple checks first */ | 894 | /* do the simple checks first */ |
@@ -1047,7 +1046,7 @@ static int validate_mmap_request(struct file *file, | |||
1047 | } | 1046 | } |
1048 | 1047 | ||
1049 | /* allow the security API to have its say */ | 1048 | /* allow the security API to have its say */ |
1050 | ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); | 1049 | ret = security_mmap_addr(addr); |
1051 | if (ret < 0) | 1050 | if (ret < 0) |
1052 | return ret; | 1051 | return ret; |
1053 | 1052 | ||
@@ -1470,7 +1469,6 @@ error_getting_region: | |||
1470 | show_free_areas(0); | 1469 | show_free_areas(0); |
1471 | return -ENOMEM; | 1470 | return -ENOMEM; |
1472 | } | 1471 | } |
1473 | EXPORT_SYMBOL(do_mmap_pgoff); | ||
1474 | 1472 | ||
1475 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | 1473 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, |
1476 | unsigned long, prot, unsigned long, flags, | 1474 | unsigned long, prot, unsigned long, flags, |
@@ -1488,9 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1488 | 1486 | ||
1489 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | 1487 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); |
1490 | 1488 | ||
1491 | down_write(¤t->mm->mmap_sem); | 1489 | retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); |
1492 | retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
1493 | up_write(¤t->mm->mmap_sem); | ||
1494 | 1490 | ||
1495 | if (file) | 1491 | if (file) |
1496 | fput(file); | 1492 | fput(file); |
@@ -1709,16 +1705,22 @@ erase_whole_vma: | |||
1709 | } | 1705 | } |
1710 | EXPORT_SYMBOL(do_munmap); | 1706 | EXPORT_SYMBOL(do_munmap); |
1711 | 1707 | ||
1712 | SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) | 1708 | int vm_munmap(unsigned long addr, size_t len) |
1713 | { | 1709 | { |
1714 | int ret; | ||
1715 | struct mm_struct *mm = current->mm; | 1710 | struct mm_struct *mm = current->mm; |
1711 | int ret; | ||
1716 | 1712 | ||
1717 | down_write(&mm->mmap_sem); | 1713 | down_write(&mm->mmap_sem); |
1718 | ret = do_munmap(mm, addr, len); | 1714 | ret = do_munmap(mm, addr, len); |
1719 | up_write(&mm->mmap_sem); | 1715 | up_write(&mm->mmap_sem); |
1720 | return ret; | 1716 | return ret; |
1721 | } | 1717 | } |
1718 | EXPORT_SYMBOL(vm_munmap); | ||
1719 | |||
1720 | SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) | ||
1721 | { | ||
1722 | return vm_munmap(addr, len); | ||
1723 | } | ||
1722 | 1724 | ||
1723 | /* | 1725 | /* |
1724 | * release all the mappings made in a process's VM space | 1726 | * release all the mappings made in a process's VM space |
@@ -1744,7 +1746,7 @@ void exit_mmap(struct mm_struct *mm) | |||
1744 | kleave(""); | 1746 | kleave(""); |
1745 | } | 1747 | } |
1746 | 1748 | ||
1747 | unsigned long do_brk(unsigned long addr, unsigned long len) | 1749 | unsigned long vm_brk(unsigned long addr, unsigned long len) |
1748 | { | 1750 | { |
1749 | return -ENOMEM; | 1751 | return -ENOMEM; |
1750 | } | 1752 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 46bf2ed5594c..198600861638 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -180,10 +180,11 @@ static bool oom_unkillable_task(struct task_struct *p, | |||
180 | * predictable as possible. The goal is to return the highest value for the | 180 | * predictable as possible. The goal is to return the highest value for the |
181 | * task consuming the most memory to avoid subsequent oom failures. | 181 | * task consuming the most memory to avoid subsequent oom failures. |
182 | */ | 182 | */ |
183 | unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | 183 | unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, |
184 | const nodemask_t *nodemask, unsigned long totalpages) | 184 | const nodemask_t *nodemask, unsigned long totalpages) |
185 | { | 185 | { |
186 | long points; | 186 | long points; |
187 | long adj; | ||
187 | 188 | ||
188 | if (oom_unkillable_task(p, memcg, nodemask)) | 189 | if (oom_unkillable_task(p, memcg, nodemask)) |
189 | return 0; | 190 | return 0; |
@@ -192,27 +193,18 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
192 | if (!p) | 193 | if (!p) |
193 | return 0; | 194 | return 0; |
194 | 195 | ||
195 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { | 196 | adj = p->signal->oom_score_adj; |
197 | if (adj == OOM_SCORE_ADJ_MIN) { | ||
196 | task_unlock(p); | 198 | task_unlock(p); |
197 | return 0; | 199 | return 0; |
198 | } | 200 | } |
199 | 201 | ||
200 | /* | 202 | /* |
201 | * The memory controller may have a limit of 0 bytes, so avoid a divide | ||
202 | * by zero, if necessary. | ||
203 | */ | ||
204 | if (!totalpages) | ||
205 | totalpages = 1; | ||
206 | |||
207 | /* | ||
208 | * The baseline for the badness score is the proportion of RAM that each | 203 | * The baseline for the badness score is the proportion of RAM that each |
209 | * task's rss, pagetable and swap space use. | 204 | * task's rss, pagetable and swap space use. |
210 | */ | 205 | */ |
211 | points = get_mm_rss(p->mm) + p->mm->nr_ptes; | 206 | points = get_mm_rss(p->mm) + p->mm->nr_ptes + |
212 | points += get_mm_counter(p->mm, MM_SWAPENTS); | 207 | get_mm_counter(p->mm, MM_SWAPENTS); |
213 | |||
214 | points *= 1000; | ||
215 | points /= totalpages; | ||
216 | task_unlock(p); | 208 | task_unlock(p); |
217 | 209 | ||
218 | /* | 210 | /* |
@@ -220,23 +212,17 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
220 | * implementation used by LSMs. | 212 | * implementation used by LSMs. |
221 | */ | 213 | */ |
222 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) | 214 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) |
223 | points -= 30; | 215 | adj -= 30; |
224 | 216 | ||
225 | /* | 217 | /* Normalize to oom_score_adj units */ |
226 | * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may | 218 | adj *= totalpages / 1000; |
227 | * either completely disable oom killing or always prefer a certain | 219 | points += adj; |
228 | * task. | ||
229 | */ | ||
230 | points += p->signal->oom_score_adj; | ||
231 | 220 | ||
232 | /* | 221 | /* |
233 | * Never return 0 for an eligible task that may be killed since it's | 222 | * Never return 0 for an eligible task regardless of the root bonus and |
234 | * possible that no single user task uses more than 0.1% of memory and | 223 | * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). |
235 | * no single admin tasks uses more than 3.0%. | ||
236 | */ | 224 | */ |
237 | if (points <= 0) | 225 | return points > 0 ? points : 1; |
238 | return 1; | ||
239 | return (points < 1000) ? points : 1000; | ||
240 | } | 226 | } |
241 | 227 | ||
242 | /* | 228 | /* |
@@ -302,99 +288,116 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
302 | } | 288 | } |
303 | #endif | 289 | #endif |
304 | 290 | ||
291 | enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | ||
292 | unsigned long totalpages, const nodemask_t *nodemask, | ||
293 | bool force_kill) | ||
294 | { | ||
295 | if (task->exit_state) | ||
296 | return OOM_SCAN_CONTINUE; | ||
297 | if (oom_unkillable_task(task, NULL, nodemask)) | ||
298 | return OOM_SCAN_CONTINUE; | ||
299 | |||
300 | /* | ||
301 | * This task already has access to memory reserves and is being killed. | ||
302 | * Don't allow any other task to have access to the reserves. | ||
303 | */ | ||
304 | if (test_tsk_thread_flag(task, TIF_MEMDIE)) { | ||
305 | if (unlikely(frozen(task))) | ||
306 | __thaw_task(task); | ||
307 | if (!force_kill) | ||
308 | return OOM_SCAN_ABORT; | ||
309 | } | ||
310 | if (!task->mm) | ||
311 | return OOM_SCAN_CONTINUE; | ||
312 | |||
313 | if (task->flags & PF_EXITING) { | ||
314 | /* | ||
315 | * If task is current and is in the process of releasing memory, | ||
316 | * allow the "kill" to set TIF_MEMDIE, which will allow it to | ||
317 | * access memory reserves. Otherwise, it may stall forever. | ||
318 | * | ||
319 | * The iteration isn't broken here, however, in case other | ||
320 | * threads are found to have already been oom killed. | ||
321 | */ | ||
322 | if (task == current) | ||
323 | return OOM_SCAN_SELECT; | ||
324 | else if (!force_kill) { | ||
325 | /* | ||
326 | * If this task is not being ptraced on exit, then wait | ||
327 | * for it to finish before killing some other task | ||
328 | * unnecessarily. | ||
329 | */ | ||
330 | if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) | ||
331 | return OOM_SCAN_ABORT; | ||
332 | } | ||
333 | } | ||
334 | return OOM_SCAN_OK; | ||
335 | } | ||
336 | |||
305 | /* | 337 | /* |
306 | * Simple selection loop. We chose the process with the highest | 338 | * Simple selection loop. We chose the process with the highest |
307 | * number of 'points'. We expect the caller will lock the tasklist. | 339 | * number of 'points'. |
308 | * | 340 | * |
309 | * (not docbooked, we don't want this one cluttering up the manual) | 341 | * (not docbooked, we don't want this one cluttering up the manual) |
310 | */ | 342 | */ |
311 | static struct task_struct *select_bad_process(unsigned int *ppoints, | 343 | static struct task_struct *select_bad_process(unsigned int *ppoints, |
312 | unsigned long totalpages, struct mem_cgroup *memcg, | 344 | unsigned long totalpages, const nodemask_t *nodemask, |
313 | const nodemask_t *nodemask, bool force_kill) | 345 | bool force_kill) |
314 | { | 346 | { |
315 | struct task_struct *g, *p; | 347 | struct task_struct *g, *p; |
316 | struct task_struct *chosen = NULL; | 348 | struct task_struct *chosen = NULL; |
317 | *ppoints = 0; | 349 | unsigned long chosen_points = 0; |
318 | 350 | ||
351 | rcu_read_lock(); | ||
319 | do_each_thread(g, p) { | 352 | do_each_thread(g, p) { |
320 | unsigned int points; | 353 | unsigned int points; |
321 | 354 | ||
322 | if (p->exit_state) | 355 | switch (oom_scan_process_thread(p, totalpages, nodemask, |
323 | continue; | 356 | force_kill)) { |
324 | if (oom_unkillable_task(p, memcg, nodemask)) | 357 | case OOM_SCAN_SELECT: |
325 | continue; | 358 | chosen = p; |
326 | 359 | chosen_points = ULONG_MAX; | |
327 | /* | 360 | /* fall through */ |
328 | * This task already has access to memory reserves and is | 361 | case OOM_SCAN_CONTINUE: |
329 | * being killed. Don't allow any other task access to the | ||
330 | * memory reserve. | ||
331 | * | ||
332 | * Note: this may have a chance of deadlock if it gets | ||
333 | * blocked waiting for another task which itself is waiting | ||
334 | * for memory. Is there a better alternative? | ||
335 | */ | ||
336 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) { | ||
337 | if (unlikely(frozen(p))) | ||
338 | __thaw_task(p); | ||
339 | if (!force_kill) | ||
340 | return ERR_PTR(-1UL); | ||
341 | } | ||
342 | if (!p->mm) | ||
343 | continue; | 362 | continue; |
344 | 363 | case OOM_SCAN_ABORT: | |
345 | if (p->flags & PF_EXITING) { | 364 | rcu_read_unlock(); |
346 | /* | 365 | return ERR_PTR(-1UL); |
347 | * If p is the current task and is in the process of | 366 | case OOM_SCAN_OK: |
348 | * releasing memory, we allow the "kill" to set | 367 | break; |
349 | * TIF_MEMDIE, which will allow it to gain access to | 368 | }; |
350 | * memory reserves. Otherwise, it may stall forever. | 369 | points = oom_badness(p, NULL, nodemask, totalpages); |
351 | * | 370 | if (points > chosen_points) { |
352 | * The loop isn't broken here, however, in case other | ||
353 | * threads are found to have already been oom killed. | ||
354 | */ | ||
355 | if (p == current) { | ||
356 | chosen = p; | ||
357 | *ppoints = 1000; | ||
358 | } else if (!force_kill) { | ||
359 | /* | ||
360 | * If this task is not being ptraced on exit, | ||
361 | * then wait for it to finish before killing | ||
362 | * some other task unnecessarily. | ||
363 | */ | ||
364 | if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) | ||
365 | return ERR_PTR(-1UL); | ||
366 | } | ||
367 | } | ||
368 | |||
369 | points = oom_badness(p, memcg, nodemask, totalpages); | ||
370 | if (points > *ppoints) { | ||
371 | chosen = p; | 371 | chosen = p; |
372 | *ppoints = points; | 372 | chosen_points = points; |
373 | } | 373 | } |
374 | } while_each_thread(g, p); | 374 | } while_each_thread(g, p); |
375 | if (chosen) | ||
376 | get_task_struct(chosen); | ||
377 | rcu_read_unlock(); | ||
375 | 378 | ||
379 | *ppoints = chosen_points * 1000 / totalpages; | ||
376 | return chosen; | 380 | return chosen; |
377 | } | 381 | } |
378 | 382 | ||
379 | /** | 383 | /** |
380 | * dump_tasks - dump current memory state of all system tasks | 384 | * dump_tasks - dump current memory state of all system tasks |
381 | * @mem: current's memory controller, if constrained | 385 | * @memcg: current's memory controller, if constrained |
382 | * @nodemask: nodemask passed to page allocator for mempolicy ooms | 386 | * @nodemask: nodemask passed to page allocator for mempolicy ooms |
383 | * | 387 | * |
384 | * Dumps the current memory state of all eligible tasks. Tasks not in the same | 388 | * Dumps the current memory state of all eligible tasks. Tasks not in the same |
385 | * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes | 389 | * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes |
386 | * are not shown. | 390 | * are not shown. |
387 | * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj | 391 | * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, |
388 | * value, oom_score_adj value, and name. | 392 | * swapents, oom_score_adj value, and name. |
389 | * | ||
390 | * Call with tasklist_lock read-locked. | ||
391 | */ | 393 | */ |
392 | static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) | 394 | static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) |
393 | { | 395 | { |
394 | struct task_struct *p; | 396 | struct task_struct *p; |
395 | struct task_struct *task; | 397 | struct task_struct *task; |
396 | 398 | ||
397 | pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); | 399 | pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); |
400 | rcu_read_lock(); | ||
398 | for_each_process(p) { | 401 | for_each_process(p) { |
399 | if (oom_unkillable_task(p, memcg, nodemask)) | 402 | if (oom_unkillable_task(p, memcg, nodemask)) |
400 | continue; | 403 | continue; |
@@ -409,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas | |||
409 | continue; | 412 | continue; |
410 | } | 413 | } |
411 | 414 | ||
412 | pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", | 415 | pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", |
413 | task->pid, task_uid(task), task->tgid, | 416 | task->pid, from_kuid(&init_user_ns, task_uid(task)), |
414 | task->mm->total_vm, get_mm_rss(task->mm), | 417 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), |
415 | task_cpu(task), task->signal->oom_adj, | 418 | task->mm->nr_ptes, |
419 | get_mm_counter(task->mm, MM_SWAPENTS), | ||
416 | task->signal->oom_score_adj, task->comm); | 420 | task->signal->oom_score_adj, task->comm); |
417 | task_unlock(task); | 421 | task_unlock(task); |
418 | } | 422 | } |
423 | rcu_read_unlock(); | ||
419 | } | 424 | } |
420 | 425 | ||
421 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | 426 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, |
@@ -436,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
436 | } | 441 | } |
437 | 442 | ||
438 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 443 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
439 | static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | 444 | /* |
440 | unsigned int points, unsigned long totalpages, | 445 | * Must be called while holding a reference to p, which will be released upon |
441 | struct mem_cgroup *memcg, nodemask_t *nodemask, | 446 | * returning. |
442 | const char *message) | 447 | */ |
448 | void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | ||
449 | unsigned int points, unsigned long totalpages, | ||
450 | struct mem_cgroup *memcg, nodemask_t *nodemask, | ||
451 | const char *message) | ||
443 | { | 452 | { |
444 | struct task_struct *victim = p; | 453 | struct task_struct *victim = p; |
445 | struct task_struct *child; | 454 | struct task_struct *child; |
@@ -455,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
455 | */ | 464 | */ |
456 | if (p->flags & PF_EXITING) { | 465 | if (p->flags & PF_EXITING) { |
457 | set_tsk_thread_flag(p, TIF_MEMDIE); | 466 | set_tsk_thread_flag(p, TIF_MEMDIE); |
467 | put_task_struct(p); | ||
458 | return; | 468 | return; |
459 | } | 469 | } |
460 | 470 | ||
@@ -472,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
472 | * parent. This attempts to lose the minimal amount of work done while | 482 | * parent. This attempts to lose the minimal amount of work done while |
473 | * still freeing memory. | 483 | * still freeing memory. |
474 | */ | 484 | */ |
485 | read_lock(&tasklist_lock); | ||
475 | do { | 486 | do { |
476 | list_for_each_entry(child, &t->children, sibling) { | 487 | list_for_each_entry(child, &t->children, sibling) { |
477 | unsigned int child_points; | 488 | unsigned int child_points; |
@@ -484,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
484 | child_points = oom_badness(child, memcg, nodemask, | 495 | child_points = oom_badness(child, memcg, nodemask, |
485 | totalpages); | 496 | totalpages); |
486 | if (child_points > victim_points) { | 497 | if (child_points > victim_points) { |
498 | put_task_struct(victim); | ||
487 | victim = child; | 499 | victim = child; |
488 | victim_points = child_points; | 500 | victim_points = child_points; |
501 | get_task_struct(victim); | ||
489 | } | 502 | } |
490 | } | 503 | } |
491 | } while_each_thread(p, t); | 504 | } while_each_thread(p, t); |
505 | read_unlock(&tasklist_lock); | ||
492 | 506 | ||
493 | victim = find_lock_task_mm(victim); | 507 | rcu_read_lock(); |
494 | if (!victim) | 508 | p = find_lock_task_mm(victim); |
509 | if (!p) { | ||
510 | rcu_read_unlock(); | ||
511 | put_task_struct(victim); | ||
495 | return; | 512 | return; |
513 | } else if (victim != p) { | ||
514 | get_task_struct(p); | ||
515 | put_task_struct(victim); | ||
516 | victim = p; | ||
517 | } | ||
496 | 518 | ||
497 | /* mm cannot safely be dereferenced after task_unlock(victim) */ | 519 | /* mm cannot safely be dereferenced after task_unlock(victim) */ |
498 | mm = victim->mm; | 520 | mm = victim->mm; |
@@ -523,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
523 | task_unlock(p); | 545 | task_unlock(p); |
524 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); | 546 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); |
525 | } | 547 | } |
548 | rcu_read_unlock(); | ||
526 | 549 | ||
527 | set_tsk_thread_flag(victim, TIF_MEMDIE); | 550 | set_tsk_thread_flag(victim, TIF_MEMDIE); |
528 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); | 551 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); |
552 | put_task_struct(victim); | ||
529 | } | 553 | } |
530 | #undef K | 554 | #undef K |
531 | 555 | ||
532 | /* | 556 | /* |
533 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. | 557 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. |
534 | */ | 558 | */ |
535 | static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | 559 | void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, |
536 | int order, const nodemask_t *nodemask) | 560 | int order, const nodemask_t *nodemask) |
537 | { | 561 | { |
538 | if (likely(!sysctl_panic_on_oom)) | 562 | if (likely(!sysctl_panic_on_oom)) |
539 | return; | 563 | return; |
@@ -546,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | |||
546 | if (constraint != CONSTRAINT_NONE) | 570 | if (constraint != CONSTRAINT_NONE) |
547 | return; | 571 | return; |
548 | } | 572 | } |
549 | read_lock(&tasklist_lock); | ||
550 | dump_header(NULL, gfp_mask, order, NULL, nodemask); | 573 | dump_header(NULL, gfp_mask, order, NULL, nodemask); |
551 | read_unlock(&tasklist_lock); | ||
552 | panic("Out of memory: %s panic_on_oom is enabled\n", | 574 | panic("Out of memory: %s panic_on_oom is enabled\n", |
553 | sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); | 575 | sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); |
554 | } | 576 | } |
555 | 577 | ||
556 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
557 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | ||
558 | int order) | ||
559 | { | ||
560 | unsigned long limit; | ||
561 | unsigned int points = 0; | ||
562 | struct task_struct *p; | ||
563 | |||
564 | /* | ||
565 | * If current has a pending SIGKILL, then automatically select it. The | ||
566 | * goal is to allow it to allocate so that it may quickly exit and free | ||
567 | * its memory. | ||
568 | */ | ||
569 | if (fatal_signal_pending(current)) { | ||
570 | set_thread_flag(TIF_MEMDIE); | ||
571 | return; | ||
572 | } | ||
573 | |||
574 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | ||
575 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; | ||
576 | read_lock(&tasklist_lock); | ||
577 | p = select_bad_process(&points, limit, memcg, NULL, false); | ||
578 | if (p && PTR_ERR(p) != -1UL) | ||
579 | oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL, | ||
580 | "Memory cgroup out of memory"); | ||
581 | read_unlock(&tasklist_lock); | ||
582 | } | ||
583 | #endif | ||
584 | |||
585 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); | 578 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); |
586 | 579 | ||
587 | int register_oom_notifier(struct notifier_block *nb) | 580 | int register_oom_notifier(struct notifier_block *nb) |
@@ -703,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
703 | struct task_struct *p; | 696 | struct task_struct *p; |
704 | unsigned long totalpages; | 697 | unsigned long totalpages; |
705 | unsigned long freed = 0; | 698 | unsigned long freed = 0; |
706 | unsigned int points; | 699 | unsigned int uninitialized_var(points); |
707 | enum oom_constraint constraint = CONSTRAINT_NONE; | 700 | enum oom_constraint constraint = CONSTRAINT_NONE; |
708 | int killed = 0; | 701 | int killed = 0; |
709 | 702 | ||
@@ -731,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
731 | mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; | 724 | mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; |
732 | check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); | 725 | check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); |
733 | 726 | ||
734 | read_lock(&tasklist_lock); | 727 | if (sysctl_oom_kill_allocating_task && current->mm && |
735 | if (sysctl_oom_kill_allocating_task && | ||
736 | !oom_unkillable_task(current, NULL, nodemask) && | 728 | !oom_unkillable_task(current, NULL, nodemask) && |
737 | current->mm) { | 729 | current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { |
730 | get_task_struct(current); | ||
738 | oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, | 731 | oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, |
739 | nodemask, | 732 | nodemask, |
740 | "Out of memory (oom_kill_allocating_task)"); | 733 | "Out of memory (oom_kill_allocating_task)"); |
741 | goto out; | 734 | goto out; |
742 | } | 735 | } |
743 | 736 | ||
744 | p = select_bad_process(&points, totalpages, NULL, mpol_mask, | 737 | p = select_bad_process(&points, totalpages, mpol_mask, force_kill); |
745 | force_kill); | ||
746 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 738 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
747 | if (!p) { | 739 | if (!p) { |
748 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); | 740 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); |
749 | read_unlock(&tasklist_lock); | ||
750 | panic("Out of memory and no killable processes...\n"); | 741 | panic("Out of memory and no killable processes...\n"); |
751 | } | 742 | } |
752 | if (PTR_ERR(p) != -1UL) { | 743 | if (PTR_ERR(p) != -1UL) { |
@@ -755,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
755 | killed = 1; | 746 | killed = 1; |
756 | } | 747 | } |
757 | out: | 748 | out: |
758 | read_unlock(&tasklist_lock); | ||
759 | |||
760 | /* | 749 | /* |
761 | * Give "p" a good chance of killing itself before we | 750 | * Give the killed threads a good chance of exiting before trying to |
762 | * retry to allocate memory unless "p" is current | 751 | * allocate memory again. |
763 | */ | 752 | */ |
764 | if (killed && !test_thread_flag(TIF_MEMDIE)) | 753 | if (killed) |
765 | schedule_timeout_uninterruptible(1); | 754 | schedule_timeout_killable(1); |
766 | } | 755 | } |
767 | 756 | ||
768 | /* | 757 | /* |
@@ -777,6 +766,5 @@ void pagefault_out_of_memory(void) | |||
777 | out_of_memory(NULL, 0, 0, NULL, false); | 766 | out_of_memory(NULL, 0, 0, NULL, false); |
778 | clear_system_oom(); | 767 | clear_system_oom(); |
779 | } | 768 | } |
780 | if (!test_thread_flag(TIF_MEMDIE)) | 769 | schedule_timeout_killable(1); |
781 | schedule_timeout_uninterruptible(1); | ||
782 | } | 770 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 26adea8ca2e7..5ad5ce23c1e0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
35 | #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ | 35 | #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ |
36 | #include <linux/pagevec.h> | 36 | #include <linux/pagevec.h> |
37 | #include <linux/timer.h> | ||
37 | #include <trace/events/writeback.h> | 38 | #include <trace/events/writeback.h> |
38 | 39 | ||
39 | /* | 40 | /* |
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit; | |||
135 | * measured in page writeback completions. | 136 | * measured in page writeback completions. |
136 | * | 137 | * |
137 | */ | 138 | */ |
138 | static struct prop_descriptor vm_completions; | 139 | static struct fprop_global writeout_completions; |
140 | |||
141 | static void writeout_period(unsigned long t); | ||
142 | /* Timer for aging of writeout_completions */ | ||
143 | static struct timer_list writeout_period_timer = | ||
144 | TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); | ||
145 | static unsigned long writeout_period_time = 0; | ||
146 | |||
147 | /* | ||
148 | * Length of period for aging writeout fractions of bdis. This is an | ||
149 | * arbitrarily chosen number. The longer the period, the slower fractions will | ||
150 | * reflect changes in current writeout rate. | ||
151 | */ | ||
152 | #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) | ||
139 | 153 | ||
140 | /* | 154 | /* |
141 | * Work out the current dirty-memory clamping and background writeout | 155 | * Work out the current dirty-memory clamping and background writeout |
@@ -204,7 +218,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) | |||
204 | * Returns the global number of pages potentially available for dirty | 218 | * Returns the global number of pages potentially available for dirty |
205 | * page cache. This is the base value for the global dirty limits. | 219 | * page cache. This is the base value for the global dirty limits. |
206 | */ | 220 | */ |
207 | unsigned long global_dirtyable_memory(void) | 221 | static unsigned long global_dirtyable_memory(void) |
208 | { | 222 | { |
209 | unsigned long x; | 223 | unsigned long x; |
210 | 224 | ||
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone) | |||
322 | zone_page_state(zone, NR_WRITEBACK) <= limit; | 336 | zone_page_state(zone, NR_WRITEBACK) <= limit; |
323 | } | 337 | } |
324 | 338 | ||
325 | /* | ||
326 | * couple the period to the dirty_ratio: | ||
327 | * | ||
328 | * period/2 ~ roundup_pow_of_two(dirty limit) | ||
329 | */ | ||
330 | static int calc_period_shift(void) | ||
331 | { | ||
332 | unsigned long dirty_total; | ||
333 | |||
334 | if (vm_dirty_bytes) | ||
335 | dirty_total = vm_dirty_bytes / PAGE_SIZE; | ||
336 | else | ||
337 | dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) / | ||
338 | 100; | ||
339 | return 2 + ilog2(dirty_total - 1); | ||
340 | } | ||
341 | |||
342 | /* | ||
343 | * update the period when the dirty threshold changes. | ||
344 | */ | ||
345 | static void update_completion_period(void) | ||
346 | { | ||
347 | int shift = calc_period_shift(); | ||
348 | prop_change_shift(&vm_completions, shift); | ||
349 | |||
350 | writeback_set_ratelimit(); | ||
351 | } | ||
352 | |||
353 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | 339 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
354 | void __user *buffer, size_t *lenp, | 340 | void __user *buffer, size_t *lenp, |
355 | loff_t *ppos) | 341 | loff_t *ppos) |
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, | |||
383 | 369 | ||
384 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 370 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
385 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | 371 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
386 | update_completion_period(); | 372 | writeback_set_ratelimit(); |
387 | vm_dirty_bytes = 0; | 373 | vm_dirty_bytes = 0; |
388 | } | 374 | } |
389 | return ret; | 375 | return ret; |
@@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
398 | 384 | ||
399 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); | 385 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
400 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { | 386 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { |
401 | update_completion_period(); | 387 | writeback_set_ratelimit(); |
402 | vm_dirty_ratio = 0; | 388 | vm_dirty_ratio = 0; |
403 | } | 389 | } |
404 | return ret; | 390 | return ret; |
405 | } | 391 | } |
406 | 392 | ||
393 | static unsigned long wp_next_time(unsigned long cur_time) | ||
394 | { | ||
395 | cur_time += VM_COMPLETIONS_PERIOD_LEN; | ||
396 | /* 0 has a special meaning... */ | ||
397 | if (!cur_time) | ||
398 | return 1; | ||
399 | return cur_time; | ||
400 | } | ||
401 | |||
407 | /* | 402 | /* |
408 | * Increment the BDI's writeout completion count and the global writeout | 403 | * Increment the BDI's writeout completion count and the global writeout |
409 | * completion count. Called from test_clear_page_writeback(). | 404 | * completion count. Called from test_clear_page_writeback(). |
@@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
411 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | 406 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) |
412 | { | 407 | { |
413 | __inc_bdi_stat(bdi, BDI_WRITTEN); | 408 | __inc_bdi_stat(bdi, BDI_WRITTEN); |
414 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, | 409 | __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, |
415 | bdi->max_prop_frac); | 410 | bdi->max_prop_frac); |
411 | /* First event after period switching was turned off? */ | ||
412 | if (!unlikely(writeout_period_time)) { | ||
413 | /* | ||
414 | * We can race with other __bdi_writeout_inc calls here but | ||
415 | * it does not cause any harm since the resulting time when | ||
416 | * timer will fire and what is in writeout_period_time will be | ||
417 | * roughly the same. | ||
418 | */ | ||
419 | writeout_period_time = wp_next_time(jiffies); | ||
420 | mod_timer(&writeout_period_timer, writeout_period_time); | ||
421 | } | ||
416 | } | 422 | } |
417 | 423 | ||
418 | void bdi_writeout_inc(struct backing_dev_info *bdi) | 424 | void bdi_writeout_inc(struct backing_dev_info *bdi) |
@@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc); | |||
431 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | 437 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, |
432 | long *numerator, long *denominator) | 438 | long *numerator, long *denominator) |
433 | { | 439 | { |
434 | prop_fraction_percpu(&vm_completions, &bdi->completions, | 440 | fprop_fraction_percpu(&writeout_completions, &bdi->completions, |
435 | numerator, denominator); | 441 | numerator, denominator); |
436 | } | 442 | } |
437 | 443 | ||
438 | /* | 444 | /* |
445 | * On idle system, we can be called long after we scheduled because we use | ||
446 | * deferred timers so count with missed periods. | ||
447 | */ | ||
448 | static void writeout_period(unsigned long t) | ||
449 | { | ||
450 | int miss_periods = (jiffies - writeout_period_time) / | ||
451 | VM_COMPLETIONS_PERIOD_LEN; | ||
452 | |||
453 | if (fprop_new_period(&writeout_completions, miss_periods + 1)) { | ||
454 | writeout_period_time = wp_next_time(writeout_period_time + | ||
455 | miss_periods * VM_COMPLETIONS_PERIOD_LEN); | ||
456 | mod_timer(&writeout_period_timer, writeout_period_time); | ||
457 | } else { | ||
458 | /* | ||
459 | * Aging has zeroed all fractions. Stop wasting CPU on period | ||
460 | * updates. | ||
461 | */ | ||
462 | writeout_period_time = 0; | ||
463 | } | ||
464 | } | ||
465 | |||
466 | /* | ||
439 | * bdi_min_ratio keeps the sum of the minimum dirty shares of all | 467 | * bdi_min_ratio keeps the sum of the minimum dirty shares of all |
440 | * registered backing devices, which, for obvious reasons, can not | 468 | * registered backing devices, which, for obvious reasons, can not |
441 | * exceed 100%. | 469 | * exceed 100%. |
@@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) | |||
475 | ret = -EINVAL; | 503 | ret = -EINVAL; |
476 | } else { | 504 | } else { |
477 | bdi->max_ratio = max_ratio; | 505 | bdi->max_ratio = max_ratio; |
478 | bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; | 506 | bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; |
479 | } | 507 | } |
480 | spin_unlock_bh(&bdi_lock); | 508 | spin_unlock_bh(&bdi_lock); |
481 | 509 | ||
@@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
918 | * bdi->dirty_ratelimit = balanced_dirty_ratelimit; | 946 | * bdi->dirty_ratelimit = balanced_dirty_ratelimit; |
919 | * | 947 | * |
920 | * However to get a more stable dirty_ratelimit, the below elaborated | 948 | * However to get a more stable dirty_ratelimit, the below elaborated |
921 | * code makes use of task_ratelimit to filter out sigular points and | 949 | * code makes use of task_ratelimit to filter out singular points and |
922 | * limit the step size. | 950 | * limit the step size. |
923 | * | 951 | * |
924 | * The below code essentially only uses the relative value of | 952 | * The below code essentially only uses the relative value of |
@@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
941 | * feel and care are stable dirty rate and small position error. | 969 | * feel and care are stable dirty rate and small position error. |
942 | * | 970 | * |
943 | * |task_ratelimit - dirty_ratelimit| is used to limit the step size | 971 | * |task_ratelimit - dirty_ratelimit| is used to limit the step size |
944 | * and filter out the sigular points of balanced_dirty_ratelimit. Which | 972 | * and filter out the singular points of balanced_dirty_ratelimit. Which |
945 | * keeps jumping around randomly and can even leap far away at times | 973 | * keeps jumping around randomly and can even leap far away at times |
946 | * due to the small 200ms estimation period of dirty_rate (we want to | 974 | * due to the small 200ms estimation period of dirty_rate (we want to |
947 | * keep that period small to reduce time lags). | 975 | * keep that period small to reduce time lags). |
@@ -1504,7 +1532,6 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write, | |||
1504 | void __user *buffer, size_t *length, loff_t *ppos) | 1532 | void __user *buffer, size_t *length, loff_t *ppos) |
1505 | { | 1533 | { |
1506 | proc_dointvec(table, write, buffer, length, ppos); | 1534 | proc_dointvec(table, write, buffer, length, ppos); |
1507 | bdi_arm_supers_timer(); | ||
1508 | return 0; | 1535 | return 0; |
1509 | } | 1536 | } |
1510 | 1537 | ||
@@ -1568,6 +1595,7 @@ void writeback_set_ratelimit(void) | |||
1568 | unsigned long background_thresh; | 1595 | unsigned long background_thresh; |
1569 | unsigned long dirty_thresh; | 1596 | unsigned long dirty_thresh; |
1570 | global_dirty_limits(&background_thresh, &dirty_thresh); | 1597 | global_dirty_limits(&background_thresh, &dirty_thresh); |
1598 | global_dirty_limit = dirty_thresh; | ||
1571 | ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); | 1599 | ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); |
1572 | if (ratelimit_pages < 16) | 1600 | if (ratelimit_pages < 16) |
1573 | ratelimit_pages = 16; | 1601 | ratelimit_pages = 16; |
@@ -1605,13 +1633,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { | |||
1605 | */ | 1633 | */ |
1606 | void __init page_writeback_init(void) | 1634 | void __init page_writeback_init(void) |
1607 | { | 1635 | { |
1608 | int shift; | ||
1609 | |||
1610 | writeback_set_ratelimit(); | 1636 | writeback_set_ratelimit(); |
1611 | register_cpu_notifier(&ratelimit_nb); | 1637 | register_cpu_notifier(&ratelimit_nb); |
1612 | 1638 | ||
1613 | shift = calc_period_shift(); | 1639 | fprop_global_init(&writeout_completions); |
1614 | prop_descriptor_init(&vm_completions, shift); | ||
1615 | } | 1640 | } |
1616 | 1641 | ||
1617 | /** | 1642 | /** |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a712fb9e04ce..c66fb875104a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -51,12 +51,12 @@ | |||
51 | #include <linux/page_cgroup.h> | 51 | #include <linux/page_cgroup.h> |
52 | #include <linux/debugobjects.h> | 52 | #include <linux/debugobjects.h> |
53 | #include <linux/kmemleak.h> | 53 | #include <linux/kmemleak.h> |
54 | #include <linux/memory.h> | ||
55 | #include <linux/compaction.h> | 54 | #include <linux/compaction.h> |
56 | #include <trace/events/kmem.h> | 55 | #include <trace/events/kmem.h> |
57 | #include <linux/ftrace_event.h> | 56 | #include <linux/ftrace_event.h> |
58 | #include <linux/memcontrol.h> | 57 | #include <linux/memcontrol.h> |
59 | #include <linux/prefetch.h> | 58 | #include <linux/prefetch.h> |
59 | #include <linux/migrate.h> | ||
60 | #include <linux/page-debug-flags.h> | 60 | #include <linux/page-debug-flags.h> |
61 | 61 | ||
62 | #include <asm/tlbflush.h> | 62 | #include <asm/tlbflush.h> |
@@ -218,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes); | |||
218 | 218 | ||
219 | int page_group_by_mobility_disabled __read_mostly; | 219 | int page_group_by_mobility_disabled __read_mostly; |
220 | 220 | ||
221 | static void set_pageblock_migratetype(struct page *page, int migratetype) | 221 | /* |
222 | * NOTE: | ||
223 | * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. | ||
224 | * Instead, use {un}set_pageblock_isolate. | ||
225 | */ | ||
226 | void set_pageblock_migratetype(struct page *page, int migratetype) | ||
222 | { | 227 | { |
223 | 228 | ||
224 | if (unlikely(page_group_by_mobility_disabled)) | 229 | if (unlikely(page_group_by_mobility_disabled)) |
@@ -513,10 +518,10 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
513 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's | 518 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's |
514 | * order is recorded in page_private(page) field. | 519 | * order is recorded in page_private(page) field. |
515 | * So when we are allocating or freeing one, we can derive the state of the | 520 | * So when we are allocating or freeing one, we can derive the state of the |
516 | * other. That is, if we allocate a small block, and both were | 521 | * other. That is, if we allocate a small block, and both were |
517 | * free, the remainder of the region must be split into blocks. | 522 | * free, the remainder of the region must be split into blocks. |
518 | * If a block is freed, and its buddy is also free, then this | 523 | * If a block is freed, and its buddy is also free, then this |
519 | * triggers coalescing into a block of larger size. | 524 | * triggers coalescing into a block of larger size. |
520 | * | 525 | * |
521 | * -- wli | 526 | * -- wli |
522 | */ | 527 | */ |
@@ -749,6 +754,24 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | |||
749 | __free_pages(page, order); | 754 | __free_pages(page, order); |
750 | } | 755 | } |
751 | 756 | ||
757 | #ifdef CONFIG_CMA | ||
758 | /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ | ||
759 | void __init init_cma_reserved_pageblock(struct page *page) | ||
760 | { | ||
761 | unsigned i = pageblock_nr_pages; | ||
762 | struct page *p = page; | ||
763 | |||
764 | do { | ||
765 | __ClearPageReserved(p); | ||
766 | set_page_count(p, 0); | ||
767 | } while (++p, --i); | ||
768 | |||
769 | set_page_refcounted(page); | ||
770 | set_pageblock_migratetype(page, MIGRATE_CMA); | ||
771 | __free_pages(page, pageblock_order); | ||
772 | totalram_pages += pageblock_nr_pages; | ||
773 | } | ||
774 | #endif | ||
752 | 775 | ||
753 | /* | 776 | /* |
754 | * The order of subdivision here is critical for the IO subsystem. | 777 | * The order of subdivision here is critical for the IO subsystem. |
@@ -874,11 +897,17 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
874 | * This array describes the order lists are fallen back to when | 897 | * This array describes the order lists are fallen back to when |
875 | * the free lists for the desirable migrate type are depleted | 898 | * the free lists for the desirable migrate type are depleted |
876 | */ | 899 | */ |
877 | static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { | 900 | static int fallbacks[MIGRATE_TYPES][4] = { |
878 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | 901 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, |
879 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | 902 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, |
880 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | 903 | #ifdef CONFIG_CMA |
881 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ | 904 | [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, |
905 | [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ | ||
906 | #else | ||
907 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | ||
908 | #endif | ||
909 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ | ||
910 | [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ | ||
882 | }; | 911 | }; |
883 | 912 | ||
884 | /* | 913 | /* |
@@ -929,7 +958,7 @@ static int move_freepages(struct zone *zone, | |||
929 | return pages_moved; | 958 | return pages_moved; |
930 | } | 959 | } |
931 | 960 | ||
932 | static int move_freepages_block(struct zone *zone, struct page *page, | 961 | int move_freepages_block(struct zone *zone, struct page *page, |
933 | int migratetype) | 962 | int migratetype) |
934 | { | 963 | { |
935 | unsigned long start_pfn, end_pfn; | 964 | unsigned long start_pfn, end_pfn; |
@@ -973,12 +1002,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
973 | /* Find the largest possible block of pages in the other list */ | 1002 | /* Find the largest possible block of pages in the other list */ |
974 | for (current_order = MAX_ORDER-1; current_order >= order; | 1003 | for (current_order = MAX_ORDER-1; current_order >= order; |
975 | --current_order) { | 1004 | --current_order) { |
976 | for (i = 0; i < MIGRATE_TYPES - 1; i++) { | 1005 | for (i = 0;; i++) { |
977 | migratetype = fallbacks[start_migratetype][i]; | 1006 | migratetype = fallbacks[start_migratetype][i]; |
978 | 1007 | ||
979 | /* MIGRATE_RESERVE handled later if necessary */ | 1008 | /* MIGRATE_RESERVE handled later if necessary */ |
980 | if (migratetype == MIGRATE_RESERVE) | 1009 | if (migratetype == MIGRATE_RESERVE) |
981 | continue; | 1010 | break; |
982 | 1011 | ||
983 | area = &(zone->free_area[current_order]); | 1012 | area = &(zone->free_area[current_order]); |
984 | if (list_empty(&area->free_list[migratetype])) | 1013 | if (list_empty(&area->free_list[migratetype])) |
@@ -993,11 +1022,18 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
993 | * pages to the preferred allocation list. If falling | 1022 | * pages to the preferred allocation list. If falling |
994 | * back for a reclaimable kernel allocation, be more | 1023 | * back for a reclaimable kernel allocation, be more |
995 | * aggressive about taking ownership of free pages | 1024 | * aggressive about taking ownership of free pages |
1025 | * | ||
1026 | * On the other hand, never change migration | ||
1027 | * type of MIGRATE_CMA pageblocks nor move CMA | ||
1028 | * pages on different free lists. We don't | ||
1029 | * want unmovable pages to be allocated from | ||
1030 | * MIGRATE_CMA areas. | ||
996 | */ | 1031 | */ |
997 | if (unlikely(current_order >= (pageblock_order >> 1)) || | 1032 | if (!is_migrate_cma(migratetype) && |
998 | start_migratetype == MIGRATE_RECLAIMABLE || | 1033 | (unlikely(current_order >= pageblock_order / 2) || |
999 | page_group_by_mobility_disabled) { | 1034 | start_migratetype == MIGRATE_RECLAIMABLE || |
1000 | unsigned long pages; | 1035 | page_group_by_mobility_disabled)) { |
1036 | int pages; | ||
1001 | pages = move_freepages_block(zone, page, | 1037 | pages = move_freepages_block(zone, page, |
1002 | start_migratetype); | 1038 | start_migratetype); |
1003 | 1039 | ||
@@ -1015,11 +1051,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
1015 | rmv_page_order(page); | 1051 | rmv_page_order(page); |
1016 | 1052 | ||
1017 | /* Take ownership for orders >= pageblock_order */ | 1053 | /* Take ownership for orders >= pageblock_order */ |
1018 | if (current_order >= pageblock_order) | 1054 | if (current_order >= pageblock_order && |
1055 | !is_migrate_cma(migratetype)) | ||
1019 | change_pageblock_range(page, current_order, | 1056 | change_pageblock_range(page, current_order, |
1020 | start_migratetype); | 1057 | start_migratetype); |
1021 | 1058 | ||
1022 | expand(zone, page, order, current_order, area, migratetype); | 1059 | expand(zone, page, order, current_order, area, |
1060 | is_migrate_cma(migratetype) | ||
1061 | ? migratetype : start_migratetype); | ||
1023 | 1062 | ||
1024 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1063 | trace_mm_page_alloc_extfrag(page, order, current_order, |
1025 | start_migratetype, migratetype); | 1064 | start_migratetype, migratetype); |
@@ -1061,17 +1100,17 @@ retry_reserve: | |||
1061 | return page; | 1100 | return page; |
1062 | } | 1101 | } |
1063 | 1102 | ||
1064 | /* | 1103 | /* |
1065 | * Obtain a specified number of elements from the buddy allocator, all under | 1104 | * Obtain a specified number of elements from the buddy allocator, all under |
1066 | * a single hold of the lock, for efficiency. Add them to the supplied list. | 1105 | * a single hold of the lock, for efficiency. Add them to the supplied list. |
1067 | * Returns the number of new pages which were placed at *list. | 1106 | * Returns the number of new pages which were placed at *list. |
1068 | */ | 1107 | */ |
1069 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 1108 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
1070 | unsigned long count, struct list_head *list, | 1109 | unsigned long count, struct list_head *list, |
1071 | int migratetype, int cold) | 1110 | int migratetype, int cold) |
1072 | { | 1111 | { |
1073 | int i; | 1112 | int mt = migratetype, i; |
1074 | 1113 | ||
1075 | spin_lock(&zone->lock); | 1114 | spin_lock(&zone->lock); |
1076 | for (i = 0; i < count; ++i) { | 1115 | for (i = 0; i < count; ++i) { |
1077 | struct page *page = __rmqueue(zone, order, migratetype); | 1116 | struct page *page = __rmqueue(zone, order, migratetype); |
@@ -1091,7 +1130,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
1091 | list_add(&page->lru, list); | 1130 | list_add(&page->lru, list); |
1092 | else | 1131 | else |
1093 | list_add_tail(&page->lru, list); | 1132 | list_add_tail(&page->lru, list); |
1094 | set_page_private(page, migratetype); | 1133 | if (IS_ENABLED(CONFIG_CMA)) { |
1134 | mt = get_pageblock_migratetype(page); | ||
1135 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) | ||
1136 | mt = migratetype; | ||
1137 | } | ||
1138 | set_page_private(page, mt); | ||
1095 | list = &page->lru; | 1139 | list = &page->lru; |
1096 | } | 1140 | } |
1097 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | 1141 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); |
@@ -1118,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
1118 | to_drain = pcp->batch; | 1162 | to_drain = pcp->batch; |
1119 | else | 1163 | else |
1120 | to_drain = pcp->count; | 1164 | to_drain = pcp->count; |
1121 | free_pcppages_bulk(zone, to_drain, pcp); | 1165 | if (to_drain > 0) { |
1122 | pcp->count -= to_drain; | 1166 | free_pcppages_bulk(zone, to_drain, pcp); |
1167 | pcp->count -= to_drain; | ||
1168 | } | ||
1123 | local_irq_restore(flags); | 1169 | local_irq_restore(flags); |
1124 | } | 1170 | } |
1125 | #endif | 1171 | #endif |
@@ -1371,8 +1417,12 @@ int split_free_page(struct page *page) | |||
1371 | 1417 | ||
1372 | if (order >= pageblock_order - 1) { | 1418 | if (order >= pageblock_order - 1) { |
1373 | struct page *endpage = page + (1 << order) - 1; | 1419 | struct page *endpage = page + (1 << order) - 1; |
1374 | for (; page < endpage; page += pageblock_nr_pages) | 1420 | for (; page < endpage; page += pageblock_nr_pages) { |
1375 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 1421 | int mt = get_pageblock_migratetype(page); |
1422 | if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) | ||
1423 | set_pageblock_migratetype(page, | ||
1424 | MIGRATE_MOVABLE); | ||
1425 | } | ||
1376 | } | 1426 | } |
1377 | 1427 | ||
1378 | return 1 << order; | 1428 | return 1 << order; |
@@ -1485,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str) | |||
1485 | } | 1535 | } |
1486 | __setup("fail_page_alloc=", setup_fail_page_alloc); | 1536 | __setup("fail_page_alloc=", setup_fail_page_alloc); |
1487 | 1537 | ||
1488 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1538 | static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1489 | { | 1539 | { |
1490 | if (order < fail_page_alloc.min_order) | 1540 | if (order < fail_page_alloc.min_order) |
1491 | return 0; | 1541 | return false; |
1492 | if (gfp_mask & __GFP_NOFAIL) | 1542 | if (gfp_mask & __GFP_NOFAIL) |
1493 | return 0; | 1543 | return false; |
1494 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) | 1544 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) |
1495 | return 0; | 1545 | return false; |
1496 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) | 1546 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) |
1497 | return 0; | 1547 | return false; |
1498 | 1548 | ||
1499 | return should_fail(&fail_page_alloc.attr, 1 << order); | 1549 | return should_fail(&fail_page_alloc.attr, 1 << order); |
1500 | } | 1550 | } |
@@ -1534,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs); | |||
1534 | 1584 | ||
1535 | #else /* CONFIG_FAIL_PAGE_ALLOC */ | 1585 | #else /* CONFIG_FAIL_PAGE_ALLOC */ |
1536 | 1586 | ||
1537 | static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1587 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1538 | { | 1588 | { |
1539 | return 0; | 1589 | return false; |
1540 | } | 1590 | } |
1541 | 1591 | ||
1542 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 1592 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
@@ -1550,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1550 | { | 1600 | { |
1551 | /* free_pages my go negative - that's OK */ | 1601 | /* free_pages my go negative - that's OK */ |
1552 | long min = mark; | 1602 | long min = mark; |
1603 | long lowmem_reserve = z->lowmem_reserve[classzone_idx]; | ||
1553 | int o; | 1604 | int o; |
1554 | 1605 | ||
1555 | free_pages -= (1 << order) - 1; | 1606 | free_pages -= (1 << order) - 1; |
@@ -1558,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1558 | if (alloc_flags & ALLOC_HARDER) | 1609 | if (alloc_flags & ALLOC_HARDER) |
1559 | min -= min / 4; | 1610 | min -= min / 4; |
1560 | 1611 | ||
1561 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 1612 | if (free_pages <= min + lowmem_reserve) |
1562 | return false; | 1613 | return false; |
1563 | for (o = 0; o < order; o++) { | 1614 | for (o = 0; o < order; o++) { |
1564 | /* At the next order, this order's pages become unavailable */ | 1615 | /* At the next order, this order's pages become unavailable */ |
@@ -1573,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1573 | return true; | 1624 | return true; |
1574 | } | 1625 | } |
1575 | 1626 | ||
1627 | #ifdef CONFIG_MEMORY_ISOLATION | ||
1628 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | ||
1629 | { | ||
1630 | if (unlikely(zone->nr_pageblock_isolate)) | ||
1631 | return zone->nr_pageblock_isolate * pageblock_nr_pages; | ||
1632 | return 0; | ||
1633 | } | ||
1634 | #else | ||
1635 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | ||
1636 | { | ||
1637 | return 0; | ||
1638 | } | ||
1639 | #endif | ||
1640 | |||
1576 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1641 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1577 | int classzone_idx, int alloc_flags) | 1642 | int classzone_idx, int alloc_flags) |
1578 | { | 1643 | { |
@@ -1588,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
1588 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | 1653 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) |
1589 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | 1654 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); |
1590 | 1655 | ||
1656 | /* | ||
1657 | * If the zone has MIGRATE_ISOLATE type free pages, we should consider | ||
1658 | * it. nr_zone_isolate_freepages is never accurate so kswapd might not | ||
1659 | * sleep although it could do so. But this is more desirable for memory | ||
1660 | * hotplug than sleeping which can cause a livelock in the direct | ||
1661 | * reclaim path. | ||
1662 | */ | ||
1663 | free_pages -= nr_zone_isolate_freepages(z); | ||
1591 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | 1664 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
1592 | free_pages); | 1665 | free_pages); |
1593 | } | 1666 | } |
@@ -1855,6 +1928,17 @@ this_zone_full: | |||
1855 | zlc_active = 0; | 1928 | zlc_active = 0; |
1856 | goto zonelist_scan; | 1929 | goto zonelist_scan; |
1857 | } | 1930 | } |
1931 | |||
1932 | if (page) | ||
1933 | /* | ||
1934 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was | ||
1935 | * necessary to allocate the page. The expectation is | ||
1936 | * that the caller is taking steps that will free more | ||
1937 | * memory. The caller should avoid the page being used | ||
1938 | * for !PFMEMALLOC purposes. | ||
1939 | */ | ||
1940 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); | ||
1941 | |||
1858 | return page; | 1942 | return page; |
1859 | } | 1943 | } |
1860 | 1944 | ||
@@ -2018,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2018 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2102 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2019 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2103 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2020 | int migratetype, bool sync_migration, | 2104 | int migratetype, bool sync_migration, |
2021 | bool *deferred_compaction, | 2105 | bool *contended_compaction, bool *deferred_compaction, |
2022 | unsigned long *did_some_progress) | 2106 | unsigned long *did_some_progress) |
2023 | { | 2107 | { |
2024 | struct page *page; | 2108 | struct page *page; |
@@ -2033,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2033 | 2117 | ||
2034 | current->flags |= PF_MEMALLOC; | 2118 | current->flags |= PF_MEMALLOC; |
2035 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2119 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
2036 | nodemask, sync_migration); | 2120 | nodemask, sync_migration, |
2121 | contended_compaction); | ||
2037 | current->flags &= ~PF_MEMALLOC; | 2122 | current->flags &= ~PF_MEMALLOC; |
2038 | if (*did_some_progress != COMPACT_SKIPPED) { | 2123 | if (*did_some_progress != COMPACT_SKIPPED) { |
2039 | 2124 | ||
@@ -2043,8 +2128,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2043 | 2128 | ||
2044 | page = get_page_from_freelist(gfp_mask, nodemask, | 2129 | page = get_page_from_freelist(gfp_mask, nodemask, |
2045 | order, zonelist, high_zoneidx, | 2130 | order, zonelist, high_zoneidx, |
2046 | alloc_flags, preferred_zone, | 2131 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2047 | migratetype); | 2132 | preferred_zone, migratetype); |
2048 | if (page) { | 2133 | if (page) { |
2049 | preferred_zone->compact_considered = 0; | 2134 | preferred_zone->compact_considered = 0; |
2050 | preferred_zone->compact_defer_shift = 0; | 2135 | preferred_zone->compact_defer_shift = 0; |
@@ -2079,23 +2164,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2079 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2164 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2080 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2165 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2081 | int migratetype, bool sync_migration, | 2166 | int migratetype, bool sync_migration, |
2082 | bool *deferred_compaction, | 2167 | bool *contended_compaction, bool *deferred_compaction, |
2083 | unsigned long *did_some_progress) | 2168 | unsigned long *did_some_progress) |
2084 | { | 2169 | { |
2085 | return NULL; | 2170 | return NULL; |
2086 | } | 2171 | } |
2087 | #endif /* CONFIG_COMPACTION */ | 2172 | #endif /* CONFIG_COMPACTION */ |
2088 | 2173 | ||
2089 | /* The really slow allocator path where we enter direct reclaim */ | 2174 | /* Perform direct synchronous page reclaim */ |
2090 | static inline struct page * | 2175 | static int |
2091 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 2176 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, |
2092 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2177 | nodemask_t *nodemask) |
2093 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
2094 | int migratetype, unsigned long *did_some_progress) | ||
2095 | { | 2178 | { |
2096 | struct page *page = NULL; | ||
2097 | struct reclaim_state reclaim_state; | 2179 | struct reclaim_state reclaim_state; |
2098 | bool drained = false; | 2180 | int progress; |
2099 | 2181 | ||
2100 | cond_resched(); | 2182 | cond_resched(); |
2101 | 2183 | ||
@@ -2106,7 +2188,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2106 | reclaim_state.reclaimed_slab = 0; | 2188 | reclaim_state.reclaimed_slab = 0; |
2107 | current->reclaim_state = &reclaim_state; | 2189 | current->reclaim_state = &reclaim_state; |
2108 | 2190 | ||
2109 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | 2191 | progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); |
2110 | 2192 | ||
2111 | current->reclaim_state = NULL; | 2193 | current->reclaim_state = NULL; |
2112 | lockdep_clear_current_reclaim_state(); | 2194 | lockdep_clear_current_reclaim_state(); |
@@ -2114,6 +2196,21 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2114 | 2196 | ||
2115 | cond_resched(); | 2197 | cond_resched(); |
2116 | 2198 | ||
2199 | return progress; | ||
2200 | } | ||
2201 | |||
2202 | /* The really slow allocator path where we enter direct reclaim */ | ||
2203 | static inline struct page * | ||
2204 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | ||
2205 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
2206 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
2207 | int migratetype, unsigned long *did_some_progress) | ||
2208 | { | ||
2209 | struct page *page = NULL; | ||
2210 | bool drained = false; | ||
2211 | |||
2212 | *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | ||
2213 | nodemask); | ||
2117 | if (unlikely(!(*did_some_progress))) | 2214 | if (unlikely(!(*did_some_progress))) |
2118 | return NULL; | 2215 | return NULL; |
2119 | 2216 | ||
@@ -2124,8 +2221,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2124 | retry: | 2221 | retry: |
2125 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2222 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2126 | zonelist, high_zoneidx, | 2223 | zonelist, high_zoneidx, |
2127 | alloc_flags, preferred_zone, | 2224 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2128 | migratetype); | 2225 | preferred_zone, migratetype); |
2129 | 2226 | ||
2130 | /* | 2227 | /* |
2131 | * If an allocation failed after direct reclaim, it could be because | 2228 | * If an allocation failed after direct reclaim, it could be because |
@@ -2209,15 +2306,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
2209 | alloc_flags |= ALLOC_HARDER; | 2306 | alloc_flags |= ALLOC_HARDER; |
2210 | 2307 | ||
2211 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | 2308 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { |
2212 | if (!in_interrupt() && | 2309 | if (gfp_mask & __GFP_MEMALLOC) |
2213 | ((current->flags & PF_MEMALLOC) || | 2310 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2214 | unlikely(test_thread_flag(TIF_MEMDIE)))) | 2311 | else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) |
2312 | alloc_flags |= ALLOC_NO_WATERMARKS; | ||
2313 | else if (!in_interrupt() && | ||
2314 | ((current->flags & PF_MEMALLOC) || | ||
2315 | unlikely(test_thread_flag(TIF_MEMDIE)))) | ||
2215 | alloc_flags |= ALLOC_NO_WATERMARKS; | 2316 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2216 | } | 2317 | } |
2217 | 2318 | ||
2218 | return alloc_flags; | 2319 | return alloc_flags; |
2219 | } | 2320 | } |
2220 | 2321 | ||
2322 | bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | ||
2323 | { | ||
2324 | return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); | ||
2325 | } | ||
2326 | |||
2221 | static inline struct page * | 2327 | static inline struct page * |
2222 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2328 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
2223 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2329 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
@@ -2231,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2231 | unsigned long did_some_progress; | 2337 | unsigned long did_some_progress; |
2232 | bool sync_migration = false; | 2338 | bool sync_migration = false; |
2233 | bool deferred_compaction = false; | 2339 | bool deferred_compaction = false; |
2340 | bool contended_compaction = false; | ||
2234 | 2341 | ||
2235 | /* | 2342 | /* |
2236 | * In the slowpath, we sanity check order to avoid ever trying to | 2343 | * In the slowpath, we sanity check order to avoid ever trying to |
@@ -2284,11 +2391,19 @@ rebalance: | |||
2284 | 2391 | ||
2285 | /* Allocate without watermarks if the context allows */ | 2392 | /* Allocate without watermarks if the context allows */ |
2286 | if (alloc_flags & ALLOC_NO_WATERMARKS) { | 2393 | if (alloc_flags & ALLOC_NO_WATERMARKS) { |
2394 | /* | ||
2395 | * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds | ||
2396 | * the allocation is high priority and these type of | ||
2397 | * allocations are system rather than user orientated | ||
2398 | */ | ||
2399 | zonelist = node_zonelist(numa_node_id(), gfp_mask); | ||
2400 | |||
2287 | page = __alloc_pages_high_priority(gfp_mask, order, | 2401 | page = __alloc_pages_high_priority(gfp_mask, order, |
2288 | zonelist, high_zoneidx, nodemask, | 2402 | zonelist, high_zoneidx, nodemask, |
2289 | preferred_zone, migratetype); | 2403 | preferred_zone, migratetype); |
2290 | if (page) | 2404 | if (page) { |
2291 | goto got_pg; | 2405 | goto got_pg; |
2406 | } | ||
2292 | } | 2407 | } |
2293 | 2408 | ||
2294 | /* Atomic allocations - we can't balance anything */ | 2409 | /* Atomic allocations - we can't balance anything */ |
@@ -2312,6 +2427,7 @@ rebalance: | |||
2312 | nodemask, | 2427 | nodemask, |
2313 | alloc_flags, preferred_zone, | 2428 | alloc_flags, preferred_zone, |
2314 | migratetype, sync_migration, | 2429 | migratetype, sync_migration, |
2430 | &contended_compaction, | ||
2315 | &deferred_compaction, | 2431 | &deferred_compaction, |
2316 | &did_some_progress); | 2432 | &did_some_progress); |
2317 | if (page) | 2433 | if (page) |
@@ -2321,10 +2437,11 @@ rebalance: | |||
2321 | /* | 2437 | /* |
2322 | * If compaction is deferred for high-order allocations, it is because | 2438 | * If compaction is deferred for high-order allocations, it is because |
2323 | * sync compaction recently failed. In this is the case and the caller | 2439 | * sync compaction recently failed. In this is the case and the caller |
2324 | * has requested the system not be heavily disrupted, fail the | 2440 | * requested a movable allocation that does not heavily disrupt the |
2325 | * allocation now instead of entering direct reclaim | 2441 | * system then fail the allocation instead of entering direct reclaim. |
2326 | */ | 2442 | */ |
2327 | if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) | 2443 | if ((deferred_compaction || contended_compaction) && |
2444 | (gfp_mask & __GFP_NO_KSWAPD)) | ||
2328 | goto nopage; | 2445 | goto nopage; |
2329 | 2446 | ||
2330 | /* Try direct reclaim and then allocating */ | 2447 | /* Try direct reclaim and then allocating */ |
@@ -2395,6 +2512,7 @@ rebalance: | |||
2395 | nodemask, | 2512 | nodemask, |
2396 | alloc_flags, preferred_zone, | 2513 | alloc_flags, preferred_zone, |
2397 | migratetype, sync_migration, | 2514 | migratetype, sync_migration, |
2515 | &contended_compaction, | ||
2398 | &deferred_compaction, | 2516 | &deferred_compaction, |
2399 | &did_some_progress); | 2517 | &did_some_progress); |
2400 | if (page) | 2518 | if (page) |
@@ -2407,8 +2525,8 @@ nopage: | |||
2407 | got_pg: | 2525 | got_pg: |
2408 | if (kmemcheck_enabled) | 2526 | if (kmemcheck_enabled) |
2409 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | 2527 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); |
2410 | return page; | ||
2411 | 2528 | ||
2529 | return page; | ||
2412 | } | 2530 | } |
2413 | 2531 | ||
2414 | /* | 2532 | /* |
@@ -2974,7 +3092,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
2974 | user_zonelist_order = oldval; | 3092 | user_zonelist_order = oldval; |
2975 | } else if (oldval != user_zonelist_order) { | 3093 | } else if (oldval != user_zonelist_order) { |
2976 | mutex_lock(&zonelists_mutex); | 3094 | mutex_lock(&zonelists_mutex); |
2977 | build_all_zonelists(NULL); | 3095 | build_all_zonelists(NULL, NULL); |
2978 | mutex_unlock(&zonelists_mutex); | 3096 | mutex_unlock(&zonelists_mutex); |
2979 | } | 3097 | } |
2980 | } | 3098 | } |
@@ -3353,14 +3471,21 @@ static void setup_zone_pageset(struct zone *zone); | |||
3353 | DEFINE_MUTEX(zonelists_mutex); | 3471 | DEFINE_MUTEX(zonelists_mutex); |
3354 | 3472 | ||
3355 | /* return values int ....just for stop_machine() */ | 3473 | /* return values int ....just for stop_machine() */ |
3356 | static __init_refok int __build_all_zonelists(void *data) | 3474 | static int __build_all_zonelists(void *data) |
3357 | { | 3475 | { |
3358 | int nid; | 3476 | int nid; |
3359 | int cpu; | 3477 | int cpu; |
3478 | pg_data_t *self = data; | ||
3360 | 3479 | ||
3361 | #ifdef CONFIG_NUMA | 3480 | #ifdef CONFIG_NUMA |
3362 | memset(node_load, 0, sizeof(node_load)); | 3481 | memset(node_load, 0, sizeof(node_load)); |
3363 | #endif | 3482 | #endif |
3483 | |||
3484 | if (self && !node_online(self->node_id)) { | ||
3485 | build_zonelists(self); | ||
3486 | build_zonelist_cache(self); | ||
3487 | } | ||
3488 | |||
3364 | for_each_online_node(nid) { | 3489 | for_each_online_node(nid) { |
3365 | pg_data_t *pgdat = NODE_DATA(nid); | 3490 | pg_data_t *pgdat = NODE_DATA(nid); |
3366 | 3491 | ||
@@ -3405,7 +3530,7 @@ static __init_refok int __build_all_zonelists(void *data) | |||
3405 | * Called with zonelists_mutex held always | 3530 | * Called with zonelists_mutex held always |
3406 | * unless system_state == SYSTEM_BOOTING. | 3531 | * unless system_state == SYSTEM_BOOTING. |
3407 | */ | 3532 | */ |
3408 | void __ref build_all_zonelists(void *data) | 3533 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) |
3409 | { | 3534 | { |
3410 | set_zonelist_order(); | 3535 | set_zonelist_order(); |
3411 | 3536 | ||
@@ -3417,10 +3542,10 @@ void __ref build_all_zonelists(void *data) | |||
3417 | /* we have to stop all cpus to guarantee there is no user | 3542 | /* we have to stop all cpus to guarantee there is no user |
3418 | of zonelist */ | 3543 | of zonelist */ |
3419 | #ifdef CONFIG_MEMORY_HOTPLUG | 3544 | #ifdef CONFIG_MEMORY_HOTPLUG |
3420 | if (data) | 3545 | if (zone) |
3421 | setup_zone_pageset((struct zone *)data); | 3546 | setup_zone_pageset(zone); |
3422 | #endif | 3547 | #endif |
3423 | stop_machine(__build_all_zonelists, NULL, NULL); | 3548 | stop_machine(__build_all_zonelists, pgdat, NULL); |
3424 | /* cpuset refresh routine should be here */ | 3549 | /* cpuset refresh routine should be here */ |
3425 | } | 3550 | } |
3426 | vm_total_pages = nr_free_pagecache_pages(); | 3551 | vm_total_pages = nr_free_pagecache_pages(); |
@@ -3690,7 +3815,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) | |||
3690 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) | 3815 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) |
3691 | #endif | 3816 | #endif |
3692 | 3817 | ||
3693 | static int zone_batchsize(struct zone *zone) | 3818 | static int __meminit zone_batchsize(struct zone *zone) |
3694 | { | 3819 | { |
3695 | #ifdef CONFIG_MMU | 3820 | #ifdef CONFIG_MMU |
3696 | int batch; | 3821 | int batch; |
@@ -3772,7 +3897,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
3772 | pcp->batch = PAGE_SHIFT * 8; | 3897 | pcp->batch = PAGE_SHIFT * 8; |
3773 | } | 3898 | } |
3774 | 3899 | ||
3775 | static void setup_zone_pageset(struct zone *zone) | 3900 | static void __meminit setup_zone_pageset(struct zone *zone) |
3776 | { | 3901 | { |
3777 | int cpu; | 3902 | int cpu; |
3778 | 3903 | ||
@@ -3845,32 +3970,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
3845 | return 0; | 3970 | return 0; |
3846 | } | 3971 | } |
3847 | 3972 | ||
3848 | static int __zone_pcp_update(void *data) | ||
3849 | { | ||
3850 | struct zone *zone = data; | ||
3851 | int cpu; | ||
3852 | unsigned long batch = zone_batchsize(zone), flags; | ||
3853 | |||
3854 | for_each_possible_cpu(cpu) { | ||
3855 | struct per_cpu_pageset *pset; | ||
3856 | struct per_cpu_pages *pcp; | ||
3857 | |||
3858 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
3859 | pcp = &pset->pcp; | ||
3860 | |||
3861 | local_irq_save(flags); | ||
3862 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
3863 | setup_pageset(pset, batch); | ||
3864 | local_irq_restore(flags); | ||
3865 | } | ||
3866 | return 0; | ||
3867 | } | ||
3868 | |||
3869 | void zone_pcp_update(struct zone *zone) | ||
3870 | { | ||
3871 | stop_machine(__zone_pcp_update, zone, NULL); | ||
3872 | } | ||
3873 | |||
3874 | static __meminit void zone_pcp_init(struct zone *zone) | 3973 | static __meminit void zone_pcp_init(struct zone *zone) |
3875 | { | 3974 | { |
3876 | /* | 3975 | /* |
@@ -3886,7 +3985,7 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
3886 | zone_batchsize(zone)); | 3985 | zone_batchsize(zone)); |
3887 | } | 3986 | } |
3888 | 3987 | ||
3889 | __meminit int init_currently_empty_zone(struct zone *zone, | 3988 | int __meminit init_currently_empty_zone(struct zone *zone, |
3890 | unsigned long zone_start_pfn, | 3989 | unsigned long zone_start_pfn, |
3891 | unsigned long size, | 3990 | unsigned long size, |
3892 | enum memmap_context context) | 3991 | enum memmap_context context) |
@@ -4244,25 +4343,24 @@ static inline void setup_usemap(struct pglist_data *pgdat, | |||
4244 | 4343 | ||
4245 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 4344 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
4246 | 4345 | ||
4247 | /* Return a sensible default order for the pageblock size. */ | ||
4248 | static inline int pageblock_default_order(void) | ||
4249 | { | ||
4250 | if (HPAGE_SHIFT > PAGE_SHIFT) | ||
4251 | return HUGETLB_PAGE_ORDER; | ||
4252 | |||
4253 | return MAX_ORDER-1; | ||
4254 | } | ||
4255 | |||
4256 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | 4346 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ |
4257 | static inline void __init set_pageblock_order(unsigned int order) | 4347 | void __init set_pageblock_order(void) |
4258 | { | 4348 | { |
4349 | unsigned int order; | ||
4350 | |||
4259 | /* Check that pageblock_nr_pages has not already been setup */ | 4351 | /* Check that pageblock_nr_pages has not already been setup */ |
4260 | if (pageblock_order) | 4352 | if (pageblock_order) |
4261 | return; | 4353 | return; |
4262 | 4354 | ||
4355 | if (HPAGE_SHIFT > PAGE_SHIFT) | ||
4356 | order = HUGETLB_PAGE_ORDER; | ||
4357 | else | ||
4358 | order = MAX_ORDER - 1; | ||
4359 | |||
4263 | /* | 4360 | /* |
4264 | * Assume the largest contiguous order of interest is a huge page. | 4361 | * Assume the largest contiguous order of interest is a huge page. |
4265 | * This value may be variable depending on boot parameters on IA64 | 4362 | * This value may be variable depending on boot parameters on IA64 and |
4363 | * powerpc. | ||
4266 | */ | 4364 | */ |
4267 | pageblock_order = order; | 4365 | pageblock_order = order; |
4268 | } | 4366 | } |
@@ -4270,15 +4368,13 @@ static inline void __init set_pageblock_order(unsigned int order) | |||
4270 | 4368 | ||
4271 | /* | 4369 | /* |
4272 | * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() | 4370 | * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() |
4273 | * and pageblock_default_order() are unused as pageblock_order is set | 4371 | * is unused as pageblock_order is set at compile-time. See |
4274 | * at compile-time. See include/linux/pageblock-flags.h for the values of | 4372 | * include/linux/pageblock-flags.h for the values of pageblock_order based on |
4275 | * pageblock_order based on the kernel config | 4373 | * the kernel config |
4276 | */ | 4374 | */ |
4277 | static inline int pageblock_default_order(unsigned int order) | 4375 | void __init set_pageblock_order(void) |
4278 | { | 4376 | { |
4279 | return MAX_ORDER-1; | ||
4280 | } | 4377 | } |
4281 | #define set_pageblock_order(x) do {} while (0) | ||
4282 | 4378 | ||
4283 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | 4379 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ |
4284 | 4380 | ||
@@ -4287,6 +4383,8 @@ static inline int pageblock_default_order(unsigned int order) | |||
4287 | * - mark all pages reserved | 4383 | * - mark all pages reserved |
4288 | * - mark all memory queues empty | 4384 | * - mark all memory queues empty |
4289 | * - clear the memory bitmaps | 4385 | * - clear the memory bitmaps |
4386 | * | ||
4387 | * NOTE: pgdat should get zeroed by caller. | ||
4290 | */ | 4388 | */ |
4291 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, | 4389 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, |
4292 | unsigned long *zones_size, unsigned long *zholes_size) | 4390 | unsigned long *zones_size, unsigned long *zholes_size) |
@@ -4297,15 +4395,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4297 | int ret; | 4395 | int ret; |
4298 | 4396 | ||
4299 | pgdat_resize_init(pgdat); | 4397 | pgdat_resize_init(pgdat); |
4300 | pgdat->nr_zones = 0; | ||
4301 | init_waitqueue_head(&pgdat->kswapd_wait); | 4398 | init_waitqueue_head(&pgdat->kswapd_wait); |
4302 | pgdat->kswapd_max_order = 0; | 4399 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4303 | pgdat_page_cgroup_init(pgdat); | 4400 | pgdat_page_cgroup_init(pgdat); |
4304 | 4401 | ||
4305 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4402 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4306 | struct zone *zone = pgdat->node_zones + j; | 4403 | struct zone *zone = pgdat->node_zones + j; |
4307 | unsigned long size, realsize, memmap_pages; | 4404 | unsigned long size, realsize, memmap_pages; |
4308 | enum lru_list lru; | ||
4309 | 4405 | ||
4310 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 4406 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
4311 | realsize = size - zone_absent_pages_in_node(nid, j, | 4407 | realsize = size - zone_absent_pages_in_node(nid, j, |
@@ -4342,6 +4438,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4342 | 4438 | ||
4343 | zone->spanned_pages = size; | 4439 | zone->spanned_pages = size; |
4344 | zone->present_pages = realsize; | 4440 | zone->present_pages = realsize; |
4441 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
4442 | zone->compact_cached_free_pfn = zone->zone_start_pfn + | ||
4443 | zone->spanned_pages; | ||
4444 | zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); | ||
4445 | #endif | ||
4345 | #ifdef CONFIG_NUMA | 4446 | #ifdef CONFIG_NUMA |
4346 | zone->node = nid; | 4447 | zone->node = nid; |
4347 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4448 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) |
@@ -4355,18 +4456,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4355 | zone->zone_pgdat = pgdat; | 4456 | zone->zone_pgdat = pgdat; |
4356 | 4457 | ||
4357 | zone_pcp_init(zone); | 4458 | zone_pcp_init(zone); |
4358 | for_each_lru(lru) | 4459 | lruvec_init(&zone->lruvec, zone); |
4359 | INIT_LIST_HEAD(&zone->lruvec.lists[lru]); | ||
4360 | zone->reclaim_stat.recent_rotated[0] = 0; | ||
4361 | zone->reclaim_stat.recent_rotated[1] = 0; | ||
4362 | zone->reclaim_stat.recent_scanned[0] = 0; | ||
4363 | zone->reclaim_stat.recent_scanned[1] = 0; | ||
4364 | zap_zone_vm_stats(zone); | ||
4365 | zone->flags = 0; | ||
4366 | if (!size) | 4460 | if (!size) |
4367 | continue; | 4461 | continue; |
4368 | 4462 | ||
4369 | set_pageblock_order(pageblock_default_order()); | 4463 | set_pageblock_order(); |
4370 | setup_usemap(pgdat, zone, size); | 4464 | setup_usemap(pgdat, zone, size); |
4371 | ret = init_currently_empty_zone(zone, zone_start_pfn, | 4465 | ret = init_currently_empty_zone(zone, zone_start_pfn, |
4372 | size, MEMMAP_EARLY); | 4466 | size, MEMMAP_EARLY); |
@@ -4422,6 +4516,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4422 | { | 4516 | { |
4423 | pg_data_t *pgdat = NODE_DATA(nid); | 4517 | pg_data_t *pgdat = NODE_DATA(nid); |
4424 | 4518 | ||
4519 | /* pg_data_t should be reset to zero when it's allocated */ | ||
4520 | WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); | ||
4521 | |||
4425 | pgdat->node_id = nid; | 4522 | pgdat->node_id = nid; |
4426 | pgdat->node_start_pfn = node_start_pfn; | 4523 | pgdat->node_start_pfn = node_start_pfn; |
4427 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 4524 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
@@ -4703,7 +4800,7 @@ out: | |||
4703 | } | 4800 | } |
4704 | 4801 | ||
4705 | /* Any regular memory on that node ? */ | 4802 | /* Any regular memory on that node ? */ |
4706 | static void check_for_regular_memory(pg_data_t *pgdat) | 4803 | static void __init check_for_regular_memory(pg_data_t *pgdat) |
4707 | { | 4804 | { |
4708 | #ifdef CONFIG_HIGHMEM | 4805 | #ifdef CONFIG_HIGHMEM |
4709 | enum zone_type zone_type; | 4806 | enum zone_type zone_type; |
@@ -4759,31 +4856,34 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4759 | find_zone_movable_pfns_for_nodes(); | 4856 | find_zone_movable_pfns_for_nodes(); |
4760 | 4857 | ||
4761 | /* Print out the zone ranges */ | 4858 | /* Print out the zone ranges */ |
4762 | printk("Zone PFN ranges:\n"); | 4859 | printk("Zone ranges:\n"); |
4763 | for (i = 0; i < MAX_NR_ZONES; i++) { | 4860 | for (i = 0; i < MAX_NR_ZONES; i++) { |
4764 | if (i == ZONE_MOVABLE) | 4861 | if (i == ZONE_MOVABLE) |
4765 | continue; | 4862 | continue; |
4766 | printk(" %-8s ", zone_names[i]); | 4863 | printk(KERN_CONT " %-8s ", zone_names[i]); |
4767 | if (arch_zone_lowest_possible_pfn[i] == | 4864 | if (arch_zone_lowest_possible_pfn[i] == |
4768 | arch_zone_highest_possible_pfn[i]) | 4865 | arch_zone_highest_possible_pfn[i]) |
4769 | printk("empty\n"); | 4866 | printk(KERN_CONT "empty\n"); |
4770 | else | 4867 | else |
4771 | printk("%0#10lx -> %0#10lx\n", | 4868 | printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", |
4772 | arch_zone_lowest_possible_pfn[i], | 4869 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, |
4773 | arch_zone_highest_possible_pfn[i]); | 4870 | (arch_zone_highest_possible_pfn[i] |
4871 | << PAGE_SHIFT) - 1); | ||
4774 | } | 4872 | } |
4775 | 4873 | ||
4776 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ | 4874 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ |
4777 | printk("Movable zone start PFN for each node\n"); | 4875 | printk("Movable zone start for each node\n"); |
4778 | for (i = 0; i < MAX_NUMNODES; i++) { | 4876 | for (i = 0; i < MAX_NUMNODES; i++) { |
4779 | if (zone_movable_pfn[i]) | 4877 | if (zone_movable_pfn[i]) |
4780 | printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); | 4878 | printk(" Node %d: %#010lx\n", i, |
4879 | zone_movable_pfn[i] << PAGE_SHIFT); | ||
4781 | } | 4880 | } |
4782 | 4881 | ||
4783 | /* Print out the early_node_map[] */ | 4882 | /* Print out the early_node_map[] */ |
4784 | printk("Early memory PFN ranges\n"); | 4883 | printk("Early memory node ranges\n"); |
4785 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 4884 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
4786 | printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); | 4885 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, |
4886 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); | ||
4787 | 4887 | ||
4788 | /* Initialise every node */ | 4888 | /* Initialise every node */ |
4789 | mminit_verify_pageflags_layout(); | 4889 | mminit_verify_pageflags_layout(); |
@@ -4976,14 +5076,7 @@ static void setup_per_zone_lowmem_reserve(void) | |||
4976 | calculate_totalreserve_pages(); | 5076 | calculate_totalreserve_pages(); |
4977 | } | 5077 | } |
4978 | 5078 | ||
4979 | /** | 5079 | static void __setup_per_zone_wmarks(void) |
4980 | * setup_per_zone_wmarks - called when min_free_kbytes changes | ||
4981 | * or when memory is hot-{added|removed} | ||
4982 | * | ||
4983 | * Ensures that the watermark[min,low,high] values for each zone are set | ||
4984 | * correctly with respect to min_free_kbytes. | ||
4985 | */ | ||
4986 | void setup_per_zone_wmarks(void) | ||
4987 | { | 5080 | { |
4988 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | 5081 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
4989 | unsigned long lowmem_pages = 0; | 5082 | unsigned long lowmem_pages = 0; |
@@ -5030,6 +5123,11 @@ void setup_per_zone_wmarks(void) | |||
5030 | 5123 | ||
5031 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); | 5124 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
5032 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); | 5125 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
5126 | |||
5127 | zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); | ||
5128 | zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); | ||
5129 | zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); | ||
5130 | |||
5033 | setup_zone_migrate_reserve(zone); | 5131 | setup_zone_migrate_reserve(zone); |
5034 | spin_unlock_irqrestore(&zone->lock, flags); | 5132 | spin_unlock_irqrestore(&zone->lock, flags); |
5035 | } | 5133 | } |
@@ -5038,6 +5136,20 @@ void setup_per_zone_wmarks(void) | |||
5038 | calculate_totalreserve_pages(); | 5136 | calculate_totalreserve_pages(); |
5039 | } | 5137 | } |
5040 | 5138 | ||
5139 | /** | ||
5140 | * setup_per_zone_wmarks - called when min_free_kbytes changes | ||
5141 | * or when memory is hot-{added|removed} | ||
5142 | * | ||
5143 | * Ensures that the watermark[min,low,high] values for each zone are set | ||
5144 | * correctly with respect to min_free_kbytes. | ||
5145 | */ | ||
5146 | void setup_per_zone_wmarks(void) | ||
5147 | { | ||
5148 | mutex_lock(&zonelists_mutex); | ||
5149 | __setup_per_zone_wmarks(); | ||
5150 | mutex_unlock(&zonelists_mutex); | ||
5151 | } | ||
5152 | |||
5041 | /* | 5153 | /* |
5042 | * The inactive anon list should be small enough that the VM never has to | 5154 | * The inactive anon list should be small enough that the VM never has to |
5043 | * do too much work, but large enough that each inactive page has a chance | 5155 | * do too much work, but large enough that each inactive page has a chance |
@@ -5203,7 +5315,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
5203 | int ret; | 5315 | int ret; |
5204 | 5316 | ||
5205 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | 5317 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); |
5206 | if (!write || (ret == -EINVAL)) | 5318 | if (!write || (ret < 0)) |
5207 | return ret; | 5319 | return ret; |
5208 | for_each_populated_zone(zone) { | 5320 | for_each_populated_zone(zone) { |
5209 | for_each_possible_cpu(cpu) { | 5321 | for_each_possible_cpu(cpu) { |
@@ -5242,9 +5354,10 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
5242 | int flags, | 5354 | int flags, |
5243 | unsigned int *_hash_shift, | 5355 | unsigned int *_hash_shift, |
5244 | unsigned int *_hash_mask, | 5356 | unsigned int *_hash_mask, |
5245 | unsigned long limit) | 5357 | unsigned long low_limit, |
5358 | unsigned long high_limit) | ||
5246 | { | 5359 | { |
5247 | unsigned long long max = limit; | 5360 | unsigned long long max = high_limit; |
5248 | unsigned long log2qty, size; | 5361 | unsigned long log2qty, size; |
5249 | void *table = NULL; | 5362 | void *table = NULL; |
5250 | 5363 | ||
@@ -5282,6 +5395,8 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
5282 | } | 5395 | } |
5283 | max = min(max, 0x80000000ULL); | 5396 | max = min(max, 0x80000000ULL); |
5284 | 5397 | ||
5398 | if (numentries < low_limit) | ||
5399 | numentries = low_limit; | ||
5285 | if (numentries > max) | 5400 | if (numentries > max) |
5286 | numentries = max; | 5401 | numentries = max; |
5287 | 5402 | ||
@@ -5403,24 +5518,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5403 | } | 5518 | } |
5404 | 5519 | ||
5405 | /* | 5520 | /* |
5406 | * This is designed as sub function...plz see page_isolation.c also. | 5521 | * This function checks whether pageblock includes unmovable pages or not. |
5407 | * set/clear page block's type to be ISOLATE. | 5522 | * If @count is not zero, it is okay to include less @count unmovable pages |
5408 | * page allocater never alloc memory from ISOLATE block. | 5523 | * |
5524 | * PageLRU check wihtout isolation or lru_lock could race so that | ||
5525 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't | ||
5526 | * expect this function should be exact. | ||
5409 | */ | 5527 | */ |
5410 | 5528 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count) | |
5411 | static int | ||
5412 | __count_immobile_pages(struct zone *zone, struct page *page, int count) | ||
5413 | { | 5529 | { |
5414 | unsigned long pfn, iter, found; | 5530 | unsigned long pfn, iter, found; |
5531 | int mt; | ||
5532 | |||
5415 | /* | 5533 | /* |
5416 | * For avoiding noise data, lru_add_drain_all() should be called | 5534 | * For avoiding noise data, lru_add_drain_all() should be called |
5417 | * If ZONE_MOVABLE, the zone never contains immobile pages | 5535 | * If ZONE_MOVABLE, the zone never contains unmovable pages |
5418 | */ | 5536 | */ |
5419 | if (zone_idx(zone) == ZONE_MOVABLE) | 5537 | if (zone_idx(zone) == ZONE_MOVABLE) |
5420 | return true; | 5538 | return false; |
5421 | 5539 | mt = get_pageblock_migratetype(page); | |
5422 | if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) | 5540 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) |
5423 | return true; | 5541 | return false; |
5424 | 5542 | ||
5425 | pfn = page_to_pfn(page); | 5543 | pfn = page_to_pfn(page); |
5426 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { | 5544 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { |
@@ -5430,11 +5548,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) | |||
5430 | continue; | 5548 | continue; |
5431 | 5549 | ||
5432 | page = pfn_to_page(check); | 5550 | page = pfn_to_page(check); |
5433 | if (!page_count(page)) { | 5551 | /* |
5552 | * We can't use page_count without pin a page | ||
5553 | * because another CPU can free compound page. | ||
5554 | * This check already skips compound tails of THP | ||
5555 | * because their page->_count is zero at all time. | ||
5556 | */ | ||
5557 | if (!atomic_read(&page->_count)) { | ||
5434 | if (PageBuddy(page)) | 5558 | if (PageBuddy(page)) |
5435 | iter += (1 << page_order(page)) - 1; | 5559 | iter += (1 << page_order(page)) - 1; |
5436 | continue; | 5560 | continue; |
5437 | } | 5561 | } |
5562 | |||
5438 | if (!PageLRU(page)) | 5563 | if (!PageLRU(page)) |
5439 | found++; | 5564 | found++; |
5440 | /* | 5565 | /* |
@@ -5451,9 +5576,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) | |||
5451 | * page at boot. | 5576 | * page at boot. |
5452 | */ | 5577 | */ |
5453 | if (found > count) | 5578 | if (found > count) |
5454 | return false; | 5579 | return true; |
5455 | } | 5580 | } |
5456 | return true; | 5581 | return false; |
5457 | } | 5582 | } |
5458 | 5583 | ||
5459 | bool is_pageblock_removable_nolock(struct page *page) | 5584 | bool is_pageblock_removable_nolock(struct page *page) |
@@ -5477,80 +5602,304 @@ bool is_pageblock_removable_nolock(struct page *page) | |||
5477 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | 5602 | zone->zone_start_pfn + zone->spanned_pages <= pfn) |
5478 | return false; | 5603 | return false; |
5479 | 5604 | ||
5480 | return __count_immobile_pages(zone, page, 0); | 5605 | return !has_unmovable_pages(zone, page, 0); |
5606 | } | ||
5607 | |||
5608 | #ifdef CONFIG_CMA | ||
5609 | |||
5610 | static unsigned long pfn_max_align_down(unsigned long pfn) | ||
5611 | { | ||
5612 | return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, | ||
5613 | pageblock_nr_pages) - 1); | ||
5481 | } | 5614 | } |
5482 | 5615 | ||
5483 | int set_migratetype_isolate(struct page *page) | 5616 | static unsigned long pfn_max_align_up(unsigned long pfn) |
5484 | { | 5617 | { |
5485 | struct zone *zone; | 5618 | return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, |
5486 | unsigned long flags, pfn; | 5619 | pageblock_nr_pages)); |
5487 | struct memory_isolate_notify arg; | 5620 | } |
5488 | int notifier_ret; | ||
5489 | int ret = -EBUSY; | ||
5490 | 5621 | ||
5491 | zone = page_zone(page); | 5622 | static struct page * |
5623 | __alloc_contig_migrate_alloc(struct page *page, unsigned long private, | ||
5624 | int **resultp) | ||
5625 | { | ||
5626 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; | ||
5627 | |||
5628 | if (PageHighMem(page)) | ||
5629 | gfp_mask |= __GFP_HIGHMEM; | ||
5492 | 5630 | ||
5631 | return alloc_page(gfp_mask); | ||
5632 | } | ||
5633 | |||
5634 | /* [start, end) must belong to a single zone. */ | ||
5635 | static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) | ||
5636 | { | ||
5637 | /* This function is based on compact_zone() from compaction.c. */ | ||
5638 | |||
5639 | unsigned long pfn = start; | ||
5640 | unsigned int tries = 0; | ||
5641 | int ret = 0; | ||
5642 | |||
5643 | struct compact_control cc = { | ||
5644 | .nr_migratepages = 0, | ||
5645 | .order = -1, | ||
5646 | .zone = page_zone(pfn_to_page(start)), | ||
5647 | .sync = true, | ||
5648 | }; | ||
5649 | INIT_LIST_HEAD(&cc.migratepages); | ||
5650 | |||
5651 | migrate_prep_local(); | ||
5652 | |||
5653 | while (pfn < end || !list_empty(&cc.migratepages)) { | ||
5654 | if (fatal_signal_pending(current)) { | ||
5655 | ret = -EINTR; | ||
5656 | break; | ||
5657 | } | ||
5658 | |||
5659 | if (list_empty(&cc.migratepages)) { | ||
5660 | cc.nr_migratepages = 0; | ||
5661 | pfn = isolate_migratepages_range(cc.zone, &cc, | ||
5662 | pfn, end); | ||
5663 | if (!pfn) { | ||
5664 | ret = -EINTR; | ||
5665 | break; | ||
5666 | } | ||
5667 | tries = 0; | ||
5668 | } else if (++tries == 5) { | ||
5669 | ret = ret < 0 ? ret : -EBUSY; | ||
5670 | break; | ||
5671 | } | ||
5672 | |||
5673 | ret = migrate_pages(&cc.migratepages, | ||
5674 | __alloc_contig_migrate_alloc, | ||
5675 | 0, false, MIGRATE_SYNC); | ||
5676 | } | ||
5677 | |||
5678 | putback_lru_pages(&cc.migratepages); | ||
5679 | return ret > 0 ? 0 : ret; | ||
5680 | } | ||
5681 | |||
5682 | /* | ||
5683 | * Update zone's cma pages counter used for watermark level calculation. | ||
5684 | */ | ||
5685 | static inline void __update_cma_watermarks(struct zone *zone, int count) | ||
5686 | { | ||
5687 | unsigned long flags; | ||
5493 | spin_lock_irqsave(&zone->lock, flags); | 5688 | spin_lock_irqsave(&zone->lock, flags); |
5689 | zone->min_cma_pages += count; | ||
5690 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5691 | setup_per_zone_wmarks(); | ||
5692 | } | ||
5494 | 5693 | ||
5495 | pfn = page_to_pfn(page); | 5694 | /* |
5496 | arg.start_pfn = pfn; | 5695 | * Trigger memory pressure bump to reclaim some pages in order to be able to |
5497 | arg.nr_pages = pageblock_nr_pages; | 5696 | * allocate 'count' pages in single page units. Does similar work as |
5498 | arg.pages_found = 0; | 5697 | *__alloc_pages_slowpath() function. |
5698 | */ | ||
5699 | static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) | ||
5700 | { | ||
5701 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
5702 | struct zonelist *zonelist = node_zonelist(0, gfp_mask); | ||
5703 | int did_some_progress = 0; | ||
5704 | int order = 1; | ||
5499 | 5705 | ||
5500 | /* | 5706 | /* |
5501 | * It may be possible to isolate a pageblock even if the | 5707 | * Increase level of watermarks to force kswapd do his job |
5502 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | 5708 | * to stabilise at new watermark level. |
5503 | * notifier chain is used by balloon drivers to return the | ||
5504 | * number of pages in a range that are held by the balloon | ||
5505 | * driver to shrink memory. If all the pages are accounted for | ||
5506 | * by balloons, are free, or on the LRU, isolation can continue. | ||
5507 | * Later, for example, when memory hotplug notifier runs, these | ||
5508 | * pages reported as "can be isolated" should be isolated(freed) | ||
5509 | * by the balloon driver through the memory notifier chain. | ||
5510 | */ | 5709 | */ |
5511 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); | 5710 | __update_cma_watermarks(zone, count); |
5512 | notifier_ret = notifier_to_errno(notifier_ret); | 5711 | |
5513 | if (notifier_ret) | 5712 | /* Obey watermarks as if the page was being allocated */ |
5514 | goto out; | 5713 | while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { |
5714 | wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); | ||
5715 | |||
5716 | did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | ||
5717 | NULL); | ||
5718 | if (!did_some_progress) { | ||
5719 | /* Exhausted what can be done so it's blamo time */ | ||
5720 | out_of_memory(zonelist, gfp_mask, order, NULL, false); | ||
5721 | } | ||
5722 | } | ||
5723 | |||
5724 | /* Restore original watermark levels. */ | ||
5725 | __update_cma_watermarks(zone, -count); | ||
5726 | |||
5727 | return count; | ||
5728 | } | ||
5729 | |||
5730 | /** | ||
5731 | * alloc_contig_range() -- tries to allocate given range of pages | ||
5732 | * @start: start PFN to allocate | ||
5733 | * @end: one-past-the-last PFN to allocate | ||
5734 | * @migratetype: migratetype of the underlaying pageblocks (either | ||
5735 | * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks | ||
5736 | * in range must have the same migratetype and it must | ||
5737 | * be either of the two. | ||
5738 | * | ||
5739 | * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES | ||
5740 | * aligned, however it's the caller's responsibility to guarantee that | ||
5741 | * we are the only thread that changes migrate type of pageblocks the | ||
5742 | * pages fall in. | ||
5743 | * | ||
5744 | * The PFN range must belong to a single zone. | ||
5745 | * | ||
5746 | * Returns zero on success or negative error code. On success all | ||
5747 | * pages which PFN is in [start, end) are allocated for the caller and | ||
5748 | * need to be freed with free_contig_range(). | ||
5749 | */ | ||
5750 | int alloc_contig_range(unsigned long start, unsigned long end, | ||
5751 | unsigned migratetype) | ||
5752 | { | ||
5753 | struct zone *zone = page_zone(pfn_to_page(start)); | ||
5754 | unsigned long outer_start, outer_end; | ||
5755 | int ret = 0, order; | ||
5756 | |||
5515 | /* | 5757 | /* |
5516 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | 5758 | * What we do here is we mark all pageblocks in range as |
5517 | * We just check MOVABLE pages. | 5759 | * MIGRATE_ISOLATE. Because pageblock and max order pages may |
5760 | * have different sizes, and due to the way page allocator | ||
5761 | * work, we align the range to biggest of the two pages so | ||
5762 | * that page allocator won't try to merge buddies from | ||
5763 | * different pageblocks and change MIGRATE_ISOLATE to some | ||
5764 | * other migration type. | ||
5765 | * | ||
5766 | * Once the pageblocks are marked as MIGRATE_ISOLATE, we | ||
5767 | * migrate the pages from an unaligned range (ie. pages that | ||
5768 | * we are interested in). This will put all the pages in | ||
5769 | * range back to page allocator as MIGRATE_ISOLATE. | ||
5770 | * | ||
5771 | * When this is done, we take the pages in range from page | ||
5772 | * allocator removing them from the buddy system. This way | ||
5773 | * page allocator will never consider using them. | ||
5774 | * | ||
5775 | * This lets us mark the pageblocks back as | ||
5776 | * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the | ||
5777 | * aligned range but not in the unaligned, original range are | ||
5778 | * put back to page allocator so that buddy can use them. | ||
5518 | */ | 5779 | */ |
5519 | if (__count_immobile_pages(zone, page, arg.pages_found)) | 5780 | |
5520 | ret = 0; | 5781 | ret = start_isolate_page_range(pfn_max_align_down(start), |
5782 | pfn_max_align_up(end), migratetype); | ||
5783 | if (ret) | ||
5784 | goto done; | ||
5785 | |||
5786 | ret = __alloc_contig_migrate_range(start, end); | ||
5787 | if (ret) | ||
5788 | goto done; | ||
5521 | 5789 | ||
5522 | /* | 5790 | /* |
5523 | * immobile means "not-on-lru" paes. If immobile is larger than | 5791 | * Pages from [start, end) are within a MAX_ORDER_NR_PAGES |
5524 | * removable-by-driver pages reported by notifier, we'll fail. | 5792 | * aligned blocks that are marked as MIGRATE_ISOLATE. What's |
5793 | * more, all pages in [start, end) are free in page allocator. | ||
5794 | * What we are going to do is to allocate all pages from | ||
5795 | * [start, end) (that is remove them from page allocator). | ||
5796 | * | ||
5797 | * The only problem is that pages at the beginning and at the | ||
5798 | * end of interesting range may be not aligned with pages that | ||
5799 | * page allocator holds, ie. they can be part of higher order | ||
5800 | * pages. Because of this, we reserve the bigger range and | ||
5801 | * once this is done free the pages we are not interested in. | ||
5802 | * | ||
5803 | * We don't have to hold zone->lock here because the pages are | ||
5804 | * isolated thus they won't get removed from buddy. | ||
5525 | */ | 5805 | */ |
5526 | 5806 | ||
5527 | out: | 5807 | lru_add_drain_all(); |
5528 | if (!ret) { | 5808 | drain_all_pages(); |
5529 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | 5809 | |
5530 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | 5810 | order = 0; |
5811 | outer_start = start; | ||
5812 | while (!PageBuddy(pfn_to_page(outer_start))) { | ||
5813 | if (++order >= MAX_ORDER) { | ||
5814 | ret = -EBUSY; | ||
5815 | goto done; | ||
5816 | } | ||
5817 | outer_start &= ~0UL << order; | ||
5531 | } | 5818 | } |
5532 | 5819 | ||
5533 | spin_unlock_irqrestore(&zone->lock, flags); | 5820 | /* Make sure the range is really isolated. */ |
5534 | if (!ret) | 5821 | if (test_pages_isolated(outer_start, end)) { |
5535 | drain_all_pages(); | 5822 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", |
5823 | outer_start, end); | ||
5824 | ret = -EBUSY; | ||
5825 | goto done; | ||
5826 | } | ||
5827 | |||
5828 | /* | ||
5829 | * Reclaim enough pages to make sure that contiguous allocation | ||
5830 | * will not starve the system. | ||
5831 | */ | ||
5832 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); | ||
5833 | |||
5834 | /* Grab isolated pages from freelists. */ | ||
5835 | outer_end = isolate_freepages_range(outer_start, end); | ||
5836 | if (!outer_end) { | ||
5837 | ret = -EBUSY; | ||
5838 | goto done; | ||
5839 | } | ||
5840 | |||
5841 | /* Free head and tail (if any) */ | ||
5842 | if (start != outer_start) | ||
5843 | free_contig_range(outer_start, start - outer_start); | ||
5844 | if (end != outer_end) | ||
5845 | free_contig_range(end, outer_end - end); | ||
5846 | |||
5847 | done: | ||
5848 | undo_isolate_page_range(pfn_max_align_down(start), | ||
5849 | pfn_max_align_up(end), migratetype); | ||
5536 | return ret; | 5850 | return ret; |
5537 | } | 5851 | } |
5538 | 5852 | ||
5539 | void unset_migratetype_isolate(struct page *page) | 5853 | void free_contig_range(unsigned long pfn, unsigned nr_pages) |
5540 | { | 5854 | { |
5541 | struct zone *zone; | 5855 | for (; nr_pages--; ++pfn) |
5542 | unsigned long flags; | 5856 | __free_page(pfn_to_page(pfn)); |
5543 | zone = page_zone(page); | 5857 | } |
5544 | spin_lock_irqsave(&zone->lock, flags); | 5858 | #endif |
5545 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | 5859 | |
5546 | goto out; | 5860 | #ifdef CONFIG_MEMORY_HOTPLUG |
5547 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 5861 | static int __meminit __zone_pcp_update(void *data) |
5548 | move_freepages_block(zone, page, MIGRATE_MOVABLE); | 5862 | { |
5549 | out: | 5863 | struct zone *zone = data; |
5550 | spin_unlock_irqrestore(&zone->lock, flags); | 5864 | int cpu; |
5865 | unsigned long batch = zone_batchsize(zone), flags; | ||
5866 | |||
5867 | for_each_possible_cpu(cpu) { | ||
5868 | struct per_cpu_pageset *pset; | ||
5869 | struct per_cpu_pages *pcp; | ||
5870 | |||
5871 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
5872 | pcp = &pset->pcp; | ||
5873 | |||
5874 | local_irq_save(flags); | ||
5875 | if (pcp->count > 0) | ||
5876 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
5877 | setup_pageset(pset, batch); | ||
5878 | local_irq_restore(flags); | ||
5879 | } | ||
5880 | return 0; | ||
5551 | } | 5881 | } |
5552 | 5882 | ||
5883 | void __meminit zone_pcp_update(struct zone *zone) | ||
5884 | { | ||
5885 | stop_machine(__zone_pcp_update, zone, NULL); | ||
5886 | } | ||
5887 | #endif | ||
5888 | |||
5553 | #ifdef CONFIG_MEMORY_HOTREMOVE | 5889 | #ifdef CONFIG_MEMORY_HOTREMOVE |
5890 | void zone_pcp_reset(struct zone *zone) | ||
5891 | { | ||
5892 | unsigned long flags; | ||
5893 | |||
5894 | /* avoid races with drain_pages() */ | ||
5895 | local_irq_save(flags); | ||
5896 | if (zone->pageset != &boot_pageset) { | ||
5897 | free_percpu(zone->pageset); | ||
5898 | zone->pageset = &boot_pageset; | ||
5899 | } | ||
5900 | local_irq_restore(flags); | ||
5901 | } | ||
5902 | |||
5554 | /* | 5903 | /* |
5555 | * All pages in the range must be isolated before calling this. | 5904 | * All pages in the range must be isolated before calling this. |
5556 | */ | 5905 | */ |
@@ -5618,7 +5967,7 @@ bool is_free_buddy_page(struct page *page) | |||
5618 | } | 5967 | } |
5619 | #endif | 5968 | #endif |
5620 | 5969 | ||
5621 | static struct trace_print_flags pageflag_names[] = { | 5970 | static const struct trace_print_flags pageflag_names[] = { |
5622 | {1UL << PG_locked, "locked" }, | 5971 | {1UL << PG_locked, "locked" }, |
5623 | {1UL << PG_error, "error" }, | 5972 | {1UL << PG_error, "error" }, |
5624 | {1UL << PG_referenced, "referenced" }, | 5973 | {1UL << PG_referenced, "referenced" }, |
@@ -5653,7 +6002,9 @@ static struct trace_print_flags pageflag_names[] = { | |||
5653 | #ifdef CONFIG_MEMORY_FAILURE | 6002 | #ifdef CONFIG_MEMORY_FAILURE |
5654 | {1UL << PG_hwpoison, "hwpoison" }, | 6003 | {1UL << PG_hwpoison, "hwpoison" }, |
5655 | #endif | 6004 | #endif |
5656 | {-1UL, NULL }, | 6005 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
6006 | {1UL << PG_compound_lock, "compound_lock" }, | ||
6007 | #endif | ||
5657 | }; | 6008 | }; |
5658 | 6009 | ||
5659 | static void dump_page_flags(unsigned long flags) | 6010 | static void dump_page_flags(unsigned long flags) |
@@ -5662,12 +6013,14 @@ static void dump_page_flags(unsigned long flags) | |||
5662 | unsigned long mask; | 6013 | unsigned long mask; |
5663 | int i; | 6014 | int i; |
5664 | 6015 | ||
6016 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); | ||
6017 | |||
5665 | printk(KERN_ALERT "page flags: %#lx(", flags); | 6018 | printk(KERN_ALERT "page flags: %#lx(", flags); |
5666 | 6019 | ||
5667 | /* remove zone id */ | 6020 | /* remove zone id */ |
5668 | flags &= (1UL << NR_PAGEFLAGS) - 1; | 6021 | flags &= (1UL << NR_PAGEFLAGS) - 1; |
5669 | 6022 | ||
5670 | for (i = 0; pageflag_names[i].name && flags; i++) { | 6023 | for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { |
5671 | 6024 | ||
5672 | mask = pageflag_names[i].mask; | 6025 | mask = pageflag_names[i].mask; |
5673 | if ((flags & mask) != mask) | 6026 | if ((flags & mask) != mask) |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 1ccbd714059c..5ddad0c6daa6 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | |||
317 | #endif | 317 | #endif |
318 | 318 | ||
319 | 319 | ||
320 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 320 | #ifdef CONFIG_MEMCG_SWAP |
321 | 321 | ||
322 | static DEFINE_MUTEX(swap_cgroup_mutex); | 322 | static DEFINE_MUTEX(swap_cgroup_mutex); |
323 | struct swap_cgroup_ctrl { | 323 | struct swap_cgroup_ctrl { |
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, | |||
392 | 392 | ||
393 | /** | 393 | /** |
394 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | 394 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. |
395 | * @end: swap entry to be cmpxchged | 395 | * @ent: swap entry to be cmpxchged |
396 | * @old: old id | 396 | * @old: old id |
397 | * @new: new id | 397 | * @new: new id |
398 | * | 398 | * |
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | |||
422 | /** | 422 | /** |
423 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | 423 | * swap_cgroup_record - record mem_cgroup for this swp_entry. |
424 | * @ent: swap entry to be recorded into | 424 | * @ent: swap entry to be recorded into |
425 | * @mem: mem_cgroup to be recorded | 425 | * @id: mem_cgroup to be recorded |
426 | * | 426 | * |
427 | * Returns old value at success, 0 at failure. | 427 | * Returns old value at success, 0 at failure. |
428 | * (Of course, old value can be 0.) | 428 | * (Of course, old value can be 0.) |
diff --git a/mm/page_io.c b/mm/page_io.c index dc76b4d0611e..78eee32ee486 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -17,7 +17,9 @@ | |||
17 | #include <linux/swap.h> | 17 | #include <linux/swap.h> |
18 | #include <linux/bio.h> | 18 | #include <linux/bio.h> |
19 | #include <linux/swapops.h> | 19 | #include <linux/swapops.h> |
20 | #include <linux/buffer_head.h> | ||
20 | #include <linux/writeback.h> | 21 | #include <linux/writeback.h> |
22 | #include <linux/frontswap.h> | ||
21 | #include <asm/pgtable.h> | 23 | #include <asm/pgtable.h> |
22 | 24 | ||
23 | static struct bio *get_swap_bio(gfp_t gfp_flags, | 25 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
@@ -85,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err) | |||
85 | bio_put(bio); | 87 | bio_put(bio); |
86 | } | 88 | } |
87 | 89 | ||
90 | int generic_swapfile_activate(struct swap_info_struct *sis, | ||
91 | struct file *swap_file, | ||
92 | sector_t *span) | ||
93 | { | ||
94 | struct address_space *mapping = swap_file->f_mapping; | ||
95 | struct inode *inode = mapping->host; | ||
96 | unsigned blocks_per_page; | ||
97 | unsigned long page_no; | ||
98 | unsigned blkbits; | ||
99 | sector_t probe_block; | ||
100 | sector_t last_block; | ||
101 | sector_t lowest_block = -1; | ||
102 | sector_t highest_block = 0; | ||
103 | int nr_extents = 0; | ||
104 | int ret; | ||
105 | |||
106 | blkbits = inode->i_blkbits; | ||
107 | blocks_per_page = PAGE_SIZE >> blkbits; | ||
108 | |||
109 | /* | ||
110 | * Map all the blocks into the extent list. This code doesn't try | ||
111 | * to be very smart. | ||
112 | */ | ||
113 | probe_block = 0; | ||
114 | page_no = 0; | ||
115 | last_block = i_size_read(inode) >> blkbits; | ||
116 | while ((probe_block + blocks_per_page) <= last_block && | ||
117 | page_no < sis->max) { | ||
118 | unsigned block_in_page; | ||
119 | sector_t first_block; | ||
120 | |||
121 | first_block = bmap(inode, probe_block); | ||
122 | if (first_block == 0) | ||
123 | goto bad_bmap; | ||
124 | |||
125 | /* | ||
126 | * It must be PAGE_SIZE aligned on-disk | ||
127 | */ | ||
128 | if (first_block & (blocks_per_page - 1)) { | ||
129 | probe_block++; | ||
130 | goto reprobe; | ||
131 | } | ||
132 | |||
133 | for (block_in_page = 1; block_in_page < blocks_per_page; | ||
134 | block_in_page++) { | ||
135 | sector_t block; | ||
136 | |||
137 | block = bmap(inode, probe_block + block_in_page); | ||
138 | if (block == 0) | ||
139 | goto bad_bmap; | ||
140 | if (block != first_block + block_in_page) { | ||
141 | /* Discontiguity */ | ||
142 | probe_block++; | ||
143 | goto reprobe; | ||
144 | } | ||
145 | } | ||
146 | |||
147 | first_block >>= (PAGE_SHIFT - blkbits); | ||
148 | if (page_no) { /* exclude the header page */ | ||
149 | if (first_block < lowest_block) | ||
150 | lowest_block = first_block; | ||
151 | if (first_block > highest_block) | ||
152 | highest_block = first_block; | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks | ||
157 | */ | ||
158 | ret = add_swap_extent(sis, page_no, 1, first_block); | ||
159 | if (ret < 0) | ||
160 | goto out; | ||
161 | nr_extents += ret; | ||
162 | page_no++; | ||
163 | probe_block += blocks_per_page; | ||
164 | reprobe: | ||
165 | continue; | ||
166 | } | ||
167 | ret = nr_extents; | ||
168 | *span = 1 + highest_block - lowest_block; | ||
169 | if (page_no == 0) | ||
170 | page_no = 1; /* force Empty message */ | ||
171 | sis->max = page_no; | ||
172 | sis->pages = page_no - 1; | ||
173 | sis->highest_bit = page_no - 1; | ||
174 | out: | ||
175 | return ret; | ||
176 | bad_bmap: | ||
177 | printk(KERN_ERR "swapon: swapfile has holes\n"); | ||
178 | ret = -EINVAL; | ||
179 | goto out; | ||
180 | } | ||
181 | |||
88 | /* | 182 | /* |
89 | * We may have stale swap cache pages in memory: notice | 183 | * We may have stale swap cache pages in memory: notice |
90 | * them here and get rid of the unnecessary final write. | 184 | * them here and get rid of the unnecessary final write. |
@@ -93,11 +187,45 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
93 | { | 187 | { |
94 | struct bio *bio; | 188 | struct bio *bio; |
95 | int ret = 0, rw = WRITE; | 189 | int ret = 0, rw = WRITE; |
190 | struct swap_info_struct *sis = page_swap_info(page); | ||
96 | 191 | ||
97 | if (try_to_free_swap(page)) { | 192 | if (try_to_free_swap(page)) { |
98 | unlock_page(page); | 193 | unlock_page(page); |
99 | goto out; | 194 | goto out; |
100 | } | 195 | } |
196 | if (frontswap_store(page) == 0) { | ||
197 | set_page_writeback(page); | ||
198 | unlock_page(page); | ||
199 | end_page_writeback(page); | ||
200 | goto out; | ||
201 | } | ||
202 | |||
203 | if (sis->flags & SWP_FILE) { | ||
204 | struct kiocb kiocb; | ||
205 | struct file *swap_file = sis->swap_file; | ||
206 | struct address_space *mapping = swap_file->f_mapping; | ||
207 | struct iovec iov = { | ||
208 | .iov_base = kmap(page), | ||
209 | .iov_len = PAGE_SIZE, | ||
210 | }; | ||
211 | |||
212 | init_sync_kiocb(&kiocb, swap_file); | ||
213 | kiocb.ki_pos = page_file_offset(page); | ||
214 | kiocb.ki_left = PAGE_SIZE; | ||
215 | kiocb.ki_nbytes = PAGE_SIZE; | ||
216 | |||
217 | unlock_page(page); | ||
218 | ret = mapping->a_ops->direct_IO(KERNEL_WRITE, | ||
219 | &kiocb, &iov, | ||
220 | kiocb.ki_pos, 1); | ||
221 | kunmap(page); | ||
222 | if (ret == PAGE_SIZE) { | ||
223 | count_vm_event(PSWPOUT); | ||
224 | ret = 0; | ||
225 | } | ||
226 | return ret; | ||
227 | } | ||
228 | |||
101 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); | 229 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); |
102 | if (bio == NULL) { | 230 | if (bio == NULL) { |
103 | set_page_dirty(page); | 231 | set_page_dirty(page); |
@@ -119,9 +247,26 @@ int swap_readpage(struct page *page) | |||
119 | { | 247 | { |
120 | struct bio *bio; | 248 | struct bio *bio; |
121 | int ret = 0; | 249 | int ret = 0; |
250 | struct swap_info_struct *sis = page_swap_info(page); | ||
122 | 251 | ||
123 | VM_BUG_ON(!PageLocked(page)); | 252 | VM_BUG_ON(!PageLocked(page)); |
124 | VM_BUG_ON(PageUptodate(page)); | 253 | VM_BUG_ON(PageUptodate(page)); |
254 | if (frontswap_load(page) == 0) { | ||
255 | SetPageUptodate(page); | ||
256 | unlock_page(page); | ||
257 | goto out; | ||
258 | } | ||
259 | |||
260 | if (sis->flags & SWP_FILE) { | ||
261 | struct file *swap_file = sis->swap_file; | ||
262 | struct address_space *mapping = swap_file->f_mapping; | ||
263 | |||
264 | ret = mapping->a_ops->readpage(swap_file, page); | ||
265 | if (!ret) | ||
266 | count_vm_event(PSWPIN); | ||
267 | return ret; | ||
268 | } | ||
269 | |||
125 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); | 270 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); |
126 | if (bio == NULL) { | 271 | if (bio == NULL) { |
127 | unlock_page(page); | 272 | unlock_page(page); |
@@ -133,3 +278,15 @@ int swap_readpage(struct page *page) | |||
133 | out: | 278 | out: |
134 | return ret; | 279 | return ret; |
135 | } | 280 | } |
281 | |||
282 | int swap_set_page_dirty(struct page *page) | ||
283 | { | ||
284 | struct swap_info_struct *sis = page_swap_info(page); | ||
285 | |||
286 | if (sis->flags & SWP_FILE) { | ||
287 | struct address_space *mapping = sis->swap_file->f_mapping; | ||
288 | return mapping->a_ops->set_page_dirty(page); | ||
289 | } else { | ||
290 | return __set_page_dirty_no_writeback(page); | ||
291 | } | ||
292 | } | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 4ae42bb40892..247d1f175739 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -5,8 +5,101 @@ | |||
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/page-isolation.h> | 6 | #include <linux/page-isolation.h> |
7 | #include <linux/pageblock-flags.h> | 7 | #include <linux/pageblock-flags.h> |
8 | #include <linux/memory.h> | ||
8 | #include "internal.h" | 9 | #include "internal.h" |
9 | 10 | ||
11 | /* called while holding zone->lock */ | ||
12 | static void set_pageblock_isolate(struct page *page) | ||
13 | { | ||
14 | if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE) | ||
15 | return; | ||
16 | |||
17 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
18 | page_zone(page)->nr_pageblock_isolate++; | ||
19 | } | ||
20 | |||
21 | /* called while holding zone->lock */ | ||
22 | static void restore_pageblock_isolate(struct page *page, int migratetype) | ||
23 | { | ||
24 | struct zone *zone = page_zone(page); | ||
25 | if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) | ||
26 | return; | ||
27 | |||
28 | BUG_ON(zone->nr_pageblock_isolate <= 0); | ||
29 | set_pageblock_migratetype(page, migratetype); | ||
30 | zone->nr_pageblock_isolate--; | ||
31 | } | ||
32 | |||
33 | int set_migratetype_isolate(struct page *page) | ||
34 | { | ||
35 | struct zone *zone; | ||
36 | unsigned long flags, pfn; | ||
37 | struct memory_isolate_notify arg; | ||
38 | int notifier_ret; | ||
39 | int ret = -EBUSY; | ||
40 | |||
41 | zone = page_zone(page); | ||
42 | |||
43 | spin_lock_irqsave(&zone->lock, flags); | ||
44 | |||
45 | pfn = page_to_pfn(page); | ||
46 | arg.start_pfn = pfn; | ||
47 | arg.nr_pages = pageblock_nr_pages; | ||
48 | arg.pages_found = 0; | ||
49 | |||
50 | /* | ||
51 | * It may be possible to isolate a pageblock even if the | ||
52 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | ||
53 | * notifier chain is used by balloon drivers to return the | ||
54 | * number of pages in a range that are held by the balloon | ||
55 | * driver to shrink memory. If all the pages are accounted for | ||
56 | * by balloons, are free, or on the LRU, isolation can continue. | ||
57 | * Later, for example, when memory hotplug notifier runs, these | ||
58 | * pages reported as "can be isolated" should be isolated(freed) | ||
59 | * by the balloon driver through the memory notifier chain. | ||
60 | */ | ||
61 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); | ||
62 | notifier_ret = notifier_to_errno(notifier_ret); | ||
63 | if (notifier_ret) | ||
64 | goto out; | ||
65 | /* | ||
66 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | ||
67 | * We just check MOVABLE pages. | ||
68 | */ | ||
69 | if (!has_unmovable_pages(zone, page, arg.pages_found)) | ||
70 | ret = 0; | ||
71 | |||
72 | /* | ||
73 | * immobile means "not-on-lru" paes. If immobile is larger than | ||
74 | * removable-by-driver pages reported by notifier, we'll fail. | ||
75 | */ | ||
76 | |||
77 | out: | ||
78 | if (!ret) { | ||
79 | set_pageblock_isolate(page); | ||
80 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
81 | } | ||
82 | |||
83 | spin_unlock_irqrestore(&zone->lock, flags); | ||
84 | if (!ret) | ||
85 | drain_all_pages(); | ||
86 | return ret; | ||
87 | } | ||
88 | |||
89 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) | ||
90 | { | ||
91 | struct zone *zone; | ||
92 | unsigned long flags; | ||
93 | zone = page_zone(page); | ||
94 | spin_lock_irqsave(&zone->lock, flags); | ||
95 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
96 | goto out; | ||
97 | move_freepages_block(zone, page, migratetype); | ||
98 | restore_pageblock_isolate(page, migratetype); | ||
99 | out: | ||
100 | spin_unlock_irqrestore(&zone->lock, flags); | ||
101 | } | ||
102 | |||
10 | static inline struct page * | 103 | static inline struct page * |
11 | __first_valid_page(unsigned long pfn, unsigned long nr_pages) | 104 | __first_valid_page(unsigned long pfn, unsigned long nr_pages) |
12 | { | 105 | { |
@@ -24,6 +117,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) | |||
24 | * to be MIGRATE_ISOLATE. | 117 | * to be MIGRATE_ISOLATE. |
25 | * @start_pfn: The lower PFN of the range to be isolated. | 118 | * @start_pfn: The lower PFN of the range to be isolated. |
26 | * @end_pfn: The upper PFN of the range to be isolated. | 119 | * @end_pfn: The upper PFN of the range to be isolated. |
120 | * @migratetype: migrate type to set in error recovery. | ||
27 | * | 121 | * |
28 | * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in | 122 | * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in |
29 | * the range will never be allocated. Any free pages and pages freed in the | 123 | * the range will never be allocated. Any free pages and pages freed in the |
@@ -32,8 +126,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) | |||
32 | * start_pfn/end_pfn must be aligned to pageblock_order. | 126 | * start_pfn/end_pfn must be aligned to pageblock_order. |
33 | * Returns 0 on success and -EBUSY if any part of range cannot be isolated. | 127 | * Returns 0 on success and -EBUSY if any part of range cannot be isolated. |
34 | */ | 128 | */ |
35 | int | 129 | int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, |
36 | start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) | 130 | unsigned migratetype) |
37 | { | 131 | { |
38 | unsigned long pfn; | 132 | unsigned long pfn; |
39 | unsigned long undo_pfn; | 133 | unsigned long undo_pfn; |
@@ -56,7 +150,7 @@ undo: | |||
56 | for (pfn = start_pfn; | 150 | for (pfn = start_pfn; |
57 | pfn < undo_pfn; | 151 | pfn < undo_pfn; |
58 | pfn += pageblock_nr_pages) | 152 | pfn += pageblock_nr_pages) |
59 | unset_migratetype_isolate(pfn_to_page(pfn)); | 153 | unset_migratetype_isolate(pfn_to_page(pfn), migratetype); |
60 | 154 | ||
61 | return -EBUSY; | 155 | return -EBUSY; |
62 | } | 156 | } |
@@ -64,8 +158,8 @@ undo: | |||
64 | /* | 158 | /* |
65 | * Make isolated pages available again. | 159 | * Make isolated pages available again. |
66 | */ | 160 | */ |
67 | int | 161 | int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, |
68 | undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) | 162 | unsigned migratetype) |
69 | { | 163 | { |
70 | unsigned long pfn; | 164 | unsigned long pfn; |
71 | struct page *page; | 165 | struct page *page; |
@@ -77,7 +171,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) | |||
77 | page = __first_valid_page(pfn, pageblock_nr_pages); | 171 | page = __first_valid_page(pfn, pageblock_nr_pages); |
78 | if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | 172 | if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) |
79 | continue; | 173 | continue; |
80 | unset_migratetype_isolate(page); | 174 | unset_migratetype_isolate(page, migratetype); |
81 | } | 175 | } |
82 | return 0; | 176 | return 0; |
83 | } | 177 | } |
@@ -86,7 +180,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) | |||
86 | * all pages in [start_pfn...end_pfn) must be in the same zone. | 180 | * all pages in [start_pfn...end_pfn) must be in the same zone. |
87 | * zone->lock must be held before call this. | 181 | * zone->lock must be held before call this. |
88 | * | 182 | * |
89 | * Returns 1 if all pages in the range is isolated. | 183 | * Returns 1 if all pages in the range are isolated. |
90 | */ | 184 | */ |
91 | static int | 185 | static int |
92 | __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | 186 | __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index aa9701e12714..6c118d012bb5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
162 | 162 | ||
163 | /** | 163 | /** |
164 | * walk_page_range - walk a memory map's page tables with a callback | 164 | * walk_page_range - walk a memory map's page tables with a callback |
165 | * @mm: memory map to walk | ||
166 | * @addr: starting address | 165 | * @addr: starting address |
167 | * @end: ending address | 166 | * @end: ending address |
168 | * @walk: set of callbacks to invoke for each level of the tree | 167 | * @walk: set of callbacks to invoke for each level of the tree |
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 405d331804c3..3707c71ae4cd 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c | |||
@@ -360,7 +360,6 @@ err_free: | |||
360 | * @chunk: chunk to depopulate | 360 | * @chunk: chunk to depopulate |
361 | * @off: offset to the area to depopulate | 361 | * @off: offset to the area to depopulate |
362 | * @size: size of the area to depopulate in bytes | 362 | * @size: size of the area to depopulate in bytes |
363 | * @flush: whether to flush cache and tlb or not | ||
364 | * | 363 | * |
365 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) | 364 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) |
366 | * from @chunk. If @flush is true, vcache is flushed before unmapping | 365 | * from @chunk. If @flush is true, vcache is flushed before unmapping |
diff --git a/mm/percpu.c b/mm/percpu.c index f47af9123af7..bb4be7435ce3 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -1132,20 +1132,20 @@ static void pcpu_dump_alloc_info(const char *lvl, | |||
1132 | for (alloc_end += gi->nr_units / upa; | 1132 | for (alloc_end += gi->nr_units / upa; |
1133 | alloc < alloc_end; alloc++) { | 1133 | alloc < alloc_end; alloc++) { |
1134 | if (!(alloc % apl)) { | 1134 | if (!(alloc % apl)) { |
1135 | printk("\n"); | 1135 | printk(KERN_CONT "\n"); |
1136 | printk("%spcpu-alloc: ", lvl); | 1136 | printk("%spcpu-alloc: ", lvl); |
1137 | } | 1137 | } |
1138 | printk("[%0*d] ", group_width, group); | 1138 | printk(KERN_CONT "[%0*d] ", group_width, group); |
1139 | 1139 | ||
1140 | for (unit_end += upa; unit < unit_end; unit++) | 1140 | for (unit_end += upa; unit < unit_end; unit++) |
1141 | if (gi->cpu_map[unit] != NR_CPUS) | 1141 | if (gi->cpu_map[unit] != NR_CPUS) |
1142 | printk("%0*d ", cpu_width, | 1142 | printk(KERN_CONT "%0*d ", cpu_width, |
1143 | gi->cpu_map[unit]); | 1143 | gi->cpu_map[unit]); |
1144 | else | 1144 | else |
1145 | printk("%s ", empty_str); | 1145 | printk(KERN_CONT "%s ", empty_str); |
1146 | } | 1146 | } |
1147 | } | 1147 | } |
1148 | printk("\n"); | 1148 | printk(KERN_CONT "\n"); |
1149 | } | 1149 | } |
1150 | 1150 | ||
1151 | /** | 1151 | /** |
@@ -1650,6 +1650,16 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, | |||
1650 | areas[group] = ptr; | 1650 | areas[group] = ptr; |
1651 | 1651 | ||
1652 | base = min(ptr, base); | 1652 | base = min(ptr, base); |
1653 | } | ||
1654 | |||
1655 | /* | ||
1656 | * Copy data and free unused parts. This should happen after all | ||
1657 | * allocations are complete; otherwise, we may end up with | ||
1658 | * overlapping groups. | ||
1659 | */ | ||
1660 | for (group = 0; group < ai->nr_groups; group++) { | ||
1661 | struct pcpu_group_info *gi = &ai->groups[group]; | ||
1662 | void *ptr = areas[group]; | ||
1653 | 1663 | ||
1654 | for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { | 1664 | for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { |
1655 | if (gi->cpu_map[i] == NR_CPUS) { | 1665 | if (gi->cpu_map[i] == NR_CPUS) { |
@@ -1885,6 +1895,8 @@ void __init setup_per_cpu_areas(void) | |||
1885 | fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 1895 | fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); |
1886 | if (!ai || !fc) | 1896 | if (!ai || !fc) |
1887 | panic("Failed to allocate memory for percpu areas."); | 1897 | panic("Failed to allocate memory for percpu areas."); |
1898 | /* kmemleak tracks the percpu allocations separately */ | ||
1899 | kmemleak_free(fc); | ||
1888 | 1900 | ||
1889 | ai->dyn_size = unit_size; | 1901 | ai->dyn_size = unit_size; |
1890 | ai->unit_size = unit_size; | 1902 | ai->unit_size = unit_size; |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 5a74fea182f1..74c0ddaa6fa0 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -109,8 +109,8 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, | |||
109 | 109 | ||
110 | #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH | 110 | #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH |
111 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 111 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
112 | pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | 112 | void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, |
113 | pmd_t *pmdp) | 113 | pmd_t *pmdp) |
114 | { | 114 | { |
115 | pmd_t pmd = pmd_mksplitting(*pmdp); | 115 | pmd_t pmd = pmd_mksplitting(*pmdp); |
116 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 116 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index c20ff48994c2..926b46649749 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c | |||
@@ -371,15 +371,15 @@ static ssize_t process_vm_rw(pid_t pid, | |||
371 | /* Check iovecs */ | 371 | /* Check iovecs */ |
372 | if (vm_write) | 372 | if (vm_write) |
373 | rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, | 373 | rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, |
374 | iovstack_l, &iov_l, 1); | 374 | iovstack_l, &iov_l); |
375 | else | 375 | else |
376 | rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, | 376 | rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, |
377 | iovstack_l, &iov_l, 1); | 377 | iovstack_l, &iov_l); |
378 | if (rc <= 0) | 378 | if (rc <= 0) |
379 | goto free_iovecs; | 379 | goto free_iovecs; |
380 | 380 | ||
381 | rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV, | 381 | rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, |
382 | iovstack_r, &iov_r, 0); | 382 | iovstack_r, &iov_r); |
383 | if (rc <= 0) | 383 | if (rc <= 0) |
384 | goto free_iovecs; | 384 | goto free_iovecs; |
385 | 385 | ||
@@ -438,16 +438,16 @@ compat_process_vm_rw(compat_pid_t pid, | |||
438 | if (vm_write) | 438 | if (vm_write) |
439 | rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, | 439 | rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, |
440 | UIO_FASTIOV, iovstack_l, | 440 | UIO_FASTIOV, iovstack_l, |
441 | &iov_l, 1); | 441 | &iov_l); |
442 | else | 442 | else |
443 | rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, | 443 | rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, |
444 | UIO_FASTIOV, iovstack_l, | 444 | UIO_FASTIOV, iovstack_l, |
445 | &iov_l, 1); | 445 | &iov_l); |
446 | if (rc <= 0) | 446 | if (rc <= 0) |
447 | goto free_iovecs; | 447 | goto free_iovecs; |
448 | rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt, | 448 | rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, |
449 | UIO_FASTIOV, iovstack_r, | 449 | UIO_FASTIOV, iovstack_r, |
450 | &iov_r, 0); | 450 | &iov_r); |
451 | if (rc <= 0) | 451 | if (rc <= 0) |
452 | goto free_iovecs; | 452 | goto free_iovecs; |
453 | 453 | ||
diff --git a/mm/readahead.c b/mm/readahead.c index cbcbb02f3e28..ea8f8fa21649 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -17,6 +17,8 @@ | |||
17 | #include <linux/task_io_accounting_ops.h> | 17 | #include <linux/task_io_accounting_ops.h> |
18 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
20 | #include <linux/syscalls.h> | ||
21 | #include <linux/file.h> | ||
20 | 22 | ||
21 | /* | 23 | /* |
22 | * Initialise a struct file's readahead state. Assumes that the caller has | 24 | * Initialise a struct file's readahead state. Assumes that the caller has |
@@ -562,3 +564,41 @@ page_cache_async_readahead(struct address_space *mapping, | |||
562 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); | 564 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); |
563 | } | 565 | } |
564 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); | 566 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); |
567 | |||
568 | static ssize_t | ||
569 | do_readahead(struct address_space *mapping, struct file *filp, | ||
570 | pgoff_t index, unsigned long nr) | ||
571 | { | ||
572 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | ||
573 | return -EINVAL; | ||
574 | |||
575 | force_page_cache_readahead(mapping, filp, index, nr); | ||
576 | return 0; | ||
577 | } | ||
578 | |||
579 | SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) | ||
580 | { | ||
581 | ssize_t ret; | ||
582 | struct file *file; | ||
583 | |||
584 | ret = -EBADF; | ||
585 | file = fget(fd); | ||
586 | if (file) { | ||
587 | if (file->f_mode & FMODE_READ) { | ||
588 | struct address_space *mapping = file->f_mapping; | ||
589 | pgoff_t start = offset >> PAGE_CACHE_SHIFT; | ||
590 | pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; | ||
591 | unsigned long len = end - start + 1; | ||
592 | ret = do_readahead(mapping, file, start, len); | ||
593 | } | ||
594 | fput(file); | ||
595 | } | ||
596 | return ret; | ||
597 | } | ||
598 | #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS | ||
599 | asmlinkage long SyS_readahead(long fd, loff_t offset, long count) | ||
600 | { | ||
601 | return SYSC_readahead((int) fd, offset, (size_t) count); | ||
602 | } | ||
603 | SYSCALL_ALIAS(sys_readahead, SyS_readahead); | ||
604 | #endif | ||
@@ -755,12 +755,6 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
755 | pte_unmap_unlock(pte, ptl); | 755 | pte_unmap_unlock(pte, ptl); |
756 | } | 756 | } |
757 | 757 | ||
758 | /* Pretend the page is referenced if the task has the | ||
759 | swap token and is in the middle of a page fault. */ | ||
760 | if (mm != current->mm && has_swap_token(mm) && | ||
761 | rwsem_is_locked(&mm->mmap_sem)) | ||
762 | referenced++; | ||
763 | |||
764 | (*mapcount)--; | 758 | (*mapcount)--; |
765 | 759 | ||
766 | if (referenced) | 760 | if (referenced) |
diff --git a/mm/shmem.c b/mm/shmem.c index f99ff3e50bd6..d4e184e2a38e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt; | |||
53 | #include <linux/blkdev.h> | 53 | #include <linux/blkdev.h> |
54 | #include <linux/pagevec.h> | 54 | #include <linux/pagevec.h> |
55 | #include <linux/percpu_counter.h> | 55 | #include <linux/percpu_counter.h> |
56 | #include <linux/falloc.h> | ||
56 | #include <linux/splice.h> | 57 | #include <linux/splice.h> |
57 | #include <linux/security.h> | 58 | #include <linux/security.h> |
58 | #include <linux/swapops.h> | 59 | #include <linux/swapops.h> |
@@ -83,12 +84,25 @@ struct shmem_xattr { | |||
83 | char value[0]; | 84 | char value[0]; |
84 | }; | 85 | }; |
85 | 86 | ||
87 | /* | ||
88 | * shmem_fallocate and shmem_writepage communicate via inode->i_private | ||
89 | * (with i_mutex making sure that it has only one user at a time): | ||
90 | * we would prefer not to enlarge the shmem inode just for that. | ||
91 | */ | ||
92 | struct shmem_falloc { | ||
93 | pgoff_t start; /* start of range currently being fallocated */ | ||
94 | pgoff_t next; /* the next page offset to be fallocated */ | ||
95 | pgoff_t nr_falloced; /* how many new pages have been fallocated */ | ||
96 | pgoff_t nr_unswapped; /* how often writepage refused to swap out */ | ||
97 | }; | ||
98 | |||
86 | /* Flag allocation requirements to shmem_getpage */ | 99 | /* Flag allocation requirements to shmem_getpage */ |
87 | enum sgp_type { | 100 | enum sgp_type { |
88 | SGP_READ, /* don't exceed i_size, don't allocate page */ | 101 | SGP_READ, /* don't exceed i_size, don't allocate page */ |
89 | SGP_CACHE, /* don't exceed i_size, may allocate page */ | 102 | SGP_CACHE, /* don't exceed i_size, may allocate page */ |
90 | SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ | 103 | SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ |
91 | SGP_WRITE, /* may exceed i_size, may allocate page */ | 104 | SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ |
105 | SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ | ||
92 | }; | 106 | }; |
93 | 107 | ||
94 | #ifdef CONFIG_TMPFS | 108 | #ifdef CONFIG_TMPFS |
@@ -103,6 +117,9 @@ static unsigned long shmem_default_max_inodes(void) | |||
103 | } | 117 | } |
104 | #endif | 118 | #endif |
105 | 119 | ||
120 | static bool shmem_should_replace_page(struct page *page, gfp_t gfp); | ||
121 | static int shmem_replace_page(struct page **pagep, gfp_t gfp, | ||
122 | struct shmem_inode_info *info, pgoff_t index); | ||
106 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | 123 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
107 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); | 124 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); |
108 | 125 | ||
@@ -247,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping, | |||
247 | } | 264 | } |
248 | 265 | ||
249 | /* | 266 | /* |
267 | * Sometimes, before we decide whether to proceed or to fail, we must check | ||
268 | * that an entry was not already brought back from swap by a racing thread. | ||
269 | * | ||
270 | * Checking page is not enough: by the time a SwapCache page is locked, it | ||
271 | * might be reused, and again be SwapCache, using the same swap as before. | ||
272 | */ | ||
273 | static bool shmem_confirm_swap(struct address_space *mapping, | ||
274 | pgoff_t index, swp_entry_t swap) | ||
275 | { | ||
276 | void *item; | ||
277 | |||
278 | rcu_read_lock(); | ||
279 | item = radix_tree_lookup(&mapping->page_tree, index); | ||
280 | rcu_read_unlock(); | ||
281 | return item == swp_to_radix_entry(swap); | ||
282 | } | ||
283 | |||
284 | /* | ||
250 | * Like add_to_page_cache_locked, but error if expected item has gone. | 285 | * Like add_to_page_cache_locked, but error if expected item has gone. |
251 | */ | 286 | */ |
252 | static int shmem_add_to_page_cache(struct page *page, | 287 | static int shmem_add_to_page_cache(struct page *page, |
253 | struct address_space *mapping, | 288 | struct address_space *mapping, |
254 | pgoff_t index, gfp_t gfp, void *expected) | 289 | pgoff_t index, gfp_t gfp, void *expected) |
255 | { | 290 | { |
256 | int error = 0; | 291 | int error; |
257 | 292 | ||
258 | VM_BUG_ON(!PageLocked(page)); | 293 | VM_BUG_ON(!PageLocked(page)); |
259 | VM_BUG_ON(!PageSwapBacked(page)); | 294 | VM_BUG_ON(!PageSwapBacked(page)); |
260 | 295 | ||
296 | page_cache_get(page); | ||
297 | page->mapping = mapping; | ||
298 | page->index = index; | ||
299 | |||
300 | spin_lock_irq(&mapping->tree_lock); | ||
261 | if (!expected) | 301 | if (!expected) |
262 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | 302 | error = radix_tree_insert(&mapping->page_tree, index, page); |
303 | else | ||
304 | error = shmem_radix_tree_replace(mapping, index, expected, | ||
305 | page); | ||
263 | if (!error) { | 306 | if (!error) { |
264 | page_cache_get(page); | 307 | mapping->nrpages++; |
265 | page->mapping = mapping; | 308 | __inc_zone_page_state(page, NR_FILE_PAGES); |
266 | page->index = index; | 309 | __inc_zone_page_state(page, NR_SHMEM); |
267 | 310 | spin_unlock_irq(&mapping->tree_lock); | |
268 | spin_lock_irq(&mapping->tree_lock); | 311 | } else { |
269 | if (!expected) | 312 | page->mapping = NULL; |
270 | error = radix_tree_insert(&mapping->page_tree, | 313 | spin_unlock_irq(&mapping->tree_lock); |
271 | index, page); | 314 | page_cache_release(page); |
272 | else | ||
273 | error = shmem_radix_tree_replace(mapping, index, | ||
274 | expected, page); | ||
275 | if (!error) { | ||
276 | mapping->nrpages++; | ||
277 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
278 | __inc_zone_page_state(page, NR_SHMEM); | ||
279 | spin_unlock_irq(&mapping->tree_lock); | ||
280 | } else { | ||
281 | page->mapping = NULL; | ||
282 | spin_unlock_irq(&mapping->tree_lock); | ||
283 | page_cache_release(page); | ||
284 | } | ||
285 | if (!expected) | ||
286 | radix_tree_preload_end(); | ||
287 | } | 315 | } |
288 | if (error) | ||
289 | mem_cgroup_uncharge_cache_page(page); | ||
290 | return error; | 316 | return error; |
291 | } | 317 | } |
292 | 318 | ||
@@ -423,27 +449,31 @@ void shmem_unlock_mapping(struct address_space *mapping) | |||
423 | 449 | ||
424 | /* | 450 | /* |
425 | * Remove range of pages and swap entries from radix tree, and free them. | 451 | * Remove range of pages and swap entries from radix tree, and free them. |
452 | * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. | ||
426 | */ | 453 | */ |
427 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | 454 | static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, |
455 | bool unfalloc) | ||
428 | { | 456 | { |
429 | struct address_space *mapping = inode->i_mapping; | 457 | struct address_space *mapping = inode->i_mapping; |
430 | struct shmem_inode_info *info = SHMEM_I(inode); | 458 | struct shmem_inode_info *info = SHMEM_I(inode); |
431 | pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 459 | pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
432 | unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); | 460 | pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; |
433 | pgoff_t end = (lend >> PAGE_CACHE_SHIFT); | 461 | unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); |
462 | unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); | ||
434 | struct pagevec pvec; | 463 | struct pagevec pvec; |
435 | pgoff_t indices[PAGEVEC_SIZE]; | 464 | pgoff_t indices[PAGEVEC_SIZE]; |
436 | long nr_swaps_freed = 0; | 465 | long nr_swaps_freed = 0; |
437 | pgoff_t index; | 466 | pgoff_t index; |
438 | int i; | 467 | int i; |
439 | 468 | ||
440 | BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); | 469 | if (lend == -1) |
470 | end = -1; /* unsigned, so actually very big */ | ||
441 | 471 | ||
442 | pagevec_init(&pvec, 0); | 472 | pagevec_init(&pvec, 0); |
443 | index = start; | 473 | index = start; |
444 | while (index <= end) { | 474 | while (index < end) { |
445 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | 475 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
446 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | 476 | min(end - index, (pgoff_t)PAGEVEC_SIZE), |
447 | pvec.pages, indices); | 477 | pvec.pages, indices); |
448 | if (!pvec.nr) | 478 | if (!pvec.nr) |
449 | break; | 479 | break; |
@@ -452,10 +482,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
452 | struct page *page = pvec.pages[i]; | 482 | struct page *page = pvec.pages[i]; |
453 | 483 | ||
454 | index = indices[i]; | 484 | index = indices[i]; |
455 | if (index > end) | 485 | if (index >= end) |
456 | break; | 486 | break; |
457 | 487 | ||
458 | if (radix_tree_exceptional_entry(page)) { | 488 | if (radix_tree_exceptional_entry(page)) { |
489 | if (unfalloc) | ||
490 | continue; | ||
459 | nr_swaps_freed += !shmem_free_swap(mapping, | 491 | nr_swaps_freed += !shmem_free_swap(mapping, |
460 | index, page); | 492 | index, page); |
461 | continue; | 493 | continue; |
@@ -463,9 +495,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
463 | 495 | ||
464 | if (!trylock_page(page)) | 496 | if (!trylock_page(page)) |
465 | continue; | 497 | continue; |
466 | if (page->mapping == mapping) { | 498 | if (!unfalloc || !PageUptodate(page)) { |
467 | VM_BUG_ON(PageWriteback(page)); | 499 | if (page->mapping == mapping) { |
468 | truncate_inode_page(mapping, page); | 500 | VM_BUG_ON(PageWriteback(page)); |
501 | truncate_inode_page(mapping, page); | ||
502 | } | ||
469 | } | 503 | } |
470 | unlock_page(page); | 504 | unlock_page(page); |
471 | } | 505 | } |
@@ -476,30 +510,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
476 | index++; | 510 | index++; |
477 | } | 511 | } |
478 | 512 | ||
479 | if (partial) { | 513 | if (partial_start) { |
480 | struct page *page = NULL; | 514 | struct page *page = NULL; |
481 | shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); | 515 | shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); |
482 | if (page) { | 516 | if (page) { |
483 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); | 517 | unsigned int top = PAGE_CACHE_SIZE; |
518 | if (start > end) { | ||
519 | top = partial_end; | ||
520 | partial_end = 0; | ||
521 | } | ||
522 | zero_user_segment(page, partial_start, top); | ||
484 | set_page_dirty(page); | 523 | set_page_dirty(page); |
485 | unlock_page(page); | 524 | unlock_page(page); |
486 | page_cache_release(page); | 525 | page_cache_release(page); |
487 | } | 526 | } |
488 | } | 527 | } |
528 | if (partial_end) { | ||
529 | struct page *page = NULL; | ||
530 | shmem_getpage(inode, end, &page, SGP_READ, NULL); | ||
531 | if (page) { | ||
532 | zero_user_segment(page, 0, partial_end); | ||
533 | set_page_dirty(page); | ||
534 | unlock_page(page); | ||
535 | page_cache_release(page); | ||
536 | } | ||
537 | } | ||
538 | if (start >= end) | ||
539 | return; | ||
489 | 540 | ||
490 | index = start; | 541 | index = start; |
491 | for ( ; ; ) { | 542 | for ( ; ; ) { |
492 | cond_resched(); | 543 | cond_resched(); |
493 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | 544 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
494 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | 545 | min(end - index, (pgoff_t)PAGEVEC_SIZE), |
495 | pvec.pages, indices); | 546 | pvec.pages, indices); |
496 | if (!pvec.nr) { | 547 | if (!pvec.nr) { |
497 | if (index == start) | 548 | if (index == start || unfalloc) |
498 | break; | 549 | break; |
499 | index = start; | 550 | index = start; |
500 | continue; | 551 | continue; |
501 | } | 552 | } |
502 | if (index == start && indices[0] > end) { | 553 | if ((index == start || unfalloc) && indices[0] >= end) { |
503 | shmem_deswap_pagevec(&pvec); | 554 | shmem_deswap_pagevec(&pvec); |
504 | pagevec_release(&pvec); | 555 | pagevec_release(&pvec); |
505 | break; | 556 | break; |
@@ -509,19 +560,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
509 | struct page *page = pvec.pages[i]; | 560 | struct page *page = pvec.pages[i]; |
510 | 561 | ||
511 | index = indices[i]; | 562 | index = indices[i]; |
512 | if (index > end) | 563 | if (index >= end) |
513 | break; | 564 | break; |
514 | 565 | ||
515 | if (radix_tree_exceptional_entry(page)) { | 566 | if (radix_tree_exceptional_entry(page)) { |
567 | if (unfalloc) | ||
568 | continue; | ||
516 | nr_swaps_freed += !shmem_free_swap(mapping, | 569 | nr_swaps_freed += !shmem_free_swap(mapping, |
517 | index, page); | 570 | index, page); |
518 | continue; | 571 | continue; |
519 | } | 572 | } |
520 | 573 | ||
521 | lock_page(page); | 574 | lock_page(page); |
522 | if (page->mapping == mapping) { | 575 | if (!unfalloc || !PageUptodate(page)) { |
523 | VM_BUG_ON(PageWriteback(page)); | 576 | if (page->mapping == mapping) { |
524 | truncate_inode_page(mapping, page); | 577 | VM_BUG_ON(PageWriteback(page)); |
578 | truncate_inode_page(mapping, page); | ||
579 | } | ||
525 | } | 580 | } |
526 | unlock_page(page); | 581 | unlock_page(page); |
527 | } | 582 | } |
@@ -535,7 +590,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
535 | info->swapped -= nr_swaps_freed; | 590 | info->swapped -= nr_swaps_freed; |
536 | shmem_recalc_inode(inode); | 591 | shmem_recalc_inode(inode); |
537 | spin_unlock(&info->lock); | 592 | spin_unlock(&info->lock); |
593 | } | ||
538 | 594 | ||
595 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | ||
596 | { | ||
597 | shmem_undo_range(inode, lstart, lend, false); | ||
539 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 598 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
540 | } | 599 | } |
541 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | 600 | EXPORT_SYMBOL_GPL(shmem_truncate_range); |
@@ -597,19 +656,20 @@ static void shmem_evict_inode(struct inode *inode) | |||
597 | } | 656 | } |
598 | BUG_ON(inode->i_blocks); | 657 | BUG_ON(inode->i_blocks); |
599 | shmem_free_inode(inode->i_sb); | 658 | shmem_free_inode(inode->i_sb); |
600 | end_writeback(inode); | 659 | clear_inode(inode); |
601 | } | 660 | } |
602 | 661 | ||
603 | /* | 662 | /* |
604 | * If swap found in inode, free it and move page from swapcache to filecache. | 663 | * If swap found in inode, free it and move page from swapcache to filecache. |
605 | */ | 664 | */ |
606 | static int shmem_unuse_inode(struct shmem_inode_info *info, | 665 | static int shmem_unuse_inode(struct shmem_inode_info *info, |
607 | swp_entry_t swap, struct page *page) | 666 | swp_entry_t swap, struct page **pagep) |
608 | { | 667 | { |
609 | struct address_space *mapping = info->vfs_inode.i_mapping; | 668 | struct address_space *mapping = info->vfs_inode.i_mapping; |
610 | void *radswap; | 669 | void *radswap; |
611 | pgoff_t index; | 670 | pgoff_t index; |
612 | int error; | 671 | gfp_t gfp; |
672 | int error = 0; | ||
613 | 673 | ||
614 | radswap = swp_to_radix_entry(swap); | 674 | radswap = swp_to_radix_entry(swap); |
615 | index = radix_tree_locate_item(&mapping->page_tree, radswap); | 675 | index = radix_tree_locate_item(&mapping->page_tree, radswap); |
@@ -625,22 +685,48 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, | |||
625 | if (shmem_swaplist.next != &info->swaplist) | 685 | if (shmem_swaplist.next != &info->swaplist) |
626 | list_move_tail(&shmem_swaplist, &info->swaplist); | 686 | list_move_tail(&shmem_swaplist, &info->swaplist); |
627 | 687 | ||
688 | gfp = mapping_gfp_mask(mapping); | ||
689 | if (shmem_should_replace_page(*pagep, gfp)) { | ||
690 | mutex_unlock(&shmem_swaplist_mutex); | ||
691 | error = shmem_replace_page(pagep, gfp, info, index); | ||
692 | mutex_lock(&shmem_swaplist_mutex); | ||
693 | /* | ||
694 | * We needed to drop mutex to make that restrictive page | ||
695 | * allocation, but the inode might have been freed while we | ||
696 | * dropped it: although a racing shmem_evict_inode() cannot | ||
697 | * complete without emptying the radix_tree, our page lock | ||
698 | * on this swapcache page is not enough to prevent that - | ||
699 | * free_swap_and_cache() of our swap entry will only | ||
700 | * trylock_page(), removing swap from radix_tree whatever. | ||
701 | * | ||
702 | * We must not proceed to shmem_add_to_page_cache() if the | ||
703 | * inode has been freed, but of course we cannot rely on | ||
704 | * inode or mapping or info to check that. However, we can | ||
705 | * safely check if our swap entry is still in use (and here | ||
706 | * it can't have got reused for another page): if it's still | ||
707 | * in use, then the inode cannot have been freed yet, and we | ||
708 | * can safely proceed (if it's no longer in use, that tells | ||
709 | * nothing about the inode, but we don't need to unuse swap). | ||
710 | */ | ||
711 | if (!page_swapcount(*pagep)) | ||
712 | error = -ENOENT; | ||
713 | } | ||
714 | |||
628 | /* | 715 | /* |
629 | * We rely on shmem_swaplist_mutex, not only to protect the swaplist, | 716 | * We rely on shmem_swaplist_mutex, not only to protect the swaplist, |
630 | * but also to hold up shmem_evict_inode(): so inode cannot be freed | 717 | * but also to hold up shmem_evict_inode(): so inode cannot be freed |
631 | * beneath us (pagelock doesn't help until the page is in pagecache). | 718 | * beneath us (pagelock doesn't help until the page is in pagecache). |
632 | */ | 719 | */ |
633 | error = shmem_add_to_page_cache(page, mapping, index, | 720 | if (!error) |
721 | error = shmem_add_to_page_cache(*pagep, mapping, index, | ||
634 | GFP_NOWAIT, radswap); | 722 | GFP_NOWAIT, radswap); |
635 | /* which does mem_cgroup_uncharge_cache_page on error */ | ||
636 | |||
637 | if (error != -ENOMEM) { | 723 | if (error != -ENOMEM) { |
638 | /* | 724 | /* |
639 | * Truncation and eviction use free_swap_and_cache(), which | 725 | * Truncation and eviction use free_swap_and_cache(), which |
640 | * only does trylock page: if we raced, best clean up here. | 726 | * only does trylock page: if we raced, best clean up here. |
641 | */ | 727 | */ |
642 | delete_from_swap_cache(page); | 728 | delete_from_swap_cache(*pagep); |
643 | set_page_dirty(page); | 729 | set_page_dirty(*pagep); |
644 | if (!error) { | 730 | if (!error) { |
645 | spin_lock(&info->lock); | 731 | spin_lock(&info->lock); |
646 | info->swapped--; | 732 | info->swapped--; |
@@ -660,7 +746,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
660 | struct list_head *this, *next; | 746 | struct list_head *this, *next; |
661 | struct shmem_inode_info *info; | 747 | struct shmem_inode_info *info; |
662 | int found = 0; | 748 | int found = 0; |
663 | int error; | 749 | int error = 0; |
750 | |||
751 | /* | ||
752 | * There's a faint possibility that swap page was replaced before | ||
753 | * caller locked it: caller will come back later with the right page. | ||
754 | */ | ||
755 | if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) | ||
756 | goto out; | ||
664 | 757 | ||
665 | /* | 758 | /* |
666 | * Charge page using GFP_KERNEL while we can wait, before taking | 759 | * Charge page using GFP_KERNEL while we can wait, before taking |
@@ -676,7 +769,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
676 | list_for_each_safe(this, next, &shmem_swaplist) { | 769 | list_for_each_safe(this, next, &shmem_swaplist) { |
677 | info = list_entry(this, struct shmem_inode_info, swaplist); | 770 | info = list_entry(this, struct shmem_inode_info, swaplist); |
678 | if (info->swapped) | 771 | if (info->swapped) |
679 | found = shmem_unuse_inode(info, swap, page); | 772 | found = shmem_unuse_inode(info, swap, &page); |
680 | else | 773 | else |
681 | list_del_init(&info->swaplist); | 774 | list_del_init(&info->swaplist); |
682 | cond_resched(); | 775 | cond_resched(); |
@@ -685,8 +778,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
685 | } | 778 | } |
686 | mutex_unlock(&shmem_swaplist_mutex); | 779 | mutex_unlock(&shmem_swaplist_mutex); |
687 | 780 | ||
688 | if (!found) | ||
689 | mem_cgroup_uncharge_cache_page(page); | ||
690 | if (found < 0) | 781 | if (found < 0) |
691 | error = found; | 782 | error = found; |
692 | out: | 783 | out: |
@@ -727,6 +818,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
727 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ | 818 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ |
728 | goto redirty; | 819 | goto redirty; |
729 | } | 820 | } |
821 | |||
822 | /* | ||
823 | * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC | ||
824 | * value into swapfile.c, the only way we can correctly account for a | ||
825 | * fallocated page arriving here is now to initialize it and write it. | ||
826 | * | ||
827 | * That's okay for a page already fallocated earlier, but if we have | ||
828 | * not yet completed the fallocation, then (a) we want to keep track | ||
829 | * of this page in case we have to undo it, and (b) it may not be a | ||
830 | * good idea to continue anyway, once we're pushing into swap. So | ||
831 | * reactivate the page, and let shmem_fallocate() quit when too many. | ||
832 | */ | ||
833 | if (!PageUptodate(page)) { | ||
834 | if (inode->i_private) { | ||
835 | struct shmem_falloc *shmem_falloc; | ||
836 | spin_lock(&inode->i_lock); | ||
837 | shmem_falloc = inode->i_private; | ||
838 | if (shmem_falloc && | ||
839 | index >= shmem_falloc->start && | ||
840 | index < shmem_falloc->next) | ||
841 | shmem_falloc->nr_unswapped++; | ||
842 | else | ||
843 | shmem_falloc = NULL; | ||
844 | spin_unlock(&inode->i_lock); | ||
845 | if (shmem_falloc) | ||
846 | goto redirty; | ||
847 | } | ||
848 | clear_highpage(page); | ||
849 | flush_dcache_page(page); | ||
850 | SetPageUptodate(page); | ||
851 | } | ||
852 | |||
730 | swap = get_swap_page(); | 853 | swap = get_swap_page(); |
731 | if (!swap.val) | 854 | if (!swap.val) |
732 | goto redirty; | 855 | goto redirty; |
@@ -806,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, | |||
806 | 929 | ||
807 | /* Create a pseudo vma that just contains the policy */ | 930 | /* Create a pseudo vma that just contains the policy */ |
808 | pvma.vm_start = 0; | 931 | pvma.vm_start = 0; |
809 | pvma.vm_pgoff = index; | 932 | /* Bias interleave by inode number to distribute better across nodes */ |
933 | pvma.vm_pgoff = index + info->vfs_inode.i_ino; | ||
810 | pvma.vm_ops = NULL; | 934 | pvma.vm_ops = NULL; |
811 | pvma.vm_policy = spol; | 935 | pvma.vm_policy = spol; |
812 | return swapin_readahead(swap, gfp, &pvma, 0); | 936 | return swapin_readahead(swap, gfp, &pvma, 0); |
@@ -819,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp, | |||
819 | 943 | ||
820 | /* Create a pseudo vma that just contains the policy */ | 944 | /* Create a pseudo vma that just contains the policy */ |
821 | pvma.vm_start = 0; | 945 | pvma.vm_start = 0; |
822 | pvma.vm_pgoff = index; | 946 | /* Bias interleave by inode number to distribute better across nodes */ |
947 | pvma.vm_pgoff = index + info->vfs_inode.i_ino; | ||
823 | pvma.vm_ops = NULL; | 948 | pvma.vm_ops = NULL; |
824 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); | 949 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); |
825 | 950 | ||
@@ -856,6 +981,89 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
856 | #endif | 981 | #endif |
857 | 982 | ||
858 | /* | 983 | /* |
984 | * When a page is moved from swapcache to shmem filecache (either by the | ||
985 | * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of | ||
986 | * shmem_unuse_inode()), it may have been read in earlier from swap, in | ||
987 | * ignorance of the mapping it belongs to. If that mapping has special | ||
988 | * constraints (like the gma500 GEM driver, which requires RAM below 4GB), | ||
989 | * we may need to copy to a suitable page before moving to filecache. | ||
990 | * | ||
991 | * In a future release, this may well be extended to respect cpuset and | ||
992 | * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); | ||
993 | * but for now it is a simple matter of zone. | ||
994 | */ | ||
995 | static bool shmem_should_replace_page(struct page *page, gfp_t gfp) | ||
996 | { | ||
997 | return page_zonenum(page) > gfp_zone(gfp); | ||
998 | } | ||
999 | |||
1000 | static int shmem_replace_page(struct page **pagep, gfp_t gfp, | ||
1001 | struct shmem_inode_info *info, pgoff_t index) | ||
1002 | { | ||
1003 | struct page *oldpage, *newpage; | ||
1004 | struct address_space *swap_mapping; | ||
1005 | pgoff_t swap_index; | ||
1006 | int error; | ||
1007 | |||
1008 | oldpage = *pagep; | ||
1009 | swap_index = page_private(oldpage); | ||
1010 | swap_mapping = page_mapping(oldpage); | ||
1011 | |||
1012 | /* | ||
1013 | * We have arrived here because our zones are constrained, so don't | ||
1014 | * limit chance of success by further cpuset and node constraints. | ||
1015 | */ | ||
1016 | gfp &= ~GFP_CONSTRAINT_MASK; | ||
1017 | newpage = shmem_alloc_page(gfp, info, index); | ||
1018 | if (!newpage) | ||
1019 | return -ENOMEM; | ||
1020 | |||
1021 | page_cache_get(newpage); | ||
1022 | copy_highpage(newpage, oldpage); | ||
1023 | flush_dcache_page(newpage); | ||
1024 | |||
1025 | __set_page_locked(newpage); | ||
1026 | SetPageUptodate(newpage); | ||
1027 | SetPageSwapBacked(newpage); | ||
1028 | set_page_private(newpage, swap_index); | ||
1029 | SetPageSwapCache(newpage); | ||
1030 | |||
1031 | /* | ||
1032 | * Our caller will very soon move newpage out of swapcache, but it's | ||
1033 | * a nice clean interface for us to replace oldpage by newpage there. | ||
1034 | */ | ||
1035 | spin_lock_irq(&swap_mapping->tree_lock); | ||
1036 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, | ||
1037 | newpage); | ||
1038 | if (!error) { | ||
1039 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | ||
1040 | __dec_zone_page_state(oldpage, NR_FILE_PAGES); | ||
1041 | } | ||
1042 | spin_unlock_irq(&swap_mapping->tree_lock); | ||
1043 | |||
1044 | if (unlikely(error)) { | ||
1045 | /* | ||
1046 | * Is this possible? I think not, now that our callers check | ||
1047 | * both PageSwapCache and page_private after getting page lock; | ||
1048 | * but be defensive. Reverse old to newpage for clear and free. | ||
1049 | */ | ||
1050 | oldpage = newpage; | ||
1051 | } else { | ||
1052 | mem_cgroup_replace_page_cache(oldpage, newpage); | ||
1053 | lru_cache_add_anon(newpage); | ||
1054 | *pagep = newpage; | ||
1055 | } | ||
1056 | |||
1057 | ClearPageSwapCache(oldpage); | ||
1058 | set_page_private(oldpage, 0); | ||
1059 | |||
1060 | unlock_page(oldpage); | ||
1061 | page_cache_release(oldpage); | ||
1062 | page_cache_release(oldpage); | ||
1063 | return error; | ||
1064 | } | ||
1065 | |||
1066 | /* | ||
859 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate | 1067 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate |
860 | * | 1068 | * |
861 | * If we allocate a new one we do not mark it dirty. That's up to the | 1069 | * If we allocate a new one we do not mark it dirty. That's up to the |
@@ -872,6 +1080,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | |||
872 | swp_entry_t swap; | 1080 | swp_entry_t swap; |
873 | int error; | 1081 | int error; |
874 | int once = 0; | 1082 | int once = 0; |
1083 | int alloced = 0; | ||
875 | 1084 | ||
876 | if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) | 1085 | if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) |
877 | return -EFBIG; | 1086 | return -EFBIG; |
@@ -883,19 +1092,21 @@ repeat: | |||
883 | page = NULL; | 1092 | page = NULL; |
884 | } | 1093 | } |
885 | 1094 | ||
886 | if (sgp != SGP_WRITE && | 1095 | if (sgp != SGP_WRITE && sgp != SGP_FALLOC && |
887 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | 1096 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
888 | error = -EINVAL; | 1097 | error = -EINVAL; |
889 | goto failed; | 1098 | goto failed; |
890 | } | 1099 | } |
891 | 1100 | ||
1101 | /* fallocated page? */ | ||
1102 | if (page && !PageUptodate(page)) { | ||
1103 | if (sgp != SGP_READ) | ||
1104 | goto clear; | ||
1105 | unlock_page(page); | ||
1106 | page_cache_release(page); | ||
1107 | page = NULL; | ||
1108 | } | ||
892 | if (page || (sgp == SGP_READ && !swap.val)) { | 1109 | if (page || (sgp == SGP_READ && !swap.val)) { |
893 | /* | ||
894 | * Once we can get the page lock, it must be uptodate: | ||
895 | * if there were an error in reading back from swap, | ||
896 | * the page would not be inserted into the filecache. | ||
897 | */ | ||
898 | BUG_ON(page && !PageUptodate(page)); | ||
899 | *pagep = page; | 1110 | *pagep = page; |
900 | return 0; | 1111 | return 0; |
901 | } | 1112 | } |
@@ -923,26 +1134,31 @@ repeat: | |||
923 | 1134 | ||
924 | /* We have to do this with page locked to prevent races */ | 1135 | /* We have to do this with page locked to prevent races */ |
925 | lock_page(page); | 1136 | lock_page(page); |
1137 | if (!PageSwapCache(page) || page_private(page) != swap.val || | ||
1138 | !shmem_confirm_swap(mapping, index, swap)) { | ||
1139 | error = -EEXIST; /* try again */ | ||
1140 | goto unlock; | ||
1141 | } | ||
926 | if (!PageUptodate(page)) { | 1142 | if (!PageUptodate(page)) { |
927 | error = -EIO; | 1143 | error = -EIO; |
928 | goto failed; | 1144 | goto failed; |
929 | } | 1145 | } |
930 | wait_on_page_writeback(page); | 1146 | wait_on_page_writeback(page); |
931 | 1147 | ||
932 | /* Someone may have already done it for us */ | 1148 | if (shmem_should_replace_page(page, gfp)) { |
933 | if (page->mapping) { | 1149 | error = shmem_replace_page(&page, gfp, info, index); |
934 | if (page->mapping == mapping && | 1150 | if (error) |
935 | page->index == index) | 1151 | goto failed; |
936 | goto done; | ||
937 | error = -EEXIST; | ||
938 | goto failed; | ||
939 | } | 1152 | } |
940 | 1153 | ||
941 | error = mem_cgroup_cache_charge(page, current->mm, | 1154 | error = mem_cgroup_cache_charge(page, current->mm, |
942 | gfp & GFP_RECLAIM_MASK); | 1155 | gfp & GFP_RECLAIM_MASK); |
943 | if (!error) | 1156 | if (!error) { |
944 | error = shmem_add_to_page_cache(page, mapping, index, | 1157 | error = shmem_add_to_page_cache(page, mapping, index, |
945 | gfp, swp_to_radix_entry(swap)); | 1158 | gfp, swp_to_radix_entry(swap)); |
1159 | /* We already confirmed swap, and make no allocation */ | ||
1160 | VM_BUG_ON(error); | ||
1161 | } | ||
946 | if (error) | 1162 | if (error) |
947 | goto failed; | 1163 | goto failed; |
948 | 1164 | ||
@@ -979,11 +1195,18 @@ repeat: | |||
979 | __set_page_locked(page); | 1195 | __set_page_locked(page); |
980 | error = mem_cgroup_cache_charge(page, current->mm, | 1196 | error = mem_cgroup_cache_charge(page, current->mm, |
981 | gfp & GFP_RECLAIM_MASK); | 1197 | gfp & GFP_RECLAIM_MASK); |
982 | if (!error) | ||
983 | error = shmem_add_to_page_cache(page, mapping, index, | ||
984 | gfp, NULL); | ||
985 | if (error) | 1198 | if (error) |
986 | goto decused; | 1199 | goto decused; |
1200 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | ||
1201 | if (!error) { | ||
1202 | error = shmem_add_to_page_cache(page, mapping, index, | ||
1203 | gfp, NULL); | ||
1204 | radix_tree_preload_end(); | ||
1205 | } | ||
1206 | if (error) { | ||
1207 | mem_cgroup_uncharge_cache_page(page); | ||
1208 | goto decused; | ||
1209 | } | ||
987 | lru_cache_add_anon(page); | 1210 | lru_cache_add_anon(page); |
988 | 1211 | ||
989 | spin_lock(&info->lock); | 1212 | spin_lock(&info->lock); |
@@ -991,19 +1214,36 @@ repeat: | |||
991 | inode->i_blocks += BLOCKS_PER_PAGE; | 1214 | inode->i_blocks += BLOCKS_PER_PAGE; |
992 | shmem_recalc_inode(inode); | 1215 | shmem_recalc_inode(inode); |
993 | spin_unlock(&info->lock); | 1216 | spin_unlock(&info->lock); |
1217 | alloced = true; | ||
994 | 1218 | ||
995 | clear_highpage(page); | 1219 | /* |
996 | flush_dcache_page(page); | 1220 | * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. |
997 | SetPageUptodate(page); | 1221 | */ |
1222 | if (sgp == SGP_FALLOC) | ||
1223 | sgp = SGP_WRITE; | ||
1224 | clear: | ||
1225 | /* | ||
1226 | * Let SGP_WRITE caller clear ends if write does not fill page; | ||
1227 | * but SGP_FALLOC on a page fallocated earlier must initialize | ||
1228 | * it now, lest undo on failure cancel our earlier guarantee. | ||
1229 | */ | ||
1230 | if (sgp != SGP_WRITE) { | ||
1231 | clear_highpage(page); | ||
1232 | flush_dcache_page(page); | ||
1233 | SetPageUptodate(page); | ||
1234 | } | ||
998 | if (sgp == SGP_DIRTY) | 1235 | if (sgp == SGP_DIRTY) |
999 | set_page_dirty(page); | 1236 | set_page_dirty(page); |
1000 | } | 1237 | } |
1001 | done: | 1238 | |
1002 | /* Perhaps the file has been truncated since we checked */ | 1239 | /* Perhaps the file has been truncated since we checked */ |
1003 | if (sgp != SGP_WRITE && | 1240 | if (sgp != SGP_WRITE && sgp != SGP_FALLOC && |
1004 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | 1241 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
1005 | error = -EINVAL; | 1242 | error = -EINVAL; |
1006 | goto trunc; | 1243 | if (alloced) |
1244 | goto trunc; | ||
1245 | else | ||
1246 | goto failed; | ||
1007 | } | 1247 | } |
1008 | *pagep = page; | 1248 | *pagep = page; |
1009 | return 0; | 1249 | return 0; |
@@ -1012,6 +1252,7 @@ done: | |||
1012 | * Error recovery. | 1252 | * Error recovery. |
1013 | */ | 1253 | */ |
1014 | trunc: | 1254 | trunc: |
1255 | info = SHMEM_I(inode); | ||
1015 | ClearPageDirty(page); | 1256 | ClearPageDirty(page); |
1016 | delete_from_page_cache(page); | 1257 | delete_from_page_cache(page); |
1017 | spin_lock(&info->lock); | 1258 | spin_lock(&info->lock); |
@@ -1019,19 +1260,16 @@ trunc: | |||
1019 | inode->i_blocks -= BLOCKS_PER_PAGE; | 1260 | inode->i_blocks -= BLOCKS_PER_PAGE; |
1020 | spin_unlock(&info->lock); | 1261 | spin_unlock(&info->lock); |
1021 | decused: | 1262 | decused: |
1263 | sbinfo = SHMEM_SB(inode->i_sb); | ||
1022 | if (sbinfo->max_blocks) | 1264 | if (sbinfo->max_blocks) |
1023 | percpu_counter_add(&sbinfo->used_blocks, -1); | 1265 | percpu_counter_add(&sbinfo->used_blocks, -1); |
1024 | unacct: | 1266 | unacct: |
1025 | shmem_unacct_blocks(info->flags, 1); | 1267 | shmem_unacct_blocks(info->flags, 1); |
1026 | failed: | 1268 | failed: |
1027 | if (swap.val && error != -EINVAL) { | 1269 | if (swap.val && error != -EINVAL && |
1028 | struct page *test = find_get_page(mapping, index); | 1270 | !shmem_confirm_swap(mapping, index, swap)) |
1029 | if (test && !radix_tree_exceptional_entry(test)) | 1271 | error = -EEXIST; |
1030 | page_cache_release(test); | 1272 | unlock: |
1031 | /* Have another try if the entry has changed */ | ||
1032 | if (test != swp_to_radix_entry(swap)) | ||
1033 | error = -EEXIST; | ||
1034 | } | ||
1035 | if (page) { | 1273 | if (page) { |
1036 | unlock_page(page); | 1274 | unlock_page(page); |
1037 | page_cache_release(page); | 1275 | page_cache_release(page); |
@@ -1043,7 +1281,7 @@ failed: | |||
1043 | spin_unlock(&info->lock); | 1281 | spin_unlock(&info->lock); |
1044 | goto repeat; | 1282 | goto repeat; |
1045 | } | 1283 | } |
1046 | if (error == -EEXIST) | 1284 | if (error == -EEXIST) /* from above or from radix_tree_insert */ |
1047 | goto repeat; | 1285 | goto repeat; |
1048 | return error; | 1286 | return error; |
1049 | } | 1287 | } |
@@ -1204,6 +1442,14 @@ shmem_write_end(struct file *file, struct address_space *mapping, | |||
1204 | if (pos + copied > inode->i_size) | 1442 | if (pos + copied > inode->i_size) |
1205 | i_size_write(inode, pos + copied); | 1443 | i_size_write(inode, pos + copied); |
1206 | 1444 | ||
1445 | if (!PageUptodate(page)) { | ||
1446 | if (copied < PAGE_CACHE_SIZE) { | ||
1447 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); | ||
1448 | zero_user_segments(page, 0, from, | ||
1449 | from + copied, PAGE_CACHE_SIZE); | ||
1450 | } | ||
1451 | SetPageUptodate(page); | ||
1452 | } | ||
1207 | set_page_dirty(page); | 1453 | set_page_dirty(page); |
1208 | unlock_page(page); | 1454 | unlock_page(page); |
1209 | page_cache_release(page); | 1455 | page_cache_release(page); |
@@ -1365,6 +1611,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
1365 | struct splice_pipe_desc spd = { | 1611 | struct splice_pipe_desc spd = { |
1366 | .pages = pages, | 1612 | .pages = pages, |
1367 | .partial = partial, | 1613 | .partial = partial, |
1614 | .nr_pages_max = PIPE_DEF_BUFFERS, | ||
1368 | .flags = flags, | 1615 | .flags = flags, |
1369 | .ops = &page_cache_pipe_buf_ops, | 1616 | .ops = &page_cache_pipe_buf_ops, |
1370 | .spd_release = spd_release_page, | 1617 | .spd_release = spd_release_page, |
@@ -1453,7 +1700,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
1453 | if (spd.nr_pages) | 1700 | if (spd.nr_pages) |
1454 | error = splice_to_pipe(pipe, &spd); | 1701 | error = splice_to_pipe(pipe, &spd); |
1455 | 1702 | ||
1456 | splice_shrink_spd(pipe, &spd); | 1703 | splice_shrink_spd(&spd); |
1457 | 1704 | ||
1458 | if (error > 0) { | 1705 | if (error > 0) { |
1459 | *ppos += error; | 1706 | *ppos += error; |
@@ -1462,6 +1709,107 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
1462 | return error; | 1709 | return error; |
1463 | } | 1710 | } |
1464 | 1711 | ||
1712 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, | ||
1713 | loff_t len) | ||
1714 | { | ||
1715 | struct inode *inode = file->f_path.dentry->d_inode; | ||
1716 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
1717 | struct shmem_falloc shmem_falloc; | ||
1718 | pgoff_t start, index, end; | ||
1719 | int error; | ||
1720 | |||
1721 | mutex_lock(&inode->i_mutex); | ||
1722 | |||
1723 | if (mode & FALLOC_FL_PUNCH_HOLE) { | ||
1724 | struct address_space *mapping = file->f_mapping; | ||
1725 | loff_t unmap_start = round_up(offset, PAGE_SIZE); | ||
1726 | loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; | ||
1727 | |||
1728 | if ((u64)unmap_end > (u64)unmap_start) | ||
1729 | unmap_mapping_range(mapping, unmap_start, | ||
1730 | 1 + unmap_end - unmap_start, 0); | ||
1731 | shmem_truncate_range(inode, offset, offset + len - 1); | ||
1732 | /* No need to unmap again: hole-punching leaves COWed pages */ | ||
1733 | error = 0; | ||
1734 | goto out; | ||
1735 | } | ||
1736 | |||
1737 | /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ | ||
1738 | error = inode_newsize_ok(inode, offset + len); | ||
1739 | if (error) | ||
1740 | goto out; | ||
1741 | |||
1742 | start = offset >> PAGE_CACHE_SHIFT; | ||
1743 | end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1744 | /* Try to avoid a swapstorm if len is impossible to satisfy */ | ||
1745 | if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { | ||
1746 | error = -ENOSPC; | ||
1747 | goto out; | ||
1748 | } | ||
1749 | |||
1750 | shmem_falloc.start = start; | ||
1751 | shmem_falloc.next = start; | ||
1752 | shmem_falloc.nr_falloced = 0; | ||
1753 | shmem_falloc.nr_unswapped = 0; | ||
1754 | spin_lock(&inode->i_lock); | ||
1755 | inode->i_private = &shmem_falloc; | ||
1756 | spin_unlock(&inode->i_lock); | ||
1757 | |||
1758 | for (index = start; index < end; index++) { | ||
1759 | struct page *page; | ||
1760 | |||
1761 | /* | ||
1762 | * Good, the fallocate(2) manpage permits EINTR: we may have | ||
1763 | * been interrupted because we are using up too much memory. | ||
1764 | */ | ||
1765 | if (signal_pending(current)) | ||
1766 | error = -EINTR; | ||
1767 | else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) | ||
1768 | error = -ENOMEM; | ||
1769 | else | ||
1770 | error = shmem_getpage(inode, index, &page, SGP_FALLOC, | ||
1771 | NULL); | ||
1772 | if (error) { | ||
1773 | /* Remove the !PageUptodate pages we added */ | ||
1774 | shmem_undo_range(inode, | ||
1775 | (loff_t)start << PAGE_CACHE_SHIFT, | ||
1776 | (loff_t)index << PAGE_CACHE_SHIFT, true); | ||
1777 | goto undone; | ||
1778 | } | ||
1779 | |||
1780 | /* | ||
1781 | * Inform shmem_writepage() how far we have reached. | ||
1782 | * No need for lock or barrier: we have the page lock. | ||
1783 | */ | ||
1784 | shmem_falloc.next++; | ||
1785 | if (!PageUptodate(page)) | ||
1786 | shmem_falloc.nr_falloced++; | ||
1787 | |||
1788 | /* | ||
1789 | * If !PageUptodate, leave it that way so that freeable pages | ||
1790 | * can be recognized if we need to rollback on error later. | ||
1791 | * But set_page_dirty so that memory pressure will swap rather | ||
1792 | * than free the pages we are allocating (and SGP_CACHE pages | ||
1793 | * might still be clean: we now need to mark those dirty too). | ||
1794 | */ | ||
1795 | set_page_dirty(page); | ||
1796 | unlock_page(page); | ||
1797 | page_cache_release(page); | ||
1798 | cond_resched(); | ||
1799 | } | ||
1800 | |||
1801 | if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) | ||
1802 | i_size_write(inode, offset + len); | ||
1803 | inode->i_ctime = CURRENT_TIME; | ||
1804 | undone: | ||
1805 | spin_lock(&inode->i_lock); | ||
1806 | inode->i_private = NULL; | ||
1807 | spin_unlock(&inode->i_lock); | ||
1808 | out: | ||
1809 | mutex_unlock(&inode->i_mutex); | ||
1810 | return error; | ||
1811 | } | ||
1812 | |||
1465 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | 1813 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) |
1466 | { | 1814 | { |
1467 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); | 1815 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); |
@@ -1531,7 +1879,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
1531 | } | 1879 | } |
1532 | 1880 | ||
1533 | static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, | 1881 | static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, |
1534 | struct nameidata *nd) | 1882 | bool excl) |
1535 | { | 1883 | { |
1536 | return shmem_mknod(dir, dentry, mode | S_IFREG, 0); | 1884 | return shmem_mknod(dir, dentry, mode | S_IFREG, 0); |
1537 | } | 1885 | } |
@@ -1665,6 +2013,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
1665 | kaddr = kmap_atomic(page); | 2013 | kaddr = kmap_atomic(page); |
1666 | memcpy(kaddr, symname, len); | 2014 | memcpy(kaddr, symname, len); |
1667 | kunmap_atomic(kaddr); | 2015 | kunmap_atomic(kaddr); |
2016 | SetPageUptodate(page); | ||
1668 | set_page_dirty(page); | 2017 | set_page_dirty(page); |
1669 | unlock_page(page); | 2018 | unlock_page(page); |
1670 | page_cache_release(page); | 2019 | page_cache_release(page); |
@@ -2033,11 +2382,9 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, | |||
2033 | return dentry; | 2382 | return dentry; |
2034 | } | 2383 | } |
2035 | 2384 | ||
2036 | static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, | 2385 | static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, |
2037 | int connectable) | 2386 | struct inode *parent) |
2038 | { | 2387 | { |
2039 | struct inode *inode = dentry->d_inode; | ||
2040 | |||
2041 | if (*len < 3) { | 2388 | if (*len < 3) { |
2042 | *len = 3; | 2389 | *len = 3; |
2043 | return 255; | 2390 | return 255; |
@@ -2075,6 +2422,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
2075 | bool remount) | 2422 | bool remount) |
2076 | { | 2423 | { |
2077 | char *this_char, *value, *rest; | 2424 | char *this_char, *value, *rest; |
2425 | uid_t uid; | ||
2426 | gid_t gid; | ||
2078 | 2427 | ||
2079 | while (options != NULL) { | 2428 | while (options != NULL) { |
2080 | this_char = options; | 2429 | this_char = options; |
@@ -2134,15 +2483,21 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
2134 | } else if (!strcmp(this_char,"uid")) { | 2483 | } else if (!strcmp(this_char,"uid")) { |
2135 | if (remount) | 2484 | if (remount) |
2136 | continue; | 2485 | continue; |
2137 | sbinfo->uid = simple_strtoul(value, &rest, 0); | 2486 | uid = simple_strtoul(value, &rest, 0); |
2138 | if (*rest) | 2487 | if (*rest) |
2139 | goto bad_val; | 2488 | goto bad_val; |
2489 | sbinfo->uid = make_kuid(current_user_ns(), uid); | ||
2490 | if (!uid_valid(sbinfo->uid)) | ||
2491 | goto bad_val; | ||
2140 | } else if (!strcmp(this_char,"gid")) { | 2492 | } else if (!strcmp(this_char,"gid")) { |
2141 | if (remount) | 2493 | if (remount) |
2142 | continue; | 2494 | continue; |
2143 | sbinfo->gid = simple_strtoul(value, &rest, 0); | 2495 | gid = simple_strtoul(value, &rest, 0); |
2144 | if (*rest) | 2496 | if (*rest) |
2145 | goto bad_val; | 2497 | goto bad_val; |
2498 | sbinfo->gid = make_kgid(current_user_ns(), gid); | ||
2499 | if (!gid_valid(sbinfo->gid)) | ||
2500 | goto bad_val; | ||
2146 | } else if (!strcmp(this_char,"mpol")) { | 2501 | } else if (!strcmp(this_char,"mpol")) { |
2147 | if (mpol_parse_str(value, &sbinfo->mpol, 1)) | 2502 | if (mpol_parse_str(value, &sbinfo->mpol, 1)) |
2148 | goto bad_val; | 2503 | goto bad_val; |
@@ -2210,10 +2565,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) | |||
2210 | seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); | 2565 | seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); |
2211 | if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) | 2566 | if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) |
2212 | seq_printf(seq, ",mode=%03ho", sbinfo->mode); | 2567 | seq_printf(seq, ",mode=%03ho", sbinfo->mode); |
2213 | if (sbinfo->uid != 0) | 2568 | if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) |
2214 | seq_printf(seq, ",uid=%u", sbinfo->uid); | 2569 | seq_printf(seq, ",uid=%u", |
2215 | if (sbinfo->gid != 0) | 2570 | from_kuid_munged(&init_user_ns, sbinfo->uid)); |
2216 | seq_printf(seq, ",gid=%u", sbinfo->gid); | 2571 | if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) |
2572 | seq_printf(seq, ",gid=%u", | ||
2573 | from_kgid_munged(&init_user_ns, sbinfo->gid)); | ||
2217 | shmem_show_mpol(seq, sbinfo->mpol); | 2574 | shmem_show_mpol(seq, sbinfo->mpol); |
2218 | return 0; | 2575 | return 0; |
2219 | } | 2576 | } |
@@ -2260,6 +2617,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) | |||
2260 | } | 2617 | } |
2261 | } | 2618 | } |
2262 | sb->s_export_op = &shmem_export_ops; | 2619 | sb->s_export_op = &shmem_export_ops; |
2620 | sb->s_flags |= MS_NOSEC; | ||
2263 | #else | 2621 | #else |
2264 | sb->s_flags |= MS_NOUSER; | 2622 | sb->s_flags |= MS_NOUSER; |
2265 | #endif | 2623 | #endif |
@@ -2362,12 +2720,12 @@ static const struct file_operations shmem_file_operations = { | |||
2362 | .fsync = noop_fsync, | 2720 | .fsync = noop_fsync, |
2363 | .splice_read = shmem_file_splice_read, | 2721 | .splice_read = shmem_file_splice_read, |
2364 | .splice_write = generic_file_splice_write, | 2722 | .splice_write = generic_file_splice_write, |
2723 | .fallocate = shmem_fallocate, | ||
2365 | #endif | 2724 | #endif |
2366 | }; | 2725 | }; |
2367 | 2726 | ||
2368 | static const struct inode_operations shmem_inode_operations = { | 2727 | static const struct inode_operations shmem_inode_operations = { |
2369 | .setattr = shmem_setattr, | 2728 | .setattr = shmem_setattr, |
2370 | .truncate_range = shmem_truncate_range, | ||
2371 | #ifdef CONFIG_TMPFS_XATTR | 2729 | #ifdef CONFIG_TMPFS_XATTR |
2372 | .setxattr = shmem_setxattr, | 2730 | .setxattr = shmem_setxattr, |
2373 | .getxattr = shmem_getxattr, | 2731 | .getxattr = shmem_getxattr, |
@@ -68,7 +68,7 @@ | |||
68 | * Further notes from the original documentation: | 68 | * Further notes from the original documentation: |
69 | * | 69 | * |
70 | * 11 April '97. Started multi-threading - markhe | 70 | * 11 April '97. Started multi-threading - markhe |
71 | * The global cache-chain is protected by the mutex 'cache_chain_mutex'. | 71 | * The global cache-chain is protected by the mutex 'slab_mutex'. |
72 | * The sem is only needed when accessing/extending the cache-chain, which | 72 | * The sem is only needed when accessing/extending the cache-chain, which |
73 | * can never happen inside an interrupt (kmem_cache_create(), | 73 | * can never happen inside an interrupt (kmem_cache_create(), |
74 | * kmem_cache_shrink() and kmem_cache_reap()). | 74 | * kmem_cache_shrink() and kmem_cache_reap()). |
@@ -87,6 +87,7 @@ | |||
87 | */ | 87 | */ |
88 | 88 | ||
89 | #include <linux/slab.h> | 89 | #include <linux/slab.h> |
90 | #include "slab.h" | ||
90 | #include <linux/mm.h> | 91 | #include <linux/mm.h> |
91 | #include <linux/poison.h> | 92 | #include <linux/poison.h> |
92 | #include <linux/swap.h> | 93 | #include <linux/swap.h> |
@@ -117,12 +118,16 @@ | |||
117 | #include <linux/memory.h> | 118 | #include <linux/memory.h> |
118 | #include <linux/prefetch.h> | 119 | #include <linux/prefetch.h> |
119 | 120 | ||
121 | #include <net/sock.h> | ||
122 | |||
120 | #include <asm/cacheflush.h> | 123 | #include <asm/cacheflush.h> |
121 | #include <asm/tlbflush.h> | 124 | #include <asm/tlbflush.h> |
122 | #include <asm/page.h> | 125 | #include <asm/page.h> |
123 | 126 | ||
124 | #include <trace/events/kmem.h> | 127 | #include <trace/events/kmem.h> |
125 | 128 | ||
129 | #include "internal.h" | ||
130 | |||
126 | /* | 131 | /* |
127 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. | 132 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. |
128 | * 0 for faster, smaller code (especially in the critical paths). | 133 | * 0 for faster, smaller code (especially in the critical paths). |
@@ -151,6 +156,12 @@ | |||
151 | #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN | 156 | #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN |
152 | #endif | 157 | #endif |
153 | 158 | ||
159 | /* | ||
160 | * true if a page was allocated from pfmemalloc reserves for network-based | ||
161 | * swap | ||
162 | */ | ||
163 | static bool pfmemalloc_active __read_mostly; | ||
164 | |||
154 | /* Legal flag mask for kmem_cache_create(). */ | 165 | /* Legal flag mask for kmem_cache_create(). */ |
155 | #if DEBUG | 166 | #if DEBUG |
156 | # define CREATE_MASK (SLAB_RED_ZONE | \ | 167 | # define CREATE_MASK (SLAB_RED_ZONE | \ |
@@ -256,9 +267,30 @@ struct array_cache { | |||
256 | * Must have this definition in here for the proper | 267 | * Must have this definition in here for the proper |
257 | * alignment of array_cache. Also simplifies accessing | 268 | * alignment of array_cache. Also simplifies accessing |
258 | * the entries. | 269 | * the entries. |
270 | * | ||
271 | * Entries should not be directly dereferenced as | ||
272 | * entries belonging to slabs marked pfmemalloc will | ||
273 | * have the lower bits set SLAB_OBJ_PFMEMALLOC | ||
259 | */ | 274 | */ |
260 | }; | 275 | }; |
261 | 276 | ||
277 | #define SLAB_OBJ_PFMEMALLOC 1 | ||
278 | static inline bool is_obj_pfmemalloc(void *objp) | ||
279 | { | ||
280 | return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; | ||
281 | } | ||
282 | |||
283 | static inline void set_obj_pfmemalloc(void **objp) | ||
284 | { | ||
285 | *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); | ||
286 | return; | ||
287 | } | ||
288 | |||
289 | static inline void clear_obj_pfmemalloc(void **objp) | ||
290 | { | ||
291 | *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); | ||
292 | } | ||
293 | |||
262 | /* | 294 | /* |
263 | * bootstrap: The caches do not work without cpuarrays anymore, but the | 295 | * bootstrap: The caches do not work without cpuarrays anymore, but the |
264 | * cpuarrays are allocated from the generic caches... | 296 | * cpuarrays are allocated from the generic caches... |
@@ -424,8 +456,8 @@ static void kmem_list3_init(struct kmem_list3 *parent) | |||
424 | * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: | 456 | * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: |
425 | * redzone word. | 457 | * redzone word. |
426 | * cachep->obj_offset: The real object. | 458 | * cachep->obj_offset: The real object. |
427 | * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] | 459 | * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] |
428 | * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address | 460 | * cachep->size - 1* BYTES_PER_WORD: last caller address |
429 | * [BYTES_PER_WORD long] | 461 | * [BYTES_PER_WORD long] |
430 | */ | 462 | */ |
431 | static int obj_offset(struct kmem_cache *cachep) | 463 | static int obj_offset(struct kmem_cache *cachep) |
@@ -433,11 +465,6 @@ static int obj_offset(struct kmem_cache *cachep) | |||
433 | return cachep->obj_offset; | 465 | return cachep->obj_offset; |
434 | } | 466 | } |
435 | 467 | ||
436 | static int obj_size(struct kmem_cache *cachep) | ||
437 | { | ||
438 | return cachep->obj_size; | ||
439 | } | ||
440 | |||
441 | static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) | 468 | static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) |
442 | { | 469 | { |
443 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); | 470 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); |
@@ -449,23 +476,22 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) | |||
449 | { | 476 | { |
450 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); | 477 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); |
451 | if (cachep->flags & SLAB_STORE_USER) | 478 | if (cachep->flags & SLAB_STORE_USER) |
452 | return (unsigned long long *)(objp + cachep->buffer_size - | 479 | return (unsigned long long *)(objp + cachep->size - |
453 | sizeof(unsigned long long) - | 480 | sizeof(unsigned long long) - |
454 | REDZONE_ALIGN); | 481 | REDZONE_ALIGN); |
455 | return (unsigned long long *) (objp + cachep->buffer_size - | 482 | return (unsigned long long *) (objp + cachep->size - |
456 | sizeof(unsigned long long)); | 483 | sizeof(unsigned long long)); |
457 | } | 484 | } |
458 | 485 | ||
459 | static void **dbg_userword(struct kmem_cache *cachep, void *objp) | 486 | static void **dbg_userword(struct kmem_cache *cachep, void *objp) |
460 | { | 487 | { |
461 | BUG_ON(!(cachep->flags & SLAB_STORE_USER)); | 488 | BUG_ON(!(cachep->flags & SLAB_STORE_USER)); |
462 | return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD); | 489 | return (void **)(objp + cachep->size - BYTES_PER_WORD); |
463 | } | 490 | } |
464 | 491 | ||
465 | #else | 492 | #else |
466 | 493 | ||
467 | #define obj_offset(x) 0 | 494 | #define obj_offset(x) 0 |
468 | #define obj_size(cachep) (cachep->buffer_size) | ||
469 | #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) | 495 | #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) |
470 | #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) | 496 | #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) |
471 | #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) | 497 | #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) |
@@ -475,7 +501,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
475 | #ifdef CONFIG_TRACING | 501 | #ifdef CONFIG_TRACING |
476 | size_t slab_buffer_size(struct kmem_cache *cachep) | 502 | size_t slab_buffer_size(struct kmem_cache *cachep) |
477 | { | 503 | { |
478 | return cachep->buffer_size; | 504 | return cachep->size; |
479 | } | 505 | } |
480 | EXPORT_SYMBOL(slab_buffer_size); | 506 | EXPORT_SYMBOL(slab_buffer_size); |
481 | #endif | 507 | #endif |
@@ -489,56 +515,37 @@ EXPORT_SYMBOL(slab_buffer_size); | |||
489 | static int slab_max_order = SLAB_MAX_ORDER_LO; | 515 | static int slab_max_order = SLAB_MAX_ORDER_LO; |
490 | static bool slab_max_order_set __initdata; | 516 | static bool slab_max_order_set __initdata; |
491 | 517 | ||
492 | /* | ||
493 | * Functions for storing/retrieving the cachep and or slab from the page | ||
494 | * allocator. These are used to find the slab an obj belongs to. With kfree(), | ||
495 | * these are used to find the cache which an obj belongs to. | ||
496 | */ | ||
497 | static inline void page_set_cache(struct page *page, struct kmem_cache *cache) | ||
498 | { | ||
499 | page->lru.next = (struct list_head *)cache; | ||
500 | } | ||
501 | |||
502 | static inline struct kmem_cache *page_get_cache(struct page *page) | 518 | static inline struct kmem_cache *page_get_cache(struct page *page) |
503 | { | 519 | { |
504 | page = compound_head(page); | 520 | page = compound_head(page); |
505 | BUG_ON(!PageSlab(page)); | 521 | BUG_ON(!PageSlab(page)); |
506 | return (struct kmem_cache *)page->lru.next; | 522 | return page->slab_cache; |
507 | } | ||
508 | |||
509 | static inline void page_set_slab(struct page *page, struct slab *slab) | ||
510 | { | ||
511 | page->lru.prev = (struct list_head *)slab; | ||
512 | } | ||
513 | |||
514 | static inline struct slab *page_get_slab(struct page *page) | ||
515 | { | ||
516 | BUG_ON(!PageSlab(page)); | ||
517 | return (struct slab *)page->lru.prev; | ||
518 | } | 523 | } |
519 | 524 | ||
520 | static inline struct kmem_cache *virt_to_cache(const void *obj) | 525 | static inline struct kmem_cache *virt_to_cache(const void *obj) |
521 | { | 526 | { |
522 | struct page *page = virt_to_head_page(obj); | 527 | struct page *page = virt_to_head_page(obj); |
523 | return page_get_cache(page); | 528 | return page->slab_cache; |
524 | } | 529 | } |
525 | 530 | ||
526 | static inline struct slab *virt_to_slab(const void *obj) | 531 | static inline struct slab *virt_to_slab(const void *obj) |
527 | { | 532 | { |
528 | struct page *page = virt_to_head_page(obj); | 533 | struct page *page = virt_to_head_page(obj); |
529 | return page_get_slab(page); | 534 | |
535 | VM_BUG_ON(!PageSlab(page)); | ||
536 | return page->slab_page; | ||
530 | } | 537 | } |
531 | 538 | ||
532 | static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, | 539 | static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, |
533 | unsigned int idx) | 540 | unsigned int idx) |
534 | { | 541 | { |
535 | return slab->s_mem + cache->buffer_size * idx; | 542 | return slab->s_mem + cache->size * idx; |
536 | } | 543 | } |
537 | 544 | ||
538 | /* | 545 | /* |
539 | * We want to avoid an expensive divide : (offset / cache->buffer_size) | 546 | * We want to avoid an expensive divide : (offset / cache->size) |
540 | * Using the fact that buffer_size is a constant for a particular cache, | 547 | * Using the fact that size is a constant for a particular cache, |
541 | * we can replace (offset / cache->buffer_size) by | 548 | * we can replace (offset / cache->size) by |
542 | * reciprocal_divide(offset, cache->reciprocal_buffer_size) | 549 | * reciprocal_divide(offset, cache->reciprocal_buffer_size) |
543 | */ | 550 | */ |
544 | static inline unsigned int obj_to_index(const struct kmem_cache *cache, | 551 | static inline unsigned int obj_to_index(const struct kmem_cache *cache, |
@@ -584,33 +591,12 @@ static struct kmem_cache cache_cache = { | |||
584 | .batchcount = 1, | 591 | .batchcount = 1, |
585 | .limit = BOOT_CPUCACHE_ENTRIES, | 592 | .limit = BOOT_CPUCACHE_ENTRIES, |
586 | .shared = 1, | 593 | .shared = 1, |
587 | .buffer_size = sizeof(struct kmem_cache), | 594 | .size = sizeof(struct kmem_cache), |
588 | .name = "kmem_cache", | 595 | .name = "kmem_cache", |
589 | }; | 596 | }; |
590 | 597 | ||
591 | #define BAD_ALIEN_MAGIC 0x01020304ul | 598 | #define BAD_ALIEN_MAGIC 0x01020304ul |
592 | 599 | ||
593 | /* | ||
594 | * chicken and egg problem: delay the per-cpu array allocation | ||
595 | * until the general caches are up. | ||
596 | */ | ||
597 | static enum { | ||
598 | NONE, | ||
599 | PARTIAL_AC, | ||
600 | PARTIAL_L3, | ||
601 | EARLY, | ||
602 | LATE, | ||
603 | FULL | ||
604 | } g_cpucache_up; | ||
605 | |||
606 | /* | ||
607 | * used by boot code to determine if it can use slab based allocator | ||
608 | */ | ||
609 | int slab_is_available(void) | ||
610 | { | ||
611 | return g_cpucache_up >= EARLY; | ||
612 | } | ||
613 | |||
614 | #ifdef CONFIG_LOCKDEP | 600 | #ifdef CONFIG_LOCKDEP |
615 | 601 | ||
616 | /* | 602 | /* |
@@ -676,7 +662,7 @@ static void init_node_lock_keys(int q) | |||
676 | { | 662 | { |
677 | struct cache_sizes *s = malloc_sizes; | 663 | struct cache_sizes *s = malloc_sizes; |
678 | 664 | ||
679 | if (g_cpucache_up < LATE) | 665 | if (slab_state < UP) |
680 | return; | 666 | return; |
681 | 667 | ||
682 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { | 668 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { |
@@ -716,12 +702,6 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) | |||
716 | } | 702 | } |
717 | #endif | 703 | #endif |
718 | 704 | ||
719 | /* | ||
720 | * Guard access to the cache-chain. | ||
721 | */ | ||
722 | static DEFINE_MUTEX(cache_chain_mutex); | ||
723 | static struct list_head cache_chain; | ||
724 | |||
725 | static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); | 705 | static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); |
726 | 706 | ||
727 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 707 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) |
@@ -951,6 +931,124 @@ static struct array_cache *alloc_arraycache(int node, int entries, | |||
951 | return nc; | 931 | return nc; |
952 | } | 932 | } |
953 | 933 | ||
934 | static inline bool is_slab_pfmemalloc(struct slab *slabp) | ||
935 | { | ||
936 | struct page *page = virt_to_page(slabp->s_mem); | ||
937 | |||
938 | return PageSlabPfmemalloc(page); | ||
939 | } | ||
940 | |||
941 | /* Clears pfmemalloc_active if no slabs have pfmalloc set */ | ||
942 | static void recheck_pfmemalloc_active(struct kmem_cache *cachep, | ||
943 | struct array_cache *ac) | ||
944 | { | ||
945 | struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()]; | ||
946 | struct slab *slabp; | ||
947 | unsigned long flags; | ||
948 | |||
949 | if (!pfmemalloc_active) | ||
950 | return; | ||
951 | |||
952 | spin_lock_irqsave(&l3->list_lock, flags); | ||
953 | list_for_each_entry(slabp, &l3->slabs_full, list) | ||
954 | if (is_slab_pfmemalloc(slabp)) | ||
955 | goto out; | ||
956 | |||
957 | list_for_each_entry(slabp, &l3->slabs_partial, list) | ||
958 | if (is_slab_pfmemalloc(slabp)) | ||
959 | goto out; | ||
960 | |||
961 | list_for_each_entry(slabp, &l3->slabs_free, list) | ||
962 | if (is_slab_pfmemalloc(slabp)) | ||
963 | goto out; | ||
964 | |||
965 | pfmemalloc_active = false; | ||
966 | out: | ||
967 | spin_unlock_irqrestore(&l3->list_lock, flags); | ||
968 | } | ||
969 | |||
970 | static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, | ||
971 | gfp_t flags, bool force_refill) | ||
972 | { | ||
973 | int i; | ||
974 | void *objp = ac->entry[--ac->avail]; | ||
975 | |||
976 | /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ | ||
977 | if (unlikely(is_obj_pfmemalloc(objp))) { | ||
978 | struct kmem_list3 *l3; | ||
979 | |||
980 | if (gfp_pfmemalloc_allowed(flags)) { | ||
981 | clear_obj_pfmemalloc(&objp); | ||
982 | return objp; | ||
983 | } | ||
984 | |||
985 | /* The caller cannot use PFMEMALLOC objects, find another one */ | ||
986 | for (i = 1; i < ac->avail; i++) { | ||
987 | /* If a !PFMEMALLOC object is found, swap them */ | ||
988 | if (!is_obj_pfmemalloc(ac->entry[i])) { | ||
989 | objp = ac->entry[i]; | ||
990 | ac->entry[i] = ac->entry[ac->avail]; | ||
991 | ac->entry[ac->avail] = objp; | ||
992 | return objp; | ||
993 | } | ||
994 | } | ||
995 | |||
996 | /* | ||
997 | * If there are empty slabs on the slabs_free list and we are | ||
998 | * being forced to refill the cache, mark this one !pfmemalloc. | ||
999 | */ | ||
1000 | l3 = cachep->nodelists[numa_mem_id()]; | ||
1001 | if (!list_empty(&l3->slabs_free) && force_refill) { | ||
1002 | struct slab *slabp = virt_to_slab(objp); | ||
1003 | ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem)); | ||
1004 | clear_obj_pfmemalloc(&objp); | ||
1005 | recheck_pfmemalloc_active(cachep, ac); | ||
1006 | return objp; | ||
1007 | } | ||
1008 | |||
1009 | /* No !PFMEMALLOC objects available */ | ||
1010 | ac->avail++; | ||
1011 | objp = NULL; | ||
1012 | } | ||
1013 | |||
1014 | return objp; | ||
1015 | } | ||
1016 | |||
1017 | static inline void *ac_get_obj(struct kmem_cache *cachep, | ||
1018 | struct array_cache *ac, gfp_t flags, bool force_refill) | ||
1019 | { | ||
1020 | void *objp; | ||
1021 | |||
1022 | if (unlikely(sk_memalloc_socks())) | ||
1023 | objp = __ac_get_obj(cachep, ac, flags, force_refill); | ||
1024 | else | ||
1025 | objp = ac->entry[--ac->avail]; | ||
1026 | |||
1027 | return objp; | ||
1028 | } | ||
1029 | |||
1030 | static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, | ||
1031 | void *objp) | ||
1032 | { | ||
1033 | if (unlikely(pfmemalloc_active)) { | ||
1034 | /* Some pfmemalloc slabs exist, check if this is one */ | ||
1035 | struct page *page = virt_to_page(objp); | ||
1036 | if (PageSlabPfmemalloc(page)) | ||
1037 | set_obj_pfmemalloc(&objp); | ||
1038 | } | ||
1039 | |||
1040 | return objp; | ||
1041 | } | ||
1042 | |||
1043 | static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, | ||
1044 | void *objp) | ||
1045 | { | ||
1046 | if (unlikely(sk_memalloc_socks())) | ||
1047 | objp = __ac_put_obj(cachep, ac, objp); | ||
1048 | |||
1049 | ac->entry[ac->avail++] = objp; | ||
1050 | } | ||
1051 | |||
954 | /* | 1052 | /* |
955 | * Transfer objects in one arraycache to another. | 1053 | * Transfer objects in one arraycache to another. |
956 | * Locking must be handled by the caller. | 1054 | * Locking must be handled by the caller. |
@@ -1127,7 +1225,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
1127 | STATS_INC_ACOVERFLOW(cachep); | 1225 | STATS_INC_ACOVERFLOW(cachep); |
1128 | __drain_alien_cache(cachep, alien, nodeid); | 1226 | __drain_alien_cache(cachep, alien, nodeid); |
1129 | } | 1227 | } |
1130 | alien->entry[alien->avail++] = objp; | 1228 | ac_put_obj(cachep, alien, objp); |
1131 | spin_unlock(&alien->lock); | 1229 | spin_unlock(&alien->lock); |
1132 | } else { | 1230 | } else { |
1133 | spin_lock(&(cachep->nodelists[nodeid])->list_lock); | 1231 | spin_lock(&(cachep->nodelists[nodeid])->list_lock); |
@@ -1145,7 +1243,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
1145 | * When hotplugging memory or a cpu, existing nodelists are not replaced if | 1243 | * When hotplugging memory or a cpu, existing nodelists are not replaced if |
1146 | * already in use. | 1244 | * already in use. |
1147 | * | 1245 | * |
1148 | * Must hold cache_chain_mutex. | 1246 | * Must hold slab_mutex. |
1149 | */ | 1247 | */ |
1150 | static int init_cache_nodelists_node(int node) | 1248 | static int init_cache_nodelists_node(int node) |
1151 | { | 1249 | { |
@@ -1153,7 +1251,7 @@ static int init_cache_nodelists_node(int node) | |||
1153 | struct kmem_list3 *l3; | 1251 | struct kmem_list3 *l3; |
1154 | const int memsize = sizeof(struct kmem_list3); | 1252 | const int memsize = sizeof(struct kmem_list3); |
1155 | 1253 | ||
1156 | list_for_each_entry(cachep, &cache_chain, next) { | 1254 | list_for_each_entry(cachep, &slab_caches, list) { |
1157 | /* | 1255 | /* |
1158 | * Set up the size64 kmemlist for cpu before we can | 1256 | * Set up the size64 kmemlist for cpu before we can |
1159 | * begin anything. Make sure some other cpu on this | 1257 | * begin anything. Make sure some other cpu on this |
@@ -1169,7 +1267,7 @@ static int init_cache_nodelists_node(int node) | |||
1169 | 1267 | ||
1170 | /* | 1268 | /* |
1171 | * The l3s don't come and go as CPUs come and | 1269 | * The l3s don't come and go as CPUs come and |
1172 | * go. cache_chain_mutex is sufficient | 1270 | * go. slab_mutex is sufficient |
1173 | * protection here. | 1271 | * protection here. |
1174 | */ | 1272 | */ |
1175 | cachep->nodelists[node] = l3; | 1273 | cachep->nodelists[node] = l3; |
@@ -1191,7 +1289,7 @@ static void __cpuinit cpuup_canceled(long cpu) | |||
1191 | int node = cpu_to_mem(cpu); | 1289 | int node = cpu_to_mem(cpu); |
1192 | const struct cpumask *mask = cpumask_of_node(node); | 1290 | const struct cpumask *mask = cpumask_of_node(node); |
1193 | 1291 | ||
1194 | list_for_each_entry(cachep, &cache_chain, next) { | 1292 | list_for_each_entry(cachep, &slab_caches, list) { |
1195 | struct array_cache *nc; | 1293 | struct array_cache *nc; |
1196 | struct array_cache *shared; | 1294 | struct array_cache *shared; |
1197 | struct array_cache **alien; | 1295 | struct array_cache **alien; |
@@ -1241,7 +1339,7 @@ free_array_cache: | |||
1241 | * the respective cache's slabs, now we can go ahead and | 1339 | * the respective cache's slabs, now we can go ahead and |
1242 | * shrink each nodelist to its limit. | 1340 | * shrink each nodelist to its limit. |
1243 | */ | 1341 | */ |
1244 | list_for_each_entry(cachep, &cache_chain, next) { | 1342 | list_for_each_entry(cachep, &slab_caches, list) { |
1245 | l3 = cachep->nodelists[node]; | 1343 | l3 = cachep->nodelists[node]; |
1246 | if (!l3) | 1344 | if (!l3) |
1247 | continue; | 1345 | continue; |
@@ -1270,7 +1368,7 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
1270 | * Now we can go ahead with allocating the shared arrays and | 1368 | * Now we can go ahead with allocating the shared arrays and |
1271 | * array caches | 1369 | * array caches |
1272 | */ | 1370 | */ |
1273 | list_for_each_entry(cachep, &cache_chain, next) { | 1371 | list_for_each_entry(cachep, &slab_caches, list) { |
1274 | struct array_cache *nc; | 1372 | struct array_cache *nc; |
1275 | struct array_cache *shared = NULL; | 1373 | struct array_cache *shared = NULL; |
1276 | struct array_cache **alien = NULL; | 1374 | struct array_cache **alien = NULL; |
@@ -1338,9 +1436,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1338 | switch (action) { | 1436 | switch (action) { |
1339 | case CPU_UP_PREPARE: | 1437 | case CPU_UP_PREPARE: |
1340 | case CPU_UP_PREPARE_FROZEN: | 1438 | case CPU_UP_PREPARE_FROZEN: |
1341 | mutex_lock(&cache_chain_mutex); | 1439 | mutex_lock(&slab_mutex); |
1342 | err = cpuup_prepare(cpu); | 1440 | err = cpuup_prepare(cpu); |
1343 | mutex_unlock(&cache_chain_mutex); | 1441 | mutex_unlock(&slab_mutex); |
1344 | break; | 1442 | break; |
1345 | case CPU_ONLINE: | 1443 | case CPU_ONLINE: |
1346 | case CPU_ONLINE_FROZEN: | 1444 | case CPU_ONLINE_FROZEN: |
@@ -1350,7 +1448,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1350 | case CPU_DOWN_PREPARE: | 1448 | case CPU_DOWN_PREPARE: |
1351 | case CPU_DOWN_PREPARE_FROZEN: | 1449 | case CPU_DOWN_PREPARE_FROZEN: |
1352 | /* | 1450 | /* |
1353 | * Shutdown cache reaper. Note that the cache_chain_mutex is | 1451 | * Shutdown cache reaper. Note that the slab_mutex is |
1354 | * held so that if cache_reap() is invoked it cannot do | 1452 | * held so that if cache_reap() is invoked it cannot do |
1355 | * anything expensive but will only modify reap_work | 1453 | * anything expensive but will only modify reap_work |
1356 | * and reschedule the timer. | 1454 | * and reschedule the timer. |
@@ -1377,9 +1475,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1377 | #endif | 1475 | #endif |
1378 | case CPU_UP_CANCELED: | 1476 | case CPU_UP_CANCELED: |
1379 | case CPU_UP_CANCELED_FROZEN: | 1477 | case CPU_UP_CANCELED_FROZEN: |
1380 | mutex_lock(&cache_chain_mutex); | 1478 | mutex_lock(&slab_mutex); |
1381 | cpuup_canceled(cpu); | 1479 | cpuup_canceled(cpu); |
1382 | mutex_unlock(&cache_chain_mutex); | 1480 | mutex_unlock(&slab_mutex); |
1383 | break; | 1481 | break; |
1384 | } | 1482 | } |
1385 | return notifier_from_errno(err); | 1483 | return notifier_from_errno(err); |
@@ -1395,14 +1493,14 @@ static struct notifier_block __cpuinitdata cpucache_notifier = { | |||
1395 | * Returns -EBUSY if all objects cannot be drained so that the node is not | 1493 | * Returns -EBUSY if all objects cannot be drained so that the node is not |
1396 | * removed. | 1494 | * removed. |
1397 | * | 1495 | * |
1398 | * Must hold cache_chain_mutex. | 1496 | * Must hold slab_mutex. |
1399 | */ | 1497 | */ |
1400 | static int __meminit drain_cache_nodelists_node(int node) | 1498 | static int __meminit drain_cache_nodelists_node(int node) |
1401 | { | 1499 | { |
1402 | struct kmem_cache *cachep; | 1500 | struct kmem_cache *cachep; |
1403 | int ret = 0; | 1501 | int ret = 0; |
1404 | 1502 | ||
1405 | list_for_each_entry(cachep, &cache_chain, next) { | 1503 | list_for_each_entry(cachep, &slab_caches, list) { |
1406 | struct kmem_list3 *l3; | 1504 | struct kmem_list3 *l3; |
1407 | 1505 | ||
1408 | l3 = cachep->nodelists[node]; | 1506 | l3 = cachep->nodelists[node]; |
@@ -1433,14 +1531,14 @@ static int __meminit slab_memory_callback(struct notifier_block *self, | |||
1433 | 1531 | ||
1434 | switch (action) { | 1532 | switch (action) { |
1435 | case MEM_GOING_ONLINE: | 1533 | case MEM_GOING_ONLINE: |
1436 | mutex_lock(&cache_chain_mutex); | 1534 | mutex_lock(&slab_mutex); |
1437 | ret = init_cache_nodelists_node(nid); | 1535 | ret = init_cache_nodelists_node(nid); |
1438 | mutex_unlock(&cache_chain_mutex); | 1536 | mutex_unlock(&slab_mutex); |
1439 | break; | 1537 | break; |
1440 | case MEM_GOING_OFFLINE: | 1538 | case MEM_GOING_OFFLINE: |
1441 | mutex_lock(&cache_chain_mutex); | 1539 | mutex_lock(&slab_mutex); |
1442 | ret = drain_cache_nodelists_node(nid); | 1540 | ret = drain_cache_nodelists_node(nid); |
1443 | mutex_unlock(&cache_chain_mutex); | 1541 | mutex_unlock(&slab_mutex); |
1444 | break; | 1542 | break; |
1445 | case MEM_ONLINE: | 1543 | case MEM_ONLINE: |
1446 | case MEM_OFFLINE: | 1544 | case MEM_OFFLINE: |
@@ -1544,8 +1642,8 @@ void __init kmem_cache_init(void) | |||
1544 | node = numa_mem_id(); | 1642 | node = numa_mem_id(); |
1545 | 1643 | ||
1546 | /* 1) create the cache_cache */ | 1644 | /* 1) create the cache_cache */ |
1547 | INIT_LIST_HEAD(&cache_chain); | 1645 | INIT_LIST_HEAD(&slab_caches); |
1548 | list_add(&cache_cache.next, &cache_chain); | 1646 | list_add(&cache_cache.list, &slab_caches); |
1549 | cache_cache.colour_off = cache_line_size(); | 1647 | cache_cache.colour_off = cache_line_size(); |
1550 | cache_cache.array[smp_processor_id()] = &initarray_cache.cache; | 1648 | cache_cache.array[smp_processor_id()] = &initarray_cache.cache; |
1551 | cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; | 1649 | cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; |
@@ -1553,18 +1651,16 @@ void __init kmem_cache_init(void) | |||
1553 | /* | 1651 | /* |
1554 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids | 1652 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids |
1555 | */ | 1653 | */ |
1556 | cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + | 1654 | cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + |
1557 | nr_node_ids * sizeof(struct kmem_list3 *); | 1655 | nr_node_ids * sizeof(struct kmem_list3 *); |
1558 | #if DEBUG | 1656 | cache_cache.object_size = cache_cache.size; |
1559 | cache_cache.obj_size = cache_cache.buffer_size; | 1657 | cache_cache.size = ALIGN(cache_cache.size, |
1560 | #endif | ||
1561 | cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, | ||
1562 | cache_line_size()); | 1658 | cache_line_size()); |
1563 | cache_cache.reciprocal_buffer_size = | 1659 | cache_cache.reciprocal_buffer_size = |
1564 | reciprocal_value(cache_cache.buffer_size); | 1660 | reciprocal_value(cache_cache.size); |
1565 | 1661 | ||
1566 | for (order = 0; order < MAX_ORDER; order++) { | 1662 | for (order = 0; order < MAX_ORDER; order++) { |
1567 | cache_estimate(order, cache_cache.buffer_size, | 1663 | cache_estimate(order, cache_cache.size, |
1568 | cache_line_size(), 0, &left_over, &cache_cache.num); | 1664 | cache_line_size(), 0, &left_over, &cache_cache.num); |
1569 | if (cache_cache.num) | 1665 | if (cache_cache.num) |
1570 | break; | 1666 | break; |
@@ -1585,7 +1681,7 @@ void __init kmem_cache_init(void) | |||
1585 | * bug. | 1681 | * bug. |
1586 | */ | 1682 | */ |
1587 | 1683 | ||
1588 | sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, | 1684 | sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name, |
1589 | sizes[INDEX_AC].cs_size, | 1685 | sizes[INDEX_AC].cs_size, |
1590 | ARCH_KMALLOC_MINALIGN, | 1686 | ARCH_KMALLOC_MINALIGN, |
1591 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, | 1687 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
@@ -1593,7 +1689,7 @@ void __init kmem_cache_init(void) | |||
1593 | 1689 | ||
1594 | if (INDEX_AC != INDEX_L3) { | 1690 | if (INDEX_AC != INDEX_L3) { |
1595 | sizes[INDEX_L3].cs_cachep = | 1691 | sizes[INDEX_L3].cs_cachep = |
1596 | kmem_cache_create(names[INDEX_L3].name, | 1692 | __kmem_cache_create(names[INDEX_L3].name, |
1597 | sizes[INDEX_L3].cs_size, | 1693 | sizes[INDEX_L3].cs_size, |
1598 | ARCH_KMALLOC_MINALIGN, | 1694 | ARCH_KMALLOC_MINALIGN, |
1599 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, | 1695 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
@@ -1611,14 +1707,14 @@ void __init kmem_cache_init(void) | |||
1611 | * allow tighter packing of the smaller caches. | 1707 | * allow tighter packing of the smaller caches. |
1612 | */ | 1708 | */ |
1613 | if (!sizes->cs_cachep) { | 1709 | if (!sizes->cs_cachep) { |
1614 | sizes->cs_cachep = kmem_cache_create(names->name, | 1710 | sizes->cs_cachep = __kmem_cache_create(names->name, |
1615 | sizes->cs_size, | 1711 | sizes->cs_size, |
1616 | ARCH_KMALLOC_MINALIGN, | 1712 | ARCH_KMALLOC_MINALIGN, |
1617 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, | 1713 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
1618 | NULL); | 1714 | NULL); |
1619 | } | 1715 | } |
1620 | #ifdef CONFIG_ZONE_DMA | 1716 | #ifdef CONFIG_ZONE_DMA |
1621 | sizes->cs_dmacachep = kmem_cache_create( | 1717 | sizes->cs_dmacachep = __kmem_cache_create( |
1622 | names->name_dma, | 1718 | names->name_dma, |
1623 | sizes->cs_size, | 1719 | sizes->cs_size, |
1624 | ARCH_KMALLOC_MINALIGN, | 1720 | ARCH_KMALLOC_MINALIGN, |
@@ -1676,27 +1772,27 @@ void __init kmem_cache_init(void) | |||
1676 | } | 1772 | } |
1677 | } | 1773 | } |
1678 | 1774 | ||
1679 | g_cpucache_up = EARLY; | 1775 | slab_state = UP; |
1680 | } | 1776 | } |
1681 | 1777 | ||
1682 | void __init kmem_cache_init_late(void) | 1778 | void __init kmem_cache_init_late(void) |
1683 | { | 1779 | { |
1684 | struct kmem_cache *cachep; | 1780 | struct kmem_cache *cachep; |
1685 | 1781 | ||
1686 | g_cpucache_up = LATE; | 1782 | slab_state = UP; |
1687 | 1783 | ||
1688 | /* Annotate slab for lockdep -- annotate the malloc caches */ | 1784 | /* Annotate slab for lockdep -- annotate the malloc caches */ |
1689 | init_lock_keys(); | 1785 | init_lock_keys(); |
1690 | 1786 | ||
1691 | /* 6) resize the head arrays to their final sizes */ | 1787 | /* 6) resize the head arrays to their final sizes */ |
1692 | mutex_lock(&cache_chain_mutex); | 1788 | mutex_lock(&slab_mutex); |
1693 | list_for_each_entry(cachep, &cache_chain, next) | 1789 | list_for_each_entry(cachep, &slab_caches, list) |
1694 | if (enable_cpucache(cachep, GFP_NOWAIT)) | 1790 | if (enable_cpucache(cachep, GFP_NOWAIT)) |
1695 | BUG(); | 1791 | BUG(); |
1696 | mutex_unlock(&cache_chain_mutex); | 1792 | mutex_unlock(&slab_mutex); |
1697 | 1793 | ||
1698 | /* Done! */ | 1794 | /* Done! */ |
1699 | g_cpucache_up = FULL; | 1795 | slab_state = FULL; |
1700 | 1796 | ||
1701 | /* | 1797 | /* |
1702 | * Register a cpu startup notifier callback that initializes | 1798 | * Register a cpu startup notifier callback that initializes |
@@ -1727,6 +1823,9 @@ static int __init cpucache_init(void) | |||
1727 | */ | 1823 | */ |
1728 | for_each_online_cpu(cpu) | 1824 | for_each_online_cpu(cpu) |
1729 | start_cpu_timer(cpu); | 1825 | start_cpu_timer(cpu); |
1826 | |||
1827 | /* Done! */ | ||
1828 | slab_state = FULL; | ||
1730 | return 0; | 1829 | return 0; |
1731 | } | 1830 | } |
1732 | __initcall(cpucache_init); | 1831 | __initcall(cpucache_init); |
@@ -1743,7 +1842,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) | |||
1743 | "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", | 1842 | "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", |
1744 | nodeid, gfpflags); | 1843 | nodeid, gfpflags); |
1745 | printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", | 1844 | printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", |
1746 | cachep->name, cachep->buffer_size, cachep->gfporder); | 1845 | cachep->name, cachep->size, cachep->gfporder); |
1747 | 1846 | ||
1748 | for_each_online_node(node) { | 1847 | for_each_online_node(node) { |
1749 | unsigned long active_objs = 0, num_objs = 0, free_objects = 0; | 1848 | unsigned long active_objs = 0, num_objs = 0, free_objects = 0; |
@@ -1798,7 +1897,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1798 | flags |= __GFP_COMP; | 1897 | flags |= __GFP_COMP; |
1799 | #endif | 1898 | #endif |
1800 | 1899 | ||
1801 | flags |= cachep->gfpflags; | 1900 | flags |= cachep->allocflags; |
1802 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1901 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1803 | flags |= __GFP_RECLAIMABLE; | 1902 | flags |= __GFP_RECLAIMABLE; |
1804 | 1903 | ||
@@ -1809,6 +1908,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1809 | return NULL; | 1908 | return NULL; |
1810 | } | 1909 | } |
1811 | 1910 | ||
1911 | /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ | ||
1912 | if (unlikely(page->pfmemalloc)) | ||
1913 | pfmemalloc_active = true; | ||
1914 | |||
1812 | nr_pages = (1 << cachep->gfporder); | 1915 | nr_pages = (1 << cachep->gfporder); |
1813 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1916 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1814 | add_zone_page_state(page_zone(page), | 1917 | add_zone_page_state(page_zone(page), |
@@ -1816,9 +1919,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1816 | else | 1919 | else |
1817 | add_zone_page_state(page_zone(page), | 1920 | add_zone_page_state(page_zone(page), |
1818 | NR_SLAB_UNRECLAIMABLE, nr_pages); | 1921 | NR_SLAB_UNRECLAIMABLE, nr_pages); |
1819 | for (i = 0; i < nr_pages; i++) | 1922 | for (i = 0; i < nr_pages; i++) { |
1820 | __SetPageSlab(page + i); | 1923 | __SetPageSlab(page + i); |
1821 | 1924 | ||
1925 | if (page->pfmemalloc) | ||
1926 | SetPageSlabPfmemalloc(page + i); | ||
1927 | } | ||
1928 | |||
1822 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | 1929 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { |
1823 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | 1930 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); |
1824 | 1931 | ||
@@ -1850,6 +1957,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1850 | NR_SLAB_UNRECLAIMABLE, nr_freed); | 1957 | NR_SLAB_UNRECLAIMABLE, nr_freed); |
1851 | while (i--) { | 1958 | while (i--) { |
1852 | BUG_ON(!PageSlab(page)); | 1959 | BUG_ON(!PageSlab(page)); |
1960 | __ClearPageSlabPfmemalloc(page); | ||
1853 | __ClearPageSlab(page); | 1961 | __ClearPageSlab(page); |
1854 | page++; | 1962 | page++; |
1855 | } | 1963 | } |
@@ -1874,7 +1982,7 @@ static void kmem_rcu_free(struct rcu_head *head) | |||
1874 | static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, | 1982 | static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, |
1875 | unsigned long caller) | 1983 | unsigned long caller) |
1876 | { | 1984 | { |
1877 | int size = obj_size(cachep); | 1985 | int size = cachep->object_size; |
1878 | 1986 | ||
1879 | addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; | 1987 | addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; |
1880 | 1988 | ||
@@ -1906,7 +2014,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, | |||
1906 | 2014 | ||
1907 | static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) | 2015 | static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) |
1908 | { | 2016 | { |
1909 | int size = obj_size(cachep); | 2017 | int size = cachep->object_size; |
1910 | addr = &((char *)addr)[obj_offset(cachep)]; | 2018 | addr = &((char *)addr)[obj_offset(cachep)]; |
1911 | 2019 | ||
1912 | memset(addr, val, size); | 2020 | memset(addr, val, size); |
@@ -1966,7 +2074,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) | |||
1966 | printk("\n"); | 2074 | printk("\n"); |
1967 | } | 2075 | } |
1968 | realobj = (char *)objp + obj_offset(cachep); | 2076 | realobj = (char *)objp + obj_offset(cachep); |
1969 | size = obj_size(cachep); | 2077 | size = cachep->object_size; |
1970 | for (i = 0; i < size && lines; i += 16, lines--) { | 2078 | for (i = 0; i < size && lines; i += 16, lines--) { |
1971 | int limit; | 2079 | int limit; |
1972 | limit = 16; | 2080 | limit = 16; |
@@ -1983,7 +2091,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) | |||
1983 | int lines = 0; | 2091 | int lines = 0; |
1984 | 2092 | ||
1985 | realobj = (char *)objp + obj_offset(cachep); | 2093 | realobj = (char *)objp + obj_offset(cachep); |
1986 | size = obj_size(cachep); | 2094 | size = cachep->object_size; |
1987 | 2095 | ||
1988 | for (i = 0; i < size; i++) { | 2096 | for (i = 0; i < size; i++) { |
1989 | char exp = POISON_FREE; | 2097 | char exp = POISON_FREE; |
@@ -2047,10 +2155,10 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab | |||
2047 | 2155 | ||
2048 | if (cachep->flags & SLAB_POISON) { | 2156 | if (cachep->flags & SLAB_POISON) { |
2049 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2157 | #ifdef CONFIG_DEBUG_PAGEALLOC |
2050 | if (cachep->buffer_size % PAGE_SIZE == 0 && | 2158 | if (cachep->size % PAGE_SIZE == 0 && |
2051 | OFF_SLAB(cachep)) | 2159 | OFF_SLAB(cachep)) |
2052 | kernel_map_pages(virt_to_page(objp), | 2160 | kernel_map_pages(virt_to_page(objp), |
2053 | cachep->buffer_size / PAGE_SIZE, 1); | 2161 | cachep->size / PAGE_SIZE, 1); |
2054 | else | 2162 | else |
2055 | check_poison_obj(cachep, objp); | 2163 | check_poison_obj(cachep, objp); |
2056 | #else | 2164 | #else |
@@ -2194,10 +2302,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, | |||
2194 | 2302 | ||
2195 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | 2303 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) |
2196 | { | 2304 | { |
2197 | if (g_cpucache_up == FULL) | 2305 | if (slab_state >= FULL) |
2198 | return enable_cpucache(cachep, gfp); | 2306 | return enable_cpucache(cachep, gfp); |
2199 | 2307 | ||
2200 | if (g_cpucache_up == NONE) { | 2308 | if (slab_state == DOWN) { |
2201 | /* | 2309 | /* |
2202 | * Note: the first kmem_cache_create must create the cache | 2310 | * Note: the first kmem_cache_create must create the cache |
2203 | * that's used by kmalloc(24), otherwise the creation of | 2311 | * that's used by kmalloc(24), otherwise the creation of |
@@ -2212,16 +2320,16 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2212 | */ | 2320 | */ |
2213 | set_up_list3s(cachep, SIZE_AC); | 2321 | set_up_list3s(cachep, SIZE_AC); |
2214 | if (INDEX_AC == INDEX_L3) | 2322 | if (INDEX_AC == INDEX_L3) |
2215 | g_cpucache_up = PARTIAL_L3; | 2323 | slab_state = PARTIAL_L3; |
2216 | else | 2324 | else |
2217 | g_cpucache_up = PARTIAL_AC; | 2325 | slab_state = PARTIAL_ARRAYCACHE; |
2218 | } else { | 2326 | } else { |
2219 | cachep->array[smp_processor_id()] = | 2327 | cachep->array[smp_processor_id()] = |
2220 | kmalloc(sizeof(struct arraycache_init), gfp); | 2328 | kmalloc(sizeof(struct arraycache_init), gfp); |
2221 | 2329 | ||
2222 | if (g_cpucache_up == PARTIAL_AC) { | 2330 | if (slab_state == PARTIAL_ARRAYCACHE) { |
2223 | set_up_list3s(cachep, SIZE_L3); | 2331 | set_up_list3s(cachep, SIZE_L3); |
2224 | g_cpucache_up = PARTIAL_L3; | 2332 | slab_state = PARTIAL_L3; |
2225 | } else { | 2333 | } else { |
2226 | int node; | 2334 | int node; |
2227 | for_each_online_node(node) { | 2335 | for_each_online_node(node) { |
@@ -2247,7 +2355,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2247 | } | 2355 | } |
2248 | 2356 | ||
2249 | /** | 2357 | /** |
2250 | * kmem_cache_create - Create a cache. | 2358 | * __kmem_cache_create - Create a cache. |
2251 | * @name: A string which is used in /proc/slabinfo to identify this cache. | 2359 | * @name: A string which is used in /proc/slabinfo to identify this cache. |
2252 | * @size: The size of objects to be created in this cache. | 2360 | * @size: The size of objects to be created in this cache. |
2253 | * @align: The required alignment for the objects. | 2361 | * @align: The required alignment for the objects. |
@@ -2274,59 +2382,14 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2274 | * as davem. | 2382 | * as davem. |
2275 | */ | 2383 | */ |
2276 | struct kmem_cache * | 2384 | struct kmem_cache * |
2277 | kmem_cache_create (const char *name, size_t size, size_t align, | 2385 | __kmem_cache_create (const char *name, size_t size, size_t align, |
2278 | unsigned long flags, void (*ctor)(void *)) | 2386 | unsigned long flags, void (*ctor)(void *)) |
2279 | { | 2387 | { |
2280 | size_t left_over, slab_size, ralign; | 2388 | size_t left_over, slab_size, ralign; |
2281 | struct kmem_cache *cachep = NULL, *pc; | 2389 | struct kmem_cache *cachep = NULL; |
2282 | gfp_t gfp; | 2390 | gfp_t gfp; |
2283 | 2391 | ||
2284 | /* | ||
2285 | * Sanity checks... these are all serious usage bugs. | ||
2286 | */ | ||
2287 | if (!name || in_interrupt() || (size < BYTES_PER_WORD) || | ||
2288 | size > KMALLOC_MAX_SIZE) { | ||
2289 | printk(KERN_ERR "%s: Early error in slab %s\n", __func__, | ||
2290 | name); | ||
2291 | BUG(); | ||
2292 | } | ||
2293 | |||
2294 | /* | ||
2295 | * We use cache_chain_mutex to ensure a consistent view of | ||
2296 | * cpu_online_mask as well. Please see cpuup_callback | ||
2297 | */ | ||
2298 | if (slab_is_available()) { | ||
2299 | get_online_cpus(); | ||
2300 | mutex_lock(&cache_chain_mutex); | ||
2301 | } | ||
2302 | |||
2303 | list_for_each_entry(pc, &cache_chain, next) { | ||
2304 | char tmp; | ||
2305 | int res; | ||
2306 | |||
2307 | /* | ||
2308 | * This happens when the module gets unloaded and doesn't | ||
2309 | * destroy its slab cache and no-one else reuses the vmalloc | ||
2310 | * area of the module. Print a warning. | ||
2311 | */ | ||
2312 | res = probe_kernel_address(pc->name, tmp); | ||
2313 | if (res) { | ||
2314 | printk(KERN_ERR | ||
2315 | "SLAB: cache with size %d has lost its name\n", | ||
2316 | pc->buffer_size); | ||
2317 | continue; | ||
2318 | } | ||
2319 | |||
2320 | if (!strcmp(pc->name, name)) { | ||
2321 | printk(KERN_ERR | ||
2322 | "kmem_cache_create: duplicate cache %s\n", name); | ||
2323 | dump_stack(); | ||
2324 | goto oops; | ||
2325 | } | ||
2326 | } | ||
2327 | |||
2328 | #if DEBUG | 2392 | #if DEBUG |
2329 | WARN_ON(strchr(name, ' ')); /* It confuses parsers */ | ||
2330 | #if FORCED_DEBUG | 2393 | #if FORCED_DEBUG |
2331 | /* | 2394 | /* |
2332 | * Enable redzoning and last user accounting, except for caches with | 2395 | * Enable redzoning and last user accounting, except for caches with |
@@ -2415,11 +2478,12 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2415 | /* Get cache's description obj. */ | 2478 | /* Get cache's description obj. */ |
2416 | cachep = kmem_cache_zalloc(&cache_cache, gfp); | 2479 | cachep = kmem_cache_zalloc(&cache_cache, gfp); |
2417 | if (!cachep) | 2480 | if (!cachep) |
2418 | goto oops; | 2481 | return NULL; |
2419 | 2482 | ||
2420 | cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; | 2483 | cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; |
2484 | cachep->object_size = size; | ||
2485 | cachep->align = align; | ||
2421 | #if DEBUG | 2486 | #if DEBUG |
2422 | cachep->obj_size = size; | ||
2423 | 2487 | ||
2424 | /* | 2488 | /* |
2425 | * Both debugging options require word-alignment which is calculated | 2489 | * Both debugging options require word-alignment which is calculated |
@@ -2442,7 +2506,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2442 | } | 2506 | } |
2443 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) | 2507 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) |
2444 | if (size >= malloc_sizes[INDEX_L3 + 1].cs_size | 2508 | if (size >= malloc_sizes[INDEX_L3 + 1].cs_size |
2445 | && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { | 2509 | && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { |
2446 | cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); | 2510 | cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); |
2447 | size = PAGE_SIZE; | 2511 | size = PAGE_SIZE; |
2448 | } | 2512 | } |
@@ -2471,8 +2535,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2471 | printk(KERN_ERR | 2535 | printk(KERN_ERR |
2472 | "kmem_cache_create: couldn't create cache %s.\n", name); | 2536 | "kmem_cache_create: couldn't create cache %s.\n", name); |
2473 | kmem_cache_free(&cache_cache, cachep); | 2537 | kmem_cache_free(&cache_cache, cachep); |
2474 | cachep = NULL; | 2538 | return NULL; |
2475 | goto oops; | ||
2476 | } | 2539 | } |
2477 | slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) | 2540 | slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) |
2478 | + sizeof(struct slab), align); | 2541 | + sizeof(struct slab), align); |
@@ -2508,10 +2571,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2508 | cachep->colour = left_over / cachep->colour_off; | 2571 | cachep->colour = left_over / cachep->colour_off; |
2509 | cachep->slab_size = slab_size; | 2572 | cachep->slab_size = slab_size; |
2510 | cachep->flags = flags; | 2573 | cachep->flags = flags; |
2511 | cachep->gfpflags = 0; | 2574 | cachep->allocflags = 0; |
2512 | if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) | 2575 | if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) |
2513 | cachep->gfpflags |= GFP_DMA; | 2576 | cachep->allocflags |= GFP_DMA; |
2514 | cachep->buffer_size = size; | 2577 | cachep->size = size; |
2515 | cachep->reciprocal_buffer_size = reciprocal_value(size); | 2578 | cachep->reciprocal_buffer_size = reciprocal_value(size); |
2516 | 2579 | ||
2517 | if (flags & CFLGS_OFF_SLAB) { | 2580 | if (flags & CFLGS_OFF_SLAB) { |
@@ -2530,8 +2593,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2530 | 2593 | ||
2531 | if (setup_cpu_cache(cachep, gfp)) { | 2594 | if (setup_cpu_cache(cachep, gfp)) { |
2532 | __kmem_cache_destroy(cachep); | 2595 | __kmem_cache_destroy(cachep); |
2533 | cachep = NULL; | 2596 | return NULL; |
2534 | goto oops; | ||
2535 | } | 2597 | } |
2536 | 2598 | ||
2537 | if (flags & SLAB_DEBUG_OBJECTS) { | 2599 | if (flags & SLAB_DEBUG_OBJECTS) { |
@@ -2545,18 +2607,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2545 | } | 2607 | } |
2546 | 2608 | ||
2547 | /* cache setup completed, link it into the list */ | 2609 | /* cache setup completed, link it into the list */ |
2548 | list_add(&cachep->next, &cache_chain); | 2610 | list_add(&cachep->list, &slab_caches); |
2549 | oops: | ||
2550 | if (!cachep && (flags & SLAB_PANIC)) | ||
2551 | panic("kmem_cache_create(): failed to create slab `%s'\n", | ||
2552 | name); | ||
2553 | if (slab_is_available()) { | ||
2554 | mutex_unlock(&cache_chain_mutex); | ||
2555 | put_online_cpus(); | ||
2556 | } | ||
2557 | return cachep; | 2611 | return cachep; |
2558 | } | 2612 | } |
2559 | EXPORT_SYMBOL(kmem_cache_create); | ||
2560 | 2613 | ||
2561 | #if DEBUG | 2614 | #if DEBUG |
2562 | static void check_irq_off(void) | 2615 | static void check_irq_off(void) |
@@ -2671,7 +2724,7 @@ out: | |||
2671 | return nr_freed; | 2724 | return nr_freed; |
2672 | } | 2725 | } |
2673 | 2726 | ||
2674 | /* Called with cache_chain_mutex held to protect against cpu hotplug */ | 2727 | /* Called with slab_mutex held to protect against cpu hotplug */ |
2675 | static int __cache_shrink(struct kmem_cache *cachep) | 2728 | static int __cache_shrink(struct kmem_cache *cachep) |
2676 | { | 2729 | { |
2677 | int ret = 0, i = 0; | 2730 | int ret = 0, i = 0; |
@@ -2706,9 +2759,9 @@ int kmem_cache_shrink(struct kmem_cache *cachep) | |||
2706 | BUG_ON(!cachep || in_interrupt()); | 2759 | BUG_ON(!cachep || in_interrupt()); |
2707 | 2760 | ||
2708 | get_online_cpus(); | 2761 | get_online_cpus(); |
2709 | mutex_lock(&cache_chain_mutex); | 2762 | mutex_lock(&slab_mutex); |
2710 | ret = __cache_shrink(cachep); | 2763 | ret = __cache_shrink(cachep); |
2711 | mutex_unlock(&cache_chain_mutex); | 2764 | mutex_unlock(&slab_mutex); |
2712 | put_online_cpus(); | 2765 | put_online_cpus(); |
2713 | return ret; | 2766 | return ret; |
2714 | } | 2767 | } |
@@ -2736,15 +2789,15 @@ void kmem_cache_destroy(struct kmem_cache *cachep) | |||
2736 | 2789 | ||
2737 | /* Find the cache in the chain of caches. */ | 2790 | /* Find the cache in the chain of caches. */ |
2738 | get_online_cpus(); | 2791 | get_online_cpus(); |
2739 | mutex_lock(&cache_chain_mutex); | 2792 | mutex_lock(&slab_mutex); |
2740 | /* | 2793 | /* |
2741 | * the chain is never empty, cache_cache is never destroyed | 2794 | * the chain is never empty, cache_cache is never destroyed |
2742 | */ | 2795 | */ |
2743 | list_del(&cachep->next); | 2796 | list_del(&cachep->list); |
2744 | if (__cache_shrink(cachep)) { | 2797 | if (__cache_shrink(cachep)) { |
2745 | slab_error(cachep, "Can't free all objects"); | 2798 | slab_error(cachep, "Can't free all objects"); |
2746 | list_add(&cachep->next, &cache_chain); | 2799 | list_add(&cachep->list, &slab_caches); |
2747 | mutex_unlock(&cache_chain_mutex); | 2800 | mutex_unlock(&slab_mutex); |
2748 | put_online_cpus(); | 2801 | put_online_cpus(); |
2749 | return; | 2802 | return; |
2750 | } | 2803 | } |
@@ -2753,7 +2806,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) | |||
2753 | rcu_barrier(); | 2806 | rcu_barrier(); |
2754 | 2807 | ||
2755 | __kmem_cache_destroy(cachep); | 2808 | __kmem_cache_destroy(cachep); |
2756 | mutex_unlock(&cache_chain_mutex); | 2809 | mutex_unlock(&slab_mutex); |
2757 | put_online_cpus(); | 2810 | put_online_cpus(); |
2758 | } | 2811 | } |
2759 | EXPORT_SYMBOL(kmem_cache_destroy); | 2812 | EXPORT_SYMBOL(kmem_cache_destroy); |
@@ -2840,10 +2893,10 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2840 | slab_error(cachep, "constructor overwrote the" | 2893 | slab_error(cachep, "constructor overwrote the" |
2841 | " start of an object"); | 2894 | " start of an object"); |
2842 | } | 2895 | } |
2843 | if ((cachep->buffer_size % PAGE_SIZE) == 0 && | 2896 | if ((cachep->size % PAGE_SIZE) == 0 && |
2844 | OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) | 2897 | OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) |
2845 | kernel_map_pages(virt_to_page(objp), | 2898 | kernel_map_pages(virt_to_page(objp), |
2846 | cachep->buffer_size / PAGE_SIZE, 0); | 2899 | cachep->size / PAGE_SIZE, 0); |
2847 | #else | 2900 | #else |
2848 | if (cachep->ctor) | 2901 | if (cachep->ctor) |
2849 | cachep->ctor(objp); | 2902 | cachep->ctor(objp); |
@@ -2857,9 +2910,9 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) | |||
2857 | { | 2910 | { |
2858 | if (CONFIG_ZONE_DMA_FLAG) { | 2911 | if (CONFIG_ZONE_DMA_FLAG) { |
2859 | if (flags & GFP_DMA) | 2912 | if (flags & GFP_DMA) |
2860 | BUG_ON(!(cachep->gfpflags & GFP_DMA)); | 2913 | BUG_ON(!(cachep->allocflags & GFP_DMA)); |
2861 | else | 2914 | else |
2862 | BUG_ON(cachep->gfpflags & GFP_DMA); | 2915 | BUG_ON(cachep->allocflags & GFP_DMA); |
2863 | } | 2916 | } |
2864 | } | 2917 | } |
2865 | 2918 | ||
@@ -2918,8 +2971,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, | |||
2918 | nr_pages <<= cache->gfporder; | 2971 | nr_pages <<= cache->gfporder; |
2919 | 2972 | ||
2920 | do { | 2973 | do { |
2921 | page_set_cache(page, cache); | 2974 | page->slab_cache = cache; |
2922 | page_set_slab(page, slab); | 2975 | page->slab_page = slab; |
2923 | page++; | 2976 | page++; |
2924 | } while (--nr_pages); | 2977 | } while (--nr_pages); |
2925 | } | 2978 | } |
@@ -3057,7 +3110,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
3057 | kfree_debugcheck(objp); | 3110 | kfree_debugcheck(objp); |
3058 | page = virt_to_head_page(objp); | 3111 | page = virt_to_head_page(objp); |
3059 | 3112 | ||
3060 | slabp = page_get_slab(page); | 3113 | slabp = page->slab_page; |
3061 | 3114 | ||
3062 | if (cachep->flags & SLAB_RED_ZONE) { | 3115 | if (cachep->flags & SLAB_RED_ZONE) { |
3063 | verify_redzone_free(cachep, objp); | 3116 | verify_redzone_free(cachep, objp); |
@@ -3077,10 +3130,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
3077 | #endif | 3130 | #endif |
3078 | if (cachep->flags & SLAB_POISON) { | 3131 | if (cachep->flags & SLAB_POISON) { |
3079 | #ifdef CONFIG_DEBUG_PAGEALLOC | 3132 | #ifdef CONFIG_DEBUG_PAGEALLOC |
3080 | if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { | 3133 | if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { |
3081 | store_stackinfo(cachep, objp, (unsigned long)caller); | 3134 | store_stackinfo(cachep, objp, (unsigned long)caller); |
3082 | kernel_map_pages(virt_to_page(objp), | 3135 | kernel_map_pages(virt_to_page(objp), |
3083 | cachep->buffer_size / PAGE_SIZE, 0); | 3136 | cachep->size / PAGE_SIZE, 0); |
3084 | } else { | 3137 | } else { |
3085 | poison_obj(cachep, objp, POISON_FREE); | 3138 | poison_obj(cachep, objp, POISON_FREE); |
3086 | } | 3139 | } |
@@ -3120,16 +3173,19 @@ bad: | |||
3120 | #define check_slabp(x,y) do { } while(0) | 3173 | #define check_slabp(x,y) do { } while(0) |
3121 | #endif | 3174 | #endif |
3122 | 3175 | ||
3123 | static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) | 3176 | static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, |
3177 | bool force_refill) | ||
3124 | { | 3178 | { |
3125 | int batchcount; | 3179 | int batchcount; |
3126 | struct kmem_list3 *l3; | 3180 | struct kmem_list3 *l3; |
3127 | struct array_cache *ac; | 3181 | struct array_cache *ac; |
3128 | int node; | 3182 | int node; |
3129 | 3183 | ||
3130 | retry: | ||
3131 | check_irq_off(); | 3184 | check_irq_off(); |
3132 | node = numa_mem_id(); | 3185 | node = numa_mem_id(); |
3186 | if (unlikely(force_refill)) | ||
3187 | goto force_grow; | ||
3188 | retry: | ||
3133 | ac = cpu_cache_get(cachep); | 3189 | ac = cpu_cache_get(cachep); |
3134 | batchcount = ac->batchcount; | 3190 | batchcount = ac->batchcount; |
3135 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { | 3191 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { |
@@ -3179,8 +3235,8 @@ retry: | |||
3179 | STATS_INC_ACTIVE(cachep); | 3235 | STATS_INC_ACTIVE(cachep); |
3180 | STATS_SET_HIGH(cachep); | 3236 | STATS_SET_HIGH(cachep); |
3181 | 3237 | ||
3182 | ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, | 3238 | ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp, |
3183 | node); | 3239 | node)); |
3184 | } | 3240 | } |
3185 | check_slabp(cachep, slabp); | 3241 | check_slabp(cachep, slabp); |
3186 | 3242 | ||
@@ -3199,18 +3255,22 @@ alloc_done: | |||
3199 | 3255 | ||
3200 | if (unlikely(!ac->avail)) { | 3256 | if (unlikely(!ac->avail)) { |
3201 | int x; | 3257 | int x; |
3258 | force_grow: | ||
3202 | x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); | 3259 | x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); |
3203 | 3260 | ||
3204 | /* cache_grow can reenable interrupts, then ac could change. */ | 3261 | /* cache_grow can reenable interrupts, then ac could change. */ |
3205 | ac = cpu_cache_get(cachep); | 3262 | ac = cpu_cache_get(cachep); |
3206 | if (!x && ac->avail == 0) /* no objects in sight? abort */ | 3263 | |
3264 | /* no objects in sight? abort */ | ||
3265 | if (!x && (ac->avail == 0 || force_refill)) | ||
3207 | return NULL; | 3266 | return NULL; |
3208 | 3267 | ||
3209 | if (!ac->avail) /* objects refilled by interrupt? */ | 3268 | if (!ac->avail) /* objects refilled by interrupt? */ |
3210 | goto retry; | 3269 | goto retry; |
3211 | } | 3270 | } |
3212 | ac->touched = 1; | 3271 | ac->touched = 1; |
3213 | return ac->entry[--ac->avail]; | 3272 | |
3273 | return ac_get_obj(cachep, ac, flags, force_refill); | ||
3214 | } | 3274 | } |
3215 | 3275 | ||
3216 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, | 3276 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, |
@@ -3230,9 +3290,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3230 | return objp; | 3290 | return objp; |
3231 | if (cachep->flags & SLAB_POISON) { | 3291 | if (cachep->flags & SLAB_POISON) { |
3232 | #ifdef CONFIG_DEBUG_PAGEALLOC | 3292 | #ifdef CONFIG_DEBUG_PAGEALLOC |
3233 | if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) | 3293 | if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) |
3234 | kernel_map_pages(virt_to_page(objp), | 3294 | kernel_map_pages(virt_to_page(objp), |
3235 | cachep->buffer_size / PAGE_SIZE, 1); | 3295 | cachep->size / PAGE_SIZE, 1); |
3236 | else | 3296 | else |
3237 | check_poison_obj(cachep, objp); | 3297 | check_poison_obj(cachep, objp); |
3238 | #else | 3298 | #else |
@@ -3261,8 +3321,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3261 | struct slab *slabp; | 3321 | struct slab *slabp; |
3262 | unsigned objnr; | 3322 | unsigned objnr; |
3263 | 3323 | ||
3264 | slabp = page_get_slab(virt_to_head_page(objp)); | 3324 | slabp = virt_to_head_page(objp)->slab_page; |
3265 | objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; | 3325 | objnr = (unsigned)(objp - slabp->s_mem) / cachep->size; |
3266 | slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; | 3326 | slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; |
3267 | } | 3327 | } |
3268 | #endif | 3328 | #endif |
@@ -3285,30 +3345,42 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) | |||
3285 | if (cachep == &cache_cache) | 3345 | if (cachep == &cache_cache) |
3286 | return false; | 3346 | return false; |
3287 | 3347 | ||
3288 | return should_failslab(obj_size(cachep), flags, cachep->flags); | 3348 | return should_failslab(cachep->object_size, flags, cachep->flags); |
3289 | } | 3349 | } |
3290 | 3350 | ||
3291 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 3351 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
3292 | { | 3352 | { |
3293 | void *objp; | 3353 | void *objp; |
3294 | struct array_cache *ac; | 3354 | struct array_cache *ac; |
3355 | bool force_refill = false; | ||
3295 | 3356 | ||
3296 | check_irq_off(); | 3357 | check_irq_off(); |
3297 | 3358 | ||
3298 | ac = cpu_cache_get(cachep); | 3359 | ac = cpu_cache_get(cachep); |
3299 | if (likely(ac->avail)) { | 3360 | if (likely(ac->avail)) { |
3300 | STATS_INC_ALLOCHIT(cachep); | ||
3301 | ac->touched = 1; | 3361 | ac->touched = 1; |
3302 | objp = ac->entry[--ac->avail]; | 3362 | objp = ac_get_obj(cachep, ac, flags, false); |
3303 | } else { | 3363 | |
3304 | STATS_INC_ALLOCMISS(cachep); | ||
3305 | objp = cache_alloc_refill(cachep, flags); | ||
3306 | /* | 3364 | /* |
3307 | * the 'ac' may be updated by cache_alloc_refill(), | 3365 | * Allow for the possibility all avail objects are not allowed |
3308 | * and kmemleak_erase() requires its correct value. | 3366 | * by the current flags |
3309 | */ | 3367 | */ |
3310 | ac = cpu_cache_get(cachep); | 3368 | if (objp) { |
3369 | STATS_INC_ALLOCHIT(cachep); | ||
3370 | goto out; | ||
3371 | } | ||
3372 | force_refill = true; | ||
3311 | } | 3373 | } |
3374 | |||
3375 | STATS_INC_ALLOCMISS(cachep); | ||
3376 | objp = cache_alloc_refill(cachep, flags, force_refill); | ||
3377 | /* | ||
3378 | * the 'ac' may be updated by cache_alloc_refill(), | ||
3379 | * and kmemleak_erase() requires its correct value. | ||
3380 | */ | ||
3381 | ac = cpu_cache_get(cachep); | ||
3382 | |||
3383 | out: | ||
3312 | /* | 3384 | /* |
3313 | * To avoid a false negative, if an object that is in one of the | 3385 | * To avoid a false negative, if an object that is in one of the |
3314 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't | 3386 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't |
@@ -3336,7 +3408,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3336 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 3408 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) |
3337 | nid_alloc = cpuset_slab_spread_node(); | 3409 | nid_alloc = cpuset_slab_spread_node(); |
3338 | else if (current->mempolicy) | 3410 | else if (current->mempolicy) |
3339 | nid_alloc = slab_node(current->mempolicy); | 3411 | nid_alloc = slab_node(); |
3340 | if (nid_alloc != nid_here) | 3412 | if (nid_alloc != nid_here) |
3341 | return ____cache_alloc_node(cachep, flags, nid_alloc); | 3413 | return ____cache_alloc_node(cachep, flags, nid_alloc); |
3342 | return NULL; | 3414 | return NULL; |
@@ -3368,7 +3440,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3368 | 3440 | ||
3369 | retry_cpuset: | 3441 | retry_cpuset: |
3370 | cpuset_mems_cookie = get_mems_allowed(); | 3442 | cpuset_mems_cookie = get_mems_allowed(); |
3371 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | 3443 | zonelist = node_zonelist(slab_node(), flags); |
3372 | 3444 | ||
3373 | retry: | 3445 | retry: |
3374 | /* | 3446 | /* |
@@ -3545,14 +3617,14 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3545 | out: | 3617 | out: |
3546 | local_irq_restore(save_flags); | 3618 | local_irq_restore(save_flags); |
3547 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); | 3619 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); |
3548 | kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, | 3620 | kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, |
3549 | flags); | 3621 | flags); |
3550 | 3622 | ||
3551 | if (likely(ptr)) | 3623 | if (likely(ptr)) |
3552 | kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); | 3624 | kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); |
3553 | 3625 | ||
3554 | if (unlikely((flags & __GFP_ZERO) && ptr)) | 3626 | if (unlikely((flags & __GFP_ZERO) && ptr)) |
3555 | memset(ptr, 0, obj_size(cachep)); | 3627 | memset(ptr, 0, cachep->object_size); |
3556 | 3628 | ||
3557 | return ptr; | 3629 | return ptr; |
3558 | } | 3630 | } |
@@ -3607,15 +3679,15 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | |||
3607 | objp = __do_cache_alloc(cachep, flags); | 3679 | objp = __do_cache_alloc(cachep, flags); |
3608 | local_irq_restore(save_flags); | 3680 | local_irq_restore(save_flags); |
3609 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); | 3681 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); |
3610 | kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, | 3682 | kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, |
3611 | flags); | 3683 | flags); |
3612 | prefetchw(objp); | 3684 | prefetchw(objp); |
3613 | 3685 | ||
3614 | if (likely(objp)) | 3686 | if (likely(objp)) |
3615 | kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); | 3687 | kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); |
3616 | 3688 | ||
3617 | if (unlikely((flags & __GFP_ZERO) && objp)) | 3689 | if (unlikely((flags & __GFP_ZERO) && objp)) |
3618 | memset(objp, 0, obj_size(cachep)); | 3690 | memset(objp, 0, cachep->object_size); |
3619 | 3691 | ||
3620 | return objp; | 3692 | return objp; |
3621 | } | 3693 | } |
@@ -3630,9 +3702,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, | |||
3630 | struct kmem_list3 *l3; | 3702 | struct kmem_list3 *l3; |
3631 | 3703 | ||
3632 | for (i = 0; i < nr_objects; i++) { | 3704 | for (i = 0; i < nr_objects; i++) { |
3633 | void *objp = objpp[i]; | 3705 | void *objp; |
3634 | struct slab *slabp; | 3706 | struct slab *slabp; |
3635 | 3707 | ||
3708 | clear_obj_pfmemalloc(&objpp[i]); | ||
3709 | objp = objpp[i]; | ||
3710 | |||
3636 | slabp = virt_to_slab(objp); | 3711 | slabp = virt_to_slab(objp); |
3637 | l3 = cachep->nodelists[node]; | 3712 | l3 = cachep->nodelists[node]; |
3638 | list_del(&slabp->list); | 3713 | list_del(&slabp->list); |
@@ -3731,7 +3806,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, | |||
3731 | kmemleak_free_recursive(objp, cachep->flags); | 3806 | kmemleak_free_recursive(objp, cachep->flags); |
3732 | objp = cache_free_debugcheck(cachep, objp, caller); | 3807 | objp = cache_free_debugcheck(cachep, objp, caller); |
3733 | 3808 | ||
3734 | kmemcheck_slab_free(cachep, objp, obj_size(cachep)); | 3809 | kmemcheck_slab_free(cachep, objp, cachep->object_size); |
3735 | 3810 | ||
3736 | /* | 3811 | /* |
3737 | * Skip calling cache_free_alien() when the platform is not numa. | 3812 | * Skip calling cache_free_alien() when the platform is not numa. |
@@ -3750,7 +3825,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, | |||
3750 | cache_flusharray(cachep, ac); | 3825 | cache_flusharray(cachep, ac); |
3751 | } | 3826 | } |
3752 | 3827 | ||
3753 | ac->entry[ac->avail++] = objp; | 3828 | ac_put_obj(cachep, ac, objp); |
3754 | } | 3829 | } |
3755 | 3830 | ||
3756 | /** | 3831 | /** |
@@ -3766,7 +3841,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3766 | void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); | 3841 | void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); |
3767 | 3842 | ||
3768 | trace_kmem_cache_alloc(_RET_IP_, ret, | 3843 | trace_kmem_cache_alloc(_RET_IP_, ret, |
3769 | obj_size(cachep), cachep->buffer_size, flags); | 3844 | cachep->object_size, cachep->size, flags); |
3770 | 3845 | ||
3771 | return ret; | 3846 | return ret; |
3772 | } | 3847 | } |
@@ -3794,7 +3869,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
3794 | __builtin_return_address(0)); | 3869 | __builtin_return_address(0)); |
3795 | 3870 | ||
3796 | trace_kmem_cache_alloc_node(_RET_IP_, ret, | 3871 | trace_kmem_cache_alloc_node(_RET_IP_, ret, |
3797 | obj_size(cachep), cachep->buffer_size, | 3872 | cachep->object_size, cachep->size, |
3798 | flags, nodeid); | 3873 | flags, nodeid); |
3799 | 3874 | ||
3800 | return ret; | 3875 | return ret; |
@@ -3876,7 +3951,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
3876 | ret = __cache_alloc(cachep, flags, caller); | 3951 | ret = __cache_alloc(cachep, flags, caller); |
3877 | 3952 | ||
3878 | trace_kmalloc((unsigned long) caller, ret, | 3953 | trace_kmalloc((unsigned long) caller, ret, |
3879 | size, cachep->buffer_size, flags); | 3954 | size, cachep->size, flags); |
3880 | 3955 | ||
3881 | return ret; | 3956 | return ret; |
3882 | } | 3957 | } |
@@ -3916,9 +3991,9 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) | |||
3916 | unsigned long flags; | 3991 | unsigned long flags; |
3917 | 3992 | ||
3918 | local_irq_save(flags); | 3993 | local_irq_save(flags); |
3919 | debug_check_no_locks_freed(objp, obj_size(cachep)); | 3994 | debug_check_no_locks_freed(objp, cachep->object_size); |
3920 | if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) | 3995 | if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) |
3921 | debug_check_no_obj_freed(objp, obj_size(cachep)); | 3996 | debug_check_no_obj_freed(objp, cachep->object_size); |
3922 | __cache_free(cachep, objp, __builtin_return_address(0)); | 3997 | __cache_free(cachep, objp, __builtin_return_address(0)); |
3923 | local_irq_restore(flags); | 3998 | local_irq_restore(flags); |
3924 | 3999 | ||
@@ -3947,8 +4022,9 @@ void kfree(const void *objp) | |||
3947 | local_irq_save(flags); | 4022 | local_irq_save(flags); |
3948 | kfree_debugcheck(objp); | 4023 | kfree_debugcheck(objp); |
3949 | c = virt_to_cache(objp); | 4024 | c = virt_to_cache(objp); |
3950 | debug_check_no_locks_freed(objp, obj_size(c)); | 4025 | debug_check_no_locks_freed(objp, c->object_size); |
3951 | debug_check_no_obj_freed(objp, obj_size(c)); | 4026 | |
4027 | debug_check_no_obj_freed(objp, c->object_size); | ||
3952 | __cache_free(c, (void *)objp, __builtin_return_address(0)); | 4028 | __cache_free(c, (void *)objp, __builtin_return_address(0)); |
3953 | local_irq_restore(flags); | 4029 | local_irq_restore(flags); |
3954 | } | 4030 | } |
@@ -3956,7 +4032,7 @@ EXPORT_SYMBOL(kfree); | |||
3956 | 4032 | ||
3957 | unsigned int kmem_cache_size(struct kmem_cache *cachep) | 4033 | unsigned int kmem_cache_size(struct kmem_cache *cachep) |
3958 | { | 4034 | { |
3959 | return obj_size(cachep); | 4035 | return cachep->object_size; |
3960 | } | 4036 | } |
3961 | EXPORT_SYMBOL(kmem_cache_size); | 4037 | EXPORT_SYMBOL(kmem_cache_size); |
3962 | 4038 | ||
@@ -4030,7 +4106,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) | |||
4030 | return 0; | 4106 | return 0; |
4031 | 4107 | ||
4032 | fail: | 4108 | fail: |
4033 | if (!cachep->next.next) { | 4109 | if (!cachep->list.next) { |
4034 | /* Cache is not active yet. Roll back what we did */ | 4110 | /* Cache is not active yet. Roll back what we did */ |
4035 | node--; | 4111 | node--; |
4036 | while (node >= 0) { | 4112 | while (node >= 0) { |
@@ -4065,7 +4141,7 @@ static void do_ccupdate_local(void *info) | |||
4065 | new->new[smp_processor_id()] = old; | 4141 | new->new[smp_processor_id()] = old; |
4066 | } | 4142 | } |
4067 | 4143 | ||
4068 | /* Always called with the cache_chain_mutex held */ | 4144 | /* Always called with the slab_mutex held */ |
4069 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | 4145 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, |
4070 | int batchcount, int shared, gfp_t gfp) | 4146 | int batchcount, int shared, gfp_t gfp) |
4071 | { | 4147 | { |
@@ -4109,7 +4185,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
4109 | return alloc_kmemlist(cachep, gfp); | 4185 | return alloc_kmemlist(cachep, gfp); |
4110 | } | 4186 | } |
4111 | 4187 | ||
4112 | /* Called with cache_chain_mutex held always */ | 4188 | /* Called with slab_mutex held always */ |
4113 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) | 4189 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) |
4114 | { | 4190 | { |
4115 | int err; | 4191 | int err; |
@@ -4124,13 +4200,13 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) | |||
4124 | * The numbers are guessed, we should auto-tune as described by | 4200 | * The numbers are guessed, we should auto-tune as described by |
4125 | * Bonwick. | 4201 | * Bonwick. |
4126 | */ | 4202 | */ |
4127 | if (cachep->buffer_size > 131072) | 4203 | if (cachep->size > 131072) |
4128 | limit = 1; | 4204 | limit = 1; |
4129 | else if (cachep->buffer_size > PAGE_SIZE) | 4205 | else if (cachep->size > PAGE_SIZE) |
4130 | limit = 8; | 4206 | limit = 8; |
4131 | else if (cachep->buffer_size > 1024) | 4207 | else if (cachep->size > 1024) |
4132 | limit = 24; | 4208 | limit = 24; |
4133 | else if (cachep->buffer_size > 256) | 4209 | else if (cachep->size > 256) |
4134 | limit = 54; | 4210 | limit = 54; |
4135 | else | 4211 | else |
4136 | limit = 120; | 4212 | limit = 120; |
@@ -4145,7 +4221,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) | |||
4145 | * to a larger limit. Thus disabled by default. | 4221 | * to a larger limit. Thus disabled by default. |
4146 | */ | 4222 | */ |
4147 | shared = 0; | 4223 | shared = 0; |
4148 | if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) | 4224 | if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) |
4149 | shared = 8; | 4225 | shared = 8; |
4150 | 4226 | ||
4151 | #if DEBUG | 4227 | #if DEBUG |
@@ -4211,11 +4287,11 @@ static void cache_reap(struct work_struct *w) | |||
4211 | int node = numa_mem_id(); | 4287 | int node = numa_mem_id(); |
4212 | struct delayed_work *work = to_delayed_work(w); | 4288 | struct delayed_work *work = to_delayed_work(w); |
4213 | 4289 | ||
4214 | if (!mutex_trylock(&cache_chain_mutex)) | 4290 | if (!mutex_trylock(&slab_mutex)) |
4215 | /* Give up. Setup the next iteration. */ | 4291 | /* Give up. Setup the next iteration. */ |
4216 | goto out; | 4292 | goto out; |
4217 | 4293 | ||
4218 | list_for_each_entry(searchp, &cache_chain, next) { | 4294 | list_for_each_entry(searchp, &slab_caches, list) { |
4219 | check_irq_on(); | 4295 | check_irq_on(); |
4220 | 4296 | ||
4221 | /* | 4297 | /* |
@@ -4253,7 +4329,7 @@ next: | |||
4253 | cond_resched(); | 4329 | cond_resched(); |
4254 | } | 4330 | } |
4255 | check_irq_on(); | 4331 | check_irq_on(); |
4256 | mutex_unlock(&cache_chain_mutex); | 4332 | mutex_unlock(&slab_mutex); |
4257 | next_reap_node(); | 4333 | next_reap_node(); |
4258 | out: | 4334 | out: |
4259 | /* Set up the next iteration */ | 4335 | /* Set up the next iteration */ |
@@ -4289,26 +4365,26 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
4289 | { | 4365 | { |
4290 | loff_t n = *pos; | 4366 | loff_t n = *pos; |
4291 | 4367 | ||
4292 | mutex_lock(&cache_chain_mutex); | 4368 | mutex_lock(&slab_mutex); |
4293 | if (!n) | 4369 | if (!n) |
4294 | print_slabinfo_header(m); | 4370 | print_slabinfo_header(m); |
4295 | 4371 | ||
4296 | return seq_list_start(&cache_chain, *pos); | 4372 | return seq_list_start(&slab_caches, *pos); |
4297 | } | 4373 | } |
4298 | 4374 | ||
4299 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | 4375 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) |
4300 | { | 4376 | { |
4301 | return seq_list_next(p, &cache_chain, pos); | 4377 | return seq_list_next(p, &slab_caches, pos); |
4302 | } | 4378 | } |
4303 | 4379 | ||
4304 | static void s_stop(struct seq_file *m, void *p) | 4380 | static void s_stop(struct seq_file *m, void *p) |
4305 | { | 4381 | { |
4306 | mutex_unlock(&cache_chain_mutex); | 4382 | mutex_unlock(&slab_mutex); |
4307 | } | 4383 | } |
4308 | 4384 | ||
4309 | static int s_show(struct seq_file *m, void *p) | 4385 | static int s_show(struct seq_file *m, void *p) |
4310 | { | 4386 | { |
4311 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); | 4387 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); |
4312 | struct slab *slabp; | 4388 | struct slab *slabp; |
4313 | unsigned long active_objs; | 4389 | unsigned long active_objs; |
4314 | unsigned long num_objs; | 4390 | unsigned long num_objs; |
@@ -4364,7 +4440,7 @@ static int s_show(struct seq_file *m, void *p) | |||
4364 | printk(KERN_ERR "slab: cache %s error: %s\n", name, error); | 4440 | printk(KERN_ERR "slab: cache %s error: %s\n", name, error); |
4365 | 4441 | ||
4366 | seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", | 4442 | seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", |
4367 | name, active_objs, num_objs, cachep->buffer_size, | 4443 | name, active_objs, num_objs, cachep->size, |
4368 | cachep->num, (1 << cachep->gfporder)); | 4444 | cachep->num, (1 << cachep->gfporder)); |
4369 | seq_printf(m, " : tunables %4u %4u %4u", | 4445 | seq_printf(m, " : tunables %4u %4u %4u", |
4370 | cachep->limit, cachep->batchcount, cachep->shared); | 4446 | cachep->limit, cachep->batchcount, cachep->shared); |
@@ -4454,9 +4530,9 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer, | |||
4454 | return -EINVAL; | 4530 | return -EINVAL; |
4455 | 4531 | ||
4456 | /* Find the cache in the chain of caches. */ | 4532 | /* Find the cache in the chain of caches. */ |
4457 | mutex_lock(&cache_chain_mutex); | 4533 | mutex_lock(&slab_mutex); |
4458 | res = -EINVAL; | 4534 | res = -EINVAL; |
4459 | list_for_each_entry(cachep, &cache_chain, next) { | 4535 | list_for_each_entry(cachep, &slab_caches, list) { |
4460 | if (!strcmp(cachep->name, kbuf)) { | 4536 | if (!strcmp(cachep->name, kbuf)) { |
4461 | if (limit < 1 || batchcount < 1 || | 4537 | if (limit < 1 || batchcount < 1 || |
4462 | batchcount > limit || shared < 0) { | 4538 | batchcount > limit || shared < 0) { |
@@ -4469,7 +4545,7 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer, | |||
4469 | break; | 4545 | break; |
4470 | } | 4546 | } |
4471 | } | 4547 | } |
4472 | mutex_unlock(&cache_chain_mutex); | 4548 | mutex_unlock(&slab_mutex); |
4473 | if (res >= 0) | 4549 | if (res >= 0) |
4474 | res = count; | 4550 | res = count; |
4475 | return res; | 4551 | return res; |
@@ -4492,8 +4568,8 @@ static const struct file_operations proc_slabinfo_operations = { | |||
4492 | 4568 | ||
4493 | static void *leaks_start(struct seq_file *m, loff_t *pos) | 4569 | static void *leaks_start(struct seq_file *m, loff_t *pos) |
4494 | { | 4570 | { |
4495 | mutex_lock(&cache_chain_mutex); | 4571 | mutex_lock(&slab_mutex); |
4496 | return seq_list_start(&cache_chain, *pos); | 4572 | return seq_list_start(&slab_caches, *pos); |
4497 | } | 4573 | } |
4498 | 4574 | ||
4499 | static inline int add_caller(unsigned long *n, unsigned long v) | 4575 | static inline int add_caller(unsigned long *n, unsigned long v) |
@@ -4532,7 +4608,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) | |||
4532 | int i; | 4608 | int i; |
4533 | if (n[0] == n[1]) | 4609 | if (n[0] == n[1]) |
4534 | return; | 4610 | return; |
4535 | for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { | 4611 | for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) { |
4536 | if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) | 4612 | if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) |
4537 | continue; | 4613 | continue; |
4538 | if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) | 4614 | if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) |
@@ -4558,7 +4634,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) | |||
4558 | 4634 | ||
4559 | static int leaks_show(struct seq_file *m, void *p) | 4635 | static int leaks_show(struct seq_file *m, void *p) |
4560 | { | 4636 | { |
4561 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); | 4637 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); |
4562 | struct slab *slabp; | 4638 | struct slab *slabp; |
4563 | struct kmem_list3 *l3; | 4639 | struct kmem_list3 *l3; |
4564 | const char *name; | 4640 | const char *name; |
@@ -4592,17 +4668,17 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4592 | name = cachep->name; | 4668 | name = cachep->name; |
4593 | if (n[0] == n[1]) { | 4669 | if (n[0] == n[1]) { |
4594 | /* Increase the buffer size */ | 4670 | /* Increase the buffer size */ |
4595 | mutex_unlock(&cache_chain_mutex); | 4671 | mutex_unlock(&slab_mutex); |
4596 | m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); | 4672 | m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); |
4597 | if (!m->private) { | 4673 | if (!m->private) { |
4598 | /* Too bad, we are really out */ | 4674 | /* Too bad, we are really out */ |
4599 | m->private = n; | 4675 | m->private = n; |
4600 | mutex_lock(&cache_chain_mutex); | 4676 | mutex_lock(&slab_mutex); |
4601 | return -ENOMEM; | 4677 | return -ENOMEM; |
4602 | } | 4678 | } |
4603 | *(unsigned long *)m->private = n[0] * 2; | 4679 | *(unsigned long *)m->private = n[0] * 2; |
4604 | kfree(n); | 4680 | kfree(n); |
4605 | mutex_lock(&cache_chain_mutex); | 4681 | mutex_lock(&slab_mutex); |
4606 | /* Now make sure this entry will be retried */ | 4682 | /* Now make sure this entry will be retried */ |
4607 | m->count = m->size; | 4683 | m->count = m->size; |
4608 | return 0; | 4684 | return 0; |
@@ -4677,6 +4753,6 @@ size_t ksize(const void *objp) | |||
4677 | if (unlikely(objp == ZERO_SIZE_PTR)) | 4753 | if (unlikely(objp == ZERO_SIZE_PTR)) |
4678 | return 0; | 4754 | return 0; |
4679 | 4755 | ||
4680 | return obj_size(virt_to_cache(objp)); | 4756 | return virt_to_cache(objp)->object_size; |
4681 | } | 4757 | } |
4682 | EXPORT_SYMBOL(ksize); | 4758 | EXPORT_SYMBOL(ksize); |
diff --git a/mm/slab.h b/mm/slab.h new file mode 100644 index 000000000000..db7848caaa25 --- /dev/null +++ b/mm/slab.h | |||
@@ -0,0 +1,33 @@ | |||
1 | #ifndef MM_SLAB_H | ||
2 | #define MM_SLAB_H | ||
3 | /* | ||
4 | * Internal slab definitions | ||
5 | */ | ||
6 | |||
7 | /* | ||
8 | * State of the slab allocator. | ||
9 | * | ||
10 | * This is used to describe the states of the allocator during bootup. | ||
11 | * Allocators use this to gradually bootstrap themselves. Most allocators | ||
12 | * have the problem that the structures used for managing slab caches are | ||
13 | * allocated from slab caches themselves. | ||
14 | */ | ||
15 | enum slab_state { | ||
16 | DOWN, /* No slab functionality yet */ | ||
17 | PARTIAL, /* SLUB: kmem_cache_node available */ | ||
18 | PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */ | ||
19 | PARTIAL_L3, /* SLAB: kmalloc size for l3 struct available */ | ||
20 | UP, /* Slab caches usable but not all extras yet */ | ||
21 | FULL /* Everything is working */ | ||
22 | }; | ||
23 | |||
24 | extern enum slab_state slab_state; | ||
25 | |||
26 | /* The slab cache mutex protects the management structures during changes */ | ||
27 | extern struct mutex slab_mutex; | ||
28 | extern struct list_head slab_caches; | ||
29 | |||
30 | struct kmem_cache *__kmem_cache_create(const char *name, size_t size, | ||
31 | size_t align, unsigned long flags, void (*ctor)(void *)); | ||
32 | |||
33 | #endif | ||
diff --git a/mm/slab_common.c b/mm/slab_common.c new file mode 100644 index 000000000000..aa3ca5bb01b5 --- /dev/null +++ b/mm/slab_common.c | |||
@@ -0,0 +1,120 @@ | |||
1 | /* | ||
2 | * Slab allocator functions that are independent of the allocator strategy | ||
3 | * | ||
4 | * (C) 2012 Christoph Lameter <cl@linux.com> | ||
5 | */ | ||
6 | #include <linux/slab.h> | ||
7 | |||
8 | #include <linux/mm.h> | ||
9 | #include <linux/poison.h> | ||
10 | #include <linux/interrupt.h> | ||
11 | #include <linux/memory.h> | ||
12 | #include <linux/compiler.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/cpu.h> | ||
15 | #include <linux/uaccess.h> | ||
16 | #include <asm/cacheflush.h> | ||
17 | #include <asm/tlbflush.h> | ||
18 | #include <asm/page.h> | ||
19 | |||
20 | #include "slab.h" | ||
21 | |||
22 | enum slab_state slab_state; | ||
23 | LIST_HEAD(slab_caches); | ||
24 | DEFINE_MUTEX(slab_mutex); | ||
25 | |||
26 | /* | ||
27 | * kmem_cache_create - Create a cache. | ||
28 | * @name: A string which is used in /proc/slabinfo to identify this cache. | ||
29 | * @size: The size of objects to be created in this cache. | ||
30 | * @align: The required alignment for the objects. | ||
31 | * @flags: SLAB flags | ||
32 | * @ctor: A constructor for the objects. | ||
33 | * | ||
34 | * Returns a ptr to the cache on success, NULL on failure. | ||
35 | * Cannot be called within a interrupt, but can be interrupted. | ||
36 | * The @ctor is run when new pages are allocated by the cache. | ||
37 | * | ||
38 | * The flags are | ||
39 | * | ||
40 | * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) | ||
41 | * to catch references to uninitialised memory. | ||
42 | * | ||
43 | * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check | ||
44 | * for buffer overruns. | ||
45 | * | ||
46 | * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware | ||
47 | * cacheline. This can be beneficial if you're counting cycles as closely | ||
48 | * as davem. | ||
49 | */ | ||
50 | |||
51 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, | ||
52 | unsigned long flags, void (*ctor)(void *)) | ||
53 | { | ||
54 | struct kmem_cache *s = NULL; | ||
55 | |||
56 | #ifdef CONFIG_DEBUG_VM | ||
57 | if (!name || in_interrupt() || size < sizeof(void *) || | ||
58 | size > KMALLOC_MAX_SIZE) { | ||
59 | printk(KERN_ERR "kmem_cache_create(%s) integrity check" | ||
60 | " failed\n", name); | ||
61 | goto out; | ||
62 | } | ||
63 | #endif | ||
64 | |||
65 | get_online_cpus(); | ||
66 | mutex_lock(&slab_mutex); | ||
67 | |||
68 | #ifdef CONFIG_DEBUG_VM | ||
69 | list_for_each_entry(s, &slab_caches, list) { | ||
70 | char tmp; | ||
71 | int res; | ||
72 | |||
73 | /* | ||
74 | * This happens when the module gets unloaded and doesn't | ||
75 | * destroy its slab cache and no-one else reuses the vmalloc | ||
76 | * area of the module. Print a warning. | ||
77 | */ | ||
78 | res = probe_kernel_address(s->name, tmp); | ||
79 | if (res) { | ||
80 | printk(KERN_ERR | ||
81 | "Slab cache with size %d has lost its name\n", | ||
82 | s->object_size); | ||
83 | continue; | ||
84 | } | ||
85 | |||
86 | if (!strcmp(s->name, name)) { | ||
87 | printk(KERN_ERR "kmem_cache_create(%s): Cache name" | ||
88 | " already exists.\n", | ||
89 | name); | ||
90 | dump_stack(); | ||
91 | s = NULL; | ||
92 | goto oops; | ||
93 | } | ||
94 | } | ||
95 | |||
96 | WARN_ON(strchr(name, ' ')); /* It confuses parsers */ | ||
97 | #endif | ||
98 | |||
99 | s = __kmem_cache_create(name, size, align, flags, ctor); | ||
100 | |||
101 | #ifdef CONFIG_DEBUG_VM | ||
102 | oops: | ||
103 | #endif | ||
104 | mutex_unlock(&slab_mutex); | ||
105 | put_online_cpus(); | ||
106 | |||
107 | #ifdef CONFIG_DEBUG_VM | ||
108 | out: | ||
109 | #endif | ||
110 | if (!s && (flags & SLAB_PANIC)) | ||
111 | panic("kmem_cache_create: Failed to create slab '%s'\n", name); | ||
112 | |||
113 | return s; | ||
114 | } | ||
115 | EXPORT_SYMBOL(kmem_cache_create); | ||
116 | |||
117 | int slab_is_available(void) | ||
118 | { | ||
119 | return slab_state >= UP; | ||
120 | } | ||
@@ -59,6 +59,8 @@ | |||
59 | 59 | ||
60 | #include <linux/kernel.h> | 60 | #include <linux/kernel.h> |
61 | #include <linux/slab.h> | 61 | #include <linux/slab.h> |
62 | #include "slab.h" | ||
63 | |||
62 | #include <linux/mm.h> | 64 | #include <linux/mm.h> |
63 | #include <linux/swap.h> /* struct reclaim_state */ | 65 | #include <linux/swap.h> /* struct reclaim_state */ |
64 | #include <linux/cache.h> | 66 | #include <linux/cache.h> |
@@ -92,36 +94,6 @@ struct slob_block { | |||
92 | typedef struct slob_block slob_t; | 94 | typedef struct slob_block slob_t; |
93 | 95 | ||
94 | /* | 96 | /* |
95 | * We use struct page fields to manage some slob allocation aspects, | ||
96 | * however to avoid the horrible mess in include/linux/mm_types.h, we'll | ||
97 | * just define our own struct page type variant here. | ||
98 | */ | ||
99 | struct slob_page { | ||
100 | union { | ||
101 | struct { | ||
102 | unsigned long flags; /* mandatory */ | ||
103 | atomic_t _count; /* mandatory */ | ||
104 | slobidx_t units; /* free units left in page */ | ||
105 | unsigned long pad[2]; | ||
106 | slob_t *free; /* first free slob_t in page */ | ||
107 | struct list_head list; /* linked list of free pages */ | ||
108 | }; | ||
109 | struct page page; | ||
110 | }; | ||
111 | }; | ||
112 | static inline void struct_slob_page_wrong_size(void) | ||
113 | { BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); } | ||
114 | |||
115 | /* | ||
116 | * free_slob_page: call before a slob_page is returned to the page allocator. | ||
117 | */ | ||
118 | static inline void free_slob_page(struct slob_page *sp) | ||
119 | { | ||
120 | reset_page_mapcount(&sp->page); | ||
121 | sp->page.mapping = NULL; | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * All partially free slob pages go on these lists. | 97 | * All partially free slob pages go on these lists. |
126 | */ | 98 | */ |
127 | #define SLOB_BREAK1 256 | 99 | #define SLOB_BREAK1 256 |
@@ -131,46 +103,23 @@ static LIST_HEAD(free_slob_medium); | |||
131 | static LIST_HEAD(free_slob_large); | 103 | static LIST_HEAD(free_slob_large); |
132 | 104 | ||
133 | /* | 105 | /* |
134 | * is_slob_page: True for all slob pages (false for bigblock pages) | ||
135 | */ | ||
136 | static inline int is_slob_page(struct slob_page *sp) | ||
137 | { | ||
138 | return PageSlab((struct page *)sp); | ||
139 | } | ||
140 | |||
141 | static inline void set_slob_page(struct slob_page *sp) | ||
142 | { | ||
143 | __SetPageSlab((struct page *)sp); | ||
144 | } | ||
145 | |||
146 | static inline void clear_slob_page(struct slob_page *sp) | ||
147 | { | ||
148 | __ClearPageSlab((struct page *)sp); | ||
149 | } | ||
150 | |||
151 | static inline struct slob_page *slob_page(const void *addr) | ||
152 | { | ||
153 | return (struct slob_page *)virt_to_page(addr); | ||
154 | } | ||
155 | |||
156 | /* | ||
157 | * slob_page_free: true for pages on free_slob_pages list. | 106 | * slob_page_free: true for pages on free_slob_pages list. |
158 | */ | 107 | */ |
159 | static inline int slob_page_free(struct slob_page *sp) | 108 | static inline int slob_page_free(struct page *sp) |
160 | { | 109 | { |
161 | return PageSlobFree((struct page *)sp); | 110 | return PageSlobFree(sp); |
162 | } | 111 | } |
163 | 112 | ||
164 | static void set_slob_page_free(struct slob_page *sp, struct list_head *list) | 113 | static void set_slob_page_free(struct page *sp, struct list_head *list) |
165 | { | 114 | { |
166 | list_add(&sp->list, list); | 115 | list_add(&sp->list, list); |
167 | __SetPageSlobFree((struct page *)sp); | 116 | __SetPageSlobFree(sp); |
168 | } | 117 | } |
169 | 118 | ||
170 | static inline void clear_slob_page_free(struct slob_page *sp) | 119 | static inline void clear_slob_page_free(struct page *sp) |
171 | { | 120 | { |
172 | list_del(&sp->list); | 121 | list_del(&sp->list); |
173 | __ClearPageSlobFree((struct page *)sp); | 122 | __ClearPageSlobFree(sp); |
174 | } | 123 | } |
175 | 124 | ||
176 | #define SLOB_UNIT sizeof(slob_t) | 125 | #define SLOB_UNIT sizeof(slob_t) |
@@ -267,12 +216,12 @@ static void slob_free_pages(void *b, int order) | |||
267 | /* | 216 | /* |
268 | * Allocate a slob block within a given slob_page sp. | 217 | * Allocate a slob block within a given slob_page sp. |
269 | */ | 218 | */ |
270 | static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) | 219 | static void *slob_page_alloc(struct page *sp, size_t size, int align) |
271 | { | 220 | { |
272 | slob_t *prev, *cur, *aligned = NULL; | 221 | slob_t *prev, *cur, *aligned = NULL; |
273 | int delta = 0, units = SLOB_UNITS(size); | 222 | int delta = 0, units = SLOB_UNITS(size); |
274 | 223 | ||
275 | for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { | 224 | for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { |
276 | slobidx_t avail = slob_units(cur); | 225 | slobidx_t avail = slob_units(cur); |
277 | 226 | ||
278 | if (align) { | 227 | if (align) { |
@@ -296,12 +245,12 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) | |||
296 | if (prev) | 245 | if (prev) |
297 | set_slob(prev, slob_units(prev), next); | 246 | set_slob(prev, slob_units(prev), next); |
298 | else | 247 | else |
299 | sp->free = next; | 248 | sp->freelist = next; |
300 | } else { /* fragment */ | 249 | } else { /* fragment */ |
301 | if (prev) | 250 | if (prev) |
302 | set_slob(prev, slob_units(prev), cur + units); | 251 | set_slob(prev, slob_units(prev), cur + units); |
303 | else | 252 | else |
304 | sp->free = cur + units; | 253 | sp->freelist = cur + units; |
305 | set_slob(cur + units, avail - units, next); | 254 | set_slob(cur + units, avail - units, next); |
306 | } | 255 | } |
307 | 256 | ||
@@ -320,7 +269,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) | |||
320 | */ | 269 | */ |
321 | static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | 270 | static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) |
322 | { | 271 | { |
323 | struct slob_page *sp; | 272 | struct page *sp; |
324 | struct list_head *prev; | 273 | struct list_head *prev; |
325 | struct list_head *slob_list; | 274 | struct list_head *slob_list; |
326 | slob_t *b = NULL; | 275 | slob_t *b = NULL; |
@@ -341,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | |||
341 | * If there's a node specification, search for a partial | 290 | * If there's a node specification, search for a partial |
342 | * page with a matching node id in the freelist. | 291 | * page with a matching node id in the freelist. |
343 | */ | 292 | */ |
344 | if (node != -1 && page_to_nid(&sp->page) != node) | 293 | if (node != -1 && page_to_nid(sp) != node) |
345 | continue; | 294 | continue; |
346 | #endif | 295 | #endif |
347 | /* Enough room on this page? */ | 296 | /* Enough room on this page? */ |
@@ -369,12 +318,12 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | |||
369 | b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); | 318 | b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); |
370 | if (!b) | 319 | if (!b) |
371 | return NULL; | 320 | return NULL; |
372 | sp = slob_page(b); | 321 | sp = virt_to_page(b); |
373 | set_slob_page(sp); | 322 | __SetPageSlab(sp); |
374 | 323 | ||
375 | spin_lock_irqsave(&slob_lock, flags); | 324 | spin_lock_irqsave(&slob_lock, flags); |
376 | sp->units = SLOB_UNITS(PAGE_SIZE); | 325 | sp->units = SLOB_UNITS(PAGE_SIZE); |
377 | sp->free = b; | 326 | sp->freelist = b; |
378 | INIT_LIST_HEAD(&sp->list); | 327 | INIT_LIST_HEAD(&sp->list); |
379 | set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); | 328 | set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); |
380 | set_slob_page_free(sp, slob_list); | 329 | set_slob_page_free(sp, slob_list); |
@@ -392,7 +341,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | |||
392 | */ | 341 | */ |
393 | static void slob_free(void *block, int size) | 342 | static void slob_free(void *block, int size) |
394 | { | 343 | { |
395 | struct slob_page *sp; | 344 | struct page *sp; |
396 | slob_t *prev, *next, *b = (slob_t *)block; | 345 | slob_t *prev, *next, *b = (slob_t *)block; |
397 | slobidx_t units; | 346 | slobidx_t units; |
398 | unsigned long flags; | 347 | unsigned long flags; |
@@ -402,7 +351,7 @@ static void slob_free(void *block, int size) | |||
402 | return; | 351 | return; |
403 | BUG_ON(!size); | 352 | BUG_ON(!size); |
404 | 353 | ||
405 | sp = slob_page(block); | 354 | sp = virt_to_page(block); |
406 | units = SLOB_UNITS(size); | 355 | units = SLOB_UNITS(size); |
407 | 356 | ||
408 | spin_lock_irqsave(&slob_lock, flags); | 357 | spin_lock_irqsave(&slob_lock, flags); |
@@ -412,8 +361,8 @@ static void slob_free(void *block, int size) | |||
412 | if (slob_page_free(sp)) | 361 | if (slob_page_free(sp)) |
413 | clear_slob_page_free(sp); | 362 | clear_slob_page_free(sp); |
414 | spin_unlock_irqrestore(&slob_lock, flags); | 363 | spin_unlock_irqrestore(&slob_lock, flags); |
415 | clear_slob_page(sp); | 364 | __ClearPageSlab(sp); |
416 | free_slob_page(sp); | 365 | reset_page_mapcount(sp); |
417 | slob_free_pages(b, 0); | 366 | slob_free_pages(b, 0); |
418 | return; | 367 | return; |
419 | } | 368 | } |
@@ -421,7 +370,7 @@ static void slob_free(void *block, int size) | |||
421 | if (!slob_page_free(sp)) { | 370 | if (!slob_page_free(sp)) { |
422 | /* This slob page is about to become partially free. Easy! */ | 371 | /* This slob page is about to become partially free. Easy! */ |
423 | sp->units = units; | 372 | sp->units = units; |
424 | sp->free = b; | 373 | sp->freelist = b; |
425 | set_slob(b, units, | 374 | set_slob(b, units, |
426 | (void *)((unsigned long)(b + | 375 | (void *)((unsigned long)(b + |
427 | SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); | 376 | SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); |
@@ -441,15 +390,15 @@ static void slob_free(void *block, int size) | |||
441 | */ | 390 | */ |
442 | sp->units += units; | 391 | sp->units += units; |
443 | 392 | ||
444 | if (b < sp->free) { | 393 | if (b < (slob_t *)sp->freelist) { |
445 | if (b + units == sp->free) { | 394 | if (b + units == sp->freelist) { |
446 | units += slob_units(sp->free); | 395 | units += slob_units(sp->freelist); |
447 | sp->free = slob_next(sp->free); | 396 | sp->freelist = slob_next(sp->freelist); |
448 | } | 397 | } |
449 | set_slob(b, units, sp->free); | 398 | set_slob(b, units, sp->freelist); |
450 | sp->free = b; | 399 | sp->freelist = b; |
451 | } else { | 400 | } else { |
452 | prev = sp->free; | 401 | prev = sp->freelist; |
453 | next = slob_next(prev); | 402 | next = slob_next(prev); |
454 | while (b > next) { | 403 | while (b > next) { |
455 | prev = next; | 404 | prev = next; |
@@ -522,7 +471,7 @@ EXPORT_SYMBOL(__kmalloc_node); | |||
522 | 471 | ||
523 | void kfree(const void *block) | 472 | void kfree(const void *block) |
524 | { | 473 | { |
525 | struct slob_page *sp; | 474 | struct page *sp; |
526 | 475 | ||
527 | trace_kfree(_RET_IP_, block); | 476 | trace_kfree(_RET_IP_, block); |
528 | 477 | ||
@@ -530,43 +479,36 @@ void kfree(const void *block) | |||
530 | return; | 479 | return; |
531 | kmemleak_free(block); | 480 | kmemleak_free(block); |
532 | 481 | ||
533 | sp = slob_page(block); | 482 | sp = virt_to_page(block); |
534 | if (is_slob_page(sp)) { | 483 | if (PageSlab(sp)) { |
535 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 484 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
536 | unsigned int *m = (unsigned int *)(block - align); | 485 | unsigned int *m = (unsigned int *)(block - align); |
537 | slob_free(m, *m + align); | 486 | slob_free(m, *m + align); |
538 | } else | 487 | } else |
539 | put_page(&sp->page); | 488 | put_page(sp); |
540 | } | 489 | } |
541 | EXPORT_SYMBOL(kfree); | 490 | EXPORT_SYMBOL(kfree); |
542 | 491 | ||
543 | /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ | 492 | /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ |
544 | size_t ksize(const void *block) | 493 | size_t ksize(const void *block) |
545 | { | 494 | { |
546 | struct slob_page *sp; | 495 | struct page *sp; |
547 | 496 | ||
548 | BUG_ON(!block); | 497 | BUG_ON(!block); |
549 | if (unlikely(block == ZERO_SIZE_PTR)) | 498 | if (unlikely(block == ZERO_SIZE_PTR)) |
550 | return 0; | 499 | return 0; |
551 | 500 | ||
552 | sp = slob_page(block); | 501 | sp = virt_to_page(block); |
553 | if (is_slob_page(sp)) { | 502 | if (PageSlab(sp)) { |
554 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 503 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
555 | unsigned int *m = (unsigned int *)(block - align); | 504 | unsigned int *m = (unsigned int *)(block - align); |
556 | return SLOB_UNITS(*m) * SLOB_UNIT; | 505 | return SLOB_UNITS(*m) * SLOB_UNIT; |
557 | } else | 506 | } else |
558 | return sp->page.private; | 507 | return sp->private; |
559 | } | 508 | } |
560 | EXPORT_SYMBOL(ksize); | 509 | EXPORT_SYMBOL(ksize); |
561 | 510 | ||
562 | struct kmem_cache { | 511 | struct kmem_cache *__kmem_cache_create(const char *name, size_t size, |
563 | unsigned int size, align; | ||
564 | unsigned long flags; | ||
565 | const char *name; | ||
566 | void (*ctor)(void *); | ||
567 | }; | ||
568 | |||
569 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | ||
570 | size_t align, unsigned long flags, void (*ctor)(void *)) | 512 | size_t align, unsigned long flags, void (*ctor)(void *)) |
571 | { | 513 | { |
572 | struct kmem_cache *c; | 514 | struct kmem_cache *c; |
@@ -589,13 +531,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
589 | c->align = ARCH_SLAB_MINALIGN; | 531 | c->align = ARCH_SLAB_MINALIGN; |
590 | if (c->align < align) | 532 | if (c->align < align) |
591 | c->align = align; | 533 | c->align = align; |
592 | } else if (flags & SLAB_PANIC) | ||
593 | panic("Cannot create slab cache %s\n", name); | ||
594 | 534 | ||
595 | kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); | 535 | kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); |
536 | c->refcount = 1; | ||
537 | } | ||
596 | return c; | 538 | return c; |
597 | } | 539 | } |
598 | EXPORT_SYMBOL(kmem_cache_create); | ||
599 | 540 | ||
600 | void kmem_cache_destroy(struct kmem_cache *c) | 541 | void kmem_cache_destroy(struct kmem_cache *c) |
601 | { | 542 | { |
@@ -678,19 +619,12 @@ int kmem_cache_shrink(struct kmem_cache *d) | |||
678 | } | 619 | } |
679 | EXPORT_SYMBOL(kmem_cache_shrink); | 620 | EXPORT_SYMBOL(kmem_cache_shrink); |
680 | 621 | ||
681 | static unsigned int slob_ready __read_mostly; | ||
682 | |||
683 | int slab_is_available(void) | ||
684 | { | ||
685 | return slob_ready; | ||
686 | } | ||
687 | |||
688 | void __init kmem_cache_init(void) | 622 | void __init kmem_cache_init(void) |
689 | { | 623 | { |
690 | slob_ready = 1; | 624 | slab_state = UP; |
691 | } | 625 | } |
692 | 626 | ||
693 | void __init kmem_cache_init_late(void) | 627 | void __init kmem_cache_init_late(void) |
694 | { | 628 | { |
695 | /* Nothing to do */ | 629 | slab_state = FULL; |
696 | } | 630 | } |
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
17 | #include <linux/bitops.h> | 17 | #include <linux/bitops.h> |
18 | #include <linux/slab.h> | 18 | #include <linux/slab.h> |
19 | #include "slab.h" | ||
19 | #include <linux/proc_fs.h> | 20 | #include <linux/proc_fs.h> |
20 | #include <linux/seq_file.h> | 21 | #include <linux/seq_file.h> |
21 | #include <linux/kmemcheck.h> | 22 | #include <linux/kmemcheck.h> |
@@ -33,15 +34,17 @@ | |||
33 | 34 | ||
34 | #include <trace/events/kmem.h> | 35 | #include <trace/events/kmem.h> |
35 | 36 | ||
37 | #include "internal.h" | ||
38 | |||
36 | /* | 39 | /* |
37 | * Lock order: | 40 | * Lock order: |
38 | * 1. slub_lock (Global Semaphore) | 41 | * 1. slab_mutex (Global Mutex) |
39 | * 2. node->list_lock | 42 | * 2. node->list_lock |
40 | * 3. slab_lock(page) (Only on some arches and for debugging) | 43 | * 3. slab_lock(page) (Only on some arches and for debugging) |
41 | * | 44 | * |
42 | * slub_lock | 45 | * slab_mutex |
43 | * | 46 | * |
44 | * The role of the slub_lock is to protect the list of all the slabs | 47 | * The role of the slab_mutex is to protect the list of all the slabs |
45 | * and to synchronize major metadata changes to slab cache structures. | 48 | * and to synchronize major metadata changes to slab cache structures. |
46 | * | 49 | * |
47 | * The slab_lock is only used for debugging and on arches that do not | 50 | * The slab_lock is only used for debugging and on arches that do not |
@@ -182,17 +185,6 @@ static int kmem_size = sizeof(struct kmem_cache); | |||
182 | static struct notifier_block slab_notifier; | 185 | static struct notifier_block slab_notifier; |
183 | #endif | 186 | #endif |
184 | 187 | ||
185 | static enum { | ||
186 | DOWN, /* No slab functionality available */ | ||
187 | PARTIAL, /* Kmem_cache_node works */ | ||
188 | UP, /* Everything works but does not show up in sysfs */ | ||
189 | SYSFS /* Sysfs up */ | ||
190 | } slab_state = DOWN; | ||
191 | |||
192 | /* A list of all slab caches on the system */ | ||
193 | static DECLARE_RWSEM(slub_lock); | ||
194 | static LIST_HEAD(slab_caches); | ||
195 | |||
196 | /* | 188 | /* |
197 | * Tracking user of a slab. | 189 | * Tracking user of a slab. |
198 | */ | 190 | */ |
@@ -237,11 +229,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) | |||
237 | * Core slab cache functions | 229 | * Core slab cache functions |
238 | *******************************************************************/ | 230 | *******************************************************************/ |
239 | 231 | ||
240 | int slab_is_available(void) | ||
241 | { | ||
242 | return slab_state >= UP; | ||
243 | } | ||
244 | |||
245 | static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | 232 | static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) |
246 | { | 233 | { |
247 | return s->node[node]; | 234 | return s->node[node]; |
@@ -311,7 +298,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) | |||
311 | * and whatever may come after it. | 298 | * and whatever may come after it. |
312 | */ | 299 | */ |
313 | if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) | 300 | if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) |
314 | return s->objsize; | 301 | return s->object_size; |
315 | 302 | ||
316 | #endif | 303 | #endif |
317 | /* | 304 | /* |
@@ -609,11 +596,11 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | |||
609 | if (p > addr + 16) | 596 | if (p > addr + 16) |
610 | print_section("Bytes b4 ", p - 16, 16); | 597 | print_section("Bytes b4 ", p - 16, 16); |
611 | 598 | ||
612 | print_section("Object ", p, min_t(unsigned long, s->objsize, | 599 | print_section("Object ", p, min_t(unsigned long, s->object_size, |
613 | PAGE_SIZE)); | 600 | PAGE_SIZE)); |
614 | if (s->flags & SLAB_RED_ZONE) | 601 | if (s->flags & SLAB_RED_ZONE) |
615 | print_section("Redzone ", p + s->objsize, | 602 | print_section("Redzone ", p + s->object_size, |
616 | s->inuse - s->objsize); | 603 | s->inuse - s->object_size); |
617 | 604 | ||
618 | if (s->offset) | 605 | if (s->offset) |
619 | off = s->offset + sizeof(void *); | 606 | off = s->offset + sizeof(void *); |
@@ -655,12 +642,12 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) | |||
655 | u8 *p = object; | 642 | u8 *p = object; |
656 | 643 | ||
657 | if (s->flags & __OBJECT_POISON) { | 644 | if (s->flags & __OBJECT_POISON) { |
658 | memset(p, POISON_FREE, s->objsize - 1); | 645 | memset(p, POISON_FREE, s->object_size - 1); |
659 | p[s->objsize - 1] = POISON_END; | 646 | p[s->object_size - 1] = POISON_END; |
660 | } | 647 | } |
661 | 648 | ||
662 | if (s->flags & SLAB_RED_ZONE) | 649 | if (s->flags & SLAB_RED_ZONE) |
663 | memset(p + s->objsize, val, s->inuse - s->objsize); | 650 | memset(p + s->object_size, val, s->inuse - s->object_size); |
664 | } | 651 | } |
665 | 652 | ||
666 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, | 653 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, |
@@ -705,10 +692,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, | |||
705 | * Poisoning uses 0x6b (POISON_FREE) and the last byte is | 692 | * Poisoning uses 0x6b (POISON_FREE) and the last byte is |
706 | * 0xa5 (POISON_END) | 693 | * 0xa5 (POISON_END) |
707 | * | 694 | * |
708 | * object + s->objsize | 695 | * object + s->object_size |
709 | * Padding to reach word boundary. This is also used for Redzoning. | 696 | * Padding to reach word boundary. This is also used for Redzoning. |
710 | * Padding is extended by another word if Redzoning is enabled and | 697 | * Padding is extended by another word if Redzoning is enabled and |
711 | * objsize == inuse. | 698 | * object_size == inuse. |
712 | * | 699 | * |
713 | * We fill with 0xbb (RED_INACTIVE) for inactive objects and with | 700 | * We fill with 0xbb (RED_INACTIVE) for inactive objects and with |
714 | * 0xcc (RED_ACTIVE) for objects in use. | 701 | * 0xcc (RED_ACTIVE) for objects in use. |
@@ -727,7 +714,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, | |||
727 | * object + s->size | 714 | * object + s->size |
728 | * Nothing is used beyond s->size. | 715 | * Nothing is used beyond s->size. |
729 | * | 716 | * |
730 | * If slabcaches are merged then the objsize and inuse boundaries are mostly | 717 | * If slabcaches are merged then the object_size and inuse boundaries are mostly |
731 | * ignored. And therefore no slab options that rely on these boundaries | 718 | * ignored. And therefore no slab options that rely on these boundaries |
732 | * may be used with merged slabcaches. | 719 | * may be used with merged slabcaches. |
733 | */ | 720 | */ |
@@ -787,25 +774,25 @@ static int check_object(struct kmem_cache *s, struct page *page, | |||
787 | void *object, u8 val) | 774 | void *object, u8 val) |
788 | { | 775 | { |
789 | u8 *p = object; | 776 | u8 *p = object; |
790 | u8 *endobject = object + s->objsize; | 777 | u8 *endobject = object + s->object_size; |
791 | 778 | ||
792 | if (s->flags & SLAB_RED_ZONE) { | 779 | if (s->flags & SLAB_RED_ZONE) { |
793 | if (!check_bytes_and_report(s, page, object, "Redzone", | 780 | if (!check_bytes_and_report(s, page, object, "Redzone", |
794 | endobject, val, s->inuse - s->objsize)) | 781 | endobject, val, s->inuse - s->object_size)) |
795 | return 0; | 782 | return 0; |
796 | } else { | 783 | } else { |
797 | if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { | 784 | if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { |
798 | check_bytes_and_report(s, page, p, "Alignment padding", | 785 | check_bytes_and_report(s, page, p, "Alignment padding", |
799 | endobject, POISON_INUSE, s->inuse - s->objsize); | 786 | endobject, POISON_INUSE, s->inuse - s->object_size); |
800 | } | 787 | } |
801 | } | 788 | } |
802 | 789 | ||
803 | if (s->flags & SLAB_POISON) { | 790 | if (s->flags & SLAB_POISON) { |
804 | if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && | 791 | if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && |
805 | (!check_bytes_and_report(s, page, p, "Poison", p, | 792 | (!check_bytes_and_report(s, page, p, "Poison", p, |
806 | POISON_FREE, s->objsize - 1) || | 793 | POISON_FREE, s->object_size - 1) || |
807 | !check_bytes_and_report(s, page, p, "Poison", | 794 | !check_bytes_and_report(s, page, p, "Poison", |
808 | p + s->objsize - 1, POISON_END, 1))) | 795 | p + s->object_size - 1, POISON_END, 1))) |
809 | return 0; | 796 | return 0; |
810 | /* | 797 | /* |
811 | * check_pad_bytes cleans up on its own. | 798 | * check_pad_bytes cleans up on its own. |
@@ -926,7 +913,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, | |||
926 | page->freelist); | 913 | page->freelist); |
927 | 914 | ||
928 | if (!alloc) | 915 | if (!alloc) |
929 | print_section("Object ", (void *)object, s->objsize); | 916 | print_section("Object ", (void *)object, s->object_size); |
930 | 917 | ||
931 | dump_stack(); | 918 | dump_stack(); |
932 | } | 919 | } |
@@ -942,14 +929,14 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) | |||
942 | lockdep_trace_alloc(flags); | 929 | lockdep_trace_alloc(flags); |
943 | might_sleep_if(flags & __GFP_WAIT); | 930 | might_sleep_if(flags & __GFP_WAIT); |
944 | 931 | ||
945 | return should_failslab(s->objsize, flags, s->flags); | 932 | return should_failslab(s->object_size, flags, s->flags); |
946 | } | 933 | } |
947 | 934 | ||
948 | static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) | 935 | static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) |
949 | { | 936 | { |
950 | flags &= gfp_allowed_mask; | 937 | flags &= gfp_allowed_mask; |
951 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); | 938 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); |
952 | kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); | 939 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); |
953 | } | 940 | } |
954 | 941 | ||
955 | static inline void slab_free_hook(struct kmem_cache *s, void *x) | 942 | static inline void slab_free_hook(struct kmem_cache *s, void *x) |
@@ -966,13 +953,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
966 | unsigned long flags; | 953 | unsigned long flags; |
967 | 954 | ||
968 | local_irq_save(flags); | 955 | local_irq_save(flags); |
969 | kmemcheck_slab_free(s, x, s->objsize); | 956 | kmemcheck_slab_free(s, x, s->object_size); |
970 | debug_check_no_locks_freed(x, s->objsize); | 957 | debug_check_no_locks_freed(x, s->object_size); |
971 | local_irq_restore(flags); | 958 | local_irq_restore(flags); |
972 | } | 959 | } |
973 | #endif | 960 | #endif |
974 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 961 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
975 | debug_check_no_obj_freed(x, s->objsize); | 962 | debug_check_no_obj_freed(x, s->object_size); |
976 | } | 963 | } |
977 | 964 | ||
978 | /* | 965 | /* |
@@ -1207,7 +1194,7 @@ out: | |||
1207 | 1194 | ||
1208 | __setup("slub_debug", setup_slub_debug); | 1195 | __setup("slub_debug", setup_slub_debug); |
1209 | 1196 | ||
1210 | static unsigned long kmem_cache_flags(unsigned long objsize, | 1197 | static unsigned long kmem_cache_flags(unsigned long object_size, |
1211 | unsigned long flags, const char *name, | 1198 | unsigned long flags, const char *name, |
1212 | void (*ctor)(void *)) | 1199 | void (*ctor)(void *)) |
1213 | { | 1200 | { |
@@ -1237,7 +1224,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page, | |||
1237 | static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, | 1224 | static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, |
1238 | struct page *page) {} | 1225 | struct page *page) {} |
1239 | static inline void remove_full(struct kmem_cache *s, struct page *page) {} | 1226 | static inline void remove_full(struct kmem_cache *s, struct page *page) {} |
1240 | static inline unsigned long kmem_cache_flags(unsigned long objsize, | 1227 | static inline unsigned long kmem_cache_flags(unsigned long object_size, |
1241 | unsigned long flags, const char *name, | 1228 | unsigned long flags, const char *name, |
1242 | void (*ctor)(void *)) | 1229 | void (*ctor)(void *)) |
1243 | { | 1230 | { |
@@ -1314,13 +1301,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1314 | stat(s, ORDER_FALLBACK); | 1301 | stat(s, ORDER_FALLBACK); |
1315 | } | 1302 | } |
1316 | 1303 | ||
1317 | if (flags & __GFP_WAIT) | 1304 | if (kmemcheck_enabled && page |
1318 | local_irq_disable(); | ||
1319 | |||
1320 | if (!page) | ||
1321 | return NULL; | ||
1322 | |||
1323 | if (kmemcheck_enabled | ||
1324 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { | 1305 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { |
1325 | int pages = 1 << oo_order(oo); | 1306 | int pages = 1 << oo_order(oo); |
1326 | 1307 | ||
@@ -1336,6 +1317,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1336 | kmemcheck_mark_unallocated_pages(page, pages); | 1317 | kmemcheck_mark_unallocated_pages(page, pages); |
1337 | } | 1318 | } |
1338 | 1319 | ||
1320 | if (flags & __GFP_WAIT) | ||
1321 | local_irq_disable(); | ||
1322 | if (!page) | ||
1323 | return NULL; | ||
1324 | |||
1339 | page->objects = oo_objects(oo); | 1325 | page->objects = oo_objects(oo); |
1340 | mod_zone_page_state(page_zone(page), | 1326 | mod_zone_page_state(page_zone(page), |
1341 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 1327 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
@@ -1369,7 +1355,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1369 | 1355 | ||
1370 | inc_slabs_node(s, page_to_nid(page), page->objects); | 1356 | inc_slabs_node(s, page_to_nid(page), page->objects); |
1371 | page->slab = s; | 1357 | page->slab = s; |
1372 | page->flags |= 1 << PG_slab; | 1358 | __SetPageSlab(page); |
1359 | if (page->pfmemalloc) | ||
1360 | SetPageSlabPfmemalloc(page); | ||
1373 | 1361 | ||
1374 | start = page_address(page); | 1362 | start = page_address(page); |
1375 | 1363 | ||
@@ -1413,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1413 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | 1401 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, |
1414 | -pages); | 1402 | -pages); |
1415 | 1403 | ||
1404 | __ClearPageSlabPfmemalloc(page); | ||
1416 | __ClearPageSlab(page); | 1405 | __ClearPageSlab(page); |
1417 | reset_page_mapcount(page); | 1406 | reset_page_mapcount(page); |
1418 | if (current->reclaim_state) | 1407 | if (current->reclaim_state) |
@@ -1490,12 +1479,12 @@ static inline void remove_partial(struct kmem_cache_node *n, | |||
1490 | } | 1479 | } |
1491 | 1480 | ||
1492 | /* | 1481 | /* |
1493 | * Lock slab, remove from the partial list and put the object into the | 1482 | * Remove slab from the partial list, freeze it and |
1494 | * per cpu freelist. | 1483 | * return the pointer to the freelist. |
1495 | * | 1484 | * |
1496 | * Returns a list of objects or NULL if it fails. | 1485 | * Returns a list of objects or NULL if it fails. |
1497 | * | 1486 | * |
1498 | * Must hold list_lock. | 1487 | * Must hold list_lock since we modify the partial list. |
1499 | */ | 1488 | */ |
1500 | static inline void *acquire_slab(struct kmem_cache *s, | 1489 | static inline void *acquire_slab(struct kmem_cache *s, |
1501 | struct kmem_cache_node *n, struct page *page, | 1490 | struct kmem_cache_node *n, struct page *page, |
@@ -1510,22 +1499,27 @@ static inline void *acquire_slab(struct kmem_cache *s, | |||
1510 | * The old freelist is the list of objects for the | 1499 | * The old freelist is the list of objects for the |
1511 | * per cpu allocation list. | 1500 | * per cpu allocation list. |
1512 | */ | 1501 | */ |
1513 | do { | 1502 | freelist = page->freelist; |
1514 | freelist = page->freelist; | 1503 | counters = page->counters; |
1515 | counters = page->counters; | 1504 | new.counters = counters; |
1516 | new.counters = counters; | 1505 | if (mode) { |
1517 | if (mode) | 1506 | new.inuse = page->objects; |
1518 | new.inuse = page->objects; | 1507 | new.freelist = NULL; |
1508 | } else { | ||
1509 | new.freelist = freelist; | ||
1510 | } | ||
1519 | 1511 | ||
1520 | VM_BUG_ON(new.frozen); | 1512 | VM_BUG_ON(new.frozen); |
1521 | new.frozen = 1; | 1513 | new.frozen = 1; |
1522 | 1514 | ||
1523 | } while (!__cmpxchg_double_slab(s, page, | 1515 | if (!__cmpxchg_double_slab(s, page, |
1524 | freelist, counters, | 1516 | freelist, counters, |
1525 | NULL, new.counters, | 1517 | new.freelist, new.counters, |
1526 | "lock and freeze")); | 1518 | "acquire_slab")) |
1519 | return NULL; | ||
1527 | 1520 | ||
1528 | remove_partial(n, page); | 1521 | remove_partial(n, page); |
1522 | WARN_ON(!freelist); | ||
1529 | return freelist; | 1523 | return freelist; |
1530 | } | 1524 | } |
1531 | 1525 | ||
@@ -1559,12 +1553,10 @@ static void *get_partial_node(struct kmem_cache *s, | |||
1559 | 1553 | ||
1560 | if (!object) { | 1554 | if (!object) { |
1561 | c->page = page; | 1555 | c->page = page; |
1562 | c->node = page_to_nid(page); | ||
1563 | stat(s, ALLOC_FROM_PARTIAL); | 1556 | stat(s, ALLOC_FROM_PARTIAL); |
1564 | object = t; | 1557 | object = t; |
1565 | available = page->objects - page->inuse; | 1558 | available = page->objects - page->inuse; |
1566 | } else { | 1559 | } else { |
1567 | page->freelist = t; | ||
1568 | available = put_cpu_partial(s, page, 0); | 1560 | available = put_cpu_partial(s, page, 0); |
1569 | stat(s, CPU_PARTIAL_NODE); | 1561 | stat(s, CPU_PARTIAL_NODE); |
1570 | } | 1562 | } |
@@ -1579,7 +1571,7 @@ static void *get_partial_node(struct kmem_cache *s, | |||
1579 | /* | 1571 | /* |
1580 | * Get a page from somewhere. Search in increasing NUMA distances. | 1572 | * Get a page from somewhere. Search in increasing NUMA distances. |
1581 | */ | 1573 | */ |
1582 | static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, | 1574 | static void *get_any_partial(struct kmem_cache *s, gfp_t flags, |
1583 | struct kmem_cache_cpu *c) | 1575 | struct kmem_cache_cpu *c) |
1584 | { | 1576 | { |
1585 | #ifdef CONFIG_NUMA | 1577 | #ifdef CONFIG_NUMA |
@@ -1614,7 +1606,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
1614 | 1606 | ||
1615 | do { | 1607 | do { |
1616 | cpuset_mems_cookie = get_mems_allowed(); | 1608 | cpuset_mems_cookie = get_mems_allowed(); |
1617 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | 1609 | zonelist = node_zonelist(slab_node(), flags); |
1618 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1610 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1619 | struct kmem_cache_node *n; | 1611 | struct kmem_cache_node *n; |
1620 | 1612 | ||
@@ -1728,14 +1720,12 @@ void init_kmem_cache_cpus(struct kmem_cache *s) | |||
1728 | /* | 1720 | /* |
1729 | * Remove the cpu slab | 1721 | * Remove the cpu slab |
1730 | */ | 1722 | */ |
1731 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1723 | static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist) |
1732 | { | 1724 | { |
1733 | enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; | 1725 | enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; |
1734 | struct page *page = c->page; | ||
1735 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | 1726 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
1736 | int lock = 0; | 1727 | int lock = 0; |
1737 | enum slab_modes l = M_NONE, m = M_NONE; | 1728 | enum slab_modes l = M_NONE, m = M_NONE; |
1738 | void *freelist; | ||
1739 | void *nextfree; | 1729 | void *nextfree; |
1740 | int tail = DEACTIVATE_TO_HEAD; | 1730 | int tail = DEACTIVATE_TO_HEAD; |
1741 | struct page new; | 1731 | struct page new; |
@@ -1746,11 +1736,6 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
1746 | tail = DEACTIVATE_TO_TAIL; | 1736 | tail = DEACTIVATE_TO_TAIL; |
1747 | } | 1737 | } |
1748 | 1738 | ||
1749 | c->tid = next_tid(c->tid); | ||
1750 | c->page = NULL; | ||
1751 | freelist = c->freelist; | ||
1752 | c->freelist = NULL; | ||
1753 | |||
1754 | /* | 1739 | /* |
1755 | * Stage one: Free all available per cpu objects back | 1740 | * Stage one: Free all available per cpu objects back |
1756 | * to the page freelist while it is still frozen. Leave the | 1741 | * to the page freelist while it is still frozen. Leave the |
@@ -1876,21 +1861,31 @@ redo: | |||
1876 | } | 1861 | } |
1877 | } | 1862 | } |
1878 | 1863 | ||
1879 | /* Unfreeze all the cpu partial slabs */ | 1864 | /* |
1865 | * Unfreeze all the cpu partial slabs. | ||
1866 | * | ||
1867 | * This function must be called with interrupt disabled. | ||
1868 | */ | ||
1880 | static void unfreeze_partials(struct kmem_cache *s) | 1869 | static void unfreeze_partials(struct kmem_cache *s) |
1881 | { | 1870 | { |
1882 | struct kmem_cache_node *n = NULL; | 1871 | struct kmem_cache_node *n = NULL, *n2 = NULL; |
1883 | struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); | 1872 | struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); |
1884 | struct page *page, *discard_page = NULL; | 1873 | struct page *page, *discard_page = NULL; |
1885 | 1874 | ||
1886 | while ((page = c->partial)) { | 1875 | while ((page = c->partial)) { |
1887 | enum slab_modes { M_PARTIAL, M_FREE }; | ||
1888 | enum slab_modes l, m; | ||
1889 | struct page new; | 1876 | struct page new; |
1890 | struct page old; | 1877 | struct page old; |
1891 | 1878 | ||
1892 | c->partial = page->next; | 1879 | c->partial = page->next; |
1893 | l = M_FREE; | 1880 | |
1881 | n2 = get_node(s, page_to_nid(page)); | ||
1882 | if (n != n2) { | ||
1883 | if (n) | ||
1884 | spin_unlock(&n->list_lock); | ||
1885 | |||
1886 | n = n2; | ||
1887 | spin_lock(&n->list_lock); | ||
1888 | } | ||
1894 | 1889 | ||
1895 | do { | 1890 | do { |
1896 | 1891 | ||
@@ -1903,43 +1898,17 @@ static void unfreeze_partials(struct kmem_cache *s) | |||
1903 | 1898 | ||
1904 | new.frozen = 0; | 1899 | new.frozen = 0; |
1905 | 1900 | ||
1906 | if (!new.inuse && (!n || n->nr_partial > s->min_partial)) | 1901 | } while (!__cmpxchg_double_slab(s, page, |
1907 | m = M_FREE; | ||
1908 | else { | ||
1909 | struct kmem_cache_node *n2 = get_node(s, | ||
1910 | page_to_nid(page)); | ||
1911 | |||
1912 | m = M_PARTIAL; | ||
1913 | if (n != n2) { | ||
1914 | if (n) | ||
1915 | spin_unlock(&n->list_lock); | ||
1916 | |||
1917 | n = n2; | ||
1918 | spin_lock(&n->list_lock); | ||
1919 | } | ||
1920 | } | ||
1921 | |||
1922 | if (l != m) { | ||
1923 | if (l == M_PARTIAL) { | ||
1924 | remove_partial(n, page); | ||
1925 | stat(s, FREE_REMOVE_PARTIAL); | ||
1926 | } else { | ||
1927 | add_partial(n, page, | ||
1928 | DEACTIVATE_TO_TAIL); | ||
1929 | stat(s, FREE_ADD_PARTIAL); | ||
1930 | } | ||
1931 | |||
1932 | l = m; | ||
1933 | } | ||
1934 | |||
1935 | } while (!cmpxchg_double_slab(s, page, | ||
1936 | old.freelist, old.counters, | 1902 | old.freelist, old.counters, |
1937 | new.freelist, new.counters, | 1903 | new.freelist, new.counters, |
1938 | "unfreezing slab")); | 1904 | "unfreezing slab")); |
1939 | 1905 | ||
1940 | if (m == M_FREE) { | 1906 | if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { |
1941 | page->next = discard_page; | 1907 | page->next = discard_page; |
1942 | discard_page = page; | 1908 | discard_page = page; |
1909 | } else { | ||
1910 | add_partial(n, page, DEACTIVATE_TO_TAIL); | ||
1911 | stat(s, FREE_ADD_PARTIAL); | ||
1943 | } | 1912 | } |
1944 | } | 1913 | } |
1945 | 1914 | ||
@@ -2008,7 +1977,11 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) | |||
2008 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1977 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
2009 | { | 1978 | { |
2010 | stat(s, CPUSLAB_FLUSH); | 1979 | stat(s, CPUSLAB_FLUSH); |
2011 | deactivate_slab(s, c); | 1980 | deactivate_slab(s, c->page, c->freelist); |
1981 | |||
1982 | c->tid = next_tid(c->tid); | ||
1983 | c->page = NULL; | ||
1984 | c->freelist = NULL; | ||
2012 | } | 1985 | } |
2013 | 1986 | ||
2014 | /* | 1987 | /* |
@@ -2040,7 +2013,7 @@ static bool has_cpu_slab(int cpu, void *info) | |||
2040 | struct kmem_cache *s = info; | 2013 | struct kmem_cache *s = info; |
2041 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); | 2014 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
2042 | 2015 | ||
2043 | return !!(c->page); | 2016 | return c->page || c->partial; |
2044 | } | 2017 | } |
2045 | 2018 | ||
2046 | static void flush_all(struct kmem_cache *s) | 2019 | static void flush_all(struct kmem_cache *s) |
@@ -2052,10 +2025,10 @@ static void flush_all(struct kmem_cache *s) | |||
2052 | * Check if the objects in a per cpu structure fit numa | 2025 | * Check if the objects in a per cpu structure fit numa |
2053 | * locality expectations. | 2026 | * locality expectations. |
2054 | */ | 2027 | */ |
2055 | static inline int node_match(struct kmem_cache_cpu *c, int node) | 2028 | static inline int node_match(struct page *page, int node) |
2056 | { | 2029 | { |
2057 | #ifdef CONFIG_NUMA | 2030 | #ifdef CONFIG_NUMA |
2058 | if (node != NUMA_NO_NODE && c->node != node) | 2031 | if (node != NUMA_NO_NODE && page_to_nid(page) != node) |
2059 | return 0; | 2032 | return 0; |
2060 | #endif | 2033 | #endif |
2061 | return 1; | 2034 | return 1; |
@@ -2098,10 +2071,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | |||
2098 | "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", | 2071 | "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", |
2099 | nid, gfpflags); | 2072 | nid, gfpflags); |
2100 | printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " | 2073 | printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " |
2101 | "default order: %d, min order: %d\n", s->name, s->objsize, | 2074 | "default order: %d, min order: %d\n", s->name, s->object_size, |
2102 | s->size, oo_order(s->oo), oo_order(s->min)); | 2075 | s->size, oo_order(s->oo), oo_order(s->min)); |
2103 | 2076 | ||
2104 | if (oo_order(s->min) > get_order(s->objsize)) | 2077 | if (oo_order(s->min) > get_order(s->object_size)) |
2105 | printk(KERN_WARNING " %s debugging increased min order, use " | 2078 | printk(KERN_WARNING " %s debugging increased min order, use " |
2106 | "slub_debug=O to disable.\n", s->name); | 2079 | "slub_debug=O to disable.\n", s->name); |
2107 | 2080 | ||
@@ -2127,10 +2100,16 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | |||
2127 | static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | 2100 | static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, |
2128 | int node, struct kmem_cache_cpu **pc) | 2101 | int node, struct kmem_cache_cpu **pc) |
2129 | { | 2102 | { |
2130 | void *object; | 2103 | void *freelist; |
2131 | struct kmem_cache_cpu *c; | 2104 | struct kmem_cache_cpu *c = *pc; |
2132 | struct page *page = new_slab(s, flags, node); | 2105 | struct page *page; |
2106 | |||
2107 | freelist = get_partial(s, flags, node, c); | ||
2133 | 2108 | ||
2109 | if (freelist) | ||
2110 | return freelist; | ||
2111 | |||
2112 | page = new_slab(s, flags, node); | ||
2134 | if (page) { | 2113 | if (page) { |
2135 | c = __this_cpu_ptr(s->cpu_slab); | 2114 | c = __this_cpu_ptr(s->cpu_slab); |
2136 | if (c->page) | 2115 | if (c->page) |
@@ -2140,17 +2119,24 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | |||
2140 | * No other reference to the page yet so we can | 2119 | * No other reference to the page yet so we can |
2141 | * muck around with it freely without cmpxchg | 2120 | * muck around with it freely without cmpxchg |
2142 | */ | 2121 | */ |
2143 | object = page->freelist; | 2122 | freelist = page->freelist; |
2144 | page->freelist = NULL; | 2123 | page->freelist = NULL; |
2145 | 2124 | ||
2146 | stat(s, ALLOC_SLAB); | 2125 | stat(s, ALLOC_SLAB); |
2147 | c->node = page_to_nid(page); | ||
2148 | c->page = page; | 2126 | c->page = page; |
2149 | *pc = c; | 2127 | *pc = c; |
2150 | } else | 2128 | } else |
2151 | object = NULL; | 2129 | freelist = NULL; |
2152 | 2130 | ||
2153 | return object; | 2131 | return freelist; |
2132 | } | ||
2133 | |||
2134 | static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) | ||
2135 | { | ||
2136 | if (unlikely(PageSlabPfmemalloc(page))) | ||
2137 | return gfp_pfmemalloc_allowed(gfpflags); | ||
2138 | |||
2139 | return true; | ||
2154 | } | 2140 | } |
2155 | 2141 | ||
2156 | /* | 2142 | /* |
@@ -2160,6 +2146,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | |||
2160 | * The page is still frozen if the return value is not NULL. | 2146 | * The page is still frozen if the return value is not NULL. |
2161 | * | 2147 | * |
2162 | * If this function returns NULL then the page has been unfrozen. | 2148 | * If this function returns NULL then the page has been unfrozen. |
2149 | * | ||
2150 | * This function must be called with interrupt disabled. | ||
2163 | */ | 2151 | */ |
2164 | static inline void *get_freelist(struct kmem_cache *s, struct page *page) | 2152 | static inline void *get_freelist(struct kmem_cache *s, struct page *page) |
2165 | { | 2153 | { |
@@ -2170,13 +2158,14 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) | |||
2170 | do { | 2158 | do { |
2171 | freelist = page->freelist; | 2159 | freelist = page->freelist; |
2172 | counters = page->counters; | 2160 | counters = page->counters; |
2161 | |||
2173 | new.counters = counters; | 2162 | new.counters = counters; |
2174 | VM_BUG_ON(!new.frozen); | 2163 | VM_BUG_ON(!new.frozen); |
2175 | 2164 | ||
2176 | new.inuse = page->objects; | 2165 | new.inuse = page->objects; |
2177 | new.frozen = freelist != NULL; | 2166 | new.frozen = freelist != NULL; |
2178 | 2167 | ||
2179 | } while (!cmpxchg_double_slab(s, page, | 2168 | } while (!__cmpxchg_double_slab(s, page, |
2180 | freelist, counters, | 2169 | freelist, counters, |
2181 | NULL, new.counters, | 2170 | NULL, new.counters, |
2182 | "get_freelist")); | 2171 | "get_freelist")); |
@@ -2203,7 +2192,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) | |||
2203 | static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | 2192 | static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, |
2204 | unsigned long addr, struct kmem_cache_cpu *c) | 2193 | unsigned long addr, struct kmem_cache_cpu *c) |
2205 | { | 2194 | { |
2206 | void **object; | 2195 | void *freelist; |
2196 | struct page *page; | ||
2207 | unsigned long flags; | 2197 | unsigned long flags; |
2208 | 2198 | ||
2209 | local_irq_save(flags); | 2199 | local_irq_save(flags); |
@@ -2216,25 +2206,41 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
2216 | c = this_cpu_ptr(s->cpu_slab); | 2206 | c = this_cpu_ptr(s->cpu_slab); |
2217 | #endif | 2207 | #endif |
2218 | 2208 | ||
2219 | if (!c->page) | 2209 | page = c->page; |
2210 | if (!page) | ||
2220 | goto new_slab; | 2211 | goto new_slab; |
2221 | redo: | 2212 | redo: |
2222 | if (unlikely(!node_match(c, node))) { | 2213 | |
2214 | if (unlikely(!node_match(page, node))) { | ||
2223 | stat(s, ALLOC_NODE_MISMATCH); | 2215 | stat(s, ALLOC_NODE_MISMATCH); |
2224 | deactivate_slab(s, c); | 2216 | deactivate_slab(s, page, c->freelist); |
2217 | c->page = NULL; | ||
2218 | c->freelist = NULL; | ||
2219 | goto new_slab; | ||
2220 | } | ||
2221 | |||
2222 | /* | ||
2223 | * By rights, we should be searching for a slab page that was | ||
2224 | * PFMEMALLOC but right now, we are losing the pfmemalloc | ||
2225 | * information when the page leaves the per-cpu allocator | ||
2226 | */ | ||
2227 | if (unlikely(!pfmemalloc_match(page, gfpflags))) { | ||
2228 | deactivate_slab(s, page, c->freelist); | ||
2229 | c->page = NULL; | ||
2230 | c->freelist = NULL; | ||
2225 | goto new_slab; | 2231 | goto new_slab; |
2226 | } | 2232 | } |
2227 | 2233 | ||
2228 | /* must check again c->freelist in case of cpu migration or IRQ */ | 2234 | /* must check again c->freelist in case of cpu migration or IRQ */ |
2229 | object = c->freelist; | 2235 | freelist = c->freelist; |
2230 | if (object) | 2236 | if (freelist) |
2231 | goto load_freelist; | 2237 | goto load_freelist; |
2232 | 2238 | ||
2233 | stat(s, ALLOC_SLOWPATH); | 2239 | stat(s, ALLOC_SLOWPATH); |
2234 | 2240 | ||
2235 | object = get_freelist(s, c->page); | 2241 | freelist = get_freelist(s, page); |
2236 | 2242 | ||
2237 | if (!object) { | 2243 | if (!freelist) { |
2238 | c->page = NULL; | 2244 | c->page = NULL; |
2239 | stat(s, DEACTIVATE_BYPASS); | 2245 | stat(s, DEACTIVATE_BYPASS); |
2240 | goto new_slab; | 2246 | goto new_slab; |
@@ -2243,50 +2249,50 @@ redo: | |||
2243 | stat(s, ALLOC_REFILL); | 2249 | stat(s, ALLOC_REFILL); |
2244 | 2250 | ||
2245 | load_freelist: | 2251 | load_freelist: |
2246 | c->freelist = get_freepointer(s, object); | 2252 | /* |
2253 | * freelist is pointing to the list of objects to be used. | ||
2254 | * page is pointing to the page from which the objects are obtained. | ||
2255 | * That page must be frozen for per cpu allocations to work. | ||
2256 | */ | ||
2257 | VM_BUG_ON(!c->page->frozen); | ||
2258 | c->freelist = get_freepointer(s, freelist); | ||
2247 | c->tid = next_tid(c->tid); | 2259 | c->tid = next_tid(c->tid); |
2248 | local_irq_restore(flags); | 2260 | local_irq_restore(flags); |
2249 | return object; | 2261 | return freelist; |
2250 | 2262 | ||
2251 | new_slab: | 2263 | new_slab: |
2252 | 2264 | ||
2253 | if (c->partial) { | 2265 | if (c->partial) { |
2254 | c->page = c->partial; | 2266 | page = c->page = c->partial; |
2255 | c->partial = c->page->next; | 2267 | c->partial = page->next; |
2256 | c->node = page_to_nid(c->page); | ||
2257 | stat(s, CPU_PARTIAL_ALLOC); | 2268 | stat(s, CPU_PARTIAL_ALLOC); |
2258 | c->freelist = NULL; | 2269 | c->freelist = NULL; |
2259 | goto redo; | 2270 | goto redo; |
2260 | } | 2271 | } |
2261 | 2272 | ||
2262 | /* Then do expensive stuff like retrieving pages from the partial lists */ | 2273 | freelist = new_slab_objects(s, gfpflags, node, &c); |
2263 | object = get_partial(s, gfpflags, node, c); | ||
2264 | |||
2265 | if (unlikely(!object)) { | ||
2266 | 2274 | ||
2267 | object = new_slab_objects(s, gfpflags, node, &c); | 2275 | if (unlikely(!freelist)) { |
2276 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | ||
2277 | slab_out_of_memory(s, gfpflags, node); | ||
2268 | 2278 | ||
2269 | if (unlikely(!object)) { | 2279 | local_irq_restore(flags); |
2270 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | 2280 | return NULL; |
2271 | slab_out_of_memory(s, gfpflags, node); | ||
2272 | |||
2273 | local_irq_restore(flags); | ||
2274 | return NULL; | ||
2275 | } | ||
2276 | } | 2281 | } |
2277 | 2282 | ||
2278 | if (likely(!kmem_cache_debug(s))) | 2283 | page = c->page; |
2284 | if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) | ||
2279 | goto load_freelist; | 2285 | goto load_freelist; |
2280 | 2286 | ||
2281 | /* Only entered in the debug case */ | 2287 | /* Only entered in the debug case */ |
2282 | if (!alloc_debug_processing(s, c->page, object, addr)) | 2288 | if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr)) |
2283 | goto new_slab; /* Slab failed checks. Next slab needed */ | 2289 | goto new_slab; /* Slab failed checks. Next slab needed */ |
2284 | 2290 | ||
2285 | c->freelist = get_freepointer(s, object); | 2291 | deactivate_slab(s, page, get_freepointer(s, freelist)); |
2286 | deactivate_slab(s, c); | 2292 | c->page = NULL; |
2287 | c->node = NUMA_NO_NODE; | 2293 | c->freelist = NULL; |
2288 | local_irq_restore(flags); | 2294 | local_irq_restore(flags); |
2289 | return object; | 2295 | return freelist; |
2290 | } | 2296 | } |
2291 | 2297 | ||
2292 | /* | 2298 | /* |
@@ -2304,6 +2310,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
2304 | { | 2310 | { |
2305 | void **object; | 2311 | void **object; |
2306 | struct kmem_cache_cpu *c; | 2312 | struct kmem_cache_cpu *c; |
2313 | struct page *page; | ||
2307 | unsigned long tid; | 2314 | unsigned long tid; |
2308 | 2315 | ||
2309 | if (slab_pre_alloc_hook(s, gfpflags)) | 2316 | if (slab_pre_alloc_hook(s, gfpflags)) |
@@ -2329,8 +2336,8 @@ redo: | |||
2329 | barrier(); | 2336 | barrier(); |
2330 | 2337 | ||
2331 | object = c->freelist; | 2338 | object = c->freelist; |
2332 | if (unlikely(!object || !node_match(c, node))) | 2339 | page = c->page; |
2333 | 2340 | if (unlikely(!object || !node_match(page, node))) | |
2334 | object = __slab_alloc(s, gfpflags, node, addr, c); | 2341 | object = __slab_alloc(s, gfpflags, node, addr, c); |
2335 | 2342 | ||
2336 | else { | 2343 | else { |
@@ -2361,7 +2368,7 @@ redo: | |||
2361 | } | 2368 | } |
2362 | 2369 | ||
2363 | if (unlikely(gfpflags & __GFP_ZERO) && object) | 2370 | if (unlikely(gfpflags & __GFP_ZERO) && object) |
2364 | memset(object, 0, s->objsize); | 2371 | memset(object, 0, s->object_size); |
2365 | 2372 | ||
2366 | slab_post_alloc_hook(s, gfpflags, object); | 2373 | slab_post_alloc_hook(s, gfpflags, object); |
2367 | 2374 | ||
@@ -2372,7 +2379,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) | |||
2372 | { | 2379 | { |
2373 | void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); | 2380 | void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); |
2374 | 2381 | ||
2375 | trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); | 2382 | trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); |
2376 | 2383 | ||
2377 | return ret; | 2384 | return ret; |
2378 | } | 2385 | } |
@@ -2402,7 +2409,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) | |||
2402 | void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); | 2409 | void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); |
2403 | 2410 | ||
2404 | trace_kmem_cache_alloc_node(_RET_IP_, ret, | 2411 | trace_kmem_cache_alloc_node(_RET_IP_, ret, |
2405 | s->objsize, s->size, gfpflags, node); | 2412 | s->object_size, s->size, gfpflags, node); |
2406 | 2413 | ||
2407 | return ret; | 2414 | return ret; |
2408 | } | 2415 | } |
@@ -2766,7 +2773,7 @@ static unsigned long calculate_alignment(unsigned long flags, | |||
2766 | } | 2773 | } |
2767 | 2774 | ||
2768 | static void | 2775 | static void |
2769 | init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) | 2776 | init_kmem_cache_node(struct kmem_cache_node *n) |
2770 | { | 2777 | { |
2771 | n->nr_partial = 0; | 2778 | n->nr_partial = 0; |
2772 | spin_lock_init(&n->list_lock); | 2779 | spin_lock_init(&n->list_lock); |
@@ -2836,7 +2843,7 @@ static void early_kmem_cache_node_alloc(int node) | |||
2836 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); | 2843 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); |
2837 | init_tracking(kmem_cache_node, n); | 2844 | init_tracking(kmem_cache_node, n); |
2838 | #endif | 2845 | #endif |
2839 | init_kmem_cache_node(n, kmem_cache_node); | 2846 | init_kmem_cache_node(n); |
2840 | inc_slabs_node(kmem_cache_node, node, page->objects); | 2847 | inc_slabs_node(kmem_cache_node, node, page->objects); |
2841 | 2848 | ||
2842 | add_partial(n, page, DEACTIVATE_TO_HEAD); | 2849 | add_partial(n, page, DEACTIVATE_TO_HEAD); |
@@ -2876,7 +2883,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) | |||
2876 | } | 2883 | } |
2877 | 2884 | ||
2878 | s->node[node] = n; | 2885 | s->node[node] = n; |
2879 | init_kmem_cache_node(n, s); | 2886 | init_kmem_cache_node(n); |
2880 | } | 2887 | } |
2881 | return 1; | 2888 | return 1; |
2882 | } | 2889 | } |
@@ -2897,7 +2904,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) | |||
2897 | static int calculate_sizes(struct kmem_cache *s, int forced_order) | 2904 | static int calculate_sizes(struct kmem_cache *s, int forced_order) |
2898 | { | 2905 | { |
2899 | unsigned long flags = s->flags; | 2906 | unsigned long flags = s->flags; |
2900 | unsigned long size = s->objsize; | 2907 | unsigned long size = s->object_size; |
2901 | unsigned long align = s->align; | 2908 | unsigned long align = s->align; |
2902 | int order; | 2909 | int order; |
2903 | 2910 | ||
@@ -2926,7 +2933,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
2926 | * end of the object and the free pointer. If not then add an | 2933 | * end of the object and the free pointer. If not then add an |
2927 | * additional word to have some bytes to store Redzone information. | 2934 | * additional word to have some bytes to store Redzone information. |
2928 | */ | 2935 | */ |
2929 | if ((flags & SLAB_RED_ZONE) && size == s->objsize) | 2936 | if ((flags & SLAB_RED_ZONE) && size == s->object_size) |
2930 | size += sizeof(void *); | 2937 | size += sizeof(void *); |
2931 | #endif | 2938 | #endif |
2932 | 2939 | ||
@@ -2974,7 +2981,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
2974 | * user specified and the dynamic determination of cache line size | 2981 | * user specified and the dynamic determination of cache line size |
2975 | * on bootup. | 2982 | * on bootup. |
2976 | */ | 2983 | */ |
2977 | align = calculate_alignment(flags, align, s->objsize); | 2984 | align = calculate_alignment(flags, align, s->object_size); |
2978 | s->align = align; | 2985 | s->align = align; |
2979 | 2986 | ||
2980 | /* | 2987 | /* |
@@ -3022,7 +3029,7 @@ static int kmem_cache_open(struct kmem_cache *s, | |||
3022 | memset(s, 0, kmem_size); | 3029 | memset(s, 0, kmem_size); |
3023 | s->name = name; | 3030 | s->name = name; |
3024 | s->ctor = ctor; | 3031 | s->ctor = ctor; |
3025 | s->objsize = size; | 3032 | s->object_size = size; |
3026 | s->align = align; | 3033 | s->align = align; |
3027 | s->flags = kmem_cache_flags(size, flags, name, ctor); | 3034 | s->flags = kmem_cache_flags(size, flags, name, ctor); |
3028 | s->reserved = 0; | 3035 | s->reserved = 0; |
@@ -3037,7 +3044,7 @@ static int kmem_cache_open(struct kmem_cache *s, | |||
3037 | * Disable debugging flags that store metadata if the min slab | 3044 | * Disable debugging flags that store metadata if the min slab |
3038 | * order increased. | 3045 | * order increased. |
3039 | */ | 3046 | */ |
3040 | if (get_order(s->size) > get_order(s->objsize)) { | 3047 | if (get_order(s->size) > get_order(s->object_size)) { |
3041 | s->flags &= ~DEBUG_METADATA_FLAGS; | 3048 | s->flags &= ~DEBUG_METADATA_FLAGS; |
3042 | s->offset = 0; | 3049 | s->offset = 0; |
3043 | if (!calculate_sizes(s, -1)) | 3050 | if (!calculate_sizes(s, -1)) |
@@ -3111,7 +3118,7 @@ error: | |||
3111 | */ | 3118 | */ |
3112 | unsigned int kmem_cache_size(struct kmem_cache *s) | 3119 | unsigned int kmem_cache_size(struct kmem_cache *s) |
3113 | { | 3120 | { |
3114 | return s->objsize; | 3121 | return s->object_size; |
3115 | } | 3122 | } |
3116 | EXPORT_SYMBOL(kmem_cache_size); | 3123 | EXPORT_SYMBOL(kmem_cache_size); |
3117 | 3124 | ||
@@ -3189,11 +3196,11 @@ static inline int kmem_cache_close(struct kmem_cache *s) | |||
3189 | */ | 3196 | */ |
3190 | void kmem_cache_destroy(struct kmem_cache *s) | 3197 | void kmem_cache_destroy(struct kmem_cache *s) |
3191 | { | 3198 | { |
3192 | down_write(&slub_lock); | 3199 | mutex_lock(&slab_mutex); |
3193 | s->refcount--; | 3200 | s->refcount--; |
3194 | if (!s->refcount) { | 3201 | if (!s->refcount) { |
3195 | list_del(&s->list); | 3202 | list_del(&s->list); |
3196 | up_write(&slub_lock); | 3203 | mutex_unlock(&slab_mutex); |
3197 | if (kmem_cache_close(s)) { | 3204 | if (kmem_cache_close(s)) { |
3198 | printk(KERN_ERR "SLUB %s: %s called for cache that " | 3205 | printk(KERN_ERR "SLUB %s: %s called for cache that " |
3199 | "still has objects.\n", s->name, __func__); | 3206 | "still has objects.\n", s->name, __func__); |
@@ -3203,7 +3210,7 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
3203 | rcu_barrier(); | 3210 | rcu_barrier(); |
3204 | sysfs_slab_remove(s); | 3211 | sysfs_slab_remove(s); |
3205 | } else | 3212 | } else |
3206 | up_write(&slub_lock); | 3213 | mutex_unlock(&slab_mutex); |
3207 | } | 3214 | } |
3208 | EXPORT_SYMBOL(kmem_cache_destroy); | 3215 | EXPORT_SYMBOL(kmem_cache_destroy); |
3209 | 3216 | ||
@@ -3265,7 +3272,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name, | |||
3265 | 3272 | ||
3266 | /* | 3273 | /* |
3267 | * This function is called with IRQs disabled during early-boot on | 3274 | * This function is called with IRQs disabled during early-boot on |
3268 | * single CPU so there's no need to take slub_lock here. | 3275 | * single CPU so there's no need to take slab_mutex here. |
3269 | */ | 3276 | */ |
3270 | if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, | 3277 | if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, |
3271 | flags, NULL)) | 3278 | flags, NULL)) |
@@ -3550,10 +3557,10 @@ static int slab_mem_going_offline_callback(void *arg) | |||
3550 | { | 3557 | { |
3551 | struct kmem_cache *s; | 3558 | struct kmem_cache *s; |
3552 | 3559 | ||
3553 | down_read(&slub_lock); | 3560 | mutex_lock(&slab_mutex); |
3554 | list_for_each_entry(s, &slab_caches, list) | 3561 | list_for_each_entry(s, &slab_caches, list) |
3555 | kmem_cache_shrink(s); | 3562 | kmem_cache_shrink(s); |
3556 | up_read(&slub_lock); | 3563 | mutex_unlock(&slab_mutex); |
3557 | 3564 | ||
3558 | return 0; | 3565 | return 0; |
3559 | } | 3566 | } |
@@ -3574,7 +3581,7 @@ static void slab_mem_offline_callback(void *arg) | |||
3574 | if (offline_node < 0) | 3581 | if (offline_node < 0) |
3575 | return; | 3582 | return; |
3576 | 3583 | ||
3577 | down_read(&slub_lock); | 3584 | mutex_lock(&slab_mutex); |
3578 | list_for_each_entry(s, &slab_caches, list) { | 3585 | list_for_each_entry(s, &slab_caches, list) { |
3579 | n = get_node(s, offline_node); | 3586 | n = get_node(s, offline_node); |
3580 | if (n) { | 3587 | if (n) { |
@@ -3590,7 +3597,7 @@ static void slab_mem_offline_callback(void *arg) | |||
3590 | kmem_cache_free(kmem_cache_node, n); | 3597 | kmem_cache_free(kmem_cache_node, n); |
3591 | } | 3598 | } |
3592 | } | 3599 | } |
3593 | up_read(&slub_lock); | 3600 | mutex_unlock(&slab_mutex); |
3594 | } | 3601 | } |
3595 | 3602 | ||
3596 | static int slab_mem_going_online_callback(void *arg) | 3603 | static int slab_mem_going_online_callback(void *arg) |
@@ -3613,7 +3620,7 @@ static int slab_mem_going_online_callback(void *arg) | |||
3613 | * allocate a kmem_cache_node structure in order to bring the node | 3620 | * allocate a kmem_cache_node structure in order to bring the node |
3614 | * online. | 3621 | * online. |
3615 | */ | 3622 | */ |
3616 | down_read(&slub_lock); | 3623 | mutex_lock(&slab_mutex); |
3617 | list_for_each_entry(s, &slab_caches, list) { | 3624 | list_for_each_entry(s, &slab_caches, list) { |
3618 | /* | 3625 | /* |
3619 | * XXX: kmem_cache_alloc_node will fallback to other nodes | 3626 | * XXX: kmem_cache_alloc_node will fallback to other nodes |
@@ -3625,11 +3632,11 @@ static int slab_mem_going_online_callback(void *arg) | |||
3625 | ret = -ENOMEM; | 3632 | ret = -ENOMEM; |
3626 | goto out; | 3633 | goto out; |
3627 | } | 3634 | } |
3628 | init_kmem_cache_node(n, s); | 3635 | init_kmem_cache_node(n); |
3629 | s->node[nid] = n; | 3636 | s->node[nid] = n; |
3630 | } | 3637 | } |
3631 | out: | 3638 | out: |
3632 | up_read(&slub_lock); | 3639 | mutex_unlock(&slab_mutex); |
3633 | return ret; | 3640 | return ret; |
3634 | } | 3641 | } |
3635 | 3642 | ||
@@ -3840,11 +3847,11 @@ void __init kmem_cache_init(void) | |||
3840 | 3847 | ||
3841 | if (s && s->size) { | 3848 | if (s && s->size) { |
3842 | char *name = kasprintf(GFP_NOWAIT, | 3849 | char *name = kasprintf(GFP_NOWAIT, |
3843 | "dma-kmalloc-%d", s->objsize); | 3850 | "dma-kmalloc-%d", s->object_size); |
3844 | 3851 | ||
3845 | BUG_ON(!name); | 3852 | BUG_ON(!name); |
3846 | kmalloc_dma_caches[i] = create_kmalloc_cache(name, | 3853 | kmalloc_dma_caches[i] = create_kmalloc_cache(name, |
3847 | s->objsize, SLAB_CACHE_DMA); | 3854 | s->object_size, SLAB_CACHE_DMA); |
3848 | } | 3855 | } |
3849 | } | 3856 | } |
3850 | #endif | 3857 | #endif |
@@ -3921,16 +3928,12 @@ static struct kmem_cache *find_mergeable(size_t size, | |||
3921 | return NULL; | 3928 | return NULL; |
3922 | } | 3929 | } |
3923 | 3930 | ||
3924 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | 3931 | struct kmem_cache *__kmem_cache_create(const char *name, size_t size, |
3925 | size_t align, unsigned long flags, void (*ctor)(void *)) | 3932 | size_t align, unsigned long flags, void (*ctor)(void *)) |
3926 | { | 3933 | { |
3927 | struct kmem_cache *s; | 3934 | struct kmem_cache *s; |
3928 | char *n; | 3935 | char *n; |
3929 | 3936 | ||
3930 | if (WARN_ON(!name)) | ||
3931 | return NULL; | ||
3932 | |||
3933 | down_write(&slub_lock); | ||
3934 | s = find_mergeable(size, align, flags, name, ctor); | 3937 | s = find_mergeable(size, align, flags, name, ctor); |
3935 | if (s) { | 3938 | if (s) { |
3936 | s->refcount++; | 3939 | s->refcount++; |
@@ -3938,49 +3941,42 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
3938 | * Adjust the object sizes so that we clear | 3941 | * Adjust the object sizes so that we clear |
3939 | * the complete object on kzalloc. | 3942 | * the complete object on kzalloc. |
3940 | */ | 3943 | */ |
3941 | s->objsize = max(s->objsize, (int)size); | 3944 | s->object_size = max(s->object_size, (int)size); |
3942 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); | 3945 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); |
3943 | 3946 | ||
3944 | if (sysfs_slab_alias(s, name)) { | 3947 | if (sysfs_slab_alias(s, name)) { |
3945 | s->refcount--; | 3948 | s->refcount--; |
3946 | goto err; | 3949 | return NULL; |
3947 | } | 3950 | } |
3948 | up_write(&slub_lock); | ||
3949 | return s; | 3951 | return s; |
3950 | } | 3952 | } |
3951 | 3953 | ||
3952 | n = kstrdup(name, GFP_KERNEL); | 3954 | n = kstrdup(name, GFP_KERNEL); |
3953 | if (!n) | 3955 | if (!n) |
3954 | goto err; | 3956 | return NULL; |
3955 | 3957 | ||
3956 | s = kmalloc(kmem_size, GFP_KERNEL); | 3958 | s = kmalloc(kmem_size, GFP_KERNEL); |
3957 | if (s) { | 3959 | if (s) { |
3958 | if (kmem_cache_open(s, n, | 3960 | if (kmem_cache_open(s, n, |
3959 | size, align, flags, ctor)) { | 3961 | size, align, flags, ctor)) { |
3962 | int r; | ||
3963 | |||
3960 | list_add(&s->list, &slab_caches); | 3964 | list_add(&s->list, &slab_caches); |
3961 | up_write(&slub_lock); | 3965 | mutex_unlock(&slab_mutex); |
3962 | if (sysfs_slab_add(s)) { | 3966 | r = sysfs_slab_add(s); |
3963 | down_write(&slub_lock); | 3967 | mutex_lock(&slab_mutex); |
3964 | list_del(&s->list); | 3968 | |
3965 | kfree(n); | 3969 | if (!r) |
3966 | kfree(s); | 3970 | return s; |
3967 | goto err; | 3971 | |
3968 | } | 3972 | list_del(&s->list); |
3969 | return s; | 3973 | kmem_cache_close(s); |
3970 | } | 3974 | } |
3971 | kfree(n); | ||
3972 | kfree(s); | 3975 | kfree(s); |
3973 | } | 3976 | } |
3974 | err: | 3977 | kfree(n); |
3975 | up_write(&slub_lock); | 3978 | return NULL; |
3976 | |||
3977 | if (flags & SLAB_PANIC) | ||
3978 | panic("Cannot create slabcache %s\n", name); | ||
3979 | else | ||
3980 | s = NULL; | ||
3981 | return s; | ||
3982 | } | 3979 | } |
3983 | EXPORT_SYMBOL(kmem_cache_create); | ||
3984 | 3980 | ||
3985 | #ifdef CONFIG_SMP | 3981 | #ifdef CONFIG_SMP |
3986 | /* | 3982 | /* |
@@ -3999,13 +3995,13 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | |||
3999 | case CPU_UP_CANCELED_FROZEN: | 3995 | case CPU_UP_CANCELED_FROZEN: |
4000 | case CPU_DEAD: | 3996 | case CPU_DEAD: |
4001 | case CPU_DEAD_FROZEN: | 3997 | case CPU_DEAD_FROZEN: |
4002 | down_read(&slub_lock); | 3998 | mutex_lock(&slab_mutex); |
4003 | list_for_each_entry(s, &slab_caches, list) { | 3999 | list_for_each_entry(s, &slab_caches, list) { |
4004 | local_irq_save(flags); | 4000 | local_irq_save(flags); |
4005 | __flush_cpu_slab(s, cpu); | 4001 | __flush_cpu_slab(s, cpu); |
4006 | local_irq_restore(flags); | 4002 | local_irq_restore(flags); |
4007 | } | 4003 | } |
4008 | up_read(&slub_lock); | 4004 | mutex_unlock(&slab_mutex); |
4009 | break; | 4005 | break; |
4010 | default: | 4006 | default: |
4011 | break; | 4007 | break; |
@@ -4497,30 +4493,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4497 | 4493 | ||
4498 | for_each_possible_cpu(cpu) { | 4494 | for_each_possible_cpu(cpu) { |
4499 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); | 4495 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
4500 | int node = ACCESS_ONCE(c->node); | 4496 | int node; |
4501 | struct page *page; | 4497 | struct page *page; |
4502 | 4498 | ||
4503 | if (node < 0) | ||
4504 | continue; | ||
4505 | page = ACCESS_ONCE(c->page); | 4499 | page = ACCESS_ONCE(c->page); |
4506 | if (page) { | 4500 | if (!page) |
4507 | if (flags & SO_TOTAL) | 4501 | continue; |
4508 | x = page->objects; | ||
4509 | else if (flags & SO_OBJECTS) | ||
4510 | x = page->inuse; | ||
4511 | else | ||
4512 | x = 1; | ||
4513 | 4502 | ||
4514 | total += x; | 4503 | node = page_to_nid(page); |
4515 | nodes[node] += x; | 4504 | if (flags & SO_TOTAL) |
4516 | } | 4505 | x = page->objects; |
4517 | page = c->partial; | 4506 | else if (flags & SO_OBJECTS) |
4507 | x = page->inuse; | ||
4508 | else | ||
4509 | x = 1; | ||
4518 | 4510 | ||
4511 | total += x; | ||
4512 | nodes[node] += x; | ||
4513 | |||
4514 | page = ACCESS_ONCE(c->partial); | ||
4519 | if (page) { | 4515 | if (page) { |
4520 | x = page->pobjects; | 4516 | x = page->pobjects; |
4521 | total += x; | 4517 | total += x; |
4522 | nodes[node] += x; | 4518 | nodes[node] += x; |
4523 | } | 4519 | } |
4520 | |||
4524 | per_cpu[node]++; | 4521 | per_cpu[node]++; |
4525 | } | 4522 | } |
4526 | } | 4523 | } |
@@ -4620,7 +4617,7 @@ SLAB_ATTR_RO(align); | |||
4620 | 4617 | ||
4621 | static ssize_t object_size_show(struct kmem_cache *s, char *buf) | 4618 | static ssize_t object_size_show(struct kmem_cache *s, char *buf) |
4622 | { | 4619 | { |
4623 | return sprintf(buf, "%d\n", s->objsize); | 4620 | return sprintf(buf, "%d\n", s->object_size); |
4624 | } | 4621 | } |
4625 | SLAB_ATTR_RO(object_size); | 4622 | SLAB_ATTR_RO(object_size); |
4626 | 4623 | ||
@@ -5283,7 +5280,7 @@ static int sysfs_slab_add(struct kmem_cache *s) | |||
5283 | const char *name; | 5280 | const char *name; |
5284 | int unmergeable; | 5281 | int unmergeable; |
5285 | 5282 | ||
5286 | if (slab_state < SYSFS) | 5283 | if (slab_state < FULL) |
5287 | /* Defer until later */ | 5284 | /* Defer until later */ |
5288 | return 0; | 5285 | return 0; |
5289 | 5286 | ||
@@ -5328,7 +5325,7 @@ static int sysfs_slab_add(struct kmem_cache *s) | |||
5328 | 5325 | ||
5329 | static void sysfs_slab_remove(struct kmem_cache *s) | 5326 | static void sysfs_slab_remove(struct kmem_cache *s) |
5330 | { | 5327 | { |
5331 | if (slab_state < SYSFS) | 5328 | if (slab_state < FULL) |
5332 | /* | 5329 | /* |
5333 | * Sysfs has not been setup yet so no need to remove the | 5330 | * Sysfs has not been setup yet so no need to remove the |
5334 | * cache from sysfs. | 5331 | * cache from sysfs. |
@@ -5356,7 +5353,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) | |||
5356 | { | 5353 | { |
5357 | struct saved_alias *al; | 5354 | struct saved_alias *al; |
5358 | 5355 | ||
5359 | if (slab_state == SYSFS) { | 5356 | if (slab_state == FULL) { |
5360 | /* | 5357 | /* |
5361 | * If we have a leftover link then remove it. | 5358 | * If we have a leftover link then remove it. |
5362 | */ | 5359 | */ |
@@ -5380,16 +5377,16 @@ static int __init slab_sysfs_init(void) | |||
5380 | struct kmem_cache *s; | 5377 | struct kmem_cache *s; |
5381 | int err; | 5378 | int err; |
5382 | 5379 | ||
5383 | down_write(&slub_lock); | 5380 | mutex_lock(&slab_mutex); |
5384 | 5381 | ||
5385 | slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); | 5382 | slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); |
5386 | if (!slab_kset) { | 5383 | if (!slab_kset) { |
5387 | up_write(&slub_lock); | 5384 | mutex_unlock(&slab_mutex); |
5388 | printk(KERN_ERR "Cannot register slab subsystem.\n"); | 5385 | printk(KERN_ERR "Cannot register slab subsystem.\n"); |
5389 | return -ENOSYS; | 5386 | return -ENOSYS; |
5390 | } | 5387 | } |
5391 | 5388 | ||
5392 | slab_state = SYSFS; | 5389 | slab_state = FULL; |
5393 | 5390 | ||
5394 | list_for_each_entry(s, &slab_caches, list) { | 5391 | list_for_each_entry(s, &slab_caches, list) { |
5395 | err = sysfs_slab_add(s); | 5392 | err = sysfs_slab_add(s); |
@@ -5405,11 +5402,11 @@ static int __init slab_sysfs_init(void) | |||
5405 | err = sysfs_slab_alias(al->s, al->name); | 5402 | err = sysfs_slab_alias(al->s, al->name); |
5406 | if (err) | 5403 | if (err) |
5407 | printk(KERN_ERR "SLUB: Unable to add boot slab alias" | 5404 | printk(KERN_ERR "SLUB: Unable to add boot slab alias" |
5408 | " %s to sysfs\n", s->name); | 5405 | " %s to sysfs\n", al->name); |
5409 | kfree(al); | 5406 | kfree(al); |
5410 | } | 5407 | } |
5411 | 5408 | ||
5412 | up_write(&slub_lock); | 5409 | mutex_unlock(&slab_mutex); |
5413 | resiliency_test(); | 5410 | resiliency_test(); |
5414 | return 0; | 5411 | return 0; |
5415 | } | 5412 | } |
@@ -5424,7 +5421,7 @@ __initcall(slab_sysfs_init); | |||
5424 | static void print_slabinfo_header(struct seq_file *m) | 5421 | static void print_slabinfo_header(struct seq_file *m) |
5425 | { | 5422 | { |
5426 | seq_puts(m, "slabinfo - version: 2.1\n"); | 5423 | seq_puts(m, "slabinfo - version: 2.1\n"); |
5427 | seq_puts(m, "# name <active_objs> <num_objs> <objsize> " | 5424 | seq_puts(m, "# name <active_objs> <num_objs> <object_size> " |
5428 | "<objperslab> <pagesperslab>"); | 5425 | "<objperslab> <pagesperslab>"); |
5429 | seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); | 5426 | seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); |
5430 | seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); | 5427 | seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); |
@@ -5435,7 +5432,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
5435 | { | 5432 | { |
5436 | loff_t n = *pos; | 5433 | loff_t n = *pos; |
5437 | 5434 | ||
5438 | down_read(&slub_lock); | 5435 | mutex_lock(&slab_mutex); |
5439 | if (!n) | 5436 | if (!n) |
5440 | print_slabinfo_header(m); | 5437 | print_slabinfo_header(m); |
5441 | 5438 | ||
@@ -5449,7 +5446,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) | |||
5449 | 5446 | ||
5450 | static void s_stop(struct seq_file *m, void *p) | 5447 | static void s_stop(struct seq_file *m, void *p) |
5451 | { | 5448 | { |
5452 | up_read(&slub_lock); | 5449 | mutex_unlock(&slab_mutex); |
5453 | } | 5450 | } |
5454 | 5451 | ||
5455 | static int s_show(struct seq_file *m, void *p) | 5452 | static int s_show(struct seq_file *m, void *p) |
diff --git a/mm/sparse.c b/mm/sparse.c index a8bc7d364deb..fac95f2888f2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -65,21 +65,18 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) | |||
65 | 65 | ||
66 | if (slab_is_available()) { | 66 | if (slab_is_available()) { |
67 | if (node_state(nid, N_HIGH_MEMORY)) | 67 | if (node_state(nid, N_HIGH_MEMORY)) |
68 | section = kmalloc_node(array_size, GFP_KERNEL, nid); | 68 | section = kzalloc_node(array_size, GFP_KERNEL, nid); |
69 | else | 69 | else |
70 | section = kmalloc(array_size, GFP_KERNEL); | 70 | section = kzalloc(array_size, GFP_KERNEL); |
71 | } else | 71 | } else { |
72 | section = alloc_bootmem_node(NODE_DATA(nid), array_size); | 72 | section = alloc_bootmem_node(NODE_DATA(nid), array_size); |
73 | 73 | } | |
74 | if (section) | ||
75 | memset(section, 0, array_size); | ||
76 | 74 | ||
77 | return section; | 75 | return section; |
78 | } | 76 | } |
79 | 77 | ||
80 | static int __meminit sparse_index_init(unsigned long section_nr, int nid) | 78 | static int __meminit sparse_index_init(unsigned long section_nr, int nid) |
81 | { | 79 | { |
82 | static DEFINE_SPINLOCK(index_init_lock); | ||
83 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); | 80 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); |
84 | struct mem_section *section; | 81 | struct mem_section *section; |
85 | int ret = 0; | 82 | int ret = 0; |
@@ -90,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) | |||
90 | section = sparse_index_alloc(nid); | 87 | section = sparse_index_alloc(nid); |
91 | if (!section) | 88 | if (!section) |
92 | return -ENOMEM; | 89 | return -ENOMEM; |
93 | /* | ||
94 | * This lock keeps two different sections from | ||
95 | * reallocating for the same index | ||
96 | */ | ||
97 | spin_lock(&index_init_lock); | ||
98 | |||
99 | if (mem_section[root]) { | ||
100 | ret = -EEXIST; | ||
101 | goto out; | ||
102 | } | ||
103 | 90 | ||
104 | mem_section[root] = section; | 91 | mem_section[root] = section; |
105 | out: | 92 | |
106 | spin_unlock(&index_init_lock); | ||
107 | return ret; | 93 | return ret; |
108 | } | 94 | } |
109 | #else /* !SPARSEMEM_EXTREME */ | 95 | #else /* !SPARSEMEM_EXTREME */ |
@@ -132,6 +118,8 @@ int __section_nr(struct mem_section* ms) | |||
132 | break; | 118 | break; |
133 | } | 119 | } |
134 | 120 | ||
121 | VM_BUG_ON(root_nr == NR_SECTION_ROOTS); | ||
122 | |||
135 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); | 123 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); |
136 | } | 124 | } |
137 | 125 | ||
@@ -273,10 +261,11 @@ static unsigned long *__kmalloc_section_usemap(void) | |||
273 | #ifdef CONFIG_MEMORY_HOTREMOVE | 261 | #ifdef CONFIG_MEMORY_HOTREMOVE |
274 | static unsigned long * __init | 262 | static unsigned long * __init |
275 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | 263 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
276 | unsigned long count) | 264 | unsigned long size) |
277 | { | 265 | { |
278 | unsigned long section_nr; | 266 | unsigned long goal, limit; |
279 | 267 | unsigned long *p; | |
268 | int nid; | ||
280 | /* | 269 | /* |
281 | * A page may contain usemaps for other sections preventing the | 270 | * A page may contain usemaps for other sections preventing the |
282 | * page being freed and making a section unremovable while | 271 | * page being freed and making a section unremovable while |
@@ -287,8 +276,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | |||
287 | * from the same section as the pgdat where possible to avoid | 276 | * from the same section as the pgdat where possible to avoid |
288 | * this problem. | 277 | * this problem. |
289 | */ | 278 | */ |
290 | section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); | 279 | goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); |
291 | return alloc_bootmem_section(usemap_size() * count, section_nr); | 280 | limit = goal + (1UL << PA_SECTION_SHIFT); |
281 | nid = early_pfn_to_nid(goal >> PAGE_SHIFT); | ||
282 | again: | ||
283 | p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, | ||
284 | SMP_CACHE_BYTES, goal, limit); | ||
285 | if (!p && limit) { | ||
286 | limit = 0; | ||
287 | goto again; | ||
288 | } | ||
289 | return p; | ||
292 | } | 290 | } |
293 | 291 | ||
294 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 292 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
@@ -332,9 +330,9 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | |||
332 | #else | 330 | #else |
333 | static unsigned long * __init | 331 | static unsigned long * __init |
334 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | 332 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
335 | unsigned long count) | 333 | unsigned long size) |
336 | { | 334 | { |
337 | return NULL; | 335 | return alloc_bootmem_node_nopanic(pgdat, size); |
338 | } | 336 | } |
339 | 337 | ||
340 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 338 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
@@ -352,13 +350,10 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, | |||
352 | int size = usemap_size(); | 350 | int size = usemap_size(); |
353 | 351 | ||
354 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), | 352 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), |
355 | usemap_count); | 353 | size * usemap_count); |
356 | if (!usemap) { | 354 | if (!usemap) { |
357 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); | 355 | printk(KERN_WARNING "%s: allocation failed\n", __func__); |
358 | if (!usemap) { | 356 | return; |
359 | printk(KERN_WARNING "%s: allocation failed\n", __func__); | ||
360 | return; | ||
361 | } | ||
362 | } | 357 | } |
363 | 358 | ||
364 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 359 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
@@ -486,6 +481,9 @@ void __init sparse_init(void) | |||
486 | struct page **map_map; | 481 | struct page **map_map; |
487 | #endif | 482 | #endif |
488 | 483 | ||
484 | /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ | ||
485 | set_pageblock_order(); | ||
486 | |||
489 | /* | 487 | /* |
490 | * map is using big page (aka 2M in x86 64 bit) | 488 | * map is using big page (aka 2M in x86 64 bit) |
491 | * usemap is less one page (aka 24 bytes) | 489 | * usemap is less one page (aka 24 bytes) |
@@ -47,13 +47,15 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); | |||
47 | static void __page_cache_release(struct page *page) | 47 | static void __page_cache_release(struct page *page) |
48 | { | 48 | { |
49 | if (PageLRU(page)) { | 49 | if (PageLRU(page)) { |
50 | unsigned long flags; | ||
51 | struct zone *zone = page_zone(page); | 50 | struct zone *zone = page_zone(page); |
51 | struct lruvec *lruvec; | ||
52 | unsigned long flags; | ||
52 | 53 | ||
53 | spin_lock_irqsave(&zone->lru_lock, flags); | 54 | spin_lock_irqsave(&zone->lru_lock, flags); |
55 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
54 | VM_BUG_ON(!PageLRU(page)); | 56 | VM_BUG_ON(!PageLRU(page)); |
55 | __ClearPageLRU(page); | 57 | __ClearPageLRU(page); |
56 | del_page_from_lru_list(zone, page, page_off_lru(page)); | 58 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
57 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 59 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
58 | } | 60 | } |
59 | } | 61 | } |
@@ -82,6 +84,25 @@ static void put_compound_page(struct page *page) | |||
82 | if (likely(page != page_head && | 84 | if (likely(page != page_head && |
83 | get_page_unless_zero(page_head))) { | 85 | get_page_unless_zero(page_head))) { |
84 | unsigned long flags; | 86 | unsigned long flags; |
87 | |||
88 | /* | ||
89 | * THP can not break up slab pages so avoid taking | ||
90 | * compound_lock(). Slab performs non-atomic bit ops | ||
91 | * on page->flags for better performance. In particular | ||
92 | * slab_unlock() in slub used to be a hot path. It is | ||
93 | * still hot on arches that do not support | ||
94 | * this_cpu_cmpxchg_double(). | ||
95 | */ | ||
96 | if (PageSlab(page_head)) { | ||
97 | if (PageTail(page)) { | ||
98 | if (put_page_testzero(page_head)) | ||
99 | VM_BUG_ON(1); | ||
100 | |||
101 | atomic_dec(&page->_mapcount); | ||
102 | goto skip_lock_tail; | ||
103 | } else | ||
104 | goto skip_lock; | ||
105 | } | ||
85 | /* | 106 | /* |
86 | * page_head wasn't a dangling pointer but it | 107 | * page_head wasn't a dangling pointer but it |
87 | * may not be a head page anymore by the time | 108 | * may not be a head page anymore by the time |
@@ -92,10 +113,10 @@ static void put_compound_page(struct page *page) | |||
92 | if (unlikely(!PageTail(page))) { | 113 | if (unlikely(!PageTail(page))) { |
93 | /* __split_huge_page_refcount run before us */ | 114 | /* __split_huge_page_refcount run before us */ |
94 | compound_unlock_irqrestore(page_head, flags); | 115 | compound_unlock_irqrestore(page_head, flags); |
95 | VM_BUG_ON(PageHead(page_head)); | 116 | skip_lock: |
96 | if (put_page_testzero(page_head)) | 117 | if (put_page_testzero(page_head)) |
97 | __put_single_page(page_head); | 118 | __put_single_page(page_head); |
98 | out_put_single: | 119 | out_put_single: |
99 | if (put_page_testzero(page)) | 120 | if (put_page_testzero(page)) |
100 | __put_single_page(page); | 121 | __put_single_page(page); |
101 | return; | 122 | return; |
@@ -115,6 +136,8 @@ static void put_compound_page(struct page *page) | |||
115 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | 136 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); |
116 | VM_BUG_ON(atomic_read(&page->_count) != 0); | 137 | VM_BUG_ON(atomic_read(&page->_count) != 0); |
117 | compound_unlock_irqrestore(page_head, flags); | 138 | compound_unlock_irqrestore(page_head, flags); |
139 | |||
140 | skip_lock_tail: | ||
118 | if (put_page_testzero(page_head)) { | 141 | if (put_page_testzero(page_head)) { |
119 | if (PageHead(page_head)) | 142 | if (PageHead(page_head)) |
120 | __put_compound_page(page_head); | 143 | __put_compound_page(page_head); |
@@ -162,6 +185,18 @@ bool __get_page_tail(struct page *page) | |||
162 | struct page *page_head = compound_trans_head(page); | 185 | struct page *page_head = compound_trans_head(page); |
163 | 186 | ||
164 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | 187 | if (likely(page != page_head && get_page_unless_zero(page_head))) { |
188 | |||
189 | /* Ref to put_compound_page() comment. */ | ||
190 | if (PageSlab(page_head)) { | ||
191 | if (likely(PageTail(page))) { | ||
192 | __get_page_tail_foll(page, false); | ||
193 | return true; | ||
194 | } else { | ||
195 | put_page(page_head); | ||
196 | return false; | ||
197 | } | ||
198 | } | ||
199 | |||
165 | /* | 200 | /* |
166 | * page_head wasn't a dangling pointer but it | 201 | * page_head wasn't a dangling pointer but it |
167 | * may not be a head page anymore by the time | 202 | * may not be a head page anymore by the time |
@@ -201,12 +236,65 @@ void put_pages_list(struct list_head *pages) | |||
201 | } | 236 | } |
202 | EXPORT_SYMBOL(put_pages_list); | 237 | EXPORT_SYMBOL(put_pages_list); |
203 | 238 | ||
239 | /* | ||
240 | * get_kernel_pages() - pin kernel pages in memory | ||
241 | * @kiov: An array of struct kvec structures | ||
242 | * @nr_segs: number of segments to pin | ||
243 | * @write: pinning for read/write, currently ignored | ||
244 | * @pages: array that receives pointers to the pages pinned. | ||
245 | * Should be at least nr_segs long. | ||
246 | * | ||
247 | * Returns number of pages pinned. This may be fewer than the number | ||
248 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
249 | * were pinned, returns -errno. Each page returned must be released | ||
250 | * with a put_page() call when it is finished with. | ||
251 | */ | ||
252 | int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, | ||
253 | struct page **pages) | ||
254 | { | ||
255 | int seg; | ||
256 | |||
257 | for (seg = 0; seg < nr_segs; seg++) { | ||
258 | if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) | ||
259 | return seg; | ||
260 | |||
261 | pages[seg] = kmap_to_page(kiov[seg].iov_base); | ||
262 | page_cache_get(pages[seg]); | ||
263 | } | ||
264 | |||
265 | return seg; | ||
266 | } | ||
267 | EXPORT_SYMBOL_GPL(get_kernel_pages); | ||
268 | |||
269 | /* | ||
270 | * get_kernel_page() - pin a kernel page in memory | ||
271 | * @start: starting kernel address | ||
272 | * @write: pinning for read/write, currently ignored | ||
273 | * @pages: array that receives pointer to the page pinned. | ||
274 | * Must be at least nr_segs long. | ||
275 | * | ||
276 | * Returns 1 if page is pinned. If the page was not pinned, returns | ||
277 | * -errno. The page returned must be released with a put_page() call | ||
278 | * when it is finished with. | ||
279 | */ | ||
280 | int get_kernel_page(unsigned long start, int write, struct page **pages) | ||
281 | { | ||
282 | const struct kvec kiov = { | ||
283 | .iov_base = (void *)start, | ||
284 | .iov_len = PAGE_SIZE | ||
285 | }; | ||
286 | |||
287 | return get_kernel_pages(&kiov, 1, write, pages); | ||
288 | } | ||
289 | EXPORT_SYMBOL_GPL(get_kernel_page); | ||
290 | |||
204 | static void pagevec_lru_move_fn(struct pagevec *pvec, | 291 | static void pagevec_lru_move_fn(struct pagevec *pvec, |
205 | void (*move_fn)(struct page *page, void *arg), | 292 | void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), |
206 | void *arg) | 293 | void *arg) |
207 | { | 294 | { |
208 | int i; | 295 | int i; |
209 | struct zone *zone = NULL; | 296 | struct zone *zone = NULL; |
297 | struct lruvec *lruvec; | ||
210 | unsigned long flags = 0; | 298 | unsigned long flags = 0; |
211 | 299 | ||
212 | for (i = 0; i < pagevec_count(pvec); i++) { | 300 | for (i = 0; i < pagevec_count(pvec); i++) { |
@@ -220,7 +308,8 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, | |||
220 | spin_lock_irqsave(&zone->lru_lock, flags); | 308 | spin_lock_irqsave(&zone->lru_lock, flags); |
221 | } | 309 | } |
222 | 310 | ||
223 | (*move_fn)(page, arg); | 311 | lruvec = mem_cgroup_page_lruvec(page, zone); |
312 | (*move_fn)(page, lruvec, arg); | ||
224 | } | 313 | } |
225 | if (zone) | 314 | if (zone) |
226 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 315 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
@@ -228,16 +317,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, | |||
228 | pagevec_reinit(pvec); | 317 | pagevec_reinit(pvec); |
229 | } | 318 | } |
230 | 319 | ||
231 | static void pagevec_move_tail_fn(struct page *page, void *arg) | 320 | static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, |
321 | void *arg) | ||
232 | { | 322 | { |
233 | int *pgmoved = arg; | 323 | int *pgmoved = arg; |
234 | 324 | ||
235 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | 325 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
236 | enum lru_list lru = page_lru_base_type(page); | 326 | enum lru_list lru = page_lru_base_type(page); |
237 | struct lruvec *lruvec; | ||
238 | |||
239 | lruvec = mem_cgroup_lru_move_lists(page_zone(page), | ||
240 | page, lru, lru); | ||
241 | list_move_tail(&page->lru, &lruvec->lists[lru]); | 327 | list_move_tail(&page->lru, &lruvec->lists[lru]); |
242 | (*pgmoved)++; | 328 | (*pgmoved)++; |
243 | } | 329 | } |
@@ -276,41 +362,30 @@ void rotate_reclaimable_page(struct page *page) | |||
276 | } | 362 | } |
277 | } | 363 | } |
278 | 364 | ||
279 | static void update_page_reclaim_stat(struct zone *zone, struct page *page, | 365 | static void update_page_reclaim_stat(struct lruvec *lruvec, |
280 | int file, int rotated) | 366 | int file, int rotated) |
281 | { | 367 | { |
282 | struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat; | 368 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
283 | struct zone_reclaim_stat *memcg_reclaim_stat; | ||
284 | |||
285 | memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page); | ||
286 | 369 | ||
287 | reclaim_stat->recent_scanned[file]++; | 370 | reclaim_stat->recent_scanned[file]++; |
288 | if (rotated) | 371 | if (rotated) |
289 | reclaim_stat->recent_rotated[file]++; | 372 | reclaim_stat->recent_rotated[file]++; |
290 | |||
291 | if (!memcg_reclaim_stat) | ||
292 | return; | ||
293 | |||
294 | memcg_reclaim_stat->recent_scanned[file]++; | ||
295 | if (rotated) | ||
296 | memcg_reclaim_stat->recent_rotated[file]++; | ||
297 | } | 373 | } |
298 | 374 | ||
299 | static void __activate_page(struct page *page, void *arg) | 375 | static void __activate_page(struct page *page, struct lruvec *lruvec, |
376 | void *arg) | ||
300 | { | 377 | { |
301 | struct zone *zone = page_zone(page); | ||
302 | |||
303 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | 378 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
304 | int file = page_is_file_cache(page); | 379 | int file = page_is_file_cache(page); |
305 | int lru = page_lru_base_type(page); | 380 | int lru = page_lru_base_type(page); |
306 | del_page_from_lru_list(zone, page, lru); | ||
307 | 381 | ||
382 | del_page_from_lru_list(page, lruvec, lru); | ||
308 | SetPageActive(page); | 383 | SetPageActive(page); |
309 | lru += LRU_ACTIVE; | 384 | lru += LRU_ACTIVE; |
310 | add_page_to_lru_list(zone, page, lru); | 385 | add_page_to_lru_list(page, lruvec, lru); |
311 | __count_vm_event(PGACTIVATE); | ||
312 | 386 | ||
313 | update_page_reclaim_stat(zone, page, file, 1); | 387 | __count_vm_event(PGACTIVATE); |
388 | update_page_reclaim_stat(lruvec, file, 1); | ||
314 | } | 389 | } |
315 | } | 390 | } |
316 | 391 | ||
@@ -347,7 +422,7 @@ void activate_page(struct page *page) | |||
347 | struct zone *zone = page_zone(page); | 422 | struct zone *zone = page_zone(page); |
348 | 423 | ||
349 | spin_lock_irq(&zone->lru_lock); | 424 | spin_lock_irq(&zone->lru_lock); |
350 | __activate_page(page, NULL); | 425 | __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); |
351 | spin_unlock_irq(&zone->lru_lock); | 426 | spin_unlock_irq(&zone->lru_lock); |
352 | } | 427 | } |
353 | #endif | 428 | #endif |
@@ -414,11 +489,13 @@ void lru_cache_add_lru(struct page *page, enum lru_list lru) | |||
414 | void add_page_to_unevictable_list(struct page *page) | 489 | void add_page_to_unevictable_list(struct page *page) |
415 | { | 490 | { |
416 | struct zone *zone = page_zone(page); | 491 | struct zone *zone = page_zone(page); |
492 | struct lruvec *lruvec; | ||
417 | 493 | ||
418 | spin_lock_irq(&zone->lru_lock); | 494 | spin_lock_irq(&zone->lru_lock); |
495 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
419 | SetPageUnevictable(page); | 496 | SetPageUnevictable(page); |
420 | SetPageLRU(page); | 497 | SetPageLRU(page); |
421 | add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); | 498 | add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); |
422 | spin_unlock_irq(&zone->lru_lock); | 499 | spin_unlock_irq(&zone->lru_lock); |
423 | } | 500 | } |
424 | 501 | ||
@@ -443,11 +520,11 @@ void add_page_to_unevictable_list(struct page *page) | |||
443 | * be write it out by flusher threads as this is much more effective | 520 | * be write it out by flusher threads as this is much more effective |
444 | * than the single-page writeout from reclaim. | 521 | * than the single-page writeout from reclaim. |
445 | */ | 522 | */ |
446 | static void lru_deactivate_fn(struct page *page, void *arg) | 523 | static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, |
524 | void *arg) | ||
447 | { | 525 | { |
448 | int lru, file; | 526 | int lru, file; |
449 | bool active; | 527 | bool active; |
450 | struct zone *zone = page_zone(page); | ||
451 | 528 | ||
452 | if (!PageLRU(page)) | 529 | if (!PageLRU(page)) |
453 | return; | 530 | return; |
@@ -460,13 +537,13 @@ static void lru_deactivate_fn(struct page *page, void *arg) | |||
460 | return; | 537 | return; |
461 | 538 | ||
462 | active = PageActive(page); | 539 | active = PageActive(page); |
463 | |||
464 | file = page_is_file_cache(page); | 540 | file = page_is_file_cache(page); |
465 | lru = page_lru_base_type(page); | 541 | lru = page_lru_base_type(page); |
466 | del_page_from_lru_list(zone, page, lru + active); | 542 | |
543 | del_page_from_lru_list(page, lruvec, lru + active); | ||
467 | ClearPageActive(page); | 544 | ClearPageActive(page); |
468 | ClearPageReferenced(page); | 545 | ClearPageReferenced(page); |
469 | add_page_to_lru_list(zone, page, lru); | 546 | add_page_to_lru_list(page, lruvec, lru); |
470 | 547 | ||
471 | if (PageWriteback(page) || PageDirty(page)) { | 548 | if (PageWriteback(page) || PageDirty(page)) { |
472 | /* | 549 | /* |
@@ -476,19 +553,17 @@ static void lru_deactivate_fn(struct page *page, void *arg) | |||
476 | */ | 553 | */ |
477 | SetPageReclaim(page); | 554 | SetPageReclaim(page); |
478 | } else { | 555 | } else { |
479 | struct lruvec *lruvec; | ||
480 | /* | 556 | /* |
481 | * The page's writeback ends up during pagevec | 557 | * The page's writeback ends up during pagevec |
482 | * We moves tha page into tail of inactive. | 558 | * We moves tha page into tail of inactive. |
483 | */ | 559 | */ |
484 | lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru); | ||
485 | list_move_tail(&page->lru, &lruvec->lists[lru]); | 560 | list_move_tail(&page->lru, &lruvec->lists[lru]); |
486 | __count_vm_event(PGROTATED); | 561 | __count_vm_event(PGROTATED); |
487 | } | 562 | } |
488 | 563 | ||
489 | if (active) | 564 | if (active) |
490 | __count_vm_event(PGDEACTIVATE); | 565 | __count_vm_event(PGDEACTIVATE); |
491 | update_page_reclaim_stat(zone, page, file, 0); | 566 | update_page_reclaim_stat(lruvec, file, 0); |
492 | } | 567 | } |
493 | 568 | ||
494 | /* | 569 | /* |
@@ -588,6 +663,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
588 | int i; | 663 | int i; |
589 | LIST_HEAD(pages_to_free); | 664 | LIST_HEAD(pages_to_free); |
590 | struct zone *zone = NULL; | 665 | struct zone *zone = NULL; |
666 | struct lruvec *lruvec; | ||
591 | unsigned long uninitialized_var(flags); | 667 | unsigned long uninitialized_var(flags); |
592 | 668 | ||
593 | for (i = 0; i < nr; i++) { | 669 | for (i = 0; i < nr; i++) { |
@@ -615,9 +691,11 @@ void release_pages(struct page **pages, int nr, int cold) | |||
615 | zone = pagezone; | 691 | zone = pagezone; |
616 | spin_lock_irqsave(&zone->lru_lock, flags); | 692 | spin_lock_irqsave(&zone->lru_lock, flags); |
617 | } | 693 | } |
694 | |||
695 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
618 | VM_BUG_ON(!PageLRU(page)); | 696 | VM_BUG_ON(!PageLRU(page)); |
619 | __ClearPageLRU(page); | 697 | __ClearPageLRU(page); |
620 | del_page_from_lru_list(zone, page, page_off_lru(page)); | 698 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
621 | } | 699 | } |
622 | 700 | ||
623 | list_add(&page->lru, &pages_to_free); | 701 | list_add(&page->lru, &pages_to_free); |
@@ -649,8 +727,8 @@ EXPORT_SYMBOL(__pagevec_release); | |||
649 | 727 | ||
650 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 728 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
651 | /* used by __split_huge_page_refcount() */ | 729 | /* used by __split_huge_page_refcount() */ |
652 | void lru_add_page_tail(struct zone* zone, | 730 | void lru_add_page_tail(struct page *page, struct page *page_tail, |
653 | struct page *page, struct page *page_tail) | 731 | struct lruvec *lruvec) |
654 | { | 732 | { |
655 | int uninitialized_var(active); | 733 | int uninitialized_var(active); |
656 | enum lru_list lru; | 734 | enum lru_list lru; |
@@ -659,7 +737,8 @@ void lru_add_page_tail(struct zone* zone, | |||
659 | VM_BUG_ON(!PageHead(page)); | 737 | VM_BUG_ON(!PageHead(page)); |
660 | VM_BUG_ON(PageCompound(page_tail)); | 738 | VM_BUG_ON(PageCompound(page_tail)); |
661 | VM_BUG_ON(PageLRU(page_tail)); | 739 | VM_BUG_ON(PageLRU(page_tail)); |
662 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock)); | 740 | VM_BUG_ON(NR_CPUS != 1 && |
741 | !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); | ||
663 | 742 | ||
664 | SetPageLRU(page_tail); | 743 | SetPageLRU(page_tail); |
665 | 744 | ||
@@ -688,20 +767,20 @@ void lru_add_page_tail(struct zone* zone, | |||
688 | * Use the standard add function to put page_tail on the list, | 767 | * Use the standard add function to put page_tail on the list, |
689 | * but then correct its position so they all end up in order. | 768 | * but then correct its position so they all end up in order. |
690 | */ | 769 | */ |
691 | add_page_to_lru_list(zone, page_tail, lru); | 770 | add_page_to_lru_list(page_tail, lruvec, lru); |
692 | list_head = page_tail->lru.prev; | 771 | list_head = page_tail->lru.prev; |
693 | list_move_tail(&page_tail->lru, list_head); | 772 | list_move_tail(&page_tail->lru, list_head); |
694 | } | 773 | } |
695 | 774 | ||
696 | if (!PageUnevictable(page)) | 775 | if (!PageUnevictable(page)) |
697 | update_page_reclaim_stat(zone, page_tail, file, active); | 776 | update_page_reclaim_stat(lruvec, file, active); |
698 | } | 777 | } |
699 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 778 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
700 | 779 | ||
701 | static void __pagevec_lru_add_fn(struct page *page, void *arg) | 780 | static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, |
781 | void *arg) | ||
702 | { | 782 | { |
703 | enum lru_list lru = (enum lru_list)arg; | 783 | enum lru_list lru = (enum lru_list)arg; |
704 | struct zone *zone = page_zone(page); | ||
705 | int file = is_file_lru(lru); | 784 | int file = is_file_lru(lru); |
706 | int active = is_active_lru(lru); | 785 | int active = is_active_lru(lru); |
707 | 786 | ||
@@ -712,8 +791,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg) | |||
712 | SetPageLRU(page); | 791 | SetPageLRU(page); |
713 | if (active) | 792 | if (active) |
714 | SetPageActive(page); | 793 | SetPageActive(page); |
715 | add_page_to_lru_list(zone, page, lru); | 794 | add_page_to_lru_list(page, lruvec, lru); |
716 | update_page_reclaim_stat(zone, page, file, active); | 795 | update_page_reclaim_stat(lruvec, file, active); |
717 | } | 796 | } |
718 | 797 | ||
719 | /* | 798 | /* |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 9d3dd3763cf7..0cb36fb1f61c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/pagemap.h> | 15 | #include <linux/pagemap.h> |
16 | #include <linux/backing-dev.h> | 16 | #include <linux/backing-dev.h> |
17 | #include <linux/blkdev.h> | ||
17 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
18 | #include <linux/migrate.h> | 19 | #include <linux/migrate.h> |
19 | #include <linux/page_cgroup.h> | 20 | #include <linux/page_cgroup.h> |
@@ -26,7 +27,7 @@ | |||
26 | */ | 27 | */ |
27 | static const struct address_space_operations swap_aops = { | 28 | static const struct address_space_operations swap_aops = { |
28 | .writepage = swap_writepage, | 29 | .writepage = swap_writepage, |
29 | .set_page_dirty = __set_page_dirty_nobuffers, | 30 | .set_page_dirty = swap_set_page_dirty, |
30 | .migratepage = migrate_page, | 31 | .migratepage = migrate_page, |
31 | }; | 32 | }; |
32 | 33 | ||
@@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
376 | unsigned long offset = swp_offset(entry); | 377 | unsigned long offset = swp_offset(entry); |
377 | unsigned long start_offset, end_offset; | 378 | unsigned long start_offset, end_offset; |
378 | unsigned long mask = (1UL << page_cluster) - 1; | 379 | unsigned long mask = (1UL << page_cluster) - 1; |
380 | struct blk_plug plug; | ||
379 | 381 | ||
380 | /* Read a page_cluster sized and aligned cluster around offset. */ | 382 | /* Read a page_cluster sized and aligned cluster around offset. */ |
381 | start_offset = offset & ~mask; | 383 | start_offset = offset & ~mask; |
@@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
383 | if (!start_offset) /* First page is swap header. */ | 385 | if (!start_offset) /* First page is swap header. */ |
384 | start_offset++; | 386 | start_offset++; |
385 | 387 | ||
388 | blk_start_plug(&plug); | ||
386 | for (offset = start_offset; offset <= end_offset ; offset++) { | 389 | for (offset = start_offset; offset <= end_offset ; offset++) { |
387 | /* Ok, do the async read-ahead now */ | 390 | /* Ok, do the async read-ahead now */ |
388 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), | 391 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), |
@@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
391 | continue; | 394 | continue; |
392 | page_cache_release(page); | 395 | page_cache_release(page); |
393 | } | 396 | } |
397 | blk_finish_plug(&plug); | ||
398 | |||
394 | lru_add_drain(); /* Push any new pages onto the LRU now */ | 399 | lru_add_drain(); /* Push any new pages onto the LRU now */ |
395 | return read_swap_cache_async(entry, gfp_mask, vma, addr); | 400 | return read_swap_cache_async(entry, gfp_mask, vma, addr); |
396 | } | 401 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index fafc26d1b1dc..14e254c768fc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -31,6 +31,9 @@ | |||
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | #include <linux/poll.h> | 32 | #include <linux/poll.h> |
33 | #include <linux/oom.h> | 33 | #include <linux/oom.h> |
34 | #include <linux/frontswap.h> | ||
35 | #include <linux/swapfile.h> | ||
36 | #include <linux/export.h> | ||
34 | 37 | ||
35 | #include <asm/pgtable.h> | 38 | #include <asm/pgtable.h> |
36 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
@@ -42,7 +45,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | |||
42 | static void free_swap_count_continuations(struct swap_info_struct *); | 45 | static void free_swap_count_continuations(struct swap_info_struct *); |
43 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); | 46 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); |
44 | 47 | ||
45 | static DEFINE_SPINLOCK(swap_lock); | 48 | DEFINE_SPINLOCK(swap_lock); |
46 | static unsigned int nr_swapfiles; | 49 | static unsigned int nr_swapfiles; |
47 | long nr_swap_pages; | 50 | long nr_swap_pages; |
48 | long total_swap_pages; | 51 | long total_swap_pages; |
@@ -53,9 +56,9 @@ static const char Unused_file[] = "Unused swap file entry "; | |||
53 | static const char Bad_offset[] = "Bad swap offset entry "; | 56 | static const char Bad_offset[] = "Bad swap offset entry "; |
54 | static const char Unused_offset[] = "Unused swap offset entry "; | 57 | static const char Unused_offset[] = "Unused swap offset entry "; |
55 | 58 | ||
56 | static struct swap_list_t swap_list = {-1, -1}; | 59 | struct swap_list_t swap_list = {-1, -1}; |
57 | 60 | ||
58 | static struct swap_info_struct *swap_info[MAX_SWAPFILES]; | 61 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
59 | 62 | ||
60 | static DEFINE_MUTEX(swapon_mutex); | 63 | static DEFINE_MUTEX(swapon_mutex); |
61 | 64 | ||
@@ -546,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
546 | 549 | ||
547 | /* free if no reference */ | 550 | /* free if no reference */ |
548 | if (!usage) { | 551 | if (!usage) { |
549 | struct gendisk *disk = p->bdev->bd_disk; | ||
550 | if (offset < p->lowest_bit) | 552 | if (offset < p->lowest_bit) |
551 | p->lowest_bit = offset; | 553 | p->lowest_bit = offset; |
552 | if (offset > p->highest_bit) | 554 | if (offset > p->highest_bit) |
@@ -556,9 +558,13 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
556 | swap_list.next = p->type; | 558 | swap_list.next = p->type; |
557 | nr_swap_pages++; | 559 | nr_swap_pages++; |
558 | p->inuse_pages--; | 560 | p->inuse_pages--; |
559 | if ((p->flags & SWP_BLKDEV) && | 561 | frontswap_invalidate_page(p->type, offset); |
560 | disk->fops->swap_slot_free_notify) | 562 | if (p->flags & SWP_BLKDEV) { |
561 | disk->fops->swap_slot_free_notify(p->bdev, offset); | 563 | struct gendisk *disk = p->bdev->bd_disk; |
564 | if (disk->fops->swap_slot_free_notify) | ||
565 | disk->fops->swap_slot_free_notify(p->bdev, | ||
566 | offset); | ||
567 | } | ||
562 | } | 568 | } |
563 | 569 | ||
564 | return usage; | 570 | return usage; |
@@ -601,7 +607,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) | |||
601 | * This does not give an exact answer when swap count is continued, | 607 | * This does not give an exact answer when swap count is continued, |
602 | * but does include the high COUNT_CONTINUED flag to allow for that. | 608 | * but does include the high COUNT_CONTINUED flag to allow for that. |
603 | */ | 609 | */ |
604 | static inline int page_swapcount(struct page *page) | 610 | int page_swapcount(struct page *page) |
605 | { | 611 | { |
606 | int count = 0; | 612 | int count = 0; |
607 | struct swap_info_struct *p; | 613 | struct swap_info_struct *p; |
@@ -717,37 +723,6 @@ int free_swap_and_cache(swp_entry_t entry) | |||
717 | return p != NULL; | 723 | return p != NULL; |
718 | } | 724 | } |
719 | 725 | ||
720 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
721 | /** | ||
722 | * mem_cgroup_count_swap_user - count the user of a swap entry | ||
723 | * @ent: the swap entry to be checked | ||
724 | * @pagep: the pointer for the swap cache page of the entry to be stored | ||
725 | * | ||
726 | * Returns the number of the user of the swap entry. The number is valid only | ||
727 | * for swaps of anonymous pages. | ||
728 | * If the entry is found on swap cache, the page is stored to pagep with | ||
729 | * refcount of it being incremented. | ||
730 | */ | ||
731 | int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) | ||
732 | { | ||
733 | struct page *page; | ||
734 | struct swap_info_struct *p; | ||
735 | int count = 0; | ||
736 | |||
737 | page = find_get_page(&swapper_space, ent.val); | ||
738 | if (page) | ||
739 | count += page_mapcount(page); | ||
740 | p = swap_info_get(ent); | ||
741 | if (p) { | ||
742 | count += swap_count(p->swap_map[swp_offset(ent)]); | ||
743 | spin_unlock(&swap_lock); | ||
744 | } | ||
745 | |||
746 | *pagep = page; | ||
747 | return count; | ||
748 | } | ||
749 | #endif | ||
750 | |||
751 | #ifdef CONFIG_HIBERNATION | 726 | #ifdef CONFIG_HIBERNATION |
752 | /* | 727 | /* |
753 | * Find the swap type that corresponds to given device (if any). | 728 | * Find the swap type that corresponds to given device (if any). |
@@ -860,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
860 | 835 | ||
861 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 836 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
862 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { | 837 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { |
863 | if (ret > 0) | 838 | mem_cgroup_cancel_charge_swapin(memcg); |
864 | mem_cgroup_cancel_charge_swapin(memcg); | ||
865 | ret = 0; | 839 | ret = 0; |
866 | goto out; | 840 | goto out; |
867 | } | 841 | } |
@@ -1016,11 +990,12 @@ static int unuse_mm(struct mm_struct *mm, | |||
1016 | } | 990 | } |
1017 | 991 | ||
1018 | /* | 992 | /* |
1019 | * Scan swap_map from current position to next entry still in use. | 993 | * Scan swap_map (or frontswap_map if frontswap parameter is true) |
994 | * from current position to next entry still in use. | ||
1020 | * Recycle to start on reaching the end, returning 0 when empty. | 995 | * Recycle to start on reaching the end, returning 0 when empty. |
1021 | */ | 996 | */ |
1022 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, | 997 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, |
1023 | unsigned int prev) | 998 | unsigned int prev, bool frontswap) |
1024 | { | 999 | { |
1025 | unsigned int max = si->max; | 1000 | unsigned int max = si->max; |
1026 | unsigned int i = prev; | 1001 | unsigned int i = prev; |
@@ -1046,6 +1021,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1046 | prev = 0; | 1021 | prev = 0; |
1047 | i = 1; | 1022 | i = 1; |
1048 | } | 1023 | } |
1024 | if (frontswap) { | ||
1025 | if (frontswap_test(si, i)) | ||
1026 | break; | ||
1027 | else | ||
1028 | continue; | ||
1029 | } | ||
1049 | count = si->swap_map[i]; | 1030 | count = si->swap_map[i]; |
1050 | if (count && swap_count(count) != SWAP_MAP_BAD) | 1031 | if (count && swap_count(count) != SWAP_MAP_BAD) |
1051 | break; | 1032 | break; |
@@ -1057,8 +1038,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1057 | * We completely avoid races by reading each swap page in advance, | 1038 | * We completely avoid races by reading each swap page in advance, |
1058 | * and then search for the process using it. All the necessary | 1039 | * and then search for the process using it. All the necessary |
1059 | * page table adjustments can then be made atomically. | 1040 | * page table adjustments can then be made atomically. |
1041 | * | ||
1042 | * if the boolean frontswap is true, only unuse pages_to_unuse pages; | ||
1043 | * pages_to_unuse==0 means all pages; ignored if frontswap is false | ||
1060 | */ | 1044 | */ |
1061 | static int try_to_unuse(unsigned int type) | 1045 | int try_to_unuse(unsigned int type, bool frontswap, |
1046 | unsigned long pages_to_unuse) | ||
1062 | { | 1047 | { |
1063 | struct swap_info_struct *si = swap_info[type]; | 1048 | struct swap_info_struct *si = swap_info[type]; |
1064 | struct mm_struct *start_mm; | 1049 | struct mm_struct *start_mm; |
@@ -1091,7 +1076,7 @@ static int try_to_unuse(unsigned int type) | |||
1091 | * one pass through swap_map is enough, but not necessarily: | 1076 | * one pass through swap_map is enough, but not necessarily: |
1092 | * there are races when an instance of an entry might be missed. | 1077 | * there are races when an instance of an entry might be missed. |
1093 | */ | 1078 | */ |
1094 | while ((i = find_next_to_unuse(si, i)) != 0) { | 1079 | while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { |
1095 | if (signal_pending(current)) { | 1080 | if (signal_pending(current)) { |
1096 | retval = -EINTR; | 1081 | retval = -EINTR; |
1097 | break; | 1082 | break; |
@@ -1258,6 +1243,10 @@ static int try_to_unuse(unsigned int type) | |||
1258 | * interactive performance. | 1243 | * interactive performance. |
1259 | */ | 1244 | */ |
1260 | cond_resched(); | 1245 | cond_resched(); |
1246 | if (frontswap && pages_to_unuse > 0) { | ||
1247 | if (!--pages_to_unuse) | ||
1248 | break; | ||
1249 | } | ||
1261 | } | 1250 | } |
1262 | 1251 | ||
1263 | mmput(start_mm); | 1252 | mmput(start_mm); |
@@ -1341,6 +1330,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis) | |||
1341 | list_del(&se->list); | 1330 | list_del(&se->list); |
1342 | kfree(se); | 1331 | kfree(se); |
1343 | } | 1332 | } |
1333 | |||
1334 | if (sis->flags & SWP_FILE) { | ||
1335 | struct file *swap_file = sis->swap_file; | ||
1336 | struct address_space *mapping = swap_file->f_mapping; | ||
1337 | |||
1338 | sis->flags &= ~SWP_FILE; | ||
1339 | mapping->a_ops->swap_deactivate(swap_file); | ||
1340 | } | ||
1344 | } | 1341 | } |
1345 | 1342 | ||
1346 | /* | 1343 | /* |
@@ -1349,7 +1346,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis) | |||
1349 | * | 1346 | * |
1350 | * This function rather assumes that it is called in ascending page order. | 1347 | * This function rather assumes that it is called in ascending page order. |
1351 | */ | 1348 | */ |
1352 | static int | 1349 | int |
1353 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | 1350 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, |
1354 | unsigned long nr_pages, sector_t start_block) | 1351 | unsigned long nr_pages, sector_t start_block) |
1355 | { | 1352 | { |
@@ -1422,102 +1419,33 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
1422 | */ | 1419 | */ |
1423 | static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | 1420 | static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) |
1424 | { | 1421 | { |
1425 | struct inode *inode; | 1422 | struct file *swap_file = sis->swap_file; |
1426 | unsigned blocks_per_page; | 1423 | struct address_space *mapping = swap_file->f_mapping; |
1427 | unsigned long page_no; | 1424 | struct inode *inode = mapping->host; |
1428 | unsigned blkbits; | ||
1429 | sector_t probe_block; | ||
1430 | sector_t last_block; | ||
1431 | sector_t lowest_block = -1; | ||
1432 | sector_t highest_block = 0; | ||
1433 | int nr_extents = 0; | ||
1434 | int ret; | 1425 | int ret; |
1435 | 1426 | ||
1436 | inode = sis->swap_file->f_mapping->host; | ||
1437 | if (S_ISBLK(inode->i_mode)) { | 1427 | if (S_ISBLK(inode->i_mode)) { |
1438 | ret = add_swap_extent(sis, 0, sis->max, 0); | 1428 | ret = add_swap_extent(sis, 0, sis->max, 0); |
1439 | *span = sis->pages; | 1429 | *span = sis->pages; |
1440 | goto out; | 1430 | return ret; |
1441 | } | 1431 | } |
1442 | 1432 | ||
1443 | blkbits = inode->i_blkbits; | 1433 | if (mapping->a_ops->swap_activate) { |
1444 | blocks_per_page = PAGE_SIZE >> blkbits; | 1434 | ret = mapping->a_ops->swap_activate(sis, swap_file, span); |
1445 | 1435 | if (!ret) { | |
1446 | /* | 1436 | sis->flags |= SWP_FILE; |
1447 | * Map all the blocks into the extent list. This code doesn't try | 1437 | ret = add_swap_extent(sis, 0, sis->max, 0); |
1448 | * to be very smart. | 1438 | *span = sis->pages; |
1449 | */ | ||
1450 | probe_block = 0; | ||
1451 | page_no = 0; | ||
1452 | last_block = i_size_read(inode) >> blkbits; | ||
1453 | while ((probe_block + blocks_per_page) <= last_block && | ||
1454 | page_no < sis->max) { | ||
1455 | unsigned block_in_page; | ||
1456 | sector_t first_block; | ||
1457 | |||
1458 | first_block = bmap(inode, probe_block); | ||
1459 | if (first_block == 0) | ||
1460 | goto bad_bmap; | ||
1461 | |||
1462 | /* | ||
1463 | * It must be PAGE_SIZE aligned on-disk | ||
1464 | */ | ||
1465 | if (first_block & (blocks_per_page - 1)) { | ||
1466 | probe_block++; | ||
1467 | goto reprobe; | ||
1468 | } | ||
1469 | |||
1470 | for (block_in_page = 1; block_in_page < blocks_per_page; | ||
1471 | block_in_page++) { | ||
1472 | sector_t block; | ||
1473 | |||
1474 | block = bmap(inode, probe_block + block_in_page); | ||
1475 | if (block == 0) | ||
1476 | goto bad_bmap; | ||
1477 | if (block != first_block + block_in_page) { | ||
1478 | /* Discontiguity */ | ||
1479 | probe_block++; | ||
1480 | goto reprobe; | ||
1481 | } | ||
1482 | } | ||
1483 | |||
1484 | first_block >>= (PAGE_SHIFT - blkbits); | ||
1485 | if (page_no) { /* exclude the header page */ | ||
1486 | if (first_block < lowest_block) | ||
1487 | lowest_block = first_block; | ||
1488 | if (first_block > highest_block) | ||
1489 | highest_block = first_block; | ||
1490 | } | 1439 | } |
1440 | return ret; | ||
1441 | } | ||
1491 | 1442 | ||
1492 | /* | 1443 | return generic_swapfile_activate(sis, swap_file, span); |
1493 | * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks | ||
1494 | */ | ||
1495 | ret = add_swap_extent(sis, page_no, 1, first_block); | ||
1496 | if (ret < 0) | ||
1497 | goto out; | ||
1498 | nr_extents += ret; | ||
1499 | page_no++; | ||
1500 | probe_block += blocks_per_page; | ||
1501 | reprobe: | ||
1502 | continue; | ||
1503 | } | ||
1504 | ret = nr_extents; | ||
1505 | *span = 1 + highest_block - lowest_block; | ||
1506 | if (page_no == 0) | ||
1507 | page_no = 1; /* force Empty message */ | ||
1508 | sis->max = page_no; | ||
1509 | sis->pages = page_no - 1; | ||
1510 | sis->highest_bit = page_no - 1; | ||
1511 | out: | ||
1512 | return ret; | ||
1513 | bad_bmap: | ||
1514 | printk(KERN_ERR "swapon: swapfile has holes\n"); | ||
1515 | ret = -EINVAL; | ||
1516 | goto out; | ||
1517 | } | 1444 | } |
1518 | 1445 | ||
1519 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1446 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
1520 | unsigned char *swap_map) | 1447 | unsigned char *swap_map, |
1448 | unsigned long *frontswap_map) | ||
1521 | { | 1449 | { |
1522 | int i, prev; | 1450 | int i, prev; |
1523 | 1451 | ||
@@ -1527,6 +1455,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
1527 | else | 1455 | else |
1528 | p->prio = --least_priority; | 1456 | p->prio = --least_priority; |
1529 | p->swap_map = swap_map; | 1457 | p->swap_map = swap_map; |
1458 | frontswap_map_set(p, frontswap_map); | ||
1530 | p->flags |= SWP_WRITEOK; | 1459 | p->flags |= SWP_WRITEOK; |
1531 | nr_swap_pages += p->pages; | 1460 | nr_swap_pages += p->pages; |
1532 | total_swap_pages += p->pages; | 1461 | total_swap_pages += p->pages; |
@@ -1543,6 +1472,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
1543 | swap_list.head = swap_list.next = p->type; | 1472 | swap_list.head = swap_list.next = p->type; |
1544 | else | 1473 | else |
1545 | swap_info[prev]->next = p->type; | 1474 | swap_info[prev]->next = p->type; |
1475 | frontswap_init(p->type); | ||
1546 | spin_unlock(&swap_lock); | 1476 | spin_unlock(&swap_lock); |
1547 | } | 1477 | } |
1548 | 1478 | ||
@@ -1616,7 +1546,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1616 | spin_unlock(&swap_lock); | 1546 | spin_unlock(&swap_lock); |
1617 | 1547 | ||
1618 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | 1548 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); |
1619 | err = try_to_unuse(type); | 1549 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ |
1620 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); | 1550 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); |
1621 | 1551 | ||
1622 | if (err) { | 1552 | if (err) { |
@@ -1627,7 +1557,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1627 | * sys_swapoff for this swap_info_struct at this point. | 1557 | * sys_swapoff for this swap_info_struct at this point. |
1628 | */ | 1558 | */ |
1629 | /* re-insert swap space back into swap_list */ | 1559 | /* re-insert swap space back into swap_list */ |
1630 | enable_swap_info(p, p->prio, p->swap_map); | 1560 | enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); |
1631 | goto out_dput; | 1561 | goto out_dput; |
1632 | } | 1562 | } |
1633 | 1563 | ||
@@ -1653,9 +1583,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1653 | swap_map = p->swap_map; | 1583 | swap_map = p->swap_map; |
1654 | p->swap_map = NULL; | 1584 | p->swap_map = NULL; |
1655 | p->flags = 0; | 1585 | p->flags = 0; |
1586 | frontswap_invalidate_area(type); | ||
1656 | spin_unlock(&swap_lock); | 1587 | spin_unlock(&swap_lock); |
1657 | mutex_unlock(&swapon_mutex); | 1588 | mutex_unlock(&swapon_mutex); |
1658 | vfree(swap_map); | 1589 | vfree(swap_map); |
1590 | vfree(frontswap_map_get(p)); | ||
1659 | /* Destroy swap account informatin */ | 1591 | /* Destroy swap account informatin */ |
1660 | swap_cgroup_swapoff(type); | 1592 | swap_cgroup_swapoff(type); |
1661 | 1593 | ||
@@ -1924,24 +1856,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1924 | 1856 | ||
1925 | /* | 1857 | /* |
1926 | * Find out how many pages are allowed for a single swap | 1858 | * Find out how many pages are allowed for a single swap |
1927 | * device. There are three limiting factors: 1) the number | 1859 | * device. There are two limiting factors: 1) the number |
1928 | * of bits for the swap offset in the swp_entry_t type, and | 1860 | * of bits for the swap offset in the swp_entry_t type, and |
1929 | * 2) the number of bits in the swap pte as defined by the | 1861 | * 2) the number of bits in the swap pte as defined by the |
1930 | * the different architectures, and 3) the number of free bits | 1862 | * different architectures. In order to find the |
1931 | * in an exceptional radix_tree entry. In order to find the | ||
1932 | * largest possible bit mask, a swap entry with swap type 0 | 1863 | * largest possible bit mask, a swap entry with swap type 0 |
1933 | * and swap offset ~0UL is created, encoded to a swap pte, | 1864 | * and swap offset ~0UL is created, encoded to a swap pte, |
1934 | * decoded to a swp_entry_t again, and finally the swap | 1865 | * decoded to a swp_entry_t again, and finally the swap |
1935 | * offset is extracted. This will mask all the bits from | 1866 | * offset is extracted. This will mask all the bits from |
1936 | * the initial ~0UL mask that can't be encoded in either | 1867 | * the initial ~0UL mask that can't be encoded in either |
1937 | * the swp_entry_t or the architecture definition of a | 1868 | * the swp_entry_t or the architecture definition of a |
1938 | * swap pte. Then the same is done for a radix_tree entry. | 1869 | * swap pte. |
1939 | */ | 1870 | */ |
1940 | maxpages = swp_offset(pte_to_swp_entry( | 1871 | maxpages = swp_offset(pte_to_swp_entry( |
1941 | swp_entry_to_pte(swp_entry(0, ~0UL)))); | 1872 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; |
1942 | maxpages = swp_offset(radix_to_swp_entry( | ||
1943 | swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; | ||
1944 | |||
1945 | if (maxpages > swap_header->info.last_page) { | 1873 | if (maxpages > swap_header->info.last_page) { |
1946 | maxpages = swap_header->info.last_page + 1; | 1874 | maxpages = swap_header->info.last_page + 1; |
1947 | /* p->max is an unsigned int: don't overflow it */ | 1875 | /* p->max is an unsigned int: don't overflow it */ |
@@ -2019,6 +1947,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2019 | sector_t span; | 1947 | sector_t span; |
2020 | unsigned long maxpages; | 1948 | unsigned long maxpages; |
2021 | unsigned char *swap_map = NULL; | 1949 | unsigned char *swap_map = NULL; |
1950 | unsigned long *frontswap_map = NULL; | ||
2022 | struct page *page = NULL; | 1951 | struct page *page = NULL; |
2023 | struct inode *inode = NULL; | 1952 | struct inode *inode = NULL; |
2024 | 1953 | ||
@@ -2102,6 +2031,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2102 | error = nr_extents; | 2031 | error = nr_extents; |
2103 | goto bad_swap; | 2032 | goto bad_swap; |
2104 | } | 2033 | } |
2034 | /* frontswap enabled? set up bit-per-page map for frontswap */ | ||
2035 | if (frontswap_enabled) | ||
2036 | frontswap_map = vzalloc(maxpages / sizeof(long)); | ||
2105 | 2037 | ||
2106 | if (p->bdev) { | 2038 | if (p->bdev) { |
2107 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | 2039 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
@@ -2117,14 +2049,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2117 | if (swap_flags & SWAP_FLAG_PREFER) | 2049 | if (swap_flags & SWAP_FLAG_PREFER) |
2118 | prio = | 2050 | prio = |
2119 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; | 2051 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; |
2120 | enable_swap_info(p, prio, swap_map); | 2052 | enable_swap_info(p, prio, swap_map, frontswap_map); |
2121 | 2053 | ||
2122 | printk(KERN_INFO "Adding %uk swap on %s. " | 2054 | printk(KERN_INFO "Adding %uk swap on %s. " |
2123 | "Priority:%d extents:%d across:%lluk %s%s\n", | 2055 | "Priority:%d extents:%d across:%lluk %s%s%s\n", |
2124 | p->pages<<(PAGE_SHIFT-10), name, p->prio, | 2056 | p->pages<<(PAGE_SHIFT-10), name, p->prio, |
2125 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), | 2057 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
2126 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", | 2058 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", |
2127 | (p->flags & SWP_DISCARDABLE) ? "D" : ""); | 2059 | (p->flags & SWP_DISCARDABLE) ? "D" : "", |
2060 | (frontswap_map) ? "FS" : ""); | ||
2128 | 2061 | ||
2129 | mutex_unlock(&swapon_mutex); | 2062 | mutex_unlock(&swapon_mutex); |
2130 | atomic_inc(&proc_poll_event); | 2063 | atomic_inc(&proc_poll_event); |
@@ -2292,6 +2225,31 @@ int swapcache_prepare(swp_entry_t entry) | |||
2292 | return __swap_duplicate(entry, SWAP_HAS_CACHE); | 2225 | return __swap_duplicate(entry, SWAP_HAS_CACHE); |
2293 | } | 2226 | } |
2294 | 2227 | ||
2228 | struct swap_info_struct *page_swap_info(struct page *page) | ||
2229 | { | ||
2230 | swp_entry_t swap = { .val = page_private(page) }; | ||
2231 | BUG_ON(!PageSwapCache(page)); | ||
2232 | return swap_info[swp_type(swap)]; | ||
2233 | } | ||
2234 | |||
2235 | /* | ||
2236 | * out-of-line __page_file_ methods to avoid include hell. | ||
2237 | */ | ||
2238 | struct address_space *__page_file_mapping(struct page *page) | ||
2239 | { | ||
2240 | VM_BUG_ON(!PageSwapCache(page)); | ||
2241 | return page_swap_info(page)->swap_file->f_mapping; | ||
2242 | } | ||
2243 | EXPORT_SYMBOL_GPL(__page_file_mapping); | ||
2244 | |||
2245 | pgoff_t __page_file_index(struct page *page) | ||
2246 | { | ||
2247 | swp_entry_t swap = { .val = page_private(page) }; | ||
2248 | VM_BUG_ON(!PageSwapCache(page)); | ||
2249 | return swp_offset(swap); | ||
2250 | } | ||
2251 | EXPORT_SYMBOL_GPL(__page_file_index); | ||
2252 | |||
2295 | /* | 2253 | /* |
2296 | * add_swap_count_continuation - called when a swap count is duplicated | 2254 | * add_swap_count_continuation - called when a swap count is duplicated |
2297 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's | 2255 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's |
diff --git a/mm/thrash.c b/mm/thrash.c deleted file mode 100644 index 57ad495dbd54..000000000000 --- a/mm/thrash.c +++ /dev/null | |||
@@ -1,155 +0,0 @@ | |||
1 | /* | ||
2 | * mm/thrash.c | ||
3 | * | ||
4 | * Copyright (C) 2004, Red Hat, Inc. | ||
5 | * Copyright (C) 2004, Rik van Riel <riel@redhat.com> | ||
6 | * Released under the GPL, see the file COPYING for details. | ||
7 | * | ||
8 | * Simple token based thrashing protection, using the algorithm | ||
9 | * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html | ||
10 | * | ||
11 | * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> | ||
12 | * Improved algorithm to pass token: | ||
13 | * Each task has a priority which is incremented if it contended | ||
14 | * for the token in an interval less than its previous attempt. | ||
15 | * If the token is acquired, that task's priority is boosted to prevent | ||
16 | * the token from bouncing around too often and to let the task make | ||
17 | * some progress in its execution. | ||
18 | */ | ||
19 | |||
20 | #include <linux/jiffies.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/sched.h> | ||
23 | #include <linux/swap.h> | ||
24 | #include <linux/memcontrol.h> | ||
25 | |||
26 | #include <trace/events/vmscan.h> | ||
27 | |||
28 | #define TOKEN_AGING_INTERVAL (0xFF) | ||
29 | |||
30 | static DEFINE_SPINLOCK(swap_token_lock); | ||
31 | struct mm_struct *swap_token_mm; | ||
32 | static struct mem_cgroup *swap_token_memcg; | ||
33 | |||
34 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
35 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | ||
36 | { | ||
37 | struct mem_cgroup *memcg; | ||
38 | |||
39 | memcg = try_get_mem_cgroup_from_mm(mm); | ||
40 | if (memcg) | ||
41 | css_put(mem_cgroup_css(memcg)); | ||
42 | |||
43 | return memcg; | ||
44 | } | ||
45 | #else | ||
46 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | ||
47 | { | ||
48 | return NULL; | ||
49 | } | ||
50 | #endif | ||
51 | |||
52 | void grab_swap_token(struct mm_struct *mm) | ||
53 | { | ||
54 | int current_interval; | ||
55 | unsigned int old_prio = mm->token_priority; | ||
56 | static unsigned int global_faults; | ||
57 | static unsigned int last_aging; | ||
58 | |||
59 | global_faults++; | ||
60 | |||
61 | current_interval = global_faults - mm->faultstamp; | ||
62 | |||
63 | if (!spin_trylock(&swap_token_lock)) | ||
64 | return; | ||
65 | |||
66 | /* First come first served */ | ||
67 | if (!swap_token_mm) | ||
68 | goto replace_token; | ||
69 | |||
70 | /* | ||
71 | * Usually, we don't need priority aging because long interval faults | ||
72 | * makes priority decrease quickly. But there is one exception. If the | ||
73 | * token owner task is sleeping, it never make long interval faults. | ||
74 | * Thus, we need a priority aging mechanism instead. The requirements | ||
75 | * of priority aging are | ||
76 | * 1) An aging interval is reasonable enough long. Too short aging | ||
77 | * interval makes quick swap token lost and decrease performance. | ||
78 | * 2) The swap token owner task have to get priority aging even if | ||
79 | * it's under sleep. | ||
80 | */ | ||
81 | if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { | ||
82 | swap_token_mm->token_priority /= 2; | ||
83 | last_aging = global_faults; | ||
84 | } | ||
85 | |||
86 | if (mm == swap_token_mm) { | ||
87 | mm->token_priority += 2; | ||
88 | goto update_priority; | ||
89 | } | ||
90 | |||
91 | if (current_interval < mm->last_interval) | ||
92 | mm->token_priority++; | ||
93 | else { | ||
94 | if (likely(mm->token_priority > 0)) | ||
95 | mm->token_priority--; | ||
96 | } | ||
97 | |||
98 | /* Check if we deserve the token */ | ||
99 | if (mm->token_priority > swap_token_mm->token_priority) | ||
100 | goto replace_token; | ||
101 | |||
102 | update_priority: | ||
103 | trace_update_swap_token_priority(mm, old_prio, swap_token_mm); | ||
104 | |||
105 | out: | ||
106 | mm->faultstamp = global_faults; | ||
107 | mm->last_interval = current_interval; | ||
108 | spin_unlock(&swap_token_lock); | ||
109 | return; | ||
110 | |||
111 | replace_token: | ||
112 | mm->token_priority += 2; | ||
113 | trace_replace_swap_token(swap_token_mm, mm); | ||
114 | swap_token_mm = mm; | ||
115 | swap_token_memcg = swap_token_memcg_from_mm(mm); | ||
116 | last_aging = global_faults; | ||
117 | goto out; | ||
118 | } | ||
119 | |||
120 | /* Called on process exit. */ | ||
121 | void __put_swap_token(struct mm_struct *mm) | ||
122 | { | ||
123 | spin_lock(&swap_token_lock); | ||
124 | if (likely(mm == swap_token_mm)) { | ||
125 | trace_put_swap_token(swap_token_mm); | ||
126 | swap_token_mm = NULL; | ||
127 | swap_token_memcg = NULL; | ||
128 | } | ||
129 | spin_unlock(&swap_token_lock); | ||
130 | } | ||
131 | |||
132 | static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b) | ||
133 | { | ||
134 | if (!a) | ||
135 | return true; | ||
136 | if (!b) | ||
137 | return true; | ||
138 | if (a == b) | ||
139 | return true; | ||
140 | return false; | ||
141 | } | ||
142 | |||
143 | void disable_swap_token(struct mem_cgroup *memcg) | ||
144 | { | ||
145 | /* memcg reclaim don't disable unrelated mm token. */ | ||
146 | if (match_memcg(memcg, swap_token_memcg)) { | ||
147 | spin_lock(&swap_token_lock); | ||
148 | if (match_memcg(memcg, swap_token_memcg)) { | ||
149 | trace_disable_swap_token(swap_token_mm); | ||
150 | swap_token_mm = NULL; | ||
151 | swap_token_memcg = NULL; | ||
152 | } | ||
153 | spin_unlock(&swap_token_lock); | ||
154 | } | ||
155 | } | ||
diff --git a/mm/truncate.c b/mm/truncate.c index 61a183b89df6..75801acdaac7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -602,31 +602,6 @@ int vmtruncate(struct inode *inode, loff_t newsize) | |||
602 | } | 602 | } |
603 | EXPORT_SYMBOL(vmtruncate); | 603 | EXPORT_SYMBOL(vmtruncate); |
604 | 604 | ||
605 | int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) | ||
606 | { | ||
607 | struct address_space *mapping = inode->i_mapping; | ||
608 | loff_t holebegin = round_up(lstart, PAGE_SIZE); | ||
609 | loff_t holelen = 1 + lend - holebegin; | ||
610 | |||
611 | /* | ||
612 | * If the underlying filesystem is not going to provide | ||
613 | * a way to truncate a range of blocks (punch a hole) - | ||
614 | * we should return failure right now. | ||
615 | */ | ||
616 | if (!inode->i_op->truncate_range) | ||
617 | return -ENOSYS; | ||
618 | |||
619 | mutex_lock(&inode->i_mutex); | ||
620 | inode_dio_wait(inode); | ||
621 | unmap_mapping_range(mapping, holebegin, holelen, 1); | ||
622 | inode->i_op->truncate_range(inode, lstart, lend); | ||
623 | /* unmap again to remove racily COWed private pages */ | ||
624 | unmap_mapping_range(mapping, holebegin, holelen, 1); | ||
625 | mutex_unlock(&inode->i_mutex); | ||
626 | |||
627 | return 0; | ||
628 | } | ||
629 | |||
630 | /** | 605 | /** |
631 | * truncate_pagecache_range - unmap and remove pagecache that is hole-punched | 606 | * truncate_pagecache_range - unmap and remove pagecache that is hole-punched |
632 | * @inode: inode | 607 | * @inode: inode |
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/export.h> | 4 | #include <linux/export.h> |
5 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <linux/security.h> | ||
7 | #include <asm/uaccess.h> | 8 | #include <asm/uaccess.h> |
8 | 9 | ||
9 | #include "internal.h" | 10 | #include "internal.h" |
@@ -341,6 +342,35 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, | |||
341 | } | 342 | } |
342 | EXPORT_SYMBOL_GPL(get_user_pages_fast); | 343 | EXPORT_SYMBOL_GPL(get_user_pages_fast); |
343 | 344 | ||
345 | unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, | ||
346 | unsigned long len, unsigned long prot, | ||
347 | unsigned long flag, unsigned long pgoff) | ||
348 | { | ||
349 | unsigned long ret; | ||
350 | struct mm_struct *mm = current->mm; | ||
351 | |||
352 | ret = security_mmap_file(file, prot, flag); | ||
353 | if (!ret) { | ||
354 | down_write(&mm->mmap_sem); | ||
355 | ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); | ||
356 | up_write(&mm->mmap_sem); | ||
357 | } | ||
358 | return ret; | ||
359 | } | ||
360 | |||
361 | unsigned long vm_mmap(struct file *file, unsigned long addr, | ||
362 | unsigned long len, unsigned long prot, | ||
363 | unsigned long flag, unsigned long offset) | ||
364 | { | ||
365 | if (unlikely(offset + PAGE_ALIGN(len) < offset)) | ||
366 | return -EINVAL; | ||
367 | if (unlikely(offset & ~PAGE_MASK)) | ||
368 | return -EINVAL; | ||
369 | |||
370 | return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); | ||
371 | } | ||
372 | EXPORT_SYMBOL(vm_mmap); | ||
373 | |||
344 | /* Tracepoints definitions. */ | 374 | /* Tracepoints definitions. */ |
345 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); | 375 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); |
346 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); | 376 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 94dff883b449..2bb90b1d241c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -413,11 +413,11 @@ nocache: | |||
413 | if (addr + size - 1 < addr) | 413 | if (addr + size - 1 < addr) |
414 | goto overflow; | 414 | goto overflow; |
415 | 415 | ||
416 | n = rb_next(&first->rb_node); | 416 | if (list_is_last(&first->list, &vmap_area_list)) |
417 | if (n) | ||
418 | first = rb_entry(n, struct vmap_area, rb_node); | ||
419 | else | ||
420 | goto found; | 417 | goto found; |
418 | |||
419 | first = list_entry(first->list.next, | ||
420 | struct vmap_area, list); | ||
421 | } | 421 | } |
422 | 422 | ||
423 | found: | 423 | found: |
@@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | |||
904 | 904 | ||
905 | BUG_ON(size & ~PAGE_MASK); | 905 | BUG_ON(size & ~PAGE_MASK); |
906 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | 906 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); |
907 | if (WARN_ON(size == 0)) { | ||
908 | /* | ||
909 | * Allocating 0 bytes isn't what caller wants since | ||
910 | * get_order(0) returns funny result. Just warn and terminate | ||
911 | * early. | ||
912 | */ | ||
913 | return NULL; | ||
914 | } | ||
907 | order = get_order(size); | 915 | order = get_order(size); |
908 | 916 | ||
909 | again: | 917 | again: |
@@ -1185,9 +1193,10 @@ void __init vmalloc_init(void) | |||
1185 | /* Import existing vmlist entries. */ | 1193 | /* Import existing vmlist entries. */ |
1186 | for (tmp = vmlist; tmp; tmp = tmp->next) { | 1194 | for (tmp = vmlist; tmp; tmp = tmp->next) { |
1187 | va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); | 1195 | va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); |
1188 | va->flags = tmp->flags | VM_VM_AREA; | 1196 | va->flags = VM_VM_AREA; |
1189 | va->va_start = (unsigned long)tmp->addr; | 1197 | va->va_start = (unsigned long)tmp->addr; |
1190 | va->va_end = va->va_start + tmp->size; | 1198 | va->va_end = va->va_start + tmp->size; |
1199 | va->vm = tmp; | ||
1191 | __insert_vmap_area(va); | 1200 | __insert_vmap_area(va); |
1192 | } | 1201 | } |
1193 | 1202 | ||
@@ -1279,7 +1288,7 @@ DEFINE_RWLOCK(vmlist_lock); | |||
1279 | struct vm_struct *vmlist; | 1288 | struct vm_struct *vmlist; |
1280 | 1289 | ||
1281 | static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | 1290 | static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, |
1282 | unsigned long flags, void *caller) | 1291 | unsigned long flags, const void *caller) |
1283 | { | 1292 | { |
1284 | vm->flags = flags; | 1293 | vm->flags = flags; |
1285 | vm->addr = (void *)va->va_start; | 1294 | vm->addr = (void *)va->va_start; |
@@ -1305,7 +1314,7 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm) | |||
1305 | } | 1314 | } |
1306 | 1315 | ||
1307 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | 1316 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, |
1308 | unsigned long flags, void *caller) | 1317 | unsigned long flags, const void *caller) |
1309 | { | 1318 | { |
1310 | setup_vmalloc_vm(vm, va, flags, caller); | 1319 | setup_vmalloc_vm(vm, va, flags, caller); |
1311 | insert_vmalloc_vmlist(vm); | 1320 | insert_vmalloc_vmlist(vm); |
@@ -1313,7 +1322,7 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | |||
1313 | 1322 | ||
1314 | static struct vm_struct *__get_vm_area_node(unsigned long size, | 1323 | static struct vm_struct *__get_vm_area_node(unsigned long size, |
1315 | unsigned long align, unsigned long flags, unsigned long start, | 1324 | unsigned long align, unsigned long flags, unsigned long start, |
1316 | unsigned long end, int node, gfp_t gfp_mask, void *caller) | 1325 | unsigned long end, int node, gfp_t gfp_mask, const void *caller) |
1317 | { | 1326 | { |
1318 | struct vmap_area *va; | 1327 | struct vmap_area *va; |
1319 | struct vm_struct *area; | 1328 | struct vm_struct *area; |
@@ -1374,7 +1383,7 @@ EXPORT_SYMBOL_GPL(__get_vm_area); | |||
1374 | 1383 | ||
1375 | struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, | 1384 | struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, |
1376 | unsigned long start, unsigned long end, | 1385 | unsigned long start, unsigned long end, |
1377 | void *caller) | 1386 | const void *caller) |
1378 | { | 1387 | { |
1379 | return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, | 1388 | return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, |
1380 | caller); | 1389 | caller); |
@@ -1396,13 +1405,21 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) | |||
1396 | } | 1405 | } |
1397 | 1406 | ||
1398 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, | 1407 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, |
1399 | void *caller) | 1408 | const void *caller) |
1400 | { | 1409 | { |
1401 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, | 1410 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, |
1402 | -1, GFP_KERNEL, caller); | 1411 | -1, GFP_KERNEL, caller); |
1403 | } | 1412 | } |
1404 | 1413 | ||
1405 | static struct vm_struct *find_vm_area(const void *addr) | 1414 | /** |
1415 | * find_vm_area - find a continuous kernel virtual area | ||
1416 | * @addr: base address | ||
1417 | * | ||
1418 | * Search for the kernel VM area starting at @addr, and return it. | ||
1419 | * It is up to the caller to do all required locking to keep the returned | ||
1420 | * pointer valid. | ||
1421 | */ | ||
1422 | struct vm_struct *find_vm_area(const void *addr) | ||
1406 | { | 1423 | { |
1407 | struct vmap_area *va; | 1424 | struct vmap_area *va; |
1408 | 1425 | ||
@@ -1567,9 +1584,9 @@ EXPORT_SYMBOL(vmap); | |||
1567 | 1584 | ||
1568 | static void *__vmalloc_node(unsigned long size, unsigned long align, | 1585 | static void *__vmalloc_node(unsigned long size, unsigned long align, |
1569 | gfp_t gfp_mask, pgprot_t prot, | 1586 | gfp_t gfp_mask, pgprot_t prot, |
1570 | int node, void *caller); | 1587 | int node, const void *caller); |
1571 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 1588 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
1572 | pgprot_t prot, int node, void *caller) | 1589 | pgprot_t prot, int node, const void *caller) |
1573 | { | 1590 | { |
1574 | const int order = 0; | 1591 | const int order = 0; |
1575 | struct page **pages; | 1592 | struct page **pages; |
@@ -1642,7 +1659,7 @@ fail: | |||
1642 | */ | 1659 | */ |
1643 | void *__vmalloc_node_range(unsigned long size, unsigned long align, | 1660 | void *__vmalloc_node_range(unsigned long size, unsigned long align, |
1644 | unsigned long start, unsigned long end, gfp_t gfp_mask, | 1661 | unsigned long start, unsigned long end, gfp_t gfp_mask, |
1645 | pgprot_t prot, int node, void *caller) | 1662 | pgprot_t prot, int node, const void *caller) |
1646 | { | 1663 | { |
1647 | struct vm_struct *area; | 1664 | struct vm_struct *area; |
1648 | void *addr; | 1665 | void *addr; |
@@ -1698,7 +1715,7 @@ fail: | |||
1698 | */ | 1715 | */ |
1699 | static void *__vmalloc_node(unsigned long size, unsigned long align, | 1716 | static void *__vmalloc_node(unsigned long size, unsigned long align, |
1700 | gfp_t gfp_mask, pgprot_t prot, | 1717 | gfp_t gfp_mask, pgprot_t prot, |
1701 | int node, void *caller) | 1718 | int node, const void *caller) |
1702 | { | 1719 | { |
1703 | return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, | 1720 | return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, |
1704 | gfp_mask, prot, node, caller); | 1721 | gfp_mask, prot, node, caller); |
@@ -1974,9 +1991,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) | |||
1974 | * IOREMAP area is treated as memory hole and no copy is done. | 1991 | * IOREMAP area is treated as memory hole and no copy is done. |
1975 | * | 1992 | * |
1976 | * If [addr...addr+count) doesn't includes any intersects with alive | 1993 | * If [addr...addr+count) doesn't includes any intersects with alive |
1977 | * vm_struct area, returns 0. | 1994 | * vm_struct area, returns 0. @buf should be kernel's buffer. |
1978 | * @buf should be kernel's buffer. Because this function uses KM_USER0, | ||
1979 | * the caller should guarantee KM_USER0 is not used. | ||
1980 | * | 1995 | * |
1981 | * Note: In usual ops, vread() is never necessary because the caller | 1996 | * Note: In usual ops, vread() is never necessary because the caller |
1982 | * should know vmalloc() area is valid and can use memcpy(). | 1997 | * should know vmalloc() area is valid and can use memcpy(). |
@@ -2050,9 +2065,7 @@ finished: | |||
2050 | * IOREMAP area is treated as memory hole and no copy is done. | 2065 | * IOREMAP area is treated as memory hole and no copy is done. |
2051 | * | 2066 | * |
2052 | * If [addr...addr+count) doesn't includes any intersects with alive | 2067 | * If [addr...addr+count) doesn't includes any intersects with alive |
2053 | * vm_struct area, returns 0. | 2068 | * vm_struct area, returns 0. @buf should be kernel's buffer. |
2054 | * @buf should be kernel's buffer. Because this function uses KM_USER0, | ||
2055 | * the caller should guarantee KM_USER0 is not used. | ||
2056 | * | 2069 | * |
2057 | * Note: In usual ops, vwrite() is never necessary because the caller | 2070 | * Note: In usual ops, vwrite() is never necessary because the caller |
2058 | * should know vmalloc() area is valid and can use memcpy(). | 2071 | * should know vmalloc() area is valid and can use memcpy(). |
@@ -2375,8 +2388,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | |||
2375 | return NULL; | 2388 | return NULL; |
2376 | } | 2389 | } |
2377 | 2390 | ||
2378 | vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); | 2391 | vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); |
2379 | vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); | 2392 | vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); |
2380 | if (!vas || !vms) | 2393 | if (!vas || !vms) |
2381 | goto err_free2; | 2394 | goto err_free2; |
2382 | 2395 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 33c332bbab73..8d01243d9560 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -53,24 +53,6 @@ | |||
53 | #define CREATE_TRACE_POINTS | 53 | #define CREATE_TRACE_POINTS |
54 | #include <trace/events/vmscan.h> | 54 | #include <trace/events/vmscan.h> |
55 | 55 | ||
56 | /* | ||
57 | * reclaim_mode determines how the inactive list is shrunk | ||
58 | * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages | ||
59 | * RECLAIM_MODE_ASYNC: Do not block | ||
60 | * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback | ||
61 | * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference | ||
62 | * page from the LRU and reclaim all pages within a | ||
63 | * naturally aligned range | ||
64 | * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of | ||
65 | * order-0 pages and then compact the zone | ||
66 | */ | ||
67 | typedef unsigned __bitwise__ reclaim_mode_t; | ||
68 | #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) | ||
69 | #define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) | ||
70 | #define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) | ||
71 | #define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) | ||
72 | #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) | ||
73 | |||
74 | struct scan_control { | 56 | struct scan_control { |
75 | /* Incremented by the number of inactive pages that were scanned */ | 57 | /* Incremented by the number of inactive pages that were scanned */ |
76 | unsigned long nr_scanned; | 58 | unsigned long nr_scanned; |
@@ -96,11 +78,8 @@ struct scan_control { | |||
96 | 78 | ||
97 | int order; | 79 | int order; |
98 | 80 | ||
99 | /* | 81 | /* Scan (total_size >> priority) pages at once */ |
100 | * Intend to reclaim enough continuous memory rather than reclaim | 82 | int priority; |
101 | * enough amount of memory. i.e, mode for high order allocation. | ||
102 | */ | ||
103 | reclaim_mode_t reclaim_mode; | ||
104 | 83 | ||
105 | /* | 84 | /* |
106 | * The memory cgroup that hit its limit and as a result is the | 85 | * The memory cgroup that hit its limit and as a result is the |
@@ -115,11 +94,6 @@ struct scan_control { | |||
115 | nodemask_t *nodemask; | 94 | nodemask_t *nodemask; |
116 | }; | 95 | }; |
117 | 96 | ||
118 | struct mem_cgroup_zone { | ||
119 | struct mem_cgroup *mem_cgroup; | ||
120 | struct zone *zone; | ||
121 | }; | ||
122 | |||
123 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 97 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
124 | 98 | ||
125 | #ifdef ARCH_HAS_PREFETCH | 99 | #ifdef ARCH_HAS_PREFETCH |
@@ -159,49 +133,26 @@ long vm_total_pages; /* The total number of pages which the VM controls */ | |||
159 | static LIST_HEAD(shrinker_list); | 133 | static LIST_HEAD(shrinker_list); |
160 | static DECLARE_RWSEM(shrinker_rwsem); | 134 | static DECLARE_RWSEM(shrinker_rwsem); |
161 | 135 | ||
162 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 136 | #ifdef CONFIG_MEMCG |
163 | static bool global_reclaim(struct scan_control *sc) | 137 | static bool global_reclaim(struct scan_control *sc) |
164 | { | 138 | { |
165 | return !sc->target_mem_cgroup; | 139 | return !sc->target_mem_cgroup; |
166 | } | 140 | } |
167 | |||
168 | static bool scanning_global_lru(struct mem_cgroup_zone *mz) | ||
169 | { | ||
170 | return !mz->mem_cgroup; | ||
171 | } | ||
172 | #else | 141 | #else |
173 | static bool global_reclaim(struct scan_control *sc) | 142 | static bool global_reclaim(struct scan_control *sc) |
174 | { | 143 | { |
175 | return true; | 144 | return true; |
176 | } | 145 | } |
177 | |||
178 | static bool scanning_global_lru(struct mem_cgroup_zone *mz) | ||
179 | { | ||
180 | return true; | ||
181 | } | ||
182 | #endif | 146 | #endif |
183 | 147 | ||
184 | static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) | 148 | static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) |
185 | { | 149 | { |
186 | if (!scanning_global_lru(mz)) | 150 | if (!mem_cgroup_disabled()) |
187 | return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); | 151 | return mem_cgroup_get_lru_size(lruvec, lru); |
188 | 152 | ||
189 | return &mz->zone->reclaim_stat; | 153 | return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); |
190 | } | 154 | } |
191 | 155 | ||
192 | static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz, | ||
193 | enum lru_list lru) | ||
194 | { | ||
195 | if (!scanning_global_lru(mz)) | ||
196 | return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup, | ||
197 | zone_to_nid(mz->zone), | ||
198 | zone_idx(mz->zone), | ||
199 | BIT(lru)); | ||
200 | |||
201 | return zone_page_state(mz->zone, NR_LRU_BASE + lru); | ||
202 | } | ||
203 | |||
204 | |||
205 | /* | 156 | /* |
206 | * Add a shrinker callback to be called from the vm | 157 | * Add a shrinker callback to be called from the vm |
207 | */ | 158 | */ |
@@ -364,39 +315,6 @@ out: | |||
364 | return ret; | 315 | return ret; |
365 | } | 316 | } |
366 | 317 | ||
367 | static void set_reclaim_mode(int priority, struct scan_control *sc, | ||
368 | bool sync) | ||
369 | { | ||
370 | reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; | ||
371 | |||
372 | /* | ||
373 | * Initially assume we are entering either lumpy reclaim or | ||
374 | * reclaim/compaction.Depending on the order, we will either set the | ||
375 | * sync mode or just reclaim order-0 pages later. | ||
376 | */ | ||
377 | if (COMPACTION_BUILD) | ||
378 | sc->reclaim_mode = RECLAIM_MODE_COMPACTION; | ||
379 | else | ||
380 | sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; | ||
381 | |||
382 | /* | ||
383 | * Avoid using lumpy reclaim or reclaim/compaction if possible by | ||
384 | * restricting when its set to either costly allocations or when | ||
385 | * under memory pressure | ||
386 | */ | ||
387 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
388 | sc->reclaim_mode |= syncmode; | ||
389 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
390 | sc->reclaim_mode |= syncmode; | ||
391 | else | ||
392 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; | ||
393 | } | ||
394 | |||
395 | static void reset_reclaim_mode(struct scan_control *sc) | ||
396 | { | ||
397 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; | ||
398 | } | ||
399 | |||
400 | static inline int is_page_cache_freeable(struct page *page) | 318 | static inline int is_page_cache_freeable(struct page *page) |
401 | { | 319 | { |
402 | /* | 320 | /* |
@@ -416,10 +334,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi, | |||
416 | return 1; | 334 | return 1; |
417 | if (bdi == current->backing_dev_info) | 335 | if (bdi == current->backing_dev_info) |
418 | return 1; | 336 | return 1; |
419 | |||
420 | /* lumpy reclaim for hugepage often need a lot of write */ | ||
421 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
422 | return 1; | ||
423 | return 0; | 337 | return 0; |
424 | } | 338 | } |
425 | 339 | ||
@@ -523,8 +437,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
523 | /* synchronous write or broken a_ops? */ | 437 | /* synchronous write or broken a_ops? */ |
524 | ClearPageReclaim(page); | 438 | ClearPageReclaim(page); |
525 | } | 439 | } |
526 | trace_mm_vmscan_writepage(page, | 440 | trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); |
527 | trace_reclaim_flags(page, sc->reclaim_mode)); | ||
528 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | 441 | inc_zone_page_state(page, NR_VMSCAN_WRITE); |
529 | return PAGE_SUCCESS; | 442 | return PAGE_SUCCESS; |
530 | } | 443 | } |
@@ -701,19 +614,15 @@ enum page_references { | |||
701 | }; | 614 | }; |
702 | 615 | ||
703 | static enum page_references page_check_references(struct page *page, | 616 | static enum page_references page_check_references(struct page *page, |
704 | struct mem_cgroup_zone *mz, | ||
705 | struct scan_control *sc) | 617 | struct scan_control *sc) |
706 | { | 618 | { |
707 | int referenced_ptes, referenced_page; | 619 | int referenced_ptes, referenced_page; |
708 | unsigned long vm_flags; | 620 | unsigned long vm_flags; |
709 | 621 | ||
710 | referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); | 622 | referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, |
623 | &vm_flags); | ||
711 | referenced_page = TestClearPageReferenced(page); | 624 | referenced_page = TestClearPageReferenced(page); |
712 | 625 | ||
713 | /* Lumpy reclaim - ignore references */ | ||
714 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | ||
715 | return PAGEREF_RECLAIM; | ||
716 | |||
717 | /* | 626 | /* |
718 | * Mlock lost the isolation race with us. Let try_to_unmap() | 627 | * Mlock lost the isolation race with us. Let try_to_unmap() |
719 | * move the page to the unevictable list. | 628 | * move the page to the unevictable list. |
@@ -722,7 +631,7 @@ static enum page_references page_check_references(struct page *page, | |||
722 | return PAGEREF_RECLAIM; | 631 | return PAGEREF_RECLAIM; |
723 | 632 | ||
724 | if (referenced_ptes) { | 633 | if (referenced_ptes) { |
725 | if (PageAnon(page)) | 634 | if (PageSwapBacked(page)) |
726 | return PAGEREF_ACTIVATE; | 635 | return PAGEREF_ACTIVATE; |
727 | /* | 636 | /* |
728 | * All mapped pages start out with page table | 637 | * All mapped pages start out with page table |
@@ -763,9 +672,8 @@ static enum page_references page_check_references(struct page *page, | |||
763 | * shrink_page_list() returns the number of reclaimed pages | 672 | * shrink_page_list() returns the number of reclaimed pages |
764 | */ | 673 | */ |
765 | static unsigned long shrink_page_list(struct list_head *page_list, | 674 | static unsigned long shrink_page_list(struct list_head *page_list, |
766 | struct mem_cgroup_zone *mz, | 675 | struct zone *zone, |
767 | struct scan_control *sc, | 676 | struct scan_control *sc, |
768 | int priority, | ||
769 | unsigned long *ret_nr_dirty, | 677 | unsigned long *ret_nr_dirty, |
770 | unsigned long *ret_nr_writeback) | 678 | unsigned long *ret_nr_writeback) |
771 | { | 679 | { |
@@ -779,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
779 | 687 | ||
780 | cond_resched(); | 688 | cond_resched(); |
781 | 689 | ||
690 | mem_cgroup_uncharge_start(); | ||
782 | while (!list_empty(page_list)) { | 691 | while (!list_empty(page_list)) { |
783 | enum page_references references; | 692 | enum page_references references; |
784 | struct address_space *mapping; | 693 | struct address_space *mapping; |
@@ -794,7 +703,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
794 | goto keep; | 703 | goto keep; |
795 | 704 | ||
796 | VM_BUG_ON(PageActive(page)); | 705 | VM_BUG_ON(PageActive(page)); |
797 | VM_BUG_ON(page_zone(page) != mz->zone); | 706 | VM_BUG_ON(page_zone(page) != zone); |
798 | 707 | ||
799 | sc->nr_scanned++; | 708 | sc->nr_scanned++; |
800 | 709 | ||
@@ -812,23 +721,44 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
812 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | 721 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); |
813 | 722 | ||
814 | if (PageWriteback(page)) { | 723 | if (PageWriteback(page)) { |
815 | nr_writeback++; | ||
816 | /* | 724 | /* |
817 | * Synchronous reclaim cannot queue pages for | 725 | * memcg doesn't have any dirty pages throttling so we |
818 | * writeback due to the possibility of stack overflow | 726 | * could easily OOM just because too many pages are in |
819 | * but if it encounters a page under writeback, wait | 727 | * writeback and there is nothing else to reclaim. |
820 | * for the IO to complete. | 728 | * |
729 | * Check __GFP_IO, certainly because a loop driver | ||
730 | * thread might enter reclaim, and deadlock if it waits | ||
731 | * on a page for which it is needed to do the write | ||
732 | * (loop masks off __GFP_IO|__GFP_FS for this reason); | ||
733 | * but more thought would probably show more reasons. | ||
734 | * | ||
735 | * Don't require __GFP_FS, since we're not going into | ||
736 | * the FS, just waiting on its writeback completion. | ||
737 | * Worryingly, ext4 gfs2 and xfs allocate pages with | ||
738 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so | ||
739 | * testing may_enter_fs here is liable to OOM on them. | ||
821 | */ | 740 | */ |
822 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && | 741 | if (global_reclaim(sc) || |
823 | may_enter_fs) | 742 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { |
824 | wait_on_page_writeback(page); | 743 | /* |
825 | else { | 744 | * This is slightly racy - end_page_writeback() |
826 | unlock_page(page); | 745 | * might have just cleared PageReclaim, then |
827 | goto keep_lumpy; | 746 | * setting PageReclaim here end up interpreted |
747 | * as PageReadahead - but that does not matter | ||
748 | * enough to care. What we do want is for this | ||
749 | * page to have PageReclaim set next time memcg | ||
750 | * reclaim reaches the tests above, so it will | ||
751 | * then wait_on_page_writeback() to avoid OOM; | ||
752 | * and it's also appropriate in global reclaim. | ||
753 | */ | ||
754 | SetPageReclaim(page); | ||
755 | nr_writeback++; | ||
756 | goto keep_locked; | ||
828 | } | 757 | } |
758 | wait_on_page_writeback(page); | ||
829 | } | 759 | } |
830 | 760 | ||
831 | references = page_check_references(page, mz, sc); | 761 | references = page_check_references(page, sc); |
832 | switch (references) { | 762 | switch (references) { |
833 | case PAGEREF_ACTIVATE: | 763 | case PAGEREF_ACTIVATE: |
834 | goto activate_locked; | 764 | goto activate_locked; |
@@ -879,7 +809,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
879 | * unless under significant pressure. | 809 | * unless under significant pressure. |
880 | */ | 810 | */ |
881 | if (page_is_file_cache(page) && | 811 | if (page_is_file_cache(page) && |
882 | (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { | 812 | (!current_is_kswapd() || |
813 | sc->priority >= DEF_PRIORITY - 2)) { | ||
883 | /* | 814 | /* |
884 | * Immediately reclaim when written back. | 815 | * Immediately reclaim when written back. |
885 | * Similar in principal to deactivate_page() | 816 | * Similar in principal to deactivate_page() |
@@ -908,7 +839,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
908 | goto activate_locked; | 839 | goto activate_locked; |
909 | case PAGE_SUCCESS: | 840 | case PAGE_SUCCESS: |
910 | if (PageWriteback(page)) | 841 | if (PageWriteback(page)) |
911 | goto keep_lumpy; | 842 | goto keep; |
912 | if (PageDirty(page)) | 843 | if (PageDirty(page)) |
913 | goto keep; | 844 | goto keep; |
914 | 845 | ||
@@ -994,7 +925,6 @@ cull_mlocked: | |||
994 | try_to_free_swap(page); | 925 | try_to_free_swap(page); |
995 | unlock_page(page); | 926 | unlock_page(page); |
996 | putback_lru_page(page); | 927 | putback_lru_page(page); |
997 | reset_reclaim_mode(sc); | ||
998 | continue; | 928 | continue; |
999 | 929 | ||
1000 | activate_locked: | 930 | activate_locked: |
@@ -1007,8 +937,6 @@ activate_locked: | |||
1007 | keep_locked: | 937 | keep_locked: |
1008 | unlock_page(page); | 938 | unlock_page(page); |
1009 | keep: | 939 | keep: |
1010 | reset_reclaim_mode(sc); | ||
1011 | keep_lumpy: | ||
1012 | list_add(&page->lru, &ret_pages); | 940 | list_add(&page->lru, &ret_pages); |
1013 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 941 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
1014 | } | 942 | } |
@@ -1020,12 +948,13 @@ keep_lumpy: | |||
1020 | * will encounter the same problem | 948 | * will encounter the same problem |
1021 | */ | 949 | */ |
1022 | if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) | 950 | if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) |
1023 | zone_set_flag(mz->zone, ZONE_CONGESTED); | 951 | zone_set_flag(zone, ZONE_CONGESTED); |
1024 | 952 | ||
1025 | free_hot_cold_page_list(&free_pages, 1); | 953 | free_hot_cold_page_list(&free_pages, 1); |
1026 | 954 | ||
1027 | list_splice(&ret_pages, page_list); | 955 | list_splice(&ret_pages, page_list); |
1028 | count_vm_events(PGACTIVATE, pgactivate); | 956 | count_vm_events(PGACTIVATE, pgactivate); |
957 | mem_cgroup_uncharge_end(); | ||
1029 | *ret_nr_dirty += nr_dirty; | 958 | *ret_nr_dirty += nr_dirty; |
1030 | *ret_nr_writeback += nr_writeback; | 959 | *ret_nr_writeback += nr_writeback; |
1031 | return nr_reclaimed; | 960 | return nr_reclaimed; |
@@ -1041,34 +970,15 @@ keep_lumpy: | |||
1041 | * | 970 | * |
1042 | * returns 0 on success, -ve errno on failure. | 971 | * returns 0 on success, -ve errno on failure. |
1043 | */ | 972 | */ |
1044 | int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) | 973 | int __isolate_lru_page(struct page *page, isolate_mode_t mode) |
1045 | { | 974 | { |
1046 | bool all_lru_mode; | ||
1047 | int ret = -EINVAL; | 975 | int ret = -EINVAL; |
1048 | 976 | ||
1049 | /* Only take pages on the LRU. */ | 977 | /* Only take pages on the LRU. */ |
1050 | if (!PageLRU(page)) | 978 | if (!PageLRU(page)) |
1051 | return ret; | 979 | return ret; |
1052 | 980 | ||
1053 | all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == | 981 | /* Do not give back unevictable pages for compaction */ |
1054 | (ISOLATE_ACTIVE|ISOLATE_INACTIVE); | ||
1055 | |||
1056 | /* | ||
1057 | * When checking the active state, we need to be sure we are | ||
1058 | * dealing with comparible boolean values. Take the logical not | ||
1059 | * of each. | ||
1060 | */ | ||
1061 | if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE)) | ||
1062 | return ret; | ||
1063 | |||
1064 | if (!all_lru_mode && !!page_is_file_cache(page) != file) | ||
1065 | return ret; | ||
1066 | |||
1067 | /* | ||
1068 | * When this function is being called for lumpy reclaim, we | ||
1069 | * initially look into all LRU pages, active, inactive and | ||
1070 | * unevictable; only give shrink_page_list evictable pages. | ||
1071 | */ | ||
1072 | if (PageUnevictable(page)) | 982 | if (PageUnevictable(page)) |
1073 | return ret; | 983 | return ret; |
1074 | 984 | ||
@@ -1135,54 +1045,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) | |||
1135 | * Appropriate locks must be held before calling this function. | 1045 | * Appropriate locks must be held before calling this function. |
1136 | * | 1046 | * |
1137 | * @nr_to_scan: The number of pages to look through on the list. | 1047 | * @nr_to_scan: The number of pages to look through on the list. |
1138 | * @mz: The mem_cgroup_zone to pull pages from. | 1048 | * @lruvec: The LRU vector to pull pages from. |
1139 | * @dst: The temp list to put pages on to. | 1049 | * @dst: The temp list to put pages on to. |
1140 | * @nr_scanned: The number of pages that were scanned. | 1050 | * @nr_scanned: The number of pages that were scanned. |
1141 | * @sc: The scan_control struct for this reclaim session | 1051 | * @sc: The scan_control struct for this reclaim session |
1142 | * @mode: One of the LRU isolation modes | 1052 | * @mode: One of the LRU isolation modes |
1143 | * @active: True [1] if isolating active pages | 1053 | * @lru: LRU list id for isolating |
1144 | * @file: True [1] if isolating file [!anon] pages | ||
1145 | * | 1054 | * |
1146 | * returns how many pages were moved onto *@dst. | 1055 | * returns how many pages were moved onto *@dst. |
1147 | */ | 1056 | */ |
1148 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 1057 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
1149 | struct mem_cgroup_zone *mz, struct list_head *dst, | 1058 | struct lruvec *lruvec, struct list_head *dst, |
1150 | unsigned long *nr_scanned, struct scan_control *sc, | 1059 | unsigned long *nr_scanned, struct scan_control *sc, |
1151 | isolate_mode_t mode, int active, int file) | 1060 | isolate_mode_t mode, enum lru_list lru) |
1152 | { | 1061 | { |
1153 | struct lruvec *lruvec; | 1062 | struct list_head *src = &lruvec->lists[lru]; |
1154 | struct list_head *src; | ||
1155 | unsigned long nr_taken = 0; | 1063 | unsigned long nr_taken = 0; |
1156 | unsigned long nr_lumpy_taken = 0; | ||
1157 | unsigned long nr_lumpy_dirty = 0; | ||
1158 | unsigned long nr_lumpy_failed = 0; | ||
1159 | unsigned long scan; | 1064 | unsigned long scan; |
1160 | int lru = LRU_BASE; | ||
1161 | |||
1162 | lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup); | ||
1163 | if (active) | ||
1164 | lru += LRU_ACTIVE; | ||
1165 | if (file) | ||
1166 | lru += LRU_FILE; | ||
1167 | src = &lruvec->lists[lru]; | ||
1168 | 1065 | ||
1169 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { | 1066 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { |
1170 | struct page *page; | 1067 | struct page *page; |
1171 | unsigned long pfn; | 1068 | int nr_pages; |
1172 | unsigned long end_pfn; | ||
1173 | unsigned long page_pfn; | ||
1174 | int zone_id; | ||
1175 | 1069 | ||
1176 | page = lru_to_page(src); | 1070 | page = lru_to_page(src); |
1177 | prefetchw_prev_lru_page(page, src, flags); | 1071 | prefetchw_prev_lru_page(page, src, flags); |
1178 | 1072 | ||
1179 | VM_BUG_ON(!PageLRU(page)); | 1073 | VM_BUG_ON(!PageLRU(page)); |
1180 | 1074 | ||
1181 | switch (__isolate_lru_page(page, mode, file)) { | 1075 | switch (__isolate_lru_page(page, mode)) { |
1182 | case 0: | 1076 | case 0: |
1183 | mem_cgroup_lru_del(page); | 1077 | nr_pages = hpage_nr_pages(page); |
1078 | mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); | ||
1184 | list_move(&page->lru, dst); | 1079 | list_move(&page->lru, dst); |
1185 | nr_taken += hpage_nr_pages(page); | 1080 | nr_taken += nr_pages; |
1186 | break; | 1081 | break; |
1187 | 1082 | ||
1188 | case -EBUSY: | 1083 | case -EBUSY: |
@@ -1193,93 +1088,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1193 | default: | 1088 | default: |
1194 | BUG(); | 1089 | BUG(); |
1195 | } | 1090 | } |
1196 | |||
1197 | if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)) | ||
1198 | continue; | ||
1199 | |||
1200 | /* | ||
1201 | * Attempt to take all pages in the order aligned region | ||
1202 | * surrounding the tag page. Only take those pages of | ||
1203 | * the same active state as that tag page. We may safely | ||
1204 | * round the target page pfn down to the requested order | ||
1205 | * as the mem_map is guaranteed valid out to MAX_ORDER, | ||
1206 | * where that page is in a different zone we will detect | ||
1207 | * it from its zone id and abort this block scan. | ||
1208 | */ | ||
1209 | zone_id = page_zone_id(page); | ||
1210 | page_pfn = page_to_pfn(page); | ||
1211 | pfn = page_pfn & ~((1 << sc->order) - 1); | ||
1212 | end_pfn = pfn + (1 << sc->order); | ||
1213 | for (; pfn < end_pfn; pfn++) { | ||
1214 | struct page *cursor_page; | ||
1215 | |||
1216 | /* The target page is in the block, ignore it. */ | ||
1217 | if (unlikely(pfn == page_pfn)) | ||
1218 | continue; | ||
1219 | |||
1220 | /* Avoid holes within the zone. */ | ||
1221 | if (unlikely(!pfn_valid_within(pfn))) | ||
1222 | break; | ||
1223 | |||
1224 | cursor_page = pfn_to_page(pfn); | ||
1225 | |||
1226 | /* Check that we have not crossed a zone boundary. */ | ||
1227 | if (unlikely(page_zone_id(cursor_page) != zone_id)) | ||
1228 | break; | ||
1229 | |||
1230 | /* | ||
1231 | * If we don't have enough swap space, reclaiming of | ||
1232 | * anon page which don't already have a swap slot is | ||
1233 | * pointless. | ||
1234 | */ | ||
1235 | if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) && | ||
1236 | !PageSwapCache(cursor_page)) | ||
1237 | break; | ||
1238 | |||
1239 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { | ||
1240 | unsigned int isolated_pages; | ||
1241 | |||
1242 | mem_cgroup_lru_del(cursor_page); | ||
1243 | list_move(&cursor_page->lru, dst); | ||
1244 | isolated_pages = hpage_nr_pages(cursor_page); | ||
1245 | nr_taken += isolated_pages; | ||
1246 | nr_lumpy_taken += isolated_pages; | ||
1247 | if (PageDirty(cursor_page)) | ||
1248 | nr_lumpy_dirty += isolated_pages; | ||
1249 | scan++; | ||
1250 | pfn += isolated_pages - 1; | ||
1251 | } else { | ||
1252 | /* | ||
1253 | * Check if the page is freed already. | ||
1254 | * | ||
1255 | * We can't use page_count() as that | ||
1256 | * requires compound_head and we don't | ||
1257 | * have a pin on the page here. If a | ||
1258 | * page is tail, we may or may not | ||
1259 | * have isolated the head, so assume | ||
1260 | * it's not free, it'd be tricky to | ||
1261 | * track the head status without a | ||
1262 | * page pin. | ||
1263 | */ | ||
1264 | if (!PageTail(cursor_page) && | ||
1265 | !atomic_read(&cursor_page->_count)) | ||
1266 | continue; | ||
1267 | break; | ||
1268 | } | ||
1269 | } | ||
1270 | |||
1271 | /* If we break out of the loop above, lumpy reclaim failed */ | ||
1272 | if (pfn < end_pfn) | ||
1273 | nr_lumpy_failed++; | ||
1274 | } | 1091 | } |
1275 | 1092 | ||
1276 | *nr_scanned = scan; | 1093 | *nr_scanned = scan; |
1277 | 1094 | trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, | |
1278 | trace_mm_vmscan_lru_isolate(sc->order, | 1095 | nr_taken, mode, is_file_lru(lru)); |
1279 | nr_to_scan, scan, | ||
1280 | nr_taken, | ||
1281 | nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, | ||
1282 | mode, file); | ||
1283 | return nr_taken; | 1096 | return nr_taken; |
1284 | } | 1097 | } |
1285 | 1098 | ||
@@ -1316,15 +1129,16 @@ int isolate_lru_page(struct page *page) | |||
1316 | 1129 | ||
1317 | if (PageLRU(page)) { | 1130 | if (PageLRU(page)) { |
1318 | struct zone *zone = page_zone(page); | 1131 | struct zone *zone = page_zone(page); |
1132 | struct lruvec *lruvec; | ||
1319 | 1133 | ||
1320 | spin_lock_irq(&zone->lru_lock); | 1134 | spin_lock_irq(&zone->lru_lock); |
1135 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
1321 | if (PageLRU(page)) { | 1136 | if (PageLRU(page)) { |
1322 | int lru = page_lru(page); | 1137 | int lru = page_lru(page); |
1323 | ret = 0; | ||
1324 | get_page(page); | 1138 | get_page(page); |
1325 | ClearPageLRU(page); | 1139 | ClearPageLRU(page); |
1326 | 1140 | del_page_from_lru_list(page, lruvec, lru); | |
1327 | del_page_from_lru_list(zone, page, lru); | 1141 | ret = 0; |
1328 | } | 1142 | } |
1329 | spin_unlock_irq(&zone->lru_lock); | 1143 | spin_unlock_irq(&zone->lru_lock); |
1330 | } | 1144 | } |
@@ -1357,11 +1171,10 @@ static int too_many_isolated(struct zone *zone, int file, | |||
1357 | } | 1171 | } |
1358 | 1172 | ||
1359 | static noinline_for_stack void | 1173 | static noinline_for_stack void |
1360 | putback_inactive_pages(struct mem_cgroup_zone *mz, | 1174 | putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) |
1361 | struct list_head *page_list) | ||
1362 | { | 1175 | { |
1363 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | 1176 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1364 | struct zone *zone = mz->zone; | 1177 | struct zone *zone = lruvec_zone(lruvec); |
1365 | LIST_HEAD(pages_to_free); | 1178 | LIST_HEAD(pages_to_free); |
1366 | 1179 | ||
1367 | /* | 1180 | /* |
@@ -1379,9 +1192,13 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, | |||
1379 | spin_lock_irq(&zone->lru_lock); | 1192 | spin_lock_irq(&zone->lru_lock); |
1380 | continue; | 1193 | continue; |
1381 | } | 1194 | } |
1195 | |||
1196 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
1197 | |||
1382 | SetPageLRU(page); | 1198 | SetPageLRU(page); |
1383 | lru = page_lru(page); | 1199 | lru = page_lru(page); |
1384 | add_page_to_lru_list(zone, page, lru); | 1200 | add_page_to_lru_list(page, lruvec, lru); |
1201 | |||
1385 | if (is_active_lru(lru)) { | 1202 | if (is_active_lru(lru)) { |
1386 | int file = is_file_lru(lru); | 1203 | int file = is_file_lru(lru); |
1387 | int numpages = hpage_nr_pages(page); | 1204 | int numpages = hpage_nr_pages(page); |
@@ -1390,7 +1207,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, | |||
1390 | if (put_page_testzero(page)) { | 1207 | if (put_page_testzero(page)) { |
1391 | __ClearPageLRU(page); | 1208 | __ClearPageLRU(page); |
1392 | __ClearPageActive(page); | 1209 | __ClearPageActive(page); |
1393 | del_page_from_lru_list(zone, page, lru); | 1210 | del_page_from_lru_list(page, lruvec, lru); |
1394 | 1211 | ||
1395 | if (unlikely(PageCompound(page))) { | 1212 | if (unlikely(PageCompound(page))) { |
1396 | spin_unlock_irq(&zone->lru_lock); | 1213 | spin_unlock_irq(&zone->lru_lock); |
@@ -1407,112 +1224,24 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, | |||
1407 | list_splice(&pages_to_free, page_list); | 1224 | list_splice(&pages_to_free, page_list); |
1408 | } | 1225 | } |
1409 | 1226 | ||
1410 | static noinline_for_stack void | ||
1411 | update_isolated_counts(struct mem_cgroup_zone *mz, | ||
1412 | struct list_head *page_list, | ||
1413 | unsigned long *nr_anon, | ||
1414 | unsigned long *nr_file) | ||
1415 | { | ||
1416 | struct zone *zone = mz->zone; | ||
1417 | unsigned int count[NR_LRU_LISTS] = { 0, }; | ||
1418 | unsigned long nr_active = 0; | ||
1419 | struct page *page; | ||
1420 | int lru; | ||
1421 | |||
1422 | /* | ||
1423 | * Count pages and clear active flags | ||
1424 | */ | ||
1425 | list_for_each_entry(page, page_list, lru) { | ||
1426 | int numpages = hpage_nr_pages(page); | ||
1427 | lru = page_lru_base_type(page); | ||
1428 | if (PageActive(page)) { | ||
1429 | lru += LRU_ACTIVE; | ||
1430 | ClearPageActive(page); | ||
1431 | nr_active += numpages; | ||
1432 | } | ||
1433 | count[lru] += numpages; | ||
1434 | } | ||
1435 | |||
1436 | preempt_disable(); | ||
1437 | __count_vm_events(PGDEACTIVATE, nr_active); | ||
1438 | |||
1439 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, | ||
1440 | -count[LRU_ACTIVE_FILE]); | ||
1441 | __mod_zone_page_state(zone, NR_INACTIVE_FILE, | ||
1442 | -count[LRU_INACTIVE_FILE]); | ||
1443 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, | ||
1444 | -count[LRU_ACTIVE_ANON]); | ||
1445 | __mod_zone_page_state(zone, NR_INACTIVE_ANON, | ||
1446 | -count[LRU_INACTIVE_ANON]); | ||
1447 | |||
1448 | *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; | ||
1449 | *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; | ||
1450 | |||
1451 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); | ||
1452 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); | ||
1453 | preempt_enable(); | ||
1454 | } | ||
1455 | |||
1456 | /* | ||
1457 | * Returns true if a direct reclaim should wait on pages under writeback. | ||
1458 | * | ||
1459 | * If we are direct reclaiming for contiguous pages and we do not reclaim | ||
1460 | * everything in the list, try again and wait for writeback IO to complete. | ||
1461 | * This will stall high-order allocations noticeably. Only do that when really | ||
1462 | * need to free the pages under high memory pressure. | ||
1463 | */ | ||
1464 | static inline bool should_reclaim_stall(unsigned long nr_taken, | ||
1465 | unsigned long nr_freed, | ||
1466 | int priority, | ||
1467 | struct scan_control *sc) | ||
1468 | { | ||
1469 | int lumpy_stall_priority; | ||
1470 | |||
1471 | /* kswapd should not stall on sync IO */ | ||
1472 | if (current_is_kswapd()) | ||
1473 | return false; | ||
1474 | |||
1475 | /* Only stall on lumpy reclaim */ | ||
1476 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) | ||
1477 | return false; | ||
1478 | |||
1479 | /* If we have reclaimed everything on the isolated list, no stall */ | ||
1480 | if (nr_freed == nr_taken) | ||
1481 | return false; | ||
1482 | |||
1483 | /* | ||
1484 | * For high-order allocations, there are two stall thresholds. | ||
1485 | * High-cost allocations stall immediately where as lower | ||
1486 | * order allocations such as stacks require the scanning | ||
1487 | * priority to be much higher before stalling. | ||
1488 | */ | ||
1489 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
1490 | lumpy_stall_priority = DEF_PRIORITY; | ||
1491 | else | ||
1492 | lumpy_stall_priority = DEF_PRIORITY / 3; | ||
1493 | |||
1494 | return priority <= lumpy_stall_priority; | ||
1495 | } | ||
1496 | |||
1497 | /* | 1227 | /* |
1498 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number | 1228 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number |
1499 | * of reclaimed pages | 1229 | * of reclaimed pages |
1500 | */ | 1230 | */ |
1501 | static noinline_for_stack unsigned long | 1231 | static noinline_for_stack unsigned long |
1502 | shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | 1232 | shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, |
1503 | struct scan_control *sc, int priority, int file) | 1233 | struct scan_control *sc, enum lru_list lru) |
1504 | { | 1234 | { |
1505 | LIST_HEAD(page_list); | 1235 | LIST_HEAD(page_list); |
1506 | unsigned long nr_scanned; | 1236 | unsigned long nr_scanned; |
1507 | unsigned long nr_reclaimed = 0; | 1237 | unsigned long nr_reclaimed = 0; |
1508 | unsigned long nr_taken; | 1238 | unsigned long nr_taken; |
1509 | unsigned long nr_anon; | ||
1510 | unsigned long nr_file; | ||
1511 | unsigned long nr_dirty = 0; | 1239 | unsigned long nr_dirty = 0; |
1512 | unsigned long nr_writeback = 0; | 1240 | unsigned long nr_writeback = 0; |
1513 | isolate_mode_t isolate_mode = ISOLATE_INACTIVE; | 1241 | isolate_mode_t isolate_mode = 0; |
1514 | struct zone *zone = mz->zone; | 1242 | int file = is_file_lru(lru); |
1515 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | 1243 | struct zone *zone = lruvec_zone(lruvec); |
1244 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | ||
1516 | 1245 | ||
1517 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1246 | while (unlikely(too_many_isolated(zone, file, sc))) { |
1518 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1247 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -1522,10 +1251,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1522 | return SWAP_CLUSTER_MAX; | 1251 | return SWAP_CLUSTER_MAX; |
1523 | } | 1252 | } |
1524 | 1253 | ||
1525 | set_reclaim_mode(priority, sc, false); | ||
1526 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | ||
1527 | isolate_mode |= ISOLATE_ACTIVE; | ||
1528 | |||
1529 | lru_add_drain(); | 1254 | lru_add_drain(); |
1530 | 1255 | ||
1531 | if (!sc->may_unmap) | 1256 | if (!sc->may_unmap) |
@@ -1535,47 +1260,43 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1535 | 1260 | ||
1536 | spin_lock_irq(&zone->lru_lock); | 1261 | spin_lock_irq(&zone->lru_lock); |
1537 | 1262 | ||
1538 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned, | 1263 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, |
1539 | sc, isolate_mode, 0, file); | 1264 | &nr_scanned, sc, isolate_mode, lru); |
1265 | |||
1266 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); | ||
1267 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); | ||
1268 | |||
1540 | if (global_reclaim(sc)) { | 1269 | if (global_reclaim(sc)) { |
1541 | zone->pages_scanned += nr_scanned; | 1270 | zone->pages_scanned += nr_scanned; |
1542 | if (current_is_kswapd()) | 1271 | if (current_is_kswapd()) |
1543 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, | 1272 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); |
1544 | nr_scanned); | ||
1545 | else | 1273 | else |
1546 | __count_zone_vm_events(PGSCAN_DIRECT, zone, | 1274 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); |
1547 | nr_scanned); | ||
1548 | } | 1275 | } |
1549 | spin_unlock_irq(&zone->lru_lock); | 1276 | spin_unlock_irq(&zone->lru_lock); |
1550 | 1277 | ||
1551 | if (nr_taken == 0) | 1278 | if (nr_taken == 0) |
1552 | return 0; | 1279 | return 0; |
1553 | 1280 | ||
1554 | update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); | 1281 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, |
1555 | |||
1556 | nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, | ||
1557 | &nr_dirty, &nr_writeback); | 1282 | &nr_dirty, &nr_writeback); |
1558 | 1283 | ||
1559 | /* Check if we should syncronously wait for writeback */ | ||
1560 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | ||
1561 | set_reclaim_mode(priority, sc, true); | ||
1562 | nr_reclaimed += shrink_page_list(&page_list, mz, sc, | ||
1563 | priority, &nr_dirty, &nr_writeback); | ||
1564 | } | ||
1565 | |||
1566 | spin_lock_irq(&zone->lru_lock); | 1284 | spin_lock_irq(&zone->lru_lock); |
1567 | 1285 | ||
1568 | reclaim_stat->recent_scanned[0] += nr_anon; | 1286 | reclaim_stat->recent_scanned[file] += nr_taken; |
1569 | reclaim_stat->recent_scanned[1] += nr_file; | ||
1570 | 1287 | ||
1571 | if (current_is_kswapd()) | 1288 | if (global_reclaim(sc)) { |
1572 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); | 1289 | if (current_is_kswapd()) |
1573 | __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); | 1290 | __count_zone_vm_events(PGSTEAL_KSWAPD, zone, |
1291 | nr_reclaimed); | ||
1292 | else | ||
1293 | __count_zone_vm_events(PGSTEAL_DIRECT, zone, | ||
1294 | nr_reclaimed); | ||
1295 | } | ||
1574 | 1296 | ||
1575 | putback_inactive_pages(mz, &page_list); | 1297 | putback_inactive_pages(lruvec, &page_list); |
1576 | 1298 | ||
1577 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); | 1299 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); |
1578 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); | ||
1579 | 1300 | ||
1580 | spin_unlock_irq(&zone->lru_lock); | 1301 | spin_unlock_irq(&zone->lru_lock); |
1581 | 1302 | ||
@@ -1604,14 +1325,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1604 | * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any | 1325 | * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any |
1605 | * isolated page is PageWriteback | 1326 | * isolated page is PageWriteback |
1606 | */ | 1327 | */ |
1607 | if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) | 1328 | if (nr_writeback && nr_writeback >= |
1329 | (nr_taken >> (DEF_PRIORITY - sc->priority))) | ||
1608 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); | 1330 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); |
1609 | 1331 | ||
1610 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, | 1332 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, |
1611 | zone_idx(zone), | 1333 | zone_idx(zone), |
1612 | nr_scanned, nr_reclaimed, | 1334 | nr_scanned, nr_reclaimed, |
1613 | priority, | 1335 | sc->priority, |
1614 | trace_shrink_flags(file, sc->reclaim_mode)); | 1336 | trace_shrink_flags(file)); |
1615 | return nr_reclaimed; | 1337 | return nr_reclaimed; |
1616 | } | 1338 | } |
1617 | 1339 | ||
@@ -1633,30 +1355,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1633 | * But we had to alter page->flags anyway. | 1355 | * But we had to alter page->flags anyway. |
1634 | */ | 1356 | */ |
1635 | 1357 | ||
1636 | static void move_active_pages_to_lru(struct zone *zone, | 1358 | static void move_active_pages_to_lru(struct lruvec *lruvec, |
1637 | struct list_head *list, | 1359 | struct list_head *list, |
1638 | struct list_head *pages_to_free, | 1360 | struct list_head *pages_to_free, |
1639 | enum lru_list lru) | 1361 | enum lru_list lru) |
1640 | { | 1362 | { |
1363 | struct zone *zone = lruvec_zone(lruvec); | ||
1641 | unsigned long pgmoved = 0; | 1364 | unsigned long pgmoved = 0; |
1642 | struct page *page; | 1365 | struct page *page; |
1366 | int nr_pages; | ||
1643 | 1367 | ||
1644 | while (!list_empty(list)) { | 1368 | while (!list_empty(list)) { |
1645 | struct lruvec *lruvec; | ||
1646 | |||
1647 | page = lru_to_page(list); | 1369 | page = lru_to_page(list); |
1370 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
1648 | 1371 | ||
1649 | VM_BUG_ON(PageLRU(page)); | 1372 | VM_BUG_ON(PageLRU(page)); |
1650 | SetPageLRU(page); | 1373 | SetPageLRU(page); |
1651 | 1374 | ||
1652 | lruvec = mem_cgroup_lru_add_list(zone, page, lru); | 1375 | nr_pages = hpage_nr_pages(page); |
1376 | mem_cgroup_update_lru_size(lruvec, lru, nr_pages); | ||
1653 | list_move(&page->lru, &lruvec->lists[lru]); | 1377 | list_move(&page->lru, &lruvec->lists[lru]); |
1654 | pgmoved += hpage_nr_pages(page); | 1378 | pgmoved += nr_pages; |
1655 | 1379 | ||
1656 | if (put_page_testzero(page)) { | 1380 | if (put_page_testzero(page)) { |
1657 | __ClearPageLRU(page); | 1381 | __ClearPageLRU(page); |
1658 | __ClearPageActive(page); | 1382 | __ClearPageActive(page); |
1659 | del_page_from_lru_list(zone, page, lru); | 1383 | del_page_from_lru_list(page, lruvec, lru); |
1660 | 1384 | ||
1661 | if (unlikely(PageCompound(page))) { | 1385 | if (unlikely(PageCompound(page))) { |
1662 | spin_unlock_irq(&zone->lru_lock); | 1386 | spin_unlock_irq(&zone->lru_lock); |
@@ -1672,9 +1396,9 @@ static void move_active_pages_to_lru(struct zone *zone, | |||
1672 | } | 1396 | } |
1673 | 1397 | ||
1674 | static void shrink_active_list(unsigned long nr_to_scan, | 1398 | static void shrink_active_list(unsigned long nr_to_scan, |
1675 | struct mem_cgroup_zone *mz, | 1399 | struct lruvec *lruvec, |
1676 | struct scan_control *sc, | 1400 | struct scan_control *sc, |
1677 | int priority, int file) | 1401 | enum lru_list lru) |
1678 | { | 1402 | { |
1679 | unsigned long nr_taken; | 1403 | unsigned long nr_taken; |
1680 | unsigned long nr_scanned; | 1404 | unsigned long nr_scanned; |
@@ -1683,15 +1407,14 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1683 | LIST_HEAD(l_active); | 1407 | LIST_HEAD(l_active); |
1684 | LIST_HEAD(l_inactive); | 1408 | LIST_HEAD(l_inactive); |
1685 | struct page *page; | 1409 | struct page *page; |
1686 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | 1410 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1687 | unsigned long nr_rotated = 0; | 1411 | unsigned long nr_rotated = 0; |
1688 | isolate_mode_t isolate_mode = ISOLATE_ACTIVE; | 1412 | isolate_mode_t isolate_mode = 0; |
1689 | struct zone *zone = mz->zone; | 1413 | int file = is_file_lru(lru); |
1414 | struct zone *zone = lruvec_zone(lruvec); | ||
1690 | 1415 | ||
1691 | lru_add_drain(); | 1416 | lru_add_drain(); |
1692 | 1417 | ||
1693 | reset_reclaim_mode(sc); | ||
1694 | |||
1695 | if (!sc->may_unmap) | 1418 | if (!sc->may_unmap) |
1696 | isolate_mode |= ISOLATE_UNMAPPED; | 1419 | isolate_mode |= ISOLATE_UNMAPPED; |
1697 | if (!sc->may_writepage) | 1420 | if (!sc->may_writepage) |
@@ -1699,18 +1422,15 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1699 | 1422 | ||
1700 | spin_lock_irq(&zone->lru_lock); | 1423 | spin_lock_irq(&zone->lru_lock); |
1701 | 1424 | ||
1702 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc, | 1425 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, |
1703 | isolate_mode, 1, file); | 1426 | &nr_scanned, sc, isolate_mode, lru); |
1704 | if (global_reclaim(sc)) | 1427 | if (global_reclaim(sc)) |
1705 | zone->pages_scanned += nr_scanned; | 1428 | zone->pages_scanned += nr_scanned; |
1706 | 1429 | ||
1707 | reclaim_stat->recent_scanned[file] += nr_taken; | 1430 | reclaim_stat->recent_scanned[file] += nr_taken; |
1708 | 1431 | ||
1709 | __count_zone_vm_events(PGREFILL, zone, nr_scanned); | 1432 | __count_zone_vm_events(PGREFILL, zone, nr_scanned); |
1710 | if (file) | 1433 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); |
1711 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); | ||
1712 | else | ||
1713 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); | ||
1714 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); | 1434 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); |
1715 | spin_unlock_irq(&zone->lru_lock); | 1435 | spin_unlock_irq(&zone->lru_lock); |
1716 | 1436 | ||
@@ -1732,7 +1452,8 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1732 | } | 1452 | } |
1733 | } | 1453 | } |
1734 | 1454 | ||
1735 | if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { | 1455 | if (page_referenced(page, 0, sc->target_mem_cgroup, |
1456 | &vm_flags)) { | ||
1736 | nr_rotated += hpage_nr_pages(page); | 1457 | nr_rotated += hpage_nr_pages(page); |
1737 | /* | 1458 | /* |
1738 | * Identify referenced, file-backed active pages and | 1459 | * Identify referenced, file-backed active pages and |
@@ -1765,10 +1486,8 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1765 | */ | 1486 | */ |
1766 | reclaim_stat->recent_rotated[file] += nr_rotated; | 1487 | reclaim_stat->recent_rotated[file] += nr_rotated; |
1767 | 1488 | ||
1768 | move_active_pages_to_lru(zone, &l_active, &l_hold, | 1489 | move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); |
1769 | LRU_ACTIVE + file * LRU_FILE); | 1490 | move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); |
1770 | move_active_pages_to_lru(zone, &l_inactive, &l_hold, | ||
1771 | LRU_BASE + file * LRU_FILE); | ||
1772 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); | 1491 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); |
1773 | spin_unlock_irq(&zone->lru_lock); | 1492 | spin_unlock_irq(&zone->lru_lock); |
1774 | 1493 | ||
@@ -1791,13 +1510,12 @@ static int inactive_anon_is_low_global(struct zone *zone) | |||
1791 | 1510 | ||
1792 | /** | 1511 | /** |
1793 | * inactive_anon_is_low - check if anonymous pages need to be deactivated | 1512 | * inactive_anon_is_low - check if anonymous pages need to be deactivated |
1794 | * @zone: zone to check | 1513 | * @lruvec: LRU vector to check |
1795 | * @sc: scan control of this context | ||
1796 | * | 1514 | * |
1797 | * Returns true if the zone does not have enough inactive anon pages, | 1515 | * Returns true if the zone does not have enough inactive anon pages, |
1798 | * meaning some active anon pages need to be deactivated. | 1516 | * meaning some active anon pages need to be deactivated. |
1799 | */ | 1517 | */ |
1800 | static int inactive_anon_is_low(struct mem_cgroup_zone *mz) | 1518 | static int inactive_anon_is_low(struct lruvec *lruvec) |
1801 | { | 1519 | { |
1802 | /* | 1520 | /* |
1803 | * If we don't have swap space, anonymous page deactivation | 1521 | * If we don't have swap space, anonymous page deactivation |
@@ -1806,14 +1524,13 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz) | |||
1806 | if (!total_swap_pages) | 1524 | if (!total_swap_pages) |
1807 | return 0; | 1525 | return 0; |
1808 | 1526 | ||
1809 | if (!scanning_global_lru(mz)) | 1527 | if (!mem_cgroup_disabled()) |
1810 | return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, | 1528 | return mem_cgroup_inactive_anon_is_low(lruvec); |
1811 | mz->zone); | ||
1812 | 1529 | ||
1813 | return inactive_anon_is_low_global(mz->zone); | 1530 | return inactive_anon_is_low_global(lruvec_zone(lruvec)); |
1814 | } | 1531 | } |
1815 | #else | 1532 | #else |
1816 | static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz) | 1533 | static inline int inactive_anon_is_low(struct lruvec *lruvec) |
1817 | { | 1534 | { |
1818 | return 0; | 1535 | return 0; |
1819 | } | 1536 | } |
@@ -1831,7 +1548,7 @@ static int inactive_file_is_low_global(struct zone *zone) | |||
1831 | 1548 | ||
1832 | /** | 1549 | /** |
1833 | * inactive_file_is_low - check if file pages need to be deactivated | 1550 | * inactive_file_is_low - check if file pages need to be deactivated |
1834 | * @mz: memory cgroup and zone to check | 1551 | * @lruvec: LRU vector to check |
1835 | * | 1552 | * |
1836 | * When the system is doing streaming IO, memory pressure here | 1553 | * When the system is doing streaming IO, memory pressure here |
1837 | * ensures that active file pages get deactivated, until more | 1554 | * ensures that active file pages get deactivated, until more |
@@ -1843,44 +1560,39 @@ static int inactive_file_is_low_global(struct zone *zone) | |||
1843 | * This uses a different ratio than the anonymous pages, because | 1560 | * This uses a different ratio than the anonymous pages, because |
1844 | * the page cache uses a use-once replacement algorithm. | 1561 | * the page cache uses a use-once replacement algorithm. |
1845 | */ | 1562 | */ |
1846 | static int inactive_file_is_low(struct mem_cgroup_zone *mz) | 1563 | static int inactive_file_is_low(struct lruvec *lruvec) |
1847 | { | 1564 | { |
1848 | if (!scanning_global_lru(mz)) | 1565 | if (!mem_cgroup_disabled()) |
1849 | return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, | 1566 | return mem_cgroup_inactive_file_is_low(lruvec); |
1850 | mz->zone); | ||
1851 | 1567 | ||
1852 | return inactive_file_is_low_global(mz->zone); | 1568 | return inactive_file_is_low_global(lruvec_zone(lruvec)); |
1853 | } | 1569 | } |
1854 | 1570 | ||
1855 | static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file) | 1571 | static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) |
1856 | { | 1572 | { |
1857 | if (file) | 1573 | if (is_file_lru(lru)) |
1858 | return inactive_file_is_low(mz); | 1574 | return inactive_file_is_low(lruvec); |
1859 | else | 1575 | else |
1860 | return inactive_anon_is_low(mz); | 1576 | return inactive_anon_is_low(lruvec); |
1861 | } | 1577 | } |
1862 | 1578 | ||
1863 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1579 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
1864 | struct mem_cgroup_zone *mz, | 1580 | struct lruvec *lruvec, struct scan_control *sc) |
1865 | struct scan_control *sc, int priority) | ||
1866 | { | 1581 | { |
1867 | int file = is_file_lru(lru); | ||
1868 | |||
1869 | if (is_active_lru(lru)) { | 1582 | if (is_active_lru(lru)) { |
1870 | if (inactive_list_is_low(mz, file)) | 1583 | if (inactive_list_is_low(lruvec, lru)) |
1871 | shrink_active_list(nr_to_scan, mz, sc, priority, file); | 1584 | shrink_active_list(nr_to_scan, lruvec, sc, lru); |
1872 | return 0; | 1585 | return 0; |
1873 | } | 1586 | } |
1874 | 1587 | ||
1875 | return shrink_inactive_list(nr_to_scan, mz, sc, priority, file); | 1588 | return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); |
1876 | } | 1589 | } |
1877 | 1590 | ||
1878 | static int vmscan_swappiness(struct mem_cgroup_zone *mz, | 1591 | static int vmscan_swappiness(struct scan_control *sc) |
1879 | struct scan_control *sc) | ||
1880 | { | 1592 | { |
1881 | if (global_reclaim(sc)) | 1593 | if (global_reclaim(sc)) |
1882 | return vm_swappiness; | 1594 | return vm_swappiness; |
1883 | return mem_cgroup_swappiness(mz->mem_cgroup); | 1595 | return mem_cgroup_swappiness(sc->target_mem_cgroup); |
1884 | } | 1596 | } |
1885 | 1597 | ||
1886 | /* | 1598 | /* |
@@ -1889,19 +1601,21 @@ static int vmscan_swappiness(struct mem_cgroup_zone *mz, | |||
1889 | * by looking at the fraction of the pages scanned we did rotate back | 1601 | * by looking at the fraction of the pages scanned we did rotate back |
1890 | * onto the active list instead of evict. | 1602 | * onto the active list instead of evict. |
1891 | * | 1603 | * |
1892 | * nr[0] = anon pages to scan; nr[1] = file pages to scan | 1604 | * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan |
1605 | * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan | ||
1893 | */ | 1606 | */ |
1894 | static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, | 1607 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, |
1895 | unsigned long *nr, int priority) | 1608 | unsigned long *nr) |
1896 | { | 1609 | { |
1897 | unsigned long anon, file, free; | 1610 | unsigned long anon, file, free; |
1898 | unsigned long anon_prio, file_prio; | 1611 | unsigned long anon_prio, file_prio; |
1899 | unsigned long ap, fp; | 1612 | unsigned long ap, fp; |
1900 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | 1613 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1901 | u64 fraction[2], denominator; | 1614 | u64 fraction[2], denominator; |
1902 | enum lru_list lru; | 1615 | enum lru_list lru; |
1903 | int noswap = 0; | 1616 | int noswap = 0; |
1904 | bool force_scan = false; | 1617 | bool force_scan = false; |
1618 | struct zone *zone = lruvec_zone(lruvec); | ||
1905 | 1619 | ||
1906 | /* | 1620 | /* |
1907 | * If the zone or memcg is small, nr[l] can be 0. This | 1621 | * If the zone or memcg is small, nr[l] can be 0. This |
@@ -1913,7 +1627,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, | |||
1913 | * latencies, so it's better to scan a minimum amount there as | 1627 | * latencies, so it's better to scan a minimum amount there as |
1914 | * well. | 1628 | * well. |
1915 | */ | 1629 | */ |
1916 | if (current_is_kswapd() && mz->zone->all_unreclaimable) | 1630 | if (current_is_kswapd() && zone->all_unreclaimable) |
1917 | force_scan = true; | 1631 | force_scan = true; |
1918 | if (!global_reclaim(sc)) | 1632 | if (!global_reclaim(sc)) |
1919 | force_scan = true; | 1633 | force_scan = true; |
@@ -1927,16 +1641,16 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, | |||
1927 | goto out; | 1641 | goto out; |
1928 | } | 1642 | } |
1929 | 1643 | ||
1930 | anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) + | 1644 | anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + |
1931 | zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); | 1645 | get_lru_size(lruvec, LRU_INACTIVE_ANON); |
1932 | file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) + | 1646 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + |
1933 | zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); | 1647 | get_lru_size(lruvec, LRU_INACTIVE_FILE); |
1934 | 1648 | ||
1935 | if (global_reclaim(sc)) { | 1649 | if (global_reclaim(sc)) { |
1936 | free = zone_page_state(mz->zone, NR_FREE_PAGES); | 1650 | free = zone_page_state(zone, NR_FREE_PAGES); |
1937 | /* If we have very few page cache pages, | 1651 | /* If we have very few page cache pages, |
1938 | force-scan anon pages. */ | 1652 | force-scan anon pages. */ |
1939 | if (unlikely(file + free <= high_wmark_pages(mz->zone))) { | 1653 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
1940 | fraction[0] = 1; | 1654 | fraction[0] = 1; |
1941 | fraction[1] = 0; | 1655 | fraction[1] = 0; |
1942 | denominator = 1; | 1656 | denominator = 1; |
@@ -1948,8 +1662,8 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, | |||
1948 | * With swappiness at 100, anonymous and file have the same priority. | 1662 | * With swappiness at 100, anonymous and file have the same priority. |
1949 | * This scanning priority is essentially the inverse of IO cost. | 1663 | * This scanning priority is essentially the inverse of IO cost. |
1950 | */ | 1664 | */ |
1951 | anon_prio = vmscan_swappiness(mz, sc); | 1665 | anon_prio = vmscan_swappiness(sc); |
1952 | file_prio = 200 - vmscan_swappiness(mz, sc); | 1666 | file_prio = 200 - anon_prio; |
1953 | 1667 | ||
1954 | /* | 1668 | /* |
1955 | * OK, so we have swap space and a fair amount of page cache | 1669 | * OK, so we have swap space and a fair amount of page cache |
@@ -1962,7 +1676,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, | |||
1962 | * | 1676 | * |
1963 | * anon in [0], file in [1] | 1677 | * anon in [0], file in [1] |
1964 | */ | 1678 | */ |
1965 | spin_lock_irq(&mz->zone->lru_lock); | 1679 | spin_lock_irq(&zone->lru_lock); |
1966 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { | 1680 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { |
1967 | reclaim_stat->recent_scanned[0] /= 2; | 1681 | reclaim_stat->recent_scanned[0] /= 2; |
1968 | reclaim_stat->recent_rotated[0] /= 2; | 1682 | reclaim_stat->recent_rotated[0] /= 2; |
@@ -1978,12 +1692,12 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, | |||
1978 | * proportional to the fraction of recently scanned pages on | 1692 | * proportional to the fraction of recently scanned pages on |
1979 | * each list that were recently referenced and in active use. | 1693 | * each list that were recently referenced and in active use. |
1980 | */ | 1694 | */ |
1981 | ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); | 1695 | ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); |
1982 | ap /= reclaim_stat->recent_rotated[0] + 1; | 1696 | ap /= reclaim_stat->recent_rotated[0] + 1; |
1983 | 1697 | ||
1984 | fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); | 1698 | fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); |
1985 | fp /= reclaim_stat->recent_rotated[1] + 1; | 1699 | fp /= reclaim_stat->recent_rotated[1] + 1; |
1986 | spin_unlock_irq(&mz->zone->lru_lock); | 1700 | spin_unlock_irq(&zone->lru_lock); |
1987 | 1701 | ||
1988 | fraction[0] = ap; | 1702 | fraction[0] = ap; |
1989 | fraction[1] = fp; | 1703 | fraction[1] = fp; |
@@ -1993,9 +1707,9 @@ out: | |||
1993 | int file = is_file_lru(lru); | 1707 | int file = is_file_lru(lru); |
1994 | unsigned long scan; | 1708 | unsigned long scan; |
1995 | 1709 | ||
1996 | scan = zone_nr_lru_pages(mz, lru); | 1710 | scan = get_lru_size(lruvec, lru); |
1997 | if (priority || noswap) { | 1711 | if (sc->priority || noswap || !vmscan_swappiness(sc)) { |
1998 | scan >>= priority; | 1712 | scan >>= sc->priority; |
1999 | if (!scan && force_scan) | 1713 | if (!scan && force_scan) |
2000 | scan = SWAP_CLUSTER_MAX; | 1714 | scan = SWAP_CLUSTER_MAX; |
2001 | scan = div64_u64(scan * fraction[file], denominator); | 1715 | scan = div64_u64(scan * fraction[file], denominator); |
@@ -2004,14 +1718,25 @@ out: | |||
2004 | } | 1718 | } |
2005 | } | 1719 | } |
2006 | 1720 | ||
1721 | /* Use reclaim/compaction for costly allocs or under memory pressure */ | ||
1722 | static bool in_reclaim_compaction(struct scan_control *sc) | ||
1723 | { | ||
1724 | if (COMPACTION_BUILD && sc->order && | ||
1725 | (sc->order > PAGE_ALLOC_COSTLY_ORDER || | ||
1726 | sc->priority < DEF_PRIORITY - 2)) | ||
1727 | return true; | ||
1728 | |||
1729 | return false; | ||
1730 | } | ||
1731 | |||
2007 | /* | 1732 | /* |
2008 | * Reclaim/compaction depends on a number of pages being freed. To avoid | 1733 | * Reclaim/compaction is used for high-order allocation requests. It reclaims |
2009 | * disruption to the system, a small number of order-0 pages continue to be | 1734 | * order-0 pages before compacting the zone. should_continue_reclaim() returns |
2010 | * rotated and reclaimed in the normal fashion. However, by the time we get | 1735 | * true if more pages should be reclaimed such that when the page allocator |
2011 | * back to the allocator and call try_to_compact_zone(), we ensure that | 1736 | * calls try_to_compact_zone() that it will have enough free pages to succeed. |
2012 | * there are enough free pages for it to be likely successful | 1737 | * It will give up earlier than that if there is difficulty reclaiming pages. |
2013 | */ | 1738 | */ |
2014 | static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, | 1739 | static inline bool should_continue_reclaim(struct lruvec *lruvec, |
2015 | unsigned long nr_reclaimed, | 1740 | unsigned long nr_reclaimed, |
2016 | unsigned long nr_scanned, | 1741 | unsigned long nr_scanned, |
2017 | struct scan_control *sc) | 1742 | struct scan_control *sc) |
@@ -2020,7 +1745,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, | |||
2020 | unsigned long inactive_lru_pages; | 1745 | unsigned long inactive_lru_pages; |
2021 | 1746 | ||
2022 | /* If not in reclaim/compaction mode, stop */ | 1747 | /* If not in reclaim/compaction mode, stop */ |
2023 | if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) | 1748 | if (!in_reclaim_compaction(sc)) |
2024 | return false; | 1749 | return false; |
2025 | 1750 | ||
2026 | /* Consider stopping depending on scan and reclaim activity */ | 1751 | /* Consider stopping depending on scan and reclaim activity */ |
@@ -2051,15 +1776,15 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, | |||
2051 | * inactive lists are large enough, continue reclaiming | 1776 | * inactive lists are large enough, continue reclaiming |
2052 | */ | 1777 | */ |
2053 | pages_for_compaction = (2UL << sc->order); | 1778 | pages_for_compaction = (2UL << sc->order); |
2054 | inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); | 1779 | inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); |
2055 | if (nr_swap_pages > 0) | 1780 | if (nr_swap_pages > 0) |
2056 | inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); | 1781 | inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); |
2057 | if (sc->nr_reclaimed < pages_for_compaction && | 1782 | if (sc->nr_reclaimed < pages_for_compaction && |
2058 | inactive_lru_pages > pages_for_compaction) | 1783 | inactive_lru_pages > pages_for_compaction) |
2059 | return true; | 1784 | return true; |
2060 | 1785 | ||
2061 | /* If compaction would go ahead or the allocation would succeed, stop */ | 1786 | /* If compaction would go ahead or the allocation would succeed, stop */ |
2062 | switch (compaction_suitable(mz->zone, sc->order)) { | 1787 | switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) { |
2063 | case COMPACT_PARTIAL: | 1788 | case COMPACT_PARTIAL: |
2064 | case COMPACT_CONTINUE: | 1789 | case COMPACT_CONTINUE: |
2065 | return false; | 1790 | return false; |
@@ -2071,8 +1796,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, | |||
2071 | /* | 1796 | /* |
2072 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1797 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
2073 | */ | 1798 | */ |
2074 | static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, | 1799 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) |
2075 | struct scan_control *sc) | ||
2076 | { | 1800 | { |
2077 | unsigned long nr[NR_LRU_LISTS]; | 1801 | unsigned long nr[NR_LRU_LISTS]; |
2078 | unsigned long nr_to_scan; | 1802 | unsigned long nr_to_scan; |
@@ -2084,7 +1808,7 @@ static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, | |||
2084 | restart: | 1808 | restart: |
2085 | nr_reclaimed = 0; | 1809 | nr_reclaimed = 0; |
2086 | nr_scanned = sc->nr_scanned; | 1810 | nr_scanned = sc->nr_scanned; |
2087 | get_scan_count(mz, sc, nr, priority); | 1811 | get_scan_count(lruvec, sc, nr); |
2088 | 1812 | ||
2089 | blk_start_plug(&plug); | 1813 | blk_start_plug(&plug); |
2090 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1814 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
@@ -2096,7 +1820,7 @@ restart: | |||
2096 | nr[lru] -= nr_to_scan; | 1820 | nr[lru] -= nr_to_scan; |
2097 | 1821 | ||
2098 | nr_reclaimed += shrink_list(lru, nr_to_scan, | 1822 | nr_reclaimed += shrink_list(lru, nr_to_scan, |
2099 | mz, sc, priority); | 1823 | lruvec, sc); |
2100 | } | 1824 | } |
2101 | } | 1825 | } |
2102 | /* | 1826 | /* |
@@ -2107,12 +1831,8 @@ restart: | |||
2107 | * with multiple processes reclaiming pages, the total | 1831 | * with multiple processes reclaiming pages, the total |
2108 | * freeing target can get unreasonably large. | 1832 | * freeing target can get unreasonably large. |
2109 | */ | 1833 | */ |
2110 | if (nr_reclaimed >= nr_to_reclaim) | 1834 | if (nr_reclaimed >= nr_to_reclaim && |
2111 | nr_to_reclaim = 0; | 1835 | sc->priority < DEF_PRIORITY) |
2112 | else | ||
2113 | nr_to_reclaim -= nr_reclaimed; | ||
2114 | |||
2115 | if (!nr_to_reclaim && priority < DEF_PRIORITY) | ||
2116 | break; | 1836 | break; |
2117 | } | 1837 | } |
2118 | blk_finish_plug(&plug); | 1838 | blk_finish_plug(&plug); |
@@ -2122,35 +1842,33 @@ restart: | |||
2122 | * Even if we did not try to evict anon pages at all, we want to | 1842 | * Even if we did not try to evict anon pages at all, we want to |
2123 | * rebalance the anon lru active/inactive ratio. | 1843 | * rebalance the anon lru active/inactive ratio. |
2124 | */ | 1844 | */ |
2125 | if (inactive_anon_is_low(mz)) | 1845 | if (inactive_anon_is_low(lruvec)) |
2126 | shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0); | 1846 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, |
1847 | sc, LRU_ACTIVE_ANON); | ||
2127 | 1848 | ||
2128 | /* reclaim/compaction might need reclaim to continue */ | 1849 | /* reclaim/compaction might need reclaim to continue */ |
2129 | if (should_continue_reclaim(mz, nr_reclaimed, | 1850 | if (should_continue_reclaim(lruvec, nr_reclaimed, |
2130 | sc->nr_scanned - nr_scanned, sc)) | 1851 | sc->nr_scanned - nr_scanned, sc)) |
2131 | goto restart; | 1852 | goto restart; |
2132 | 1853 | ||
2133 | throttle_vm_writeout(sc->gfp_mask); | 1854 | throttle_vm_writeout(sc->gfp_mask); |
2134 | } | 1855 | } |
2135 | 1856 | ||
2136 | static void shrink_zone(int priority, struct zone *zone, | 1857 | static void shrink_zone(struct zone *zone, struct scan_control *sc) |
2137 | struct scan_control *sc) | ||
2138 | { | 1858 | { |
2139 | struct mem_cgroup *root = sc->target_mem_cgroup; | 1859 | struct mem_cgroup *root = sc->target_mem_cgroup; |
2140 | struct mem_cgroup_reclaim_cookie reclaim = { | 1860 | struct mem_cgroup_reclaim_cookie reclaim = { |
2141 | .zone = zone, | 1861 | .zone = zone, |
2142 | .priority = priority, | 1862 | .priority = sc->priority, |
2143 | }; | 1863 | }; |
2144 | struct mem_cgroup *memcg; | 1864 | struct mem_cgroup *memcg; |
2145 | 1865 | ||
2146 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 1866 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
2147 | do { | 1867 | do { |
2148 | struct mem_cgroup_zone mz = { | 1868 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2149 | .mem_cgroup = memcg, | 1869 | |
2150 | .zone = zone, | 1870 | shrink_lruvec(lruvec, sc); |
2151 | }; | ||
2152 | 1871 | ||
2153 | shrink_mem_cgroup_zone(priority, &mz, sc); | ||
2154 | /* | 1872 | /* |
2155 | * Limit reclaim has historically picked one memcg and | 1873 | * Limit reclaim has historically picked one memcg and |
2156 | * scanned it with decreasing priority levels until | 1874 | * scanned it with decreasing priority levels until |
@@ -2226,8 +1944,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | |||
2226 | * the caller that it should consider retrying the allocation instead of | 1944 | * the caller that it should consider retrying the allocation instead of |
2227 | * further reclaim. | 1945 | * further reclaim. |
2228 | */ | 1946 | */ |
2229 | static bool shrink_zones(int priority, struct zonelist *zonelist, | 1947 | static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) |
2230 | struct scan_control *sc) | ||
2231 | { | 1948 | { |
2232 | struct zoneref *z; | 1949 | struct zoneref *z; |
2233 | struct zone *zone; | 1950 | struct zone *zone; |
@@ -2254,7 +1971,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, | |||
2254 | if (global_reclaim(sc)) { | 1971 | if (global_reclaim(sc)) { |
2255 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1972 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2256 | continue; | 1973 | continue; |
2257 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1974 | if (zone->all_unreclaimable && |
1975 | sc->priority != DEF_PRIORITY) | ||
2258 | continue; /* Let kswapd poll it */ | 1976 | continue; /* Let kswapd poll it */ |
2259 | if (COMPACTION_BUILD) { | 1977 | if (COMPACTION_BUILD) { |
2260 | /* | 1978 | /* |
@@ -2286,7 +2004,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, | |||
2286 | /* need some check for avoid more shrink_zone() */ | 2004 | /* need some check for avoid more shrink_zone() */ |
2287 | } | 2005 | } |
2288 | 2006 | ||
2289 | shrink_zone(priority, zone, sc); | 2007 | shrink_zone(zone, sc); |
2290 | } | 2008 | } |
2291 | 2009 | ||
2292 | return aborted_reclaim; | 2010 | return aborted_reclaim; |
@@ -2337,7 +2055,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2337 | struct scan_control *sc, | 2055 | struct scan_control *sc, |
2338 | struct shrink_control *shrink) | 2056 | struct shrink_control *shrink) |
2339 | { | 2057 | { |
2340 | int priority; | ||
2341 | unsigned long total_scanned = 0; | 2058 | unsigned long total_scanned = 0; |
2342 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2059 | struct reclaim_state *reclaim_state = current->reclaim_state; |
2343 | struct zoneref *z; | 2060 | struct zoneref *z; |
@@ -2350,11 +2067,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2350 | if (global_reclaim(sc)) | 2067 | if (global_reclaim(sc)) |
2351 | count_vm_event(ALLOCSTALL); | 2068 | count_vm_event(ALLOCSTALL); |
2352 | 2069 | ||
2353 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2070 | do { |
2354 | sc->nr_scanned = 0; | 2071 | sc->nr_scanned = 0; |
2355 | if (!priority) | 2072 | aborted_reclaim = shrink_zones(zonelist, sc); |
2356 | disable_swap_token(sc->target_mem_cgroup); | ||
2357 | aborted_reclaim = shrink_zones(priority, zonelist, sc); | ||
2358 | 2073 | ||
2359 | /* | 2074 | /* |
2360 | * Don't shrink slabs when reclaiming memory from | 2075 | * Don't shrink slabs when reclaiming memory from |
@@ -2396,7 +2111,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2396 | 2111 | ||
2397 | /* Take a nap, wait for some writeback to complete */ | 2112 | /* Take a nap, wait for some writeback to complete */ |
2398 | if (!sc->hibernation_mode && sc->nr_scanned && | 2113 | if (!sc->hibernation_mode && sc->nr_scanned && |
2399 | priority < DEF_PRIORITY - 2) { | 2114 | sc->priority < DEF_PRIORITY - 2) { |
2400 | struct zone *preferred_zone; | 2115 | struct zone *preferred_zone; |
2401 | 2116 | ||
2402 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), | 2117 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), |
@@ -2404,7 +2119,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2404 | &preferred_zone); | 2119 | &preferred_zone); |
2405 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); | 2120 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); |
2406 | } | 2121 | } |
2407 | } | 2122 | } while (--sc->priority >= 0); |
2408 | 2123 | ||
2409 | out: | 2124 | out: |
2410 | delayacct_freepages_end(); | 2125 | delayacct_freepages_end(); |
@@ -2431,6 +2146,83 @@ out: | |||
2431 | return 0; | 2146 | return 0; |
2432 | } | 2147 | } |
2433 | 2148 | ||
2149 | static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | ||
2150 | { | ||
2151 | struct zone *zone; | ||
2152 | unsigned long pfmemalloc_reserve = 0; | ||
2153 | unsigned long free_pages = 0; | ||
2154 | int i; | ||
2155 | bool wmark_ok; | ||
2156 | |||
2157 | for (i = 0; i <= ZONE_NORMAL; i++) { | ||
2158 | zone = &pgdat->node_zones[i]; | ||
2159 | pfmemalloc_reserve += min_wmark_pages(zone); | ||
2160 | free_pages += zone_page_state(zone, NR_FREE_PAGES); | ||
2161 | } | ||
2162 | |||
2163 | wmark_ok = free_pages > pfmemalloc_reserve / 2; | ||
2164 | |||
2165 | /* kswapd must be awake if processes are being throttled */ | ||
2166 | if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { | ||
2167 | pgdat->classzone_idx = min(pgdat->classzone_idx, | ||
2168 | (enum zone_type)ZONE_NORMAL); | ||
2169 | wake_up_interruptible(&pgdat->kswapd_wait); | ||
2170 | } | ||
2171 | |||
2172 | return wmark_ok; | ||
2173 | } | ||
2174 | |||
2175 | /* | ||
2176 | * Throttle direct reclaimers if backing storage is backed by the network | ||
2177 | * and the PFMEMALLOC reserve for the preferred node is getting dangerously | ||
2178 | * depleted. kswapd will continue to make progress and wake the processes | ||
2179 | * when the low watermark is reached | ||
2180 | */ | ||
2181 | static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | ||
2182 | nodemask_t *nodemask) | ||
2183 | { | ||
2184 | struct zone *zone; | ||
2185 | int high_zoneidx = gfp_zone(gfp_mask); | ||
2186 | pg_data_t *pgdat; | ||
2187 | |||
2188 | /* | ||
2189 | * Kernel threads should not be throttled as they may be indirectly | ||
2190 | * responsible for cleaning pages necessary for reclaim to make forward | ||
2191 | * progress. kjournald for example may enter direct reclaim while | ||
2192 | * committing a transaction where throttling it could forcing other | ||
2193 | * processes to block on log_wait_commit(). | ||
2194 | */ | ||
2195 | if (current->flags & PF_KTHREAD) | ||
2196 | return; | ||
2197 | |||
2198 | /* Check if the pfmemalloc reserves are ok */ | ||
2199 | first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); | ||
2200 | pgdat = zone->zone_pgdat; | ||
2201 | if (pfmemalloc_watermark_ok(pgdat)) | ||
2202 | return; | ||
2203 | |||
2204 | /* Account for the throttling */ | ||
2205 | count_vm_event(PGSCAN_DIRECT_THROTTLE); | ||
2206 | |||
2207 | /* | ||
2208 | * If the caller cannot enter the filesystem, it's possible that it | ||
2209 | * is due to the caller holding an FS lock or performing a journal | ||
2210 | * transaction in the case of a filesystem like ext[3|4]. In this case, | ||
2211 | * it is not safe to block on pfmemalloc_wait as kswapd could be | ||
2212 | * blocked waiting on the same lock. Instead, throttle for up to a | ||
2213 | * second before continuing. | ||
2214 | */ | ||
2215 | if (!(gfp_mask & __GFP_FS)) { | ||
2216 | wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, | ||
2217 | pfmemalloc_watermark_ok(pgdat), HZ); | ||
2218 | return; | ||
2219 | } | ||
2220 | |||
2221 | /* Throttle until kswapd wakes the process */ | ||
2222 | wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, | ||
2223 | pfmemalloc_watermark_ok(pgdat)); | ||
2224 | } | ||
2225 | |||
2434 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 2226 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
2435 | gfp_t gfp_mask, nodemask_t *nodemask) | 2227 | gfp_t gfp_mask, nodemask_t *nodemask) |
2436 | { | 2228 | { |
@@ -2442,6 +2234,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2442 | .may_unmap = 1, | 2234 | .may_unmap = 1, |
2443 | .may_swap = 1, | 2235 | .may_swap = 1, |
2444 | .order = order, | 2236 | .order = order, |
2237 | .priority = DEF_PRIORITY, | ||
2445 | .target_mem_cgroup = NULL, | 2238 | .target_mem_cgroup = NULL, |
2446 | .nodemask = nodemask, | 2239 | .nodemask = nodemask, |
2447 | }; | 2240 | }; |
@@ -2449,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2449 | .gfp_mask = sc.gfp_mask, | 2242 | .gfp_mask = sc.gfp_mask, |
2450 | }; | 2243 | }; |
2451 | 2244 | ||
2245 | throttle_direct_reclaim(gfp_mask, zonelist, nodemask); | ||
2246 | |||
2247 | /* | ||
2248 | * Do not enter reclaim if fatal signal is pending. 1 is returned so | ||
2249 | * that the page allocator does not consider triggering OOM | ||
2250 | */ | ||
2251 | if (fatal_signal_pending(current)) | ||
2252 | return 1; | ||
2253 | |||
2452 | trace_mm_vmscan_direct_reclaim_begin(order, | 2254 | trace_mm_vmscan_direct_reclaim_begin(order, |
2453 | sc.may_writepage, | 2255 | sc.may_writepage, |
2454 | gfp_mask); | 2256 | gfp_mask); |
@@ -2460,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2460 | return nr_reclaimed; | 2262 | return nr_reclaimed; |
2461 | } | 2263 | } |
2462 | 2264 | ||
2463 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 2265 | #ifdef CONFIG_MEMCG |
2464 | 2266 | ||
2465 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | 2267 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, |
2466 | gfp_t gfp_mask, bool noswap, | 2268 | gfp_t gfp_mask, bool noswap, |
@@ -2474,17 +2276,15 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
2474 | .may_unmap = 1, | 2276 | .may_unmap = 1, |
2475 | .may_swap = !noswap, | 2277 | .may_swap = !noswap, |
2476 | .order = 0, | 2278 | .order = 0, |
2279 | .priority = 0, | ||
2477 | .target_mem_cgroup = memcg, | 2280 | .target_mem_cgroup = memcg, |
2478 | }; | 2281 | }; |
2479 | struct mem_cgroup_zone mz = { | 2282 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2480 | .mem_cgroup = memcg, | ||
2481 | .zone = zone, | ||
2482 | }; | ||
2483 | 2283 | ||
2484 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2284 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2485 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2285 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
2486 | 2286 | ||
2487 | trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, | 2287 | trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, |
2488 | sc.may_writepage, | 2288 | sc.may_writepage, |
2489 | sc.gfp_mask); | 2289 | sc.gfp_mask); |
2490 | 2290 | ||
@@ -2495,7 +2295,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
2495 | * will pick up pages from other mem cgroup's as well. We hack | 2295 | * will pick up pages from other mem cgroup's as well. We hack |
2496 | * the priority and make it zero. | 2296 | * the priority and make it zero. |
2497 | */ | 2297 | */ |
2498 | shrink_mem_cgroup_zone(0, &mz, &sc); | 2298 | shrink_lruvec(lruvec, &sc); |
2499 | 2299 | ||
2500 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 2300 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
2501 | 2301 | ||
@@ -2516,6 +2316,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
2516 | .may_swap = !noswap, | 2316 | .may_swap = !noswap, |
2517 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2317 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2518 | .order = 0, | 2318 | .order = 0, |
2319 | .priority = DEF_PRIORITY, | ||
2519 | .target_mem_cgroup = memcg, | 2320 | .target_mem_cgroup = memcg, |
2520 | .nodemask = NULL, /* we don't care the placement */ | 2321 | .nodemask = NULL, /* we don't care the placement */ |
2521 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2322 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
@@ -2546,8 +2347,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
2546 | } | 2347 | } |
2547 | #endif | 2348 | #endif |
2548 | 2349 | ||
2549 | static void age_active_anon(struct zone *zone, struct scan_control *sc, | 2350 | static void age_active_anon(struct zone *zone, struct scan_control *sc) |
2550 | int priority) | ||
2551 | { | 2351 | { |
2552 | struct mem_cgroup *memcg; | 2352 | struct mem_cgroup *memcg; |
2553 | 2353 | ||
@@ -2556,14 +2356,11 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc, | |||
2556 | 2356 | ||
2557 | memcg = mem_cgroup_iter(NULL, NULL, NULL); | 2357 | memcg = mem_cgroup_iter(NULL, NULL, NULL); |
2558 | do { | 2358 | do { |
2559 | struct mem_cgroup_zone mz = { | 2359 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2560 | .mem_cgroup = memcg, | ||
2561 | .zone = zone, | ||
2562 | }; | ||
2563 | 2360 | ||
2564 | if (inactive_anon_is_low(&mz)) | 2361 | if (inactive_anon_is_low(lruvec)) |
2565 | shrink_active_list(SWAP_CLUSTER_MAX, &mz, | 2362 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, |
2566 | sc, priority, 0); | 2363 | sc, LRU_ACTIVE_ANON); |
2567 | 2364 | ||
2568 | memcg = mem_cgroup_iter(NULL, memcg, NULL); | 2365 | memcg = mem_cgroup_iter(NULL, memcg, NULL); |
2569 | } while (memcg); | 2366 | } while (memcg); |
@@ -2598,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | |||
2598 | return balanced_pages >= (present_pages >> 2); | 2395 | return balanced_pages >= (present_pages >> 2); |
2599 | } | 2396 | } |
2600 | 2397 | ||
2601 | /* is kswapd sleeping prematurely? */ | 2398 | /* |
2602 | static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | 2399 | * Prepare kswapd for sleeping. This verifies that there are no processes |
2400 | * waiting in throttle_direct_reclaim() and that watermarks have been met. | ||
2401 | * | ||
2402 | * Returns true if kswapd is ready to sleep | ||
2403 | */ | ||
2404 | static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | ||
2603 | int classzone_idx) | 2405 | int classzone_idx) |
2604 | { | 2406 | { |
2605 | int i; | 2407 | int i; |
@@ -2608,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2608 | 2410 | ||
2609 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 2411 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
2610 | if (remaining) | 2412 | if (remaining) |
2611 | return true; | 2413 | return false; |
2414 | |||
2415 | /* | ||
2416 | * There is a potential race between when kswapd checks its watermarks | ||
2417 | * and a process gets throttled. There is also a potential race if | ||
2418 | * processes get throttled, kswapd wakes, a large process exits therby | ||
2419 | * balancing the zones that causes kswapd to miss a wakeup. If kswapd | ||
2420 | * is going to sleep, no process should be sleeping on pfmemalloc_wait | ||
2421 | * so wake them now if necessary. If necessary, processes will wake | ||
2422 | * kswapd and get throttled again | ||
2423 | */ | ||
2424 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) { | ||
2425 | wake_up(&pgdat->pfmemalloc_wait); | ||
2426 | return false; | ||
2427 | } | ||
2612 | 2428 | ||
2613 | /* Check the watermark levels */ | 2429 | /* Check the watermark levels */ |
2614 | for (i = 0; i <= classzone_idx; i++) { | 2430 | for (i = 0; i <= classzone_idx; i++) { |
@@ -2641,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2641 | * must be balanced | 2457 | * must be balanced |
2642 | */ | 2458 | */ |
2643 | if (order) | 2459 | if (order) |
2644 | return !pgdat_balanced(pgdat, balanced, classzone_idx); | 2460 | return pgdat_balanced(pgdat, balanced, classzone_idx); |
2645 | else | 2461 | else |
2646 | return !all_zones_ok; | 2462 | return all_zones_ok; |
2647 | } | 2463 | } |
2648 | 2464 | ||
2649 | /* | 2465 | /* |
@@ -2672,7 +2488,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2672 | { | 2488 | { |
2673 | int all_zones_ok; | 2489 | int all_zones_ok; |
2674 | unsigned long balanced; | 2490 | unsigned long balanced; |
2675 | int priority; | ||
2676 | int i; | 2491 | int i; |
2677 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2492 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
2678 | unsigned long total_scanned; | 2493 | unsigned long total_scanned; |
@@ -2696,18 +2511,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2696 | }; | 2511 | }; |
2697 | loop_again: | 2512 | loop_again: |
2698 | total_scanned = 0; | 2513 | total_scanned = 0; |
2514 | sc.priority = DEF_PRIORITY; | ||
2699 | sc.nr_reclaimed = 0; | 2515 | sc.nr_reclaimed = 0; |
2700 | sc.may_writepage = !laptop_mode; | 2516 | sc.may_writepage = !laptop_mode; |
2701 | count_vm_event(PAGEOUTRUN); | 2517 | count_vm_event(PAGEOUTRUN); |
2702 | 2518 | ||
2703 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2519 | do { |
2704 | unsigned long lru_pages = 0; | 2520 | unsigned long lru_pages = 0; |
2705 | int has_under_min_watermark_zone = 0; | 2521 | int has_under_min_watermark_zone = 0; |
2706 | 2522 | ||
2707 | /* The swap token gets in the way of swapout... */ | ||
2708 | if (!priority) | ||
2709 | disable_swap_token(NULL); | ||
2710 | |||
2711 | all_zones_ok = 1; | 2523 | all_zones_ok = 1; |
2712 | balanced = 0; | 2524 | balanced = 0; |
2713 | 2525 | ||
@@ -2721,14 +2533,15 @@ loop_again: | |||
2721 | if (!populated_zone(zone)) | 2533 | if (!populated_zone(zone)) |
2722 | continue; | 2534 | continue; |
2723 | 2535 | ||
2724 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2536 | if (zone->all_unreclaimable && |
2537 | sc.priority != DEF_PRIORITY) | ||
2725 | continue; | 2538 | continue; |
2726 | 2539 | ||
2727 | /* | 2540 | /* |
2728 | * Do some background aging of the anon list, to give | 2541 | * Do some background aging of the anon list, to give |
2729 | * pages a chance to be referenced before reclaiming. | 2542 | * pages a chance to be referenced before reclaiming. |
2730 | */ | 2543 | */ |
2731 | age_active_anon(zone, &sc, priority); | 2544 | age_active_anon(zone, &sc); |
2732 | 2545 | ||
2733 | /* | 2546 | /* |
2734 | * If the number of buffer_heads in the machine | 2547 | * If the number of buffer_heads in the machine |
@@ -2776,7 +2589,8 @@ loop_again: | |||
2776 | if (!populated_zone(zone)) | 2589 | if (!populated_zone(zone)) |
2777 | continue; | 2590 | continue; |
2778 | 2591 | ||
2779 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2592 | if (zone->all_unreclaimable && |
2593 | sc.priority != DEF_PRIORITY) | ||
2780 | continue; | 2594 | continue; |
2781 | 2595 | ||
2782 | sc.nr_scanned = 0; | 2596 | sc.nr_scanned = 0; |
@@ -2820,7 +2634,7 @@ loop_again: | |||
2820 | !zone_watermark_ok_safe(zone, testorder, | 2634 | !zone_watermark_ok_safe(zone, testorder, |
2821 | high_wmark_pages(zone) + balance_gap, | 2635 | high_wmark_pages(zone) + balance_gap, |
2822 | end_zone, 0)) { | 2636 | end_zone, 0)) { |
2823 | shrink_zone(priority, zone, &sc); | 2637 | shrink_zone(zone, &sc); |
2824 | 2638 | ||
2825 | reclaim_state->reclaimed_slab = 0; | 2639 | reclaim_state->reclaimed_slab = 0; |
2826 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); | 2640 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); |
@@ -2863,7 +2677,7 @@ loop_again: | |||
2863 | * consider it to be no longer congested. It's | 2677 | * consider it to be no longer congested. It's |
2864 | * possible there are dirty pages backed by | 2678 | * possible there are dirty pages backed by |
2865 | * congested BDIs but as pressure is relieved, | 2679 | * congested BDIs but as pressure is relieved, |
2866 | * spectulatively avoid congestion waits | 2680 | * speculatively avoid congestion waits |
2867 | */ | 2681 | */ |
2868 | zone_clear_flag(zone, ZONE_CONGESTED); | 2682 | zone_clear_flag(zone, ZONE_CONGESTED); |
2869 | if (i <= *classzone_idx) | 2683 | if (i <= *classzone_idx) |
@@ -2871,13 +2685,23 @@ loop_again: | |||
2871 | } | 2685 | } |
2872 | 2686 | ||
2873 | } | 2687 | } |
2688 | |||
2689 | /* | ||
2690 | * If the low watermark is met there is no need for processes | ||
2691 | * to be throttled on pfmemalloc_wait as they should not be | ||
2692 | * able to safely make forward progress. Wake them | ||
2693 | */ | ||
2694 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && | ||
2695 | pfmemalloc_watermark_ok(pgdat)) | ||
2696 | wake_up(&pgdat->pfmemalloc_wait); | ||
2697 | |||
2874 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) | 2698 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
2875 | break; /* kswapd: all done */ | 2699 | break; /* kswapd: all done */ |
2876 | /* | 2700 | /* |
2877 | * OK, kswapd is getting into trouble. Take a nap, then take | 2701 | * OK, kswapd is getting into trouble. Take a nap, then take |
2878 | * another pass across the zones. | 2702 | * another pass across the zones. |
2879 | */ | 2703 | */ |
2880 | if (total_scanned && (priority < DEF_PRIORITY - 2)) { | 2704 | if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { |
2881 | if (has_under_min_watermark_zone) | 2705 | if (has_under_min_watermark_zone) |
2882 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); | 2706 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); |
2883 | else | 2707 | else |
@@ -2892,7 +2716,7 @@ loop_again: | |||
2892 | */ | 2716 | */ |
2893 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) | 2717 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) |
2894 | break; | 2718 | break; |
2895 | } | 2719 | } while (--sc.priority >= 0); |
2896 | out: | 2720 | out: |
2897 | 2721 | ||
2898 | /* | 2722 | /* |
@@ -2942,7 +2766,8 @@ out: | |||
2942 | if (!populated_zone(zone)) | 2766 | if (!populated_zone(zone)) |
2943 | continue; | 2767 | continue; |
2944 | 2768 | ||
2945 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2769 | if (zone->all_unreclaimable && |
2770 | sc.priority != DEF_PRIORITY) | ||
2946 | continue; | 2771 | continue; |
2947 | 2772 | ||
2948 | /* Would compaction fail due to lack of free memory? */ | 2773 | /* Would compaction fail due to lack of free memory? */ |
@@ -2971,7 +2796,7 @@ out: | |||
2971 | } | 2796 | } |
2972 | 2797 | ||
2973 | /* | 2798 | /* |
2974 | * Return the order we were reclaiming at so sleeping_prematurely() | 2799 | * Return the order we were reclaiming at so prepare_kswapd_sleep() |
2975 | * makes a decision on the order we were last reclaiming at. However, | 2800 | * makes a decision on the order we were last reclaiming at. However, |
2976 | * if another caller entered the allocator slow path while kswapd | 2801 | * if another caller entered the allocator slow path while kswapd |
2977 | * was awake, order will remain at the higher level | 2802 | * was awake, order will remain at the higher level |
@@ -2991,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2991 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2816 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
2992 | 2817 | ||
2993 | /* Try to sleep for a short interval */ | 2818 | /* Try to sleep for a short interval */ |
2994 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | 2819 | if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { |
2995 | remaining = schedule_timeout(HZ/10); | 2820 | remaining = schedule_timeout(HZ/10); |
2996 | finish_wait(&pgdat->kswapd_wait, &wait); | 2821 | finish_wait(&pgdat->kswapd_wait, &wait); |
2997 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2822 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
@@ -3001,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
3001 | * After a short sleep, check if it was a premature sleep. If not, then | 2826 | * After a short sleep, check if it was a premature sleep. If not, then |
3002 | * go fully to sleep until explicitly woken up. | 2827 | * go fully to sleep until explicitly woken up. |
3003 | */ | 2828 | */ |
3004 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | 2829 | if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { |
3005 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | 2830 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); |
3006 | 2831 | ||
3007 | /* | 2832 | /* |
@@ -3013,7 +2838,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
3013 | * them before going back to sleep. | 2838 | * them before going back to sleep. |
3014 | */ | 2839 | */ |
3015 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | 2840 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); |
3016 | schedule(); | 2841 | |
2842 | if (!kthread_should_stop()) | ||
2843 | schedule(); | ||
2844 | |||
3017 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); | 2845 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); |
3018 | } else { | 2846 | } else { |
3019 | if (remaining) | 2847 | if (remaining) |
@@ -3209,6 +3037,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
3209 | .nr_to_reclaim = nr_to_reclaim, | 3037 | .nr_to_reclaim = nr_to_reclaim, |
3210 | .hibernation_mode = 1, | 3038 | .hibernation_mode = 1, |
3211 | .order = 0, | 3039 | .order = 0, |
3040 | .priority = DEF_PRIORITY, | ||
3212 | }; | 3041 | }; |
3213 | struct shrink_control shrink = { | 3042 | struct shrink_control shrink = { |
3214 | .gfp_mask = sc.gfp_mask, | 3043 | .gfp_mask = sc.gfp_mask, |
@@ -3279,14 +3108,17 @@ int kswapd_run(int nid) | |||
3279 | } | 3108 | } |
3280 | 3109 | ||
3281 | /* | 3110 | /* |
3282 | * Called by memory hotplug when all memory in a node is offlined. | 3111 | * Called by memory hotplug when all memory in a node is offlined. Caller must |
3112 | * hold lock_memory_hotplug(). | ||
3283 | */ | 3113 | */ |
3284 | void kswapd_stop(int nid) | 3114 | void kswapd_stop(int nid) |
3285 | { | 3115 | { |
3286 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; | 3116 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; |
3287 | 3117 | ||
3288 | if (kswapd) | 3118 | if (kswapd) { |
3289 | kthread_stop(kswapd); | 3119 | kthread_stop(kswapd); |
3120 | NODE_DATA(nid)->kswapd = NULL; | ||
3121 | } | ||
3290 | } | 3122 | } |
3291 | 3123 | ||
3292 | static int __init kswapd_init(void) | 3124 | static int __init kswapd_init(void) |
@@ -3386,7 +3218,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3386 | const unsigned long nr_pages = 1 << order; | 3218 | const unsigned long nr_pages = 1 << order; |
3387 | struct task_struct *p = current; | 3219 | struct task_struct *p = current; |
3388 | struct reclaim_state reclaim_state; | 3220 | struct reclaim_state reclaim_state; |
3389 | int priority; | ||
3390 | struct scan_control sc = { | 3221 | struct scan_control sc = { |
3391 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 3222 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
3392 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 3223 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
@@ -3395,6 +3226,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3395 | SWAP_CLUSTER_MAX), | 3226 | SWAP_CLUSTER_MAX), |
3396 | .gfp_mask = gfp_mask, | 3227 | .gfp_mask = gfp_mask, |
3397 | .order = order, | 3228 | .order = order, |
3229 | .priority = ZONE_RECLAIM_PRIORITY, | ||
3398 | }; | 3230 | }; |
3399 | struct shrink_control shrink = { | 3231 | struct shrink_control shrink = { |
3400 | .gfp_mask = sc.gfp_mask, | 3232 | .gfp_mask = sc.gfp_mask, |
@@ -3417,11 +3249,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3417 | * Free memory by calling shrink zone with increasing | 3249 | * Free memory by calling shrink zone with increasing |
3418 | * priorities until we have enough memory freed. | 3250 | * priorities until we have enough memory freed. |
3419 | */ | 3251 | */ |
3420 | priority = ZONE_RECLAIM_PRIORITY; | ||
3421 | do { | 3252 | do { |
3422 | shrink_zone(priority, zone, &sc); | 3253 | shrink_zone(zone, &sc); |
3423 | priority--; | 3254 | } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); |
3424 | } while (priority >= 0 && sc.nr_reclaimed < nr_pages); | ||
3425 | } | 3255 | } |
3426 | 3256 | ||
3427 | nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); | 3257 | nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); |
@@ -3536,7 +3366,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) | |||
3536 | if (mapping_unevictable(page_mapping(page))) | 3366 | if (mapping_unevictable(page_mapping(page))) |
3537 | return 0; | 3367 | return 0; |
3538 | 3368 | ||
3539 | if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) | 3369 | if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page))) |
3540 | return 0; | 3370 | return 0; |
3541 | 3371 | ||
3542 | return 1; | 3372 | return 1; |
@@ -3572,6 +3402,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) | |||
3572 | zone = pagezone; | 3402 | zone = pagezone; |
3573 | spin_lock_irq(&zone->lru_lock); | 3403 | spin_lock_irq(&zone->lru_lock); |
3574 | } | 3404 | } |
3405 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
3575 | 3406 | ||
3576 | if (!PageLRU(page) || !PageUnevictable(page)) | 3407 | if (!PageLRU(page) || !PageUnevictable(page)) |
3577 | continue; | 3408 | continue; |
@@ -3581,11 +3412,8 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) | |||
3581 | 3412 | ||
3582 | VM_BUG_ON(PageActive(page)); | 3413 | VM_BUG_ON(PageActive(page)); |
3583 | ClearPageUnevictable(page); | 3414 | ClearPageUnevictable(page); |
3584 | __dec_zone_state(zone, NR_UNEVICTABLE); | 3415 | del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); |
3585 | lruvec = mem_cgroup_lru_move_lists(zone, page, | 3416 | add_page_to_lru_list(page, lruvec, lru); |
3586 | LRU_UNEVICTABLE, lru); | ||
3587 | list_move(&page->lru, &lruvec->lists[lru]); | ||
3588 | __inc_zone_state(zone, NR_INACTIVE_ANON + lru); | ||
3589 | pgrescued++; | 3417 | pgrescued++; |
3590 | } | 3418 | } |
3591 | } | 3419 | } |
diff --git a/mm/vmstat.c b/mm/vmstat.c index f600557a7659..df7a6748231d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -613,6 +613,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = { | |||
613 | "Reclaimable", | 613 | "Reclaimable", |
614 | "Movable", | 614 | "Movable", |
615 | "Reserve", | 615 | "Reserve", |
616 | #ifdef CONFIG_CMA | ||
617 | "CMA", | ||
618 | #endif | ||
616 | "Isolate", | 619 | "Isolate", |
617 | }; | 620 | }; |
618 | 621 | ||
@@ -738,16 +741,17 @@ const char * const vmstat_text[] = { | |||
738 | "pgmajfault", | 741 | "pgmajfault", |
739 | 742 | ||
740 | TEXTS_FOR_ZONES("pgrefill") | 743 | TEXTS_FOR_ZONES("pgrefill") |
741 | TEXTS_FOR_ZONES("pgsteal") | 744 | TEXTS_FOR_ZONES("pgsteal_kswapd") |
745 | TEXTS_FOR_ZONES("pgsteal_direct") | ||
742 | TEXTS_FOR_ZONES("pgscan_kswapd") | 746 | TEXTS_FOR_ZONES("pgscan_kswapd") |
743 | TEXTS_FOR_ZONES("pgscan_direct") | 747 | TEXTS_FOR_ZONES("pgscan_direct") |
748 | "pgscan_direct_throttle", | ||
744 | 749 | ||
745 | #ifdef CONFIG_NUMA | 750 | #ifdef CONFIG_NUMA |
746 | "zone_reclaim_failed", | 751 | "zone_reclaim_failed", |
747 | #endif | 752 | #endif |
748 | "pginodesteal", | 753 | "pginodesteal", |
749 | "slabs_scanned", | 754 | "slabs_scanned", |
750 | "kswapd_steal", | ||
751 | "kswapd_inodesteal", | 755 | "kswapd_inodesteal", |
752 | "kswapd_low_wmark_hit_quickly", | 756 | "kswapd_low_wmark_hit_quickly", |
753 | "kswapd_high_wmark_hit_quickly", | 757 | "kswapd_high_wmark_hit_quickly", |
@@ -1220,7 +1224,6 @@ module_init(setup_vmstat) | |||
1220 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) | 1224 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) |
1221 | #include <linux/debugfs.h> | 1225 | #include <linux/debugfs.h> |
1222 | 1226 | ||
1223 | static struct dentry *extfrag_debug_root; | ||
1224 | 1227 | ||
1225 | /* | 1228 | /* |
1226 | * Return an index indicating how much of the available free memory is | 1229 | * Return an index indicating how much of the available free memory is |
@@ -1358,19 +1361,24 @@ static const struct file_operations extfrag_file_ops = { | |||
1358 | 1361 | ||
1359 | static int __init extfrag_debug_init(void) | 1362 | static int __init extfrag_debug_init(void) |
1360 | { | 1363 | { |
1364 | struct dentry *extfrag_debug_root; | ||
1365 | |||
1361 | extfrag_debug_root = debugfs_create_dir("extfrag", NULL); | 1366 | extfrag_debug_root = debugfs_create_dir("extfrag", NULL); |
1362 | if (!extfrag_debug_root) | 1367 | if (!extfrag_debug_root) |
1363 | return -ENOMEM; | 1368 | return -ENOMEM; |
1364 | 1369 | ||
1365 | if (!debugfs_create_file("unusable_index", 0444, | 1370 | if (!debugfs_create_file("unusable_index", 0444, |
1366 | extfrag_debug_root, NULL, &unusable_file_ops)) | 1371 | extfrag_debug_root, NULL, &unusable_file_ops)) |
1367 | return -ENOMEM; | 1372 | goto fail; |
1368 | 1373 | ||
1369 | if (!debugfs_create_file("extfrag_index", 0444, | 1374 | if (!debugfs_create_file("extfrag_index", 0444, |
1370 | extfrag_debug_root, NULL, &extfrag_file_ops)) | 1375 | extfrag_debug_root, NULL, &extfrag_file_ops)) |
1371 | return -ENOMEM; | 1376 | goto fail; |
1372 | 1377 | ||
1373 | return 0; | 1378 | return 0; |
1379 | fail: | ||
1380 | debugfs_remove_recursive(extfrag_debug_root); | ||
1381 | return -ENOMEM; | ||
1374 | } | 1382 | } |
1375 | 1383 | ||
1376 | module_init(extfrag_debug_init); | 1384 | module_init(extfrag_debug_init); |