aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig34
-rw-r--r--mm/Makefile20
-rw-r--r--mm/backing-dev.c78
-rw-r--r--mm/bootmem.c138
-rw-r--r--mm/bounce.c8
-rw-r--r--mm/cleancache.c6
-rw-r--r--mm/compaction.c595
-rw-r--r--mm/fadvise.c18
-rw-r--r--mm/filemap.c146
-rw-r--r--mm/filemap_xip.c10
-rw-r--r--mm/frontswap.c344
-rw-r--r--mm/highmem.c12
-rw-r--r--mm/huge_memory.c29
-rw-r--r--mm/hugetlb.c234
-rw-r--r--mm/hugetlb_cgroup.c418
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h51
-rw-r--r--mm/madvise.c29
-rw-r--r--mm/memblock.c181
-rw-r--r--mm/memcontrol.c1167
-rw-r--r--mm/memory-failure.c45
-rw-r--r--mm/memory.c93
-rw-r--r--mm/memory_hotplug.c42
-rw-r--r--mm/mempolicy.c102
-rw-r--r--mm/mempool.c12
-rw-r--r--mm/migrate.c106
-rw-r--r--mm/mmap.c161
-rw-r--r--mm/mmu_notifier.c45
-rw-r--r--mm/mmzone.c14
-rw-r--r--mm/mremap.c28
-rw-r--r--mm/nobootmem.c151
-rw-r--r--mm/nommu.c20
-rw-r--r--mm/oom_kill.c278
-rw-r--r--mm/page-writeback.c111
-rw-r--r--mm/page_alloc.c779
-rw-r--r--mm/page_cgroup.c6
-rw-r--r--mm/page_io.c157
-rw-r--r--mm/page_isolation.c108
-rw-r--r--mm/pagewalk.c1
-rw-r--r--mm/percpu-vm.c1
-rw-r--r--mm/percpu.c22
-rw-r--r--mm/pgtable-generic.c4
-rw-r--r--mm/process_vm_access.c16
-rw-r--r--mm/readahead.c40
-rw-r--r--mm/rmap.c6
-rw-r--r--mm/shmem.c572
-rw-r--r--mm/slab.c622
-rw-r--r--mm/slab.h33
-rw-r--r--mm/slab_common.c120
-rw-r--r--mm/slob.c152
-rw-r--r--mm/slub.c479
-rw-r--r--mm/sparse.c62
-rw-r--r--mm/swap.c181
-rw-r--r--mm/swap_state.c7
-rw-r--r--mm/swapfile.c244
-rw-r--r--mm/thrash.c155
-rw-r--r--mm/truncate.c25
-rw-r--r--mm/util.c30
-rw-r--r--mm/vmalloc.c59
-rw-r--r--mm/vmscan.c936
-rw-r--r--mm/vmstat.c18
61 files changed, 5833 insertions, 3730 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e338407f1225..d5c8019c6627 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK
140config NO_BOOTMEM 140config NO_BOOTMEM
141 boolean 141 boolean
142 142
143config MEMORY_ISOLATION
144 boolean
145
143# eventually, we can have this option just 'select SPARSEMEM' 146# eventually, we can have this option just 'select SPARSEMEM'
144config MEMORY_HOTPLUG 147config MEMORY_HOTPLUG
145 bool "Allow for memory hot-add" 148 bool "Allow for memory hot-add"
149 select MEMORY_ISOLATION
146 depends on SPARSEMEM || X86_64_ACPI_NUMA 150 depends on SPARSEMEM || X86_64_ACPI_NUMA
147 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG 151 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
148 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) 152 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -198,7 +202,7 @@ config COMPACTION
198config MIGRATION 202config MIGRATION
199 bool "Page migration" 203 bool "Page migration"
200 def_bool y 204 def_bool y
201 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION 205 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA
202 help 206 help
203 Allows the migration of the physical location of pages of processes 207 Allows the migration of the physical location of pages of processes
204 while the virtual addresses are not changed. This is useful in 208 while the virtual addresses are not changed. This is useful in
@@ -272,6 +276,7 @@ config MEMORY_FAILURE
272 depends on MMU 276 depends on MMU
273 depends on ARCH_SUPPORTS_MEMORY_FAILURE 277 depends on ARCH_SUPPORTS_MEMORY_FAILURE
274 bool "Enable recovery from hardware memory errors" 278 bool "Enable recovery from hardware memory errors"
279 select MEMORY_ISOLATION
275 help 280 help
276 Enables code to recover from some memory failures on systems 281 Enables code to recover from some memory failures on systems
277 with MCA recovery. This allows a system to continue running 282 with MCA recovery. This allows a system to continue running
@@ -349,6 +354,16 @@ choice
349 benefit. 354 benefit.
350endchoice 355endchoice
351 356
357config CROSS_MEMORY_ATTACH
358 bool "Cross Memory Support"
359 depends on MMU
360 default y
361 help
362 Enabling this option adds the system calls process_vm_readv and
363 process_vm_writev which allow a process with the correct privileges
364 to directly read from or write to to another process's address space.
365 See the man page for more details.
366
352# 367#
353# UP and nommu archs use km based percpu allocator 368# UP and nommu archs use km based percpu allocator
354# 369#
@@ -379,3 +394,20 @@ config CLEANCACHE
379 in a negligible performance hit. 394 in a negligible performance hit.
380 395
381 If unsure, say Y to enable cleancache 396 If unsure, say Y to enable cleancache
397
398config FRONTSWAP
399 bool "Enable frontswap to cache swap pages if tmem is present"
400 depends on SWAP
401 default n
402 help
403 Frontswap is so named because it can be thought of as the opposite
404 of a "backing" store for a swap device. The data is stored into
405 "transcendent memory", memory that is not directly accessible or
406 addressable by the kernel and is of unknown and possibly
407 time-varying size. When space in transcendent memory is available,
408 a significant swap I/O reduction may be achieved. When none is
409 available, all frontswap calls are reduced to a single pointer-
410 compare-against-NULL resulting in a negligible performance hit
411 and swap data is stored as normal on the matching swap device.
412
413 If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index 50ec00ef2a0e..92753e2d82da 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,15 +5,19 @@
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o pgtable-generic.o \ 8 vmalloc.o pagewalk.o pgtable-generic.o
9 process_vm_access.o 9
10ifdef CONFIG_CROSS_MEMORY_ATTACH
11mmu-$(CONFIG_MMU) += process_vm_access.o
12endif
10 13
11obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ 14obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
12 maccess.o page_alloc.o page-writeback.o \ 15 maccess.o page_alloc.o page-writeback.o \
13 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
14 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 17 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
15 page_isolation.o mm_init.o mmu_context.o percpu.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
16 $(mmu-y) 19 compaction.o $(mmu-y)
20
17obj-y += init-mm.o 21obj-y += init-mm.o
18 22
19ifdef CONFIG_NO_BOOTMEM 23ifdef CONFIG_NO_BOOTMEM
@@ -25,14 +29,14 @@ endif
25obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 29obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
26 30
27obj-$(CONFIG_BOUNCE) += bounce.o 31obj-$(CONFIG_BOUNCE) += bounce.o
28obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 32obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
33obj-$(CONFIG_FRONTSWAP) += frontswap.o
29obj-$(CONFIG_HAS_DMA) += dmapool.o 34obj-$(CONFIG_HAS_DMA) += dmapool.o
30obj-$(CONFIG_HUGETLBFS) += hugetlb.o 35obj-$(CONFIG_HUGETLBFS) += hugetlb.o
31obj-$(CONFIG_NUMA) += mempolicy.o 36obj-$(CONFIG_NUMA) += mempolicy.o
32obj-$(CONFIG_SPARSEMEM) += sparse.o 37obj-$(CONFIG_SPARSEMEM) += sparse.o
33obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 38obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
34obj-$(CONFIG_SLOB) += slob.o 39obj-$(CONFIG_SLOB) += slob.o
35obj-$(CONFIG_COMPACTION) += compaction.o
36obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 40obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
37obj-$(CONFIG_KSM) += ksm.o 41obj-$(CONFIG_KSM) += ksm.o
38obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 42obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
@@ -45,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
45obj-$(CONFIG_MIGRATION) += migrate.o 49obj-$(CONFIG_MIGRATION) += migrate.o
46obj-$(CONFIG_QUICKLIST) += quicklist.o 50obj-$(CONFIG_QUICKLIST) += quicklist.o
47obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o 51obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
48obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 52obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
53obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
49obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 54obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
50obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 55obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
51obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 56obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
52obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 57obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
53obj-$(CONFIG_CLEANCACHE) += cleancache.o 58obj-$(CONFIG_CLEANCACHE) += cleancache.o
59obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dd8e2aafb07e..b41823cc05e6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -39,12 +39,6 @@ DEFINE_SPINLOCK(bdi_lock);
39LIST_HEAD(bdi_list); 39LIST_HEAD(bdi_list);
40LIST_HEAD(bdi_pending_list); 40LIST_HEAD(bdi_pending_list);
41 41
42static struct task_struct *sync_supers_tsk;
43static struct timer_list sync_supers_timer;
44
45static int bdi_sync_supers(void *);
46static void sync_supers_timer_fn(unsigned long);
47
48void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) 42void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
49{ 43{
50 if (wb1 < wb2) { 44 if (wb1 < wb2) {
@@ -250,12 +244,6 @@ static int __init default_bdi_init(void)
250{ 244{
251 int err; 245 int err;
252 246
253 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
254 BUG_ON(IS_ERR(sync_supers_tsk));
255
256 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
257 bdi_arm_supers_timer();
258
259 err = bdi_init(&default_backing_dev_info); 247 err = bdi_init(&default_backing_dev_info);
260 if (!err) 248 if (!err)
261 bdi_register(&default_backing_dev_info, NULL, "default"); 249 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -270,46 +258,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
270 return wb_has_dirty_io(&bdi->wb); 258 return wb_has_dirty_io(&bdi->wb);
271} 259}
272 260
273/*
274 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
275 * or we risk deadlocking on ->s_umount. The longer term solution would be
276 * to implement sync_supers_bdi() or similar and simply do it from the
277 * bdi writeback thread individually.
278 */
279static int bdi_sync_supers(void *unused)
280{
281 set_user_nice(current, 0);
282
283 while (!kthread_should_stop()) {
284 set_current_state(TASK_INTERRUPTIBLE);
285 schedule();
286
287 /*
288 * Do this periodically, like kupdated() did before.
289 */
290 sync_supers();
291 }
292
293 return 0;
294}
295
296void bdi_arm_supers_timer(void)
297{
298 unsigned long next;
299
300 if (!dirty_writeback_interval)
301 return;
302
303 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
304 mod_timer(&sync_supers_timer, round_jiffies_up(next));
305}
306
307static void sync_supers_timer_fn(unsigned long unused)
308{
309 wake_up_process(sync_supers_tsk);
310 bdi_arm_supers_timer();
311}
312
313static void wakeup_timer_fn(unsigned long data) 261static void wakeup_timer_fn(unsigned long data)
314{ 262{
315 struct backing_dev_info *bdi = (struct backing_dev_info *)data; 263 struct backing_dev_info *bdi = (struct backing_dev_info *)data;
@@ -677,7 +625,7 @@ int bdi_init(struct backing_dev_info *bdi)
677 625
678 bdi->min_ratio = 0; 626 bdi->min_ratio = 0;
679 bdi->max_ratio = 100; 627 bdi->max_ratio = 100;
680 bdi->max_prop_frac = PROP_FRAC_BASE; 628 bdi->max_prop_frac = FPROP_FRAC_BASE;
681 spin_lock_init(&bdi->wb_lock); 629 spin_lock_init(&bdi->wb_lock);
682 INIT_LIST_HEAD(&bdi->bdi_list); 630 INIT_LIST_HEAD(&bdi->bdi_list);
683 INIT_LIST_HEAD(&bdi->work_list); 631 INIT_LIST_HEAD(&bdi->work_list);
@@ -700,7 +648,7 @@ int bdi_init(struct backing_dev_info *bdi)
700 bdi->write_bandwidth = INIT_BW; 648 bdi->write_bandwidth = INIT_BW;
701 bdi->avg_write_bandwidth = INIT_BW; 649 bdi->avg_write_bandwidth = INIT_BW;
702 650
703 err = prop_local_init_percpu(&bdi->completions); 651 err = fprop_local_init_percpu(&bdi->completions);
704 652
705 if (err) { 653 if (err) {
706err: 654err:
@@ -744,7 +692,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
744 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 692 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
745 percpu_counter_destroy(&bdi->bdi_stat[i]); 693 percpu_counter_destroy(&bdi->bdi_stat[i]);
746 694
747 prop_local_destroy_percpu(&bdi->completions); 695 fprop_local_destroy_percpu(&bdi->completions);
748} 696}
749EXPORT_SYMBOL(bdi_destroy); 697EXPORT_SYMBOL(bdi_destroy);
750 698
@@ -886,3 +834,23 @@ out:
886 return ret; 834 return ret;
887} 835}
888EXPORT_SYMBOL(wait_iff_congested); 836EXPORT_SYMBOL(wait_iff_congested);
837
838int pdflush_proc_obsolete(struct ctl_table *table, int write,
839 void __user *buffer, size_t *lenp, loff_t *ppos)
840{
841 char kbuf[] = "0\n";
842
843 if (*ppos) {
844 *lenp = 0;
845 return 0;
846 }
847
848 if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
849 return -EFAULT;
850 printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
851 table->procname);
852
853 *lenp = 2;
854 *ppos += *lenp;
855 return 2;
856}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0131170c9d54..bcb63ac48cc5 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -77,16 +77,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages)
77 */ 77 */
78static void __init link_bootmem(bootmem_data_t *bdata) 78static void __init link_bootmem(bootmem_data_t *bdata)
79{ 79{
80 struct list_head *iter; 80 bootmem_data_t *ent;
81 81
82 list_for_each(iter, &bdata_list) { 82 list_for_each_entry(ent, &bdata_list, list) {
83 bootmem_data_t *ent; 83 if (bdata->node_min_pfn < ent->node_min_pfn) {
84 84 list_add_tail(&bdata->list, &ent->list);
85 ent = list_entry(iter, bootmem_data_t, list); 85 return;
86 if (bdata->node_min_pfn < ent->node_min_pfn) 86 }
87 break;
88 } 87 }
89 list_add_tail(&bdata->list, iter); 88
89 list_add_tail(&bdata->list, &bdata_list);
90} 90}
91 91
92/* 92/*
@@ -203,7 +203,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
203 } else { 203 } else {
204 unsigned long off = 0; 204 unsigned long off = 0;
205 205
206 while (vec && off < BITS_PER_LONG) { 206 vec >>= start & (BITS_PER_LONG - 1);
207 while (vec) {
207 if (vec & 1) { 208 if (vec & 1) {
208 page = pfn_to_page(start + off); 209 page = pfn_to_page(start + off);
209 __free_pages_bootmem(page, 0); 210 __free_pages_bootmem(page, 0);
@@ -467,7 +468,7 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
467 return ALIGN(base + off, align) - base; 468 return ALIGN(base + off, align) - base;
468} 469}
469 470
470static void * __init alloc_bootmem_core(struct bootmem_data *bdata, 471static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
471 unsigned long size, unsigned long align, 472 unsigned long size, unsigned long align,
472 unsigned long goal, unsigned long limit) 473 unsigned long goal, unsigned long limit)
473{ 474{
@@ -588,14 +589,14 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
588 p_bdata = bootmem_arch_preferred_node(bdata, size, align, 589 p_bdata = bootmem_arch_preferred_node(bdata, size, align,
589 goal, limit); 590 goal, limit);
590 if (p_bdata) 591 if (p_bdata)
591 return alloc_bootmem_core(p_bdata, size, align, 592 return alloc_bootmem_bdata(p_bdata, size, align,
592 goal, limit); 593 goal, limit);
593 } 594 }
594#endif 595#endif
595 return NULL; 596 return NULL;
596} 597}
597 598
598static void * __init ___alloc_bootmem_nopanic(unsigned long size, 599static void * __init alloc_bootmem_core(unsigned long size,
599 unsigned long align, 600 unsigned long align,
600 unsigned long goal, 601 unsigned long goal,
601 unsigned long limit) 602 unsigned long limit)
@@ -603,7 +604,6 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
603 bootmem_data_t *bdata; 604 bootmem_data_t *bdata;
604 void *region; 605 void *region;
605 606
606restart:
607 region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); 607 region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
608 if (region) 608 if (region)
609 return region; 609 return region;
@@ -614,11 +614,25 @@ restart:
614 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) 614 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
615 break; 615 break;
616 616
617 region = alloc_bootmem_core(bdata, size, align, goal, limit); 617 region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
618 if (region) 618 if (region)
619 return region; 619 return region;
620 } 620 }
621 621
622 return NULL;
623}
624
625static void * __init ___alloc_bootmem_nopanic(unsigned long size,
626 unsigned long align,
627 unsigned long goal,
628 unsigned long limit)
629{
630 void *ptr;
631
632restart:
633 ptr = alloc_bootmem_core(size, align, goal, limit);
634 if (ptr)
635 return ptr;
622 if (goal) { 636 if (goal) {
623 goal = 0; 637 goal = 0;
624 goto restart; 638 goto restart;
@@ -684,21 +698,60 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
684 return ___alloc_bootmem(size, align, goal, limit); 698 return ___alloc_bootmem(size, align, goal, limit);
685} 699}
686 700
687static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, 701void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
688 unsigned long size, unsigned long align, 702 unsigned long size, unsigned long align,
689 unsigned long goal, unsigned long limit) 703 unsigned long goal, unsigned long limit)
690{ 704{
691 void *ptr; 705 void *ptr;
692 706
693 ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); 707again:
708 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
709 align, goal, limit);
694 if (ptr) 710 if (ptr)
695 return ptr; 711 return ptr;
696 712
697 ptr = alloc_bootmem_core(bdata, size, align, goal, limit); 713 /* do not panic in alloc_bootmem_bdata() */
714 if (limit && goal + size > limit)
715 limit = 0;
716
717 ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
698 if (ptr) 718 if (ptr)
699 return ptr; 719 return ptr;
700 720
701 return ___alloc_bootmem(size, align, goal, limit); 721 ptr = alloc_bootmem_core(size, align, goal, limit);
722 if (ptr)
723 return ptr;
724
725 if (goal) {
726 goal = 0;
727 goto again;
728 }
729
730 return NULL;
731}
732
733void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
734 unsigned long align, unsigned long goal)
735{
736 if (WARN_ON_ONCE(slab_is_available()))
737 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
738
739 return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
740}
741
742void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
743 unsigned long align, unsigned long goal,
744 unsigned long limit)
745{
746 void *ptr;
747
748 ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
749 if (ptr)
750 return ptr;
751
752 printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
753 panic("Out of memory");
754 return NULL;
702} 755}
703 756
704/** 757/**
@@ -722,7 +775,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
722 if (WARN_ON_ONCE(slab_is_available())) 775 if (WARN_ON_ONCE(slab_is_available()))
723 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 776 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
724 777
725 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); 778 return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
726} 779}
727 780
728void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, 781void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -743,7 +796,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
743 unsigned long new_goal; 796 unsigned long new_goal;
744 797
745 new_goal = MAX_DMA32_PFN << PAGE_SHIFT; 798 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
746 ptr = alloc_bootmem_core(pgdat->bdata, size, align, 799 ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
747 new_goal, 0); 800 new_goal, 0);
748 if (ptr) 801 if (ptr)
749 return ptr; 802 return ptr;
@@ -754,47 +807,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
754 807
755} 808}
756 809
757#ifdef CONFIG_SPARSEMEM
758/**
759 * alloc_bootmem_section - allocate boot memory from a specific section
760 * @size: size of the request in bytes
761 * @section_nr: sparse map section to allocate from
762 *
763 * Return NULL on failure.
764 */
765void * __init alloc_bootmem_section(unsigned long size,
766 unsigned long section_nr)
767{
768 bootmem_data_t *bdata;
769 unsigned long pfn, goal;
770
771 pfn = section_nr_to_pfn(section_nr);
772 goal = pfn << PAGE_SHIFT;
773 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
774
775 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
776}
777#endif
778
779void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
780 unsigned long align, unsigned long goal)
781{
782 void *ptr;
783
784 if (WARN_ON_ONCE(slab_is_available()))
785 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
786
787 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
788 if (ptr)
789 return ptr;
790
791 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
792 if (ptr)
793 return ptr;
794
795 return __alloc_bootmem_nopanic(size, align, goal);
796}
797
798#ifndef ARCH_LOW_ADDRESS_LIMIT 810#ifndef ARCH_LOW_ADDRESS_LIMIT
799#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL 811#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
800#endif 812#endif
@@ -839,6 +851,6 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
839 if (WARN_ON_ONCE(slab_is_available())) 851 if (WARN_ON_ONCE(slab_is_available()))
840 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 852 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
841 853
842 return ___alloc_bootmem_node(pgdat->bdata, size, align, 854 return ___alloc_bootmem_node(pgdat, size, align,
843 goal, ARCH_LOW_ADDRESS_LIMIT); 855 goal, ARCH_LOW_ADDRESS_LIMIT);
844} 856}
diff --git a/mm/bounce.c b/mm/bounce.c
index d1be02ca1889..042086775561 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -24,23 +24,25 @@
24 24
25static mempool_t *page_pool, *isa_page_pool; 25static mempool_t *page_pool, *isa_page_pool;
26 26
27#ifdef CONFIG_HIGHMEM 27#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
28static __init int init_emergency_pool(void) 28static __init int init_emergency_pool(void)
29{ 29{
30#ifndef CONFIG_MEMORY_HOTPLUG 30#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
31 if (max_pfn <= max_low_pfn) 31 if (max_pfn <= max_low_pfn)
32 return 0; 32 return 0;
33#endif 33#endif
34 34
35 page_pool = mempool_create_page_pool(POOL_SIZE, 0); 35 page_pool = mempool_create_page_pool(POOL_SIZE, 0);
36 BUG_ON(!page_pool); 36 BUG_ON(!page_pool);
37 printk("highmem bounce pool size: %d pages\n", POOL_SIZE); 37 printk("bounce pool size: %d pages\n", POOL_SIZE);
38 38
39 return 0; 39 return 0;
40} 40}
41 41
42__initcall(init_emergency_pool); 42__initcall(init_emergency_pool);
43#endif
43 44
45#ifdef CONFIG_HIGHMEM
44/* 46/*
45 * highmem version, map in to vec 47 * highmem version, map in to vec
46 */ 48 */
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 5646c740f613..32e6f4136fa2 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs);
80static int cleancache_get_key(struct inode *inode, 80static int cleancache_get_key(struct inode *inode,
81 struct cleancache_filekey *key) 81 struct cleancache_filekey *key)
82{ 82{
83 int (*fhfn)(struct dentry *, __u32 *fh, int *, int); 83 int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *);
84 int len = 0, maxlen = CLEANCACHE_KEY_MAX; 84 int len = 0, maxlen = CLEANCACHE_KEY_MAX;
85 struct super_block *sb = inode->i_sb; 85 struct super_block *sb = inode->i_sb;
86 86
@@ -88,9 +88,7 @@ static int cleancache_get_key(struct inode *inode,
88 if (sb->s_export_op != NULL) { 88 if (sb->s_export_op != NULL) {
89 fhfn = sb->s_export_op->encode_fh; 89 fhfn = sb->s_export_op->encode_fh;
90 if (fhfn) { 90 if (fhfn) {
91 struct dentry d; 91 len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
92 d.d_inode = inode;
93 len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
94 if (len <= 0 || len == 255) 92 if (len <= 0 || len == 255)
95 return -1; 93 return -1;
96 if (maxlen > CLEANCACHE_KEY_MAX) 94 if (maxlen > CLEANCACHE_KEY_MAX)
diff --git a/mm/compaction.c b/mm/compaction.c
index 74a8c825ff28..7fcd3a52e68d 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,30 +16,11 @@
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include "internal.h" 17#include "internal.h"
18 18
19#if defined CONFIG_COMPACTION || defined CONFIG_CMA
20
19#define CREATE_TRACE_POINTS 21#define CREATE_TRACE_POINTS
20#include <trace/events/compaction.h> 22#include <trace/events/compaction.h>
21 23
22/*
23 * compact_control is used to track pages being migrated and the free pages
24 * they are being migrated to during memory compaction. The free_pfn starts
25 * at the end of a zone and migrate_pfn begins at the start. Movable pages
26 * are moved to the end of a zone during a compaction run and the run
27 * completes when free_pfn <= migrate_pfn
28 */
29struct compact_control {
30 struct list_head freepages; /* List of free pages to migrate to */
31 struct list_head migratepages; /* List of pages being migrated */
32 unsigned long nr_freepages; /* Number of isolated free pages */
33 unsigned long nr_migratepages; /* Number of pages to migrate */
34 unsigned long free_pfn; /* isolate_freepages search base */
35 unsigned long migrate_pfn; /* isolate_migratepages search base */
36 bool sync; /* Synchronous migration */
37
38 int order; /* order a direct compactor needs */
39 int migratetype; /* MOVABLE, RECLAIMABLE etc */
40 struct zone *zone;
41};
42
43static unsigned long release_freepages(struct list_head *freelist) 24static unsigned long release_freepages(struct list_head *freelist)
44{ 25{
45 struct page *page, *next; 26 struct page *page, *next;
@@ -54,24 +35,76 @@ static unsigned long release_freepages(struct list_head *freelist)
54 return count; 35 return count;
55} 36}
56 37
57/* Isolate free pages onto a private freelist. Must hold zone->lock */ 38static void map_pages(struct list_head *list)
58static unsigned long isolate_freepages_block(struct zone *zone,
59 unsigned long blockpfn,
60 struct list_head *freelist)
61{ 39{
62 unsigned long zone_end_pfn, end_pfn; 40 struct page *page;
63 int nr_scanned = 0, total_isolated = 0;
64 struct page *cursor;
65 41
66 /* Get the last PFN we should scan for free pages at */ 42 list_for_each_entry(page, list, lru) {
67 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 43 arch_alloc_page(page, 0);
68 end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn); 44 kernel_map_pages(page, 1, 1);
45 }
46}
69 47
70 /* Find the first usable PFN in the block to initialse page cursor */ 48static inline bool migrate_async_suitable(int migratetype)
71 for (; blockpfn < end_pfn; blockpfn++) { 49{
72 if (pfn_valid_within(blockpfn)) 50 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
73 break; 51}
52
53/*
54 * Compaction requires the taking of some coarse locks that are potentially
55 * very heavily contended. Check if the process needs to be scheduled or
56 * if the lock is contended. For async compaction, back out in the event
57 * if contention is severe. For sync compaction, schedule.
58 *
59 * Returns true if the lock is held.
60 * Returns false if the lock is released and compaction should abort
61 */
62static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
63 bool locked, struct compact_control *cc)
64{
65 if (need_resched() || spin_is_contended(lock)) {
66 if (locked) {
67 spin_unlock_irqrestore(lock, *flags);
68 locked = false;
69 }
70
71 /* async aborts if taking too long or contended */
72 if (!cc->sync) {
73 if (cc->contended)
74 *cc->contended = true;
75 return false;
76 }
77
78 cond_resched();
79 if (fatal_signal_pending(current))
80 return false;
74 } 81 }
82
83 if (!locked)
84 spin_lock_irqsave(lock, *flags);
85 return true;
86}
87
88static inline bool compact_trylock_irqsave(spinlock_t *lock,
89 unsigned long *flags, struct compact_control *cc)
90{
91 return compact_checklock_irqsave(lock, flags, false, cc);
92}
93
94/*
95 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
96 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
97 * pages inside of the pageblock (even though it may still end up isolating
98 * some pages).
99 */
100static unsigned long isolate_freepages_block(unsigned long blockpfn,
101 unsigned long end_pfn,
102 struct list_head *freelist,
103 bool strict)
104{
105 int nr_scanned = 0, total_isolated = 0;
106 struct page *cursor;
107
75 cursor = pfn_to_page(blockpfn); 108 cursor = pfn_to_page(blockpfn);
76 109
77 /* Isolate free pages. This assumes the block is valid */ 110 /* Isolate free pages. This assumes the block is valid */
@@ -79,15 +112,23 @@ static unsigned long isolate_freepages_block(struct zone *zone,
79 int isolated, i; 112 int isolated, i;
80 struct page *page = cursor; 113 struct page *page = cursor;
81 114
82 if (!pfn_valid_within(blockpfn)) 115 if (!pfn_valid_within(blockpfn)) {
116 if (strict)
117 return 0;
83 continue; 118 continue;
119 }
84 nr_scanned++; 120 nr_scanned++;
85 121
86 if (!PageBuddy(page)) 122 if (!PageBuddy(page)) {
123 if (strict)
124 return 0;
87 continue; 125 continue;
126 }
88 127
89 /* Found a free page, break it into order-0 pages */ 128 /* Found a free page, break it into order-0 pages */
90 isolated = split_free_page(page); 129 isolated = split_free_page(page);
130 if (!isolated && strict)
131 return 0;
91 total_isolated += isolated; 132 total_isolated += isolated;
92 for (i = 0; i < isolated; i++) { 133 for (i = 0; i < isolated; i++) {
93 list_add(&page->lru, freelist); 134 list_add(&page->lru, freelist);
@@ -105,118 +146,75 @@ static unsigned long isolate_freepages_block(struct zone *zone,
105 return total_isolated; 146 return total_isolated;
106} 147}
107 148
108/* Returns true if the page is within a block suitable for migration to */ 149/**
109static bool suitable_migration_target(struct page *page) 150 * isolate_freepages_range() - isolate free pages.
110{ 151 * @start_pfn: The first PFN to start isolating.
111 152 * @end_pfn: The one-past-last PFN.
112 int migratetype = get_pageblock_migratetype(page); 153 *
113 154 * Non-free pages, invalid PFNs, or zone boundaries within the
114 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ 155 * [start_pfn, end_pfn) range are considered errors, cause function to
115 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) 156 * undo its actions and return zero.
116 return false; 157 *
117 158 * Otherwise, function returns one-past-the-last PFN of isolated page
118 /* If the page is a large free page, then allow migration */ 159 * (which may be greater then end_pfn if end fell in a middle of
119 if (PageBuddy(page) && page_order(page) >= pageblock_order) 160 * a free page).
120 return true;
121
122 /* If the block is MIGRATE_MOVABLE, allow migration */
123 if (migratetype == MIGRATE_MOVABLE)
124 return true;
125
126 /* Otherwise skip the block */
127 return false;
128}
129
130/*
131 * Based on information in the current compact_control, find blocks
132 * suitable for isolating free pages from and then isolate them.
133 */ 161 */
134static void isolate_freepages(struct zone *zone, 162unsigned long
135 struct compact_control *cc) 163isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
136{ 164{
137 struct page *page; 165 unsigned long isolated, pfn, block_end_pfn, flags;
138 unsigned long high_pfn, low_pfn, pfn; 166 struct zone *zone = NULL;
139 unsigned long flags; 167 LIST_HEAD(freelist);
140 int nr_freepages = cc->nr_freepages;
141 struct list_head *freelist = &cc->freepages;
142 168
143 /* 169 if (pfn_valid(start_pfn))
144 * Initialise the free scanner. The starting point is where we last 170 zone = page_zone(pfn_to_page(start_pfn));
145 * scanned from (or the end of the zone if starting). The low point
146 * is the end of the pageblock the migration scanner is using.
147 */
148 pfn = cc->free_pfn;
149 low_pfn = cc->migrate_pfn + pageblock_nr_pages;
150 171
151 /* 172 for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
152 * Take care that if the migration scanner is at the end of the zone 173 if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
153 * that the free scanner does not accidentally move to the next zone 174 break;
154 * in the next isolation cycle.
155 */
156 high_pfn = min(low_pfn, pfn);
157
158 /*
159 * Isolate free pages until enough are available to migrate the
160 * pages on cc->migratepages. We stop searching if the migrate
161 * and free page scanners meet or enough free pages are isolated.
162 */
163 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
164 pfn -= pageblock_nr_pages) {
165 unsigned long isolated;
166
167 if (!pfn_valid(pfn))
168 continue;
169 175
170 /* 176 /*
171 * Check for overlapping nodes/zones. It's possible on some 177 * On subsequent iterations ALIGN() is actually not needed,
172 * configurations to have a setup like 178 * but we keep it that we not to complicate the code.
173 * node0 node1 node0
174 * i.e. it's possible that all pages within a zones range of
175 * pages do not belong to a single zone.
176 */ 179 */
177 page = pfn_to_page(pfn); 180 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
178 if (page_zone(page) != zone) 181 block_end_pfn = min(block_end_pfn, end_pfn);
179 continue;
180 182
181 /* Check the block is suitable for migration */ 183 spin_lock_irqsave(&zone->lock, flags);
182 if (!suitable_migration_target(page)) 184 isolated = isolate_freepages_block(pfn, block_end_pfn,
183 continue; 185 &freelist, true);
186 spin_unlock_irqrestore(&zone->lock, flags);
184 187
185 /* 188 /*
186 * Found a block suitable for isolating free pages from. Now 189 * In strict mode, isolate_freepages_block() returns 0 if
187 * we disabled interrupts, double check things are ok and 190 * there are any holes in the block (ie. invalid PFNs or
188 * isolate the pages. This is to minimise the time IRQs 191 * non-free pages).
189 * are disabled
190 */ 192 */
191 isolated = 0; 193 if (!isolated)
192 spin_lock_irqsave(&zone->lock, flags); 194 break;
193 if (suitable_migration_target(page)) {
194 isolated = isolate_freepages_block(zone, pfn, freelist);
195 nr_freepages += isolated;
196 }
197 spin_unlock_irqrestore(&zone->lock, flags);
198 195
199 /* 196 /*
200 * Record the highest PFN we isolated pages from. When next 197 * If we managed to isolate pages, it is always (1 << n) *
201 * looking for free pages, the search will restart here as 198 * pageblock_nr_pages for some non-negative n. (Max order
202 * page migration may have returned some pages to the allocator 199 * page may span two pageblocks).
203 */ 200 */
204 if (isolated)
205 high_pfn = max(high_pfn, pfn);
206 } 201 }
207 202
208 /* split_free_page does not map the pages */ 203 /* split_free_page does not map the pages */
209 list_for_each_entry(page, freelist, lru) { 204 map_pages(&freelist);
210 arch_alloc_page(page, 0); 205
211 kernel_map_pages(page, 1, 1); 206 if (pfn < end_pfn) {
207 /* Loop terminated early, cleanup. */
208 release_freepages(&freelist);
209 return 0;
212 } 210 }
213 211
214 cc->free_pfn = high_pfn; 212 /* We don't use freelists for anything. */
215 cc->nr_freepages = nr_freepages; 213 return pfn;
216} 214}
217 215
218/* Update the number of anon and file isolated pages in the zone */ 216/* Update the number of anon and file isolated pages in the zone */
219static void acct_isolated(struct zone *zone, struct compact_control *cc) 217static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
220{ 218{
221 struct page *page; 219 struct page *page;
222 unsigned int count[2] = { 0, }; 220 unsigned int count[2] = { 0, };
@@ -224,8 +222,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
224 list_for_each_entry(page, &cc->migratepages, lru) 222 list_for_each_entry(page, &cc->migratepages, lru)
225 count[!!page_is_file_cache(page)]++; 223 count[!!page_is_file_cache(page)]++;
226 224
227 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); 225 /* If locked we can use the interrupt unsafe versions */
228 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); 226 if (locked) {
227 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
228 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
229 } else {
230 mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
231 mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
232 }
229} 233}
230 234
231/* Similar to reclaim, but different enough that they don't share logic */ 235/* Similar to reclaim, but different enough that they don't share logic */
@@ -243,37 +247,36 @@ static bool too_many_isolated(struct zone *zone)
243 return isolated > (inactive + active) / 2; 247 return isolated > (inactive + active) / 2;
244} 248}
245 249
246/* possible outcome of isolate_migratepages */ 250/**
247typedef enum { 251 * isolate_migratepages_range() - isolate all migrate-able pages in range.
248 ISOLATE_ABORT, /* Abort compaction now */ 252 * @zone: Zone pages are in.
249 ISOLATE_NONE, /* No pages isolated, continue scanning */ 253 * @cc: Compaction control structure.
250 ISOLATE_SUCCESS, /* Pages isolated, migrate */ 254 * @low_pfn: The first PFN of the range.
251} isolate_migrate_t; 255 * @end_pfn: The one-past-the-last PFN of the range.
252 256 *
253/* 257 * Isolate all pages that can be migrated from the range specified by
254 * Isolate all pages that can be migrated from the block pointed to by 258 * [low_pfn, end_pfn). Returns zero if there is a fatal signal
255 * the migrate scanner within compact_control. 259 * pending), otherwise PFN of the first page that was not scanned
260 * (which may be both less, equal to or more then end_pfn).
261 *
262 * Assumes that cc->migratepages is empty and cc->nr_migratepages is
263 * zero.
264 *
265 * Apart from cc->migratepages and cc->nr_migratetypes this function
266 * does not modify any cc's fields, in particular it does not modify
267 * (or read for that matter) cc->migrate_pfn.
256 */ 268 */
257static isolate_migrate_t isolate_migratepages(struct zone *zone, 269unsigned long
258 struct compact_control *cc) 270isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
271 unsigned long low_pfn, unsigned long end_pfn)
259{ 272{
260 unsigned long low_pfn, end_pfn;
261 unsigned long last_pageblock_nr = 0, pageblock_nr; 273 unsigned long last_pageblock_nr = 0, pageblock_nr;
262 unsigned long nr_scanned = 0, nr_isolated = 0; 274 unsigned long nr_scanned = 0, nr_isolated = 0;
263 struct list_head *migratelist = &cc->migratepages; 275 struct list_head *migratelist = &cc->migratepages;
264 isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE; 276 isolate_mode_t mode = 0;
265 277 struct lruvec *lruvec;
266 /* Do not scan outside zone boundaries */ 278 unsigned long flags;
267 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); 279 bool locked;
268
269 /* Only scan within a pageblock boundary */
270 end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
271
272 /* Do not cross the free scanner or scan within a memory hole */
273 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
274 cc->migrate_pfn = end_pfn;
275 return ISOLATE_NONE;
276 }
277 280
278 /* 281 /*
279 * Ensure that there are not too many pages isolated from the LRU 282 * Ensure that there are not too many pages isolated from the LRU
@@ -283,35 +286,32 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
283 while (unlikely(too_many_isolated(zone))) { 286 while (unlikely(too_many_isolated(zone))) {
284 /* async migration should just abort */ 287 /* async migration should just abort */
285 if (!cc->sync) 288 if (!cc->sync)
286 return ISOLATE_ABORT; 289 return 0;
287 290
288 congestion_wait(BLK_RW_ASYNC, HZ/10); 291 congestion_wait(BLK_RW_ASYNC, HZ/10);
289 292
290 if (fatal_signal_pending(current)) 293 if (fatal_signal_pending(current))
291 return ISOLATE_ABORT; 294 return 0;
292 } 295 }
293 296
294 /* Time to isolate some pages for migration */ 297 /* Time to isolate some pages for migration */
295 cond_resched(); 298 cond_resched();
296 spin_lock_irq(&zone->lru_lock); 299 spin_lock_irqsave(&zone->lru_lock, flags);
300 locked = true;
297 for (; low_pfn < end_pfn; low_pfn++) { 301 for (; low_pfn < end_pfn; low_pfn++) {
298 struct page *page; 302 struct page *page;
299 bool locked = true;
300 303
301 /* give a chance to irqs before checking need_resched() */ 304 /* give a chance to irqs before checking need_resched() */
302 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { 305 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
303 spin_unlock_irq(&zone->lru_lock); 306 spin_unlock_irqrestore(&zone->lru_lock, flags);
304 locked = false; 307 locked = false;
305 } 308 }
306 if (need_resched() || spin_is_contended(&zone->lru_lock)) { 309
307 if (locked) 310 /* Check if it is ok to still hold the lock */
308 spin_unlock_irq(&zone->lru_lock); 311 locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
309 cond_resched(); 312 locked, cc);
310 spin_lock_irq(&zone->lru_lock); 313 if (!locked)
311 if (fatal_signal_pending(current)) 314 break;
312 break;
313 } else if (!locked)
314 spin_lock_irq(&zone->lru_lock);
315 315
316 /* 316 /*
317 * migrate_pfn does not necessarily start aligned to a 317 * migrate_pfn does not necessarily start aligned to a
@@ -351,7 +351,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
351 */ 351 */
352 pageblock_nr = low_pfn >> pageblock_order; 352 pageblock_nr = low_pfn >> pageblock_order;
353 if (!cc->sync && last_pageblock_nr != pageblock_nr && 353 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
354 get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { 354 !migrate_async_suitable(get_pageblock_migratetype(page))) {
355 low_pfn += pageblock_nr_pages; 355 low_pfn += pageblock_nr_pages;
356 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; 356 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
357 last_pageblock_nr = pageblock_nr; 357 last_pageblock_nr = pageblock_nr;
@@ -374,14 +374,16 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
374 if (!cc->sync) 374 if (!cc->sync)
375 mode |= ISOLATE_ASYNC_MIGRATE; 375 mode |= ISOLATE_ASYNC_MIGRATE;
376 376
377 lruvec = mem_cgroup_page_lruvec(page, zone);
378
377 /* Try isolate the page */ 379 /* Try isolate the page */
378 if (__isolate_lru_page(page, mode, 0) != 0) 380 if (__isolate_lru_page(page, mode) != 0)
379 continue; 381 continue;
380 382
381 VM_BUG_ON(PageTransCompound(page)); 383 VM_BUG_ON(PageTransCompound(page));
382 384
383 /* Successfully isolated */ 385 /* Successfully isolated */
384 del_page_from_lru_list(zone, page, page_lru(page)); 386 del_page_from_lru_list(page, lruvec, page_lru(page));
385 list_add(&page->lru, migratelist); 387 list_add(&page->lru, migratelist);
386 cc->nr_migratepages++; 388 cc->nr_migratepages++;
387 nr_isolated++; 389 nr_isolated++;
@@ -393,14 +395,167 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
393 } 395 }
394 } 396 }
395 397
396 acct_isolated(zone, cc); 398 acct_isolated(zone, locked, cc);
397 399
398 spin_unlock_irq(&zone->lru_lock); 400 if (locked)
399 cc->migrate_pfn = low_pfn; 401 spin_unlock_irqrestore(&zone->lru_lock, flags);
400 402
401 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 403 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
402 404
403 return ISOLATE_SUCCESS; 405 return low_pfn;
406}
407
408#endif /* CONFIG_COMPACTION || CONFIG_CMA */
409#ifdef CONFIG_COMPACTION
410
411/* Returns true if the page is within a block suitable for migration to */
412static bool suitable_migration_target(struct page *page)
413{
414
415 int migratetype = get_pageblock_migratetype(page);
416
417 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
418 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
419 return false;
420
421 /* If the page is a large free page, then allow migration */
422 if (PageBuddy(page) && page_order(page) >= pageblock_order)
423 return true;
424
425 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
426 if (migrate_async_suitable(migratetype))
427 return true;
428
429 /* Otherwise skip the block */
430 return false;
431}
432
433/*
434 * Returns the start pfn of the last page block in a zone. This is the starting
435 * point for full compaction of a zone. Compaction searches for free pages from
436 * the end of each zone, while isolate_freepages_block scans forward inside each
437 * page block.
438 */
439static unsigned long start_free_pfn(struct zone *zone)
440{
441 unsigned long free_pfn;
442 free_pfn = zone->zone_start_pfn + zone->spanned_pages;
443 free_pfn &= ~(pageblock_nr_pages-1);
444 return free_pfn;
445}
446
447/*
448 * Based on information in the current compact_control, find blocks
449 * suitable for isolating free pages from and then isolate them.
450 */
451static void isolate_freepages(struct zone *zone,
452 struct compact_control *cc)
453{
454 struct page *page;
455 unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
456 unsigned long flags;
457 int nr_freepages = cc->nr_freepages;
458 struct list_head *freelist = &cc->freepages;
459
460 /*
461 * Initialise the free scanner. The starting point is where we last
462 * scanned from (or the end of the zone if starting). The low point
463 * is the end of the pageblock the migration scanner is using.
464 */
465 pfn = cc->free_pfn;
466 low_pfn = cc->migrate_pfn + pageblock_nr_pages;
467
468 /*
469 * Take care that if the migration scanner is at the end of the zone
470 * that the free scanner does not accidentally move to the next zone
471 * in the next isolation cycle.
472 */
473 high_pfn = min(low_pfn, pfn);
474
475 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
476
477 /*
478 * Isolate free pages until enough are available to migrate the
479 * pages on cc->migratepages. We stop searching if the migrate
480 * and free page scanners meet or enough free pages are isolated.
481 */
482 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
483 pfn -= pageblock_nr_pages) {
484 unsigned long isolated;
485
486 if (!pfn_valid(pfn))
487 continue;
488
489 /*
490 * Check for overlapping nodes/zones. It's possible on some
491 * configurations to have a setup like
492 * node0 node1 node0
493 * i.e. it's possible that all pages within a zones range of
494 * pages do not belong to a single zone.
495 */
496 page = pfn_to_page(pfn);
497 if (page_zone(page) != zone)
498 continue;
499
500 /* Check the block is suitable for migration */
501 if (!suitable_migration_target(page))
502 continue;
503
504 /*
505 * Found a block suitable for isolating free pages from. Now
506 * we disabled interrupts, double check things are ok and
507 * isolate the pages. This is to minimise the time IRQs
508 * are disabled
509 */
510 isolated = 0;
511
512 /*
513 * The zone lock must be held to isolate freepages. This
514 * unfortunately this is a very coarse lock and can be
515 * heavily contended if there are parallel allocations
516 * or parallel compactions. For async compaction do not
517 * spin on the lock
518 */
519 if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
520 break;
521 if (suitable_migration_target(page)) {
522 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
523 isolated = isolate_freepages_block(pfn, end_pfn,
524 freelist, false);
525 nr_freepages += isolated;
526 }
527 spin_unlock_irqrestore(&zone->lock, flags);
528
529 /*
530 * Record the highest PFN we isolated pages from. When next
531 * looking for free pages, the search will restart here as
532 * page migration may have returned some pages to the allocator
533 */
534 if (isolated) {
535 high_pfn = max(high_pfn, pfn);
536
537 /*
538 * If the free scanner has wrapped, update
539 * compact_cached_free_pfn to point to the highest
540 * pageblock with free pages. This reduces excessive
541 * scanning of full pageblocks near the end of the
542 * zone
543 */
544 if (cc->order > 0 && cc->wrapped)
545 zone->compact_cached_free_pfn = high_pfn;
546 }
547 }
548
549 /* split_free_page does not map the pages */
550 map_pages(freelist);
551
552 cc->free_pfn = high_pfn;
553 cc->nr_freepages = nr_freepages;
554
555 /* If compact_cached_free_pfn is reset then set it now */
556 if (cc->order > 0 && !cc->wrapped &&
557 zone->compact_cached_free_pfn == start_free_pfn(zone))
558 zone->compact_cached_free_pfn = high_pfn;
404} 559}
405 560
406/* 561/*
@@ -449,6 +604,44 @@ static void update_nr_listpages(struct compact_control *cc)
449 cc->nr_freepages = nr_freepages; 604 cc->nr_freepages = nr_freepages;
450} 605}
451 606
607/* possible outcome of isolate_migratepages */
608typedef enum {
609 ISOLATE_ABORT, /* Abort compaction now */
610 ISOLATE_NONE, /* No pages isolated, continue scanning */
611 ISOLATE_SUCCESS, /* Pages isolated, migrate */
612} isolate_migrate_t;
613
614/*
615 * Isolate all pages that can be migrated from the block pointed to by
616 * the migrate scanner within compact_control.
617 */
618static isolate_migrate_t isolate_migratepages(struct zone *zone,
619 struct compact_control *cc)
620{
621 unsigned long low_pfn, end_pfn;
622
623 /* Do not scan outside zone boundaries */
624 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
625
626 /* Only scan within a pageblock boundary */
627 end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
628
629 /* Do not cross the free scanner or scan within a memory hole */
630 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
631 cc->migrate_pfn = end_pfn;
632 return ISOLATE_NONE;
633 }
634
635 /* Perform the isolation */
636 low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
637 if (!low_pfn)
638 return ISOLATE_ABORT;
639
640 cc->migrate_pfn = low_pfn;
641
642 return ISOLATE_SUCCESS;
643}
644
452static int compact_finished(struct zone *zone, 645static int compact_finished(struct zone *zone,
453 struct compact_control *cc) 646 struct compact_control *cc)
454{ 647{
@@ -458,8 +651,26 @@ static int compact_finished(struct zone *zone,
458 if (fatal_signal_pending(current)) 651 if (fatal_signal_pending(current))
459 return COMPACT_PARTIAL; 652 return COMPACT_PARTIAL;
460 653
461 /* Compaction run completes if the migrate and free scanner meet */ 654 /*
462 if (cc->free_pfn <= cc->migrate_pfn) 655 * A full (order == -1) compaction run starts at the beginning and
656 * end of a zone; it completes when the migrate and free scanner meet.
657 * A partial (order > 0) compaction can start with the free scanner
658 * at a random point in the zone, and may have to restart.
659 */
660 if (cc->free_pfn <= cc->migrate_pfn) {
661 if (cc->order > 0 && !cc->wrapped) {
662 /* We started partway through; restart at the end. */
663 unsigned long free_pfn = start_free_pfn(zone);
664 zone->compact_cached_free_pfn = free_pfn;
665 cc->free_pfn = free_pfn;
666 cc->wrapped = 1;
667 return COMPACT_CONTINUE;
668 }
669 return COMPACT_COMPLETE;
670 }
671
672 /* We wrapped around and ended up where we started. */
673 if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
463 return COMPACT_COMPLETE; 674 return COMPACT_COMPLETE;
464 675
465 /* 676 /*
@@ -557,8 +768,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
557 768
558 /* Setup to move all movable pages to the end of the zone */ 769 /* Setup to move all movable pages to the end of the zone */
559 cc->migrate_pfn = zone->zone_start_pfn; 770 cc->migrate_pfn = zone->zone_start_pfn;
560 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; 771
561 cc->free_pfn &= ~(pageblock_nr_pages-1); 772 if (cc->order > 0) {
773 /* Incremental compaction. Start where the last one stopped. */
774 cc->free_pfn = zone->compact_cached_free_pfn;
775 cc->start_free_pfn = cc->free_pfn;
776 } else {
777 /* Order == -1 starts at the end of the zone. */
778 cc->free_pfn = start_free_pfn(zone);
779 }
562 780
563 migrate_prep_local(); 781 migrate_prep_local();
564 782
@@ -594,8 +812,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
594 if (err) { 812 if (err) {
595 putback_lru_pages(&cc->migratepages); 813 putback_lru_pages(&cc->migratepages);
596 cc->nr_migratepages = 0; 814 cc->nr_migratepages = 0;
815 if (err == -ENOMEM) {
816 ret = COMPACT_PARTIAL;
817 goto out;
818 }
597 } 819 }
598
599 } 820 }
600 821
601out: 822out:
@@ -608,7 +829,7 @@ out:
608 829
609static unsigned long compact_zone_order(struct zone *zone, 830static unsigned long compact_zone_order(struct zone *zone,
610 int order, gfp_t gfp_mask, 831 int order, gfp_t gfp_mask,
611 bool sync) 832 bool sync, bool *contended)
612{ 833{
613 struct compact_control cc = { 834 struct compact_control cc = {
614 .nr_freepages = 0, 835 .nr_freepages = 0,
@@ -617,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone,
617 .migratetype = allocflags_to_migratetype(gfp_mask), 838 .migratetype = allocflags_to_migratetype(gfp_mask),
618 .zone = zone, 839 .zone = zone,
619 .sync = sync, 840 .sync = sync,
841 .contended = contended,
620 }; 842 };
621 INIT_LIST_HEAD(&cc.freepages); 843 INIT_LIST_HEAD(&cc.freepages);
622 INIT_LIST_HEAD(&cc.migratepages); 844 INIT_LIST_HEAD(&cc.migratepages);
@@ -638,7 +860,7 @@ int sysctl_extfrag_threshold = 500;
638 */ 860 */
639unsigned long try_to_compact_pages(struct zonelist *zonelist, 861unsigned long try_to_compact_pages(struct zonelist *zonelist,
640 int order, gfp_t gfp_mask, nodemask_t *nodemask, 862 int order, gfp_t gfp_mask, nodemask_t *nodemask,
641 bool sync) 863 bool sync, bool *contended)
642{ 864{
643 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 865 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
644 int may_enter_fs = gfp_mask & __GFP_FS; 866 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -662,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
662 nodemask) { 884 nodemask) {
663 int status; 885 int status;
664 886
665 status = compact_zone_order(zone, order, gfp_mask, sync); 887 status = compact_zone_order(zone, order, gfp_mask, sync,
888 contended);
666 rc = max(status, rc); 889 rc = max(status, rc);
667 890
668 /* If a normal allocation would succeed, stop compacting */ 891 /* If a normal allocation would succeed, stop compacting */
@@ -698,7 +921,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
698 if (cc->order > 0) { 921 if (cc->order > 0) {
699 int ok = zone_watermark_ok(zone, cc->order, 922 int ok = zone_watermark_ok(zone, cc->order,
700 low_wmark_pages(zone), 0, 0); 923 low_wmark_pages(zone), 0, 0);
701 if (ok && cc->order > zone->compact_order_failed) 924 if (ok && cc->order >= zone->compact_order_failed)
702 zone->compact_order_failed = cc->order + 1; 925 zone->compact_order_failed = cc->order + 1;
703 /* Currently async compaction is never deferred. */ 926 /* Currently async compaction is never deferred. */
704 else if (!ok && cc->sync) 927 else if (!ok && cc->sync)
@@ -795,3 +1018,5 @@ void compaction_unregister_node(struct node *node)
795 return device_remove_file(&node->dev, &dev_attr_compact); 1018 return device_remove_file(&node->dev, &dev_attr_compact);
796} 1019}
797#endif /* CONFIG_SYSFS && CONFIG_NUMA */ 1020#endif /* CONFIG_SYSFS && CONFIG_NUMA */
1021
1022#endif /* CONFIG_COMPACTION */
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 469491e0af79..9b75a045dbf4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
93 spin_unlock(&file->f_lock); 93 spin_unlock(&file->f_lock);
94 break; 94 break;
95 case POSIX_FADV_WILLNEED: 95 case POSIX_FADV_WILLNEED:
96 if (!mapping->a_ops->readpage) {
97 ret = -EINVAL;
98 break;
99 }
100
101 /* First and last PARTIAL page! */ 96 /* First and last PARTIAL page! */
102 start_index = offset >> PAGE_CACHE_SHIFT; 97 start_index = offset >> PAGE_CACHE_SHIFT;
103 end_index = endbyte >> PAGE_CACHE_SHIFT; 98 end_index = endbyte >> PAGE_CACHE_SHIFT;
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
106 nrpages = end_index - start_index + 1; 101 nrpages = end_index - start_index + 1;
107 if (!nrpages) 102 if (!nrpages)
108 nrpages = ~0UL; 103 nrpages = ~0UL;
109 104
110 ret = force_page_cache_readahead(mapping, file, 105 /*
111 start_index, 106 * Ignore return value because fadvise() shall return
112 nrpages); 107 * success even if filesystem can't retrieve a hint,
113 if (ret > 0) 108 */
114 ret = 0; 109 force_page_cache_readahead(mapping, file, start_index,
110 nrpages);
115 break; 111 break;
116 case POSIX_FADV_NOREUSE: 112 case POSIX_FADV_NOREUSE:
117 break; 113 break;
diff --git a/mm/filemap.c b/mm/filemap.c
index 79c4b2b0b14e..384344575c37 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,7 +29,6 @@
29#include <linux/pagevec.h> 29#include <linux/pagevec.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/syscalls.h>
33#include <linux/cpuset.h> 32#include <linux/cpuset.h>
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 33#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 34#include <linux/memcontrol.h>
@@ -1413,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1413 retval = filemap_write_and_wait_range(mapping, pos, 1412 retval = filemap_write_and_wait_range(mapping, pos,
1414 pos + iov_length(iov, nr_segs) - 1); 1413 pos + iov_length(iov, nr_segs) - 1);
1415 if (!retval) { 1414 if (!retval) {
1416 struct blk_plug plug;
1417
1418 blk_start_plug(&plug);
1419 retval = mapping->a_ops->direct_IO(READ, iocb, 1415 retval = mapping->a_ops->direct_IO(READ, iocb,
1420 iov, pos, nr_segs); 1416 iov, pos, nr_segs);
1421 blk_finish_plug(&plug);
1422 } 1417 }
1423 if (retval > 0) { 1418 if (retval > 0) {
1424 *ppos = pos + retval; 1419 *ppos = pos + retval;
@@ -1478,44 +1473,6 @@ out:
1478} 1473}
1479EXPORT_SYMBOL(generic_file_aio_read); 1474EXPORT_SYMBOL(generic_file_aio_read);
1480 1475
1481static ssize_t
1482do_readahead(struct address_space *mapping, struct file *filp,
1483 pgoff_t index, unsigned long nr)
1484{
1485 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1486 return -EINVAL;
1487
1488 force_page_cache_readahead(mapping, filp, index, nr);
1489 return 0;
1490}
1491
1492SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
1493{
1494 ssize_t ret;
1495 struct file *file;
1496
1497 ret = -EBADF;
1498 file = fget(fd);
1499 if (file) {
1500 if (file->f_mode & FMODE_READ) {
1501 struct address_space *mapping = file->f_mapping;
1502 pgoff_t start = offset >> PAGE_CACHE_SHIFT;
1503 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1504 unsigned long len = end - start + 1;
1505 ret = do_readahead(mapping, file, start, len);
1506 }
1507 fput(file);
1508 }
1509 return ret;
1510}
1511#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
1512asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
1513{
1514 return SYSC_readahead((int) fd, offset, (size_t) count);
1515}
1516SYSCALL_ALIAS(sys_readahead, SyS_readahead);
1517#endif
1518
1519#ifdef CONFIG_MMU 1476#ifdef CONFIG_MMU
1520/** 1477/**
1521 * page_cache_read - adds requested page to the page cache if not already there 1478 * page_cache_read - adds requested page to the page cache if not already there
@@ -1751,8 +1708,35 @@ page_not_uptodate:
1751} 1708}
1752EXPORT_SYMBOL(filemap_fault); 1709EXPORT_SYMBOL(filemap_fault);
1753 1710
1711int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1712{
1713 struct page *page = vmf->page;
1714 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1715 int ret = VM_FAULT_LOCKED;
1716
1717 sb_start_pagefault(inode->i_sb);
1718 file_update_time(vma->vm_file);
1719 lock_page(page);
1720 if (page->mapping != inode->i_mapping) {
1721 unlock_page(page);
1722 ret = VM_FAULT_NOPAGE;
1723 goto out;
1724 }
1725 /*
1726 * We mark the page dirty already here so that when freeze is in
1727 * progress, we are guaranteed that writeback during freezing will
1728 * see the dirty page and writeprotect it again.
1729 */
1730 set_page_dirty(page);
1731out:
1732 sb_end_pagefault(inode->i_sb);
1733 return ret;
1734}
1735EXPORT_SYMBOL(filemap_page_mkwrite);
1736
1754const struct vm_operations_struct generic_file_vm_ops = { 1737const struct vm_operations_struct generic_file_vm_ops = {
1755 .fault = filemap_fault, 1738 .fault = filemap_fault,
1739 .page_mkwrite = filemap_page_mkwrite,
1756}; 1740};
1757 1741
1758/* This is used for a general mmap of a disk file */ 1742/* This is used for a general mmap of a disk file */
@@ -1938,71 +1922,6 @@ struct page *read_cache_page(struct address_space *mapping,
1938} 1922}
1939EXPORT_SYMBOL(read_cache_page); 1923EXPORT_SYMBOL(read_cache_page);
1940 1924
1941/*
1942 * The logic we want is
1943 *
1944 * if suid or (sgid and xgrp)
1945 * remove privs
1946 */
1947int should_remove_suid(struct dentry *dentry)
1948{
1949 umode_t mode = dentry->d_inode->i_mode;
1950 int kill = 0;
1951
1952 /* suid always must be killed */
1953 if (unlikely(mode & S_ISUID))
1954 kill = ATTR_KILL_SUID;
1955
1956 /*
1957 * sgid without any exec bits is just a mandatory locking mark; leave
1958 * it alone. If some exec bits are set, it's a real sgid; kill it.
1959 */
1960 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1961 kill |= ATTR_KILL_SGID;
1962
1963 if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
1964 return kill;
1965
1966 return 0;
1967}
1968EXPORT_SYMBOL(should_remove_suid);
1969
1970static int __remove_suid(struct dentry *dentry, int kill)
1971{
1972 struct iattr newattrs;
1973
1974 newattrs.ia_valid = ATTR_FORCE | kill;
1975 return notify_change(dentry, &newattrs);
1976}
1977
1978int file_remove_suid(struct file *file)
1979{
1980 struct dentry *dentry = file->f_path.dentry;
1981 struct inode *inode = dentry->d_inode;
1982 int killsuid;
1983 int killpriv;
1984 int error = 0;
1985
1986 /* Fast path for nothing security related */
1987 if (IS_NOSEC(inode))
1988 return 0;
1989
1990 killsuid = should_remove_suid(dentry);
1991 killpriv = security_inode_need_killpriv(dentry);
1992
1993 if (killpriv < 0)
1994 return killpriv;
1995 if (killpriv)
1996 error = security_inode_killpriv(dentry);
1997 if (!error && killsuid)
1998 error = __remove_suid(dentry, killsuid);
1999 if (!error && (inode->i_sb->s_flags & MS_NOSEC))
2000 inode->i_flags |= S_NOSEC;
2001
2002 return error;
2003}
2004EXPORT_SYMBOL(file_remove_suid);
2005
2006static size_t __iovec_copy_from_user_inatomic(char *vaddr, 1925static size_t __iovec_copy_from_user_inatomic(char *vaddr,
2007 const struct iovec *iov, size_t base, size_t bytes) 1926 const struct iovec *iov, size_t base, size_t bytes)
2008{ 1927{
@@ -2511,8 +2430,6 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2511 count = ocount; 2430 count = ocount;
2512 pos = *ppos; 2431 pos = *ppos;
2513 2432
2514 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2515
2516 /* We can write back this queue in page reclaim */ 2433 /* We can write back this queue in page reclaim */
2517 current->backing_dev_info = mapping->backing_dev_info; 2434 current->backing_dev_info = mapping->backing_dev_info;
2518 written = 0; 2435 written = 0;
@@ -2528,7 +2445,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2528 if (err) 2445 if (err)
2529 goto out; 2446 goto out;
2530 2447
2531 file_update_time(file); 2448 err = file_update_time(file);
2449 if (err)
2450 goto out;
2532 2451
2533 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 2452 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2534 if (unlikely(file->f_flags & O_DIRECT)) { 2453 if (unlikely(file->f_flags & O_DIRECT)) {
@@ -2604,13 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2604{ 2523{
2605 struct file *file = iocb->ki_filp; 2524 struct file *file = iocb->ki_filp;
2606 struct inode *inode = file->f_mapping->host; 2525 struct inode *inode = file->f_mapping->host;
2607 struct blk_plug plug;
2608 ssize_t ret; 2526 ssize_t ret;
2609 2527
2610 BUG_ON(iocb->ki_pos != pos); 2528 BUG_ON(iocb->ki_pos != pos);
2611 2529
2530 sb_start_write(inode->i_sb);
2612 mutex_lock(&inode->i_mutex); 2531 mutex_lock(&inode->i_mutex);
2613 blk_start_plug(&plug);
2614 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 2532 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2615 mutex_unlock(&inode->i_mutex); 2533 mutex_unlock(&inode->i_mutex);
2616 2534
@@ -2621,7 +2539,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2621 if (err < 0 && ret > 0) 2539 if (err < 0 && ret > 0)
2622 ret = err; 2540 ret = err;
2623 } 2541 }
2624 blk_finish_plug(&plug); 2542 sb_end_write(inode->i_sb);
2625 return ret; 2543 return ret;
2626} 2544}
2627EXPORT_SYMBOL(generic_file_aio_write); 2545EXPORT_SYMBOL(generic_file_aio_write);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index a4eb31132229..13e013b1270c 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -304,6 +304,7 @@ out:
304 304
305static const struct vm_operations_struct xip_file_vm_ops = { 305static const struct vm_operations_struct xip_file_vm_ops = {
306 .fault = xip_file_fault, 306 .fault = xip_file_fault,
307 .page_mkwrite = filemap_page_mkwrite,
307}; 308};
308 309
309int xip_file_mmap(struct file * file, struct vm_area_struct * vma) 310int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -401,6 +402,8 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
401 loff_t pos; 402 loff_t pos;
402 ssize_t ret; 403 ssize_t ret;
403 404
405 sb_start_write(inode->i_sb);
406
404 mutex_lock(&inode->i_mutex); 407 mutex_lock(&inode->i_mutex);
405 408
406 if (!access_ok(VERIFY_READ, buf, len)) { 409 if (!access_ok(VERIFY_READ, buf, len)) {
@@ -411,8 +414,6 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
411 pos = *ppos; 414 pos = *ppos;
412 count = len; 415 count = len;
413 416
414 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
415
416 /* We can write back this queue in page reclaim */ 417 /* We can write back this queue in page reclaim */
417 current->backing_dev_info = mapping->backing_dev_info; 418 current->backing_dev_info = mapping->backing_dev_info;
418 419
@@ -426,7 +427,9 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
426 if (ret) 427 if (ret)
427 goto out_backing; 428 goto out_backing;
428 429
429 file_update_time(filp); 430 ret = file_update_time(filp);
431 if (ret)
432 goto out_backing;
430 433
431 ret = __xip_file_write (filp, buf, count, pos, ppos); 434 ret = __xip_file_write (filp, buf, count, pos, ppos);
432 435
@@ -434,6 +437,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
434 current->backing_dev_info = NULL; 437 current->backing_dev_info = NULL;
435 out_up: 438 out_up:
436 mutex_unlock(&inode->i_mutex); 439 mutex_unlock(&inode->i_mutex);
440 sb_end_write(inode->i_sb);
437 return ret; 441 return ret;
438} 442}
439EXPORT_SYMBOL_GPL(xip_file_write); 443EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 000000000000..6b3e71a2cd48
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,344 @@
1/*
2 * Frontswap frontend
3 *
4 * This code provides the generic "frontend" layer to call a matching
5 * "backend" driver implementation of frontswap. See
6 * Documentation/vm/frontswap.txt for more information.
7 *
8 * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
9 * Author: Dan Magenheimer
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2.
12 */
13
14#include <linux/mman.h>
15#include <linux/swap.h>
16#include <linux/swapops.h>
17#include <linux/security.h>
18#include <linux/module.h>
19#include <linux/debugfs.h>
20#include <linux/frontswap.h>
21#include <linux/swapfile.h>
22
23/*
24 * frontswap_ops is set by frontswap_register_ops to contain the pointers
25 * to the frontswap "backend" implementation functions.
26 */
27static struct frontswap_ops frontswap_ops __read_mostly;
28
29/*
30 * This global enablement flag reduces overhead on systems where frontswap_ops
31 * has not been registered, so is preferred to the slower alternative: a
32 * function call that checks a non-global.
33 */
34bool frontswap_enabled __read_mostly;
35EXPORT_SYMBOL(frontswap_enabled);
36
37/*
38 * If enabled, frontswap_store will return failure even on success. As
39 * a result, the swap subsystem will always write the page to swap, in
40 * effect converting frontswap into a writethrough cache. In this mode,
41 * there is no direct reduction in swap writes, but a frontswap backend
42 * can unilaterally "reclaim" any pages in use with no data loss, thus
43 * providing increases control over maximum memory usage due to frontswap.
44 */
45static bool frontswap_writethrough_enabled __read_mostly;
46
47#ifdef CONFIG_DEBUG_FS
48/*
49 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
50 * properly configured). These are for information only so are not protected
51 * against increment races.
52 */
53static u64 frontswap_loads;
54static u64 frontswap_succ_stores;
55static u64 frontswap_failed_stores;
56static u64 frontswap_invalidates;
57
58static inline void inc_frontswap_loads(void) {
59 frontswap_loads++;
60}
61static inline void inc_frontswap_succ_stores(void) {
62 frontswap_succ_stores++;
63}
64static inline void inc_frontswap_failed_stores(void) {
65 frontswap_failed_stores++;
66}
67static inline void inc_frontswap_invalidates(void) {
68 frontswap_invalidates++;
69}
70#else
71static inline void inc_frontswap_loads(void) { }
72static inline void inc_frontswap_succ_stores(void) { }
73static inline void inc_frontswap_failed_stores(void) { }
74static inline void inc_frontswap_invalidates(void) { }
75#endif
76/*
77 * Register operations for frontswap, returning previous thus allowing
78 * detection of multiple backends and possible nesting.
79 */
80struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
81{
82 struct frontswap_ops old = frontswap_ops;
83
84 frontswap_ops = *ops;
85 frontswap_enabled = true;
86 return old;
87}
88EXPORT_SYMBOL(frontswap_register_ops);
89
90/*
91 * Enable/disable frontswap writethrough (see above).
92 */
93void frontswap_writethrough(bool enable)
94{
95 frontswap_writethrough_enabled = enable;
96}
97EXPORT_SYMBOL(frontswap_writethrough);
98
99/*
100 * Called when a swap device is swapon'd.
101 */
102void __frontswap_init(unsigned type)
103{
104 struct swap_info_struct *sis = swap_info[type];
105
106 BUG_ON(sis == NULL);
107 if (sis->frontswap_map == NULL)
108 return;
109 frontswap_ops.init(type);
110}
111EXPORT_SYMBOL(__frontswap_init);
112
113static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
114{
115 frontswap_clear(sis, offset);
116 atomic_dec(&sis->frontswap_pages);
117}
118
119/*
120 * "Store" data from a page to frontswap and associate it with the page's
121 * swaptype and offset. Page must be locked and in the swap cache.
122 * If frontswap already contains a page with matching swaptype and
123 * offset, the frontswap implementation may either overwrite the data and
124 * return success or invalidate the page from frontswap and return failure.
125 */
126int __frontswap_store(struct page *page)
127{
128 int ret = -1, dup = 0;
129 swp_entry_t entry = { .val = page_private(page), };
130 int type = swp_type(entry);
131 struct swap_info_struct *sis = swap_info[type];
132 pgoff_t offset = swp_offset(entry);
133
134 BUG_ON(!PageLocked(page));
135 BUG_ON(sis == NULL);
136 if (frontswap_test(sis, offset))
137 dup = 1;
138 ret = frontswap_ops.store(type, offset, page);
139 if (ret == 0) {
140 frontswap_set(sis, offset);
141 inc_frontswap_succ_stores();
142 if (!dup)
143 atomic_inc(&sis->frontswap_pages);
144 } else {
145 /*
146 failed dup always results in automatic invalidate of
147 the (older) page from frontswap
148 */
149 inc_frontswap_failed_stores();
150 if (dup)
151 __frontswap_clear(sis, offset);
152 }
153 if (frontswap_writethrough_enabled)
154 /* report failure so swap also writes to swap device */
155 ret = -1;
156 return ret;
157}
158EXPORT_SYMBOL(__frontswap_store);
159
160/*
161 * "Get" data from frontswap associated with swaptype and offset that were
162 * specified when the data was put to frontswap and use it to fill the
163 * specified page with data. Page must be locked and in the swap cache.
164 */
165int __frontswap_load(struct page *page)
166{
167 int ret = -1;
168 swp_entry_t entry = { .val = page_private(page), };
169 int type = swp_type(entry);
170 struct swap_info_struct *sis = swap_info[type];
171 pgoff_t offset = swp_offset(entry);
172
173 BUG_ON(!PageLocked(page));
174 BUG_ON(sis == NULL);
175 if (frontswap_test(sis, offset))
176 ret = frontswap_ops.load(type, offset, page);
177 if (ret == 0)
178 inc_frontswap_loads();
179 return ret;
180}
181EXPORT_SYMBOL(__frontswap_load);
182
183/*
184 * Invalidate any data from frontswap associated with the specified swaptype
185 * and offset so that a subsequent "get" will fail.
186 */
187void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
188{
189 struct swap_info_struct *sis = swap_info[type];
190
191 BUG_ON(sis == NULL);
192 if (frontswap_test(sis, offset)) {
193 frontswap_ops.invalidate_page(type, offset);
194 __frontswap_clear(sis, offset);
195 inc_frontswap_invalidates();
196 }
197}
198EXPORT_SYMBOL(__frontswap_invalidate_page);
199
200/*
201 * Invalidate all data from frontswap associated with all offsets for the
202 * specified swaptype.
203 */
204void __frontswap_invalidate_area(unsigned type)
205{
206 struct swap_info_struct *sis = swap_info[type];
207
208 BUG_ON(sis == NULL);
209 if (sis->frontswap_map == NULL)
210 return;
211 frontswap_ops.invalidate_area(type);
212 atomic_set(&sis->frontswap_pages, 0);
213 memset(sis->frontswap_map, 0, sis->max / sizeof(long));
214}
215EXPORT_SYMBOL(__frontswap_invalidate_area);
216
217static unsigned long __frontswap_curr_pages(void)
218{
219 int type;
220 unsigned long totalpages = 0;
221 struct swap_info_struct *si = NULL;
222
223 assert_spin_locked(&swap_lock);
224 for (type = swap_list.head; type >= 0; type = si->next) {
225 si = swap_info[type];
226 totalpages += atomic_read(&si->frontswap_pages);
227 }
228 return totalpages;
229}
230
231static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
232 int *swapid)
233{
234 int ret = -EINVAL;
235 struct swap_info_struct *si = NULL;
236 int si_frontswap_pages;
237 unsigned long total_pages_to_unuse = total;
238 unsigned long pages = 0, pages_to_unuse = 0;
239 int type;
240
241 assert_spin_locked(&swap_lock);
242 for (type = swap_list.head; type >= 0; type = si->next) {
243 si = swap_info[type];
244 si_frontswap_pages = atomic_read(&si->frontswap_pages);
245 if (total_pages_to_unuse < si_frontswap_pages) {
246 pages = pages_to_unuse = total_pages_to_unuse;
247 } else {
248 pages = si_frontswap_pages;
249 pages_to_unuse = 0; /* unuse all */
250 }
251 /* ensure there is enough RAM to fetch pages from frontswap */
252 if (security_vm_enough_memory_mm(current->mm, pages)) {
253 ret = -ENOMEM;
254 continue;
255 }
256 vm_unacct_memory(pages);
257 *unused = pages_to_unuse;
258 *swapid = type;
259 ret = 0;
260 break;
261 }
262
263 return ret;
264}
265
266static int __frontswap_shrink(unsigned long target_pages,
267 unsigned long *pages_to_unuse,
268 int *type)
269{
270 unsigned long total_pages = 0, total_pages_to_unuse;
271
272 assert_spin_locked(&swap_lock);
273
274 total_pages = __frontswap_curr_pages();
275 if (total_pages <= target_pages) {
276 /* Nothing to do */
277 *pages_to_unuse = 0;
278 return 0;
279 }
280 total_pages_to_unuse = total_pages - target_pages;
281 return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
282}
283
284/*
285 * Frontswap, like a true swap device, may unnecessarily retain pages
286 * under certain circumstances; "shrink" frontswap is essentially a
287 * "partial swapoff" and works by calling try_to_unuse to attempt to
288 * unuse enough frontswap pages to attempt to -- subject to memory
289 * constraints -- reduce the number of pages in frontswap to the
290 * number given in the parameter target_pages.
291 */
292void frontswap_shrink(unsigned long target_pages)
293{
294 unsigned long pages_to_unuse = 0;
295 int type, ret;
296
297 /*
298 * we don't want to hold swap_lock while doing a very
299 * lengthy try_to_unuse, but swap_list may change
300 * so restart scan from swap_list.head each time
301 */
302 spin_lock(&swap_lock);
303 ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
304 spin_unlock(&swap_lock);
305 if (ret == 0 && pages_to_unuse)
306 try_to_unuse(type, true, pages_to_unuse);
307 return;
308}
309EXPORT_SYMBOL(frontswap_shrink);
310
311/*
312 * Count and return the number of frontswap pages across all
313 * swap devices. This is exported so that backend drivers can
314 * determine current usage without reading debugfs.
315 */
316unsigned long frontswap_curr_pages(void)
317{
318 unsigned long totalpages = 0;
319
320 spin_lock(&swap_lock);
321 totalpages = __frontswap_curr_pages();
322 spin_unlock(&swap_lock);
323
324 return totalpages;
325}
326EXPORT_SYMBOL(frontswap_curr_pages);
327
328static int __init init_frontswap(void)
329{
330#ifdef CONFIG_DEBUG_FS
331 struct dentry *root = debugfs_create_dir("frontswap", NULL);
332 if (root == NULL)
333 return -ENXIO;
334 debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
335 debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
336 debugfs_create_u64("failed_stores", S_IRUGO, root,
337 &frontswap_failed_stores);
338 debugfs_create_u64("invalidates", S_IRUGO,
339 root, &frontswap_invalidates);
340#endif
341 return 0;
342}
343
344module_init(init_frontswap);
diff --git a/mm/highmem.c b/mm/highmem.c
index 57d82c6250c3..d517cd16a6eb 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
94 do { spin_unlock(&kmap_lock); (void)(flags); } while (0) 94 do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
95#endif 95#endif
96 96
97struct page *kmap_to_page(void *vaddr)
98{
99 unsigned long addr = (unsigned long)vaddr;
100
101 if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
102 int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
103 return pte_page(pkmap_page_table[i]);
104 }
105
106 return virt_to_page(addr);
107}
108
97static void flush_all_zero_pkmaps(void) 109static void flush_all_zero_pkmaps(void)
98{ 110{
99 int i; 111 int i;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f0e5306eeb55..57c4b9309015 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -636,16 +636,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
636 unsigned long haddr, pmd_t *pmd, 636 unsigned long haddr, pmd_t *pmd,
637 struct page *page) 637 struct page *page)
638{ 638{
639 int ret = 0;
640 pgtable_t pgtable; 639 pgtable_t pgtable;
641 640
642 VM_BUG_ON(!PageCompound(page)); 641 VM_BUG_ON(!PageCompound(page));
643 pgtable = pte_alloc_one(mm, haddr); 642 pgtable = pte_alloc_one(mm, haddr);
644 if (unlikely(!pgtable)) { 643 if (unlikely(!pgtable))
645 mem_cgroup_uncharge_page(page);
646 put_page(page);
647 return VM_FAULT_OOM; 644 return VM_FAULT_OOM;
648 }
649 645
650 clear_huge_page(page, haddr, HPAGE_PMD_NR); 646 clear_huge_page(page, haddr, HPAGE_PMD_NR);
651 __SetPageUptodate(page); 647 __SetPageUptodate(page);
@@ -675,7 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
675 spin_unlock(&mm->page_table_lock); 671 spin_unlock(&mm->page_table_lock);
676 } 672 }
677 673
678 return ret; 674 return 0;
679} 675}
680 676
681static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) 677static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
@@ -724,8 +720,14 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
724 put_page(page); 720 put_page(page);
725 goto out; 721 goto out;
726 } 722 }
723 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
724 page))) {
725 mem_cgroup_uncharge_page(page);
726 put_page(page);
727 goto out;
728 }
727 729
728 return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); 730 return 0;
729 } 731 }
730out: 732out:
731 /* 733 /*
@@ -950,6 +952,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
950 count_vm_event(THP_FAULT_FALLBACK); 952 count_vm_event(THP_FAULT_FALLBACK);
951 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 953 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
952 pmd, orig_pmd, page, haddr); 954 pmd, orig_pmd, page, haddr);
955 if (ret & VM_FAULT_OOM)
956 split_huge_page(page);
953 put_page(page); 957 put_page(page);
954 goto out; 958 goto out;
955 } 959 }
@@ -957,6 +961,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
957 961
958 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 962 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
959 put_page(new_page); 963 put_page(new_page);
964 split_huge_page(page);
960 put_page(page); 965 put_page(page);
961 ret |= VM_FAULT_OOM; 966 ret |= VM_FAULT_OOM;
962 goto out; 967 goto out;
@@ -968,8 +973,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
968 spin_lock(&mm->page_table_lock); 973 spin_lock(&mm->page_table_lock);
969 put_page(page); 974 put_page(page);
970 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 975 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
976 spin_unlock(&mm->page_table_lock);
971 mem_cgroup_uncharge_page(new_page); 977 mem_cgroup_uncharge_page(new_page);
972 put_page(new_page); 978 put_page(new_page);
979 goto out;
973 } else { 980 } else {
974 pmd_t entry; 981 pmd_t entry;
975 VM_BUG_ON(!PageHead(page)); 982 VM_BUG_ON(!PageHead(page));
@@ -1224,10 +1231,13 @@ static void __split_huge_page_refcount(struct page *page)
1224{ 1231{
1225 int i; 1232 int i;
1226 struct zone *zone = page_zone(page); 1233 struct zone *zone = page_zone(page);
1234 struct lruvec *lruvec;
1227 int tail_count = 0; 1235 int tail_count = 0;
1228 1236
1229 /* prevent PageLRU to go away from under us, and freeze lru stats */ 1237 /* prevent PageLRU to go away from under us, and freeze lru stats */
1230 spin_lock_irq(&zone->lru_lock); 1238 spin_lock_irq(&zone->lru_lock);
1239 lruvec = mem_cgroup_page_lruvec(page, zone);
1240
1231 compound_lock(page); 1241 compound_lock(page);
1232 /* complete memcg works before add pages to LRU */ 1242 /* complete memcg works before add pages to LRU */
1233 mem_cgroup_split_huge_fixup(page); 1243 mem_cgroup_split_huge_fixup(page);
@@ -1302,13 +1312,12 @@ static void __split_huge_page_refcount(struct page *page)
1302 BUG_ON(!PageDirty(page_tail)); 1312 BUG_ON(!PageDirty(page_tail));
1303 BUG_ON(!PageSwapBacked(page_tail)); 1313 BUG_ON(!PageSwapBacked(page_tail));
1304 1314
1305 1315 lru_add_page_tail(page, page_tail, lruvec);
1306 lru_add_page_tail(zone, page, page_tail);
1307 } 1316 }
1308 atomic_sub(tail_count, &page->_count); 1317 atomic_sub(tail_count, &page->_count);
1309 BUG_ON(atomic_read(&page->_count) <= 0); 1318 BUG_ON(atomic_read(&page->_count) <= 0);
1310 1319
1311 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1320 __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
1312 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); 1321 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1313 1322
1314 ClearPageCompound(page); 1323 ClearPageCompound(page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b8ce6f450956..bc727122dd44 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,17 +24,20 @@
24 24
25#include <asm/page.h> 25#include <asm/page.h>
26#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <linux/io.h> 27#include <asm/tlb.h>
28 28
29#include <linux/io.h>
29#include <linux/hugetlb.h> 30#include <linux/hugetlb.h>
31#include <linux/hugetlb_cgroup.h>
30#include <linux/node.h> 32#include <linux/node.h>
33#include <linux/hugetlb_cgroup.h>
31#include "internal.h" 34#include "internal.h"
32 35
33const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 36const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 37static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable; 38unsigned long hugepages_treat_as_movable;
36 39
37static int max_hstate; 40int hugetlb_max_hstate __read_mostly;
38unsigned int default_hstate_idx; 41unsigned int default_hstate_idx;
39struct hstate hstates[HUGE_MAX_HSTATE]; 42struct hstate hstates[HUGE_MAX_HSTATE];
40 43
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate;
45static unsigned long __initdata default_hstate_max_huge_pages; 48static unsigned long __initdata default_hstate_max_huge_pages;
46static unsigned long __initdata default_hstate_size; 49static unsigned long __initdata default_hstate_size;
47 50
48#define for_each_hstate(h) \
49 for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
50
51/* 51/*
52 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 52 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
53 */ 53 */
54static DEFINE_SPINLOCK(hugetlb_lock); 54DEFINE_SPINLOCK(hugetlb_lock);
55 55
56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
57{ 57{
@@ -273,8 +273,8 @@ static long region_count(struct list_head *head, long f, long t)
273 273
274 /* Locate each segment we overlap with, and count that overlap. */ 274 /* Locate each segment we overlap with, and count that overlap. */
275 list_for_each_entry(rg, head, link) { 275 list_for_each_entry(rg, head, link) {
276 int seg_from; 276 long seg_from;
277 int seg_to; 277 long seg_to;
278 278
279 if (rg->to <= f) 279 if (rg->to <= f)
280 continue; 280 continue;
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src)
509static void enqueue_huge_page(struct hstate *h, struct page *page) 509static void enqueue_huge_page(struct hstate *h, struct page *page)
510{ 510{
511 int nid = page_to_nid(page); 511 int nid = page_to_nid(page);
512 list_add(&page->lru, &h->hugepage_freelists[nid]); 512 list_move(&page->lru, &h->hugepage_freelists[nid]);
513 h->free_huge_pages++; 513 h->free_huge_pages++;
514 h->free_huge_pages_node[nid]++; 514 h->free_huge_pages_node[nid]++;
515} 515}
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
521 if (list_empty(&h->hugepage_freelists[nid])) 521 if (list_empty(&h->hugepage_freelists[nid]))
522 return NULL; 522 return NULL;
523 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); 523 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
524 list_del(&page->lru); 524 list_move(&page->lru, &h->hugepage_activelist);
525 set_page_refcounted(page); 525 set_page_refcounted(page);
526 h->free_huge_pages--; 526 h->free_huge_pages--;
527 h->free_huge_pages_node[nid]--; 527 h->free_huge_pages_node[nid]--;
@@ -532,7 +532,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
532 struct vm_area_struct *vma, 532 struct vm_area_struct *vma,
533 unsigned long address, int avoid_reserve) 533 unsigned long address, int avoid_reserve)
534{ 534{
535 struct page *page; 535 struct page *page = NULL;
536 struct mempolicy *mpol; 536 struct mempolicy *mpol;
537 nodemask_t *nodemask; 537 nodemask_t *nodemask;
538 struct zonelist *zonelist; 538 struct zonelist *zonelist;
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
593 1 << PG_active | 1 << PG_reserved | 593 1 << PG_active | 1 << PG_reserved |
594 1 << PG_private | 1 << PG_writeback); 594 1 << PG_private | 1 << PG_writeback);
595 } 595 }
596 VM_BUG_ON(hugetlb_cgroup_from_page(page));
596 set_compound_page_dtor(page, NULL); 597 set_compound_page_dtor(page, NULL);
597 set_page_refcounted(page); 598 set_page_refcounted(page);
598 arch_release_hugepage(page); 599 arch_release_hugepage(page);
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page)
625 page->mapping = NULL; 626 page->mapping = NULL;
626 BUG_ON(page_count(page)); 627 BUG_ON(page_count(page));
627 BUG_ON(page_mapcount(page)); 628 BUG_ON(page_mapcount(page));
628 INIT_LIST_HEAD(&page->lru);
629 629
630 spin_lock(&hugetlb_lock); 630 spin_lock(&hugetlb_lock);
631 hugetlb_cgroup_uncharge_page(hstate_index(h),
632 pages_per_huge_page(h), page);
631 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { 633 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
634 /* remove the page from active list */
635 list_del(&page->lru);
632 update_and_free_page(h, page); 636 update_and_free_page(h, page);
633 h->surplus_huge_pages--; 637 h->surplus_huge_pages--;
634 h->surplus_huge_pages_node[nid]--; 638 h->surplus_huge_pages_node[nid]--;
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page)
641 645
642static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 646static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
643{ 647{
648 INIT_LIST_HEAD(&page->lru);
644 set_compound_page_dtor(page, free_huge_page); 649 set_compound_page_dtor(page, free_huge_page);
645 spin_lock(&hugetlb_lock); 650 spin_lock(&hugetlb_lock);
651 set_hugetlb_cgroup(page, NULL);
646 h->nr_huge_pages++; 652 h->nr_huge_pages++;
647 h->nr_huge_pages_node[nid]++; 653 h->nr_huge_pages_node[nid]++;
648 spin_unlock(&hugetlb_lock); 654 spin_unlock(&hugetlb_lock);
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
889 895
890 spin_lock(&hugetlb_lock); 896 spin_lock(&hugetlb_lock);
891 if (page) { 897 if (page) {
898 INIT_LIST_HEAD(&page->lru);
892 r_nid = page_to_nid(page); 899 r_nid = page_to_nid(page);
893 set_compound_page_dtor(page, free_huge_page); 900 set_compound_page_dtor(page, free_huge_page);
901 set_hugetlb_cgroup(page, NULL);
894 /* 902 /*
895 * We incremented the global counters already 903 * We incremented the global counters already
896 */ 904 */
@@ -993,7 +1001,6 @@ retry:
993 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1001 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
994 if ((--needed) < 0) 1002 if ((--needed) < 0)
995 break; 1003 break;
996 list_del(&page->lru);
997 /* 1004 /*
998 * This page is now managed by the hugetlb allocator and has 1005 * This page is now managed by the hugetlb allocator and has
999 * no users -- drop the buddy allocator's reference. 1006 * no users -- drop the buddy allocator's reference.
@@ -1008,7 +1015,6 @@ free:
1008 /* Free unnecessary surplus pages to the buddy allocator */ 1015 /* Free unnecessary surplus pages to the buddy allocator */
1009 if (!list_empty(&surplus_list)) { 1016 if (!list_empty(&surplus_list)) {
1010 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1017 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
1011 list_del(&page->lru);
1012 put_page(page); 1018 put_page(page);
1013 } 1019 }
1014 } 1020 }
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1112 struct hstate *h = hstate_vma(vma); 1118 struct hstate *h = hstate_vma(vma);
1113 struct page *page; 1119 struct page *page;
1114 long chg; 1120 long chg;
1121 int ret, idx;
1122 struct hugetlb_cgroup *h_cg;
1115 1123
1124 idx = hstate_index(h);
1116 /* 1125 /*
1117 * Processes that did not create the mapping will have no 1126 * Processes that did not create the mapping will have no
1118 * reserves and will not have accounted against subpool 1127 * reserves and will not have accounted against subpool
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1123 */ 1132 */
1124 chg = vma_needs_reservation(h, vma, addr); 1133 chg = vma_needs_reservation(h, vma, addr);
1125 if (chg < 0) 1134 if (chg < 0)
1126 return ERR_PTR(-VM_FAULT_OOM); 1135 return ERR_PTR(-ENOMEM);
1127 if (chg) 1136 if (chg)
1128 if (hugepage_subpool_get_pages(spool, chg)) 1137 if (hugepage_subpool_get_pages(spool, chg))
1129 return ERR_PTR(-VM_FAULT_SIGBUS); 1138 return ERR_PTR(-ENOSPC);
1130 1139
1140 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1141 if (ret) {
1142 hugepage_subpool_put_pages(spool, chg);
1143 return ERR_PTR(-ENOSPC);
1144 }
1131 spin_lock(&hugetlb_lock); 1145 spin_lock(&hugetlb_lock);
1132 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); 1146 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
1133 spin_unlock(&hugetlb_lock); 1147 if (page) {
1134 1148 /* update page cgroup details */
1135 if (!page) { 1149 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1150 h_cg, page);
1151 spin_unlock(&hugetlb_lock);
1152 } else {
1153 spin_unlock(&hugetlb_lock);
1136 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1154 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1137 if (!page) { 1155 if (!page) {
1156 hugetlb_cgroup_uncharge_cgroup(idx,
1157 pages_per_huge_page(h),
1158 h_cg);
1138 hugepage_subpool_put_pages(spool, chg); 1159 hugepage_subpool_put_pages(spool, chg);
1139 return ERR_PTR(-VM_FAULT_SIGBUS); 1160 return ERR_PTR(-ENOSPC);
1140 } 1161 }
1162 spin_lock(&hugetlb_lock);
1163 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1164 h_cg, page);
1165 list_move(&page->lru, &h->hugepage_activelist);
1166 spin_unlock(&hugetlb_lock);
1141 } 1167 }
1142 1168
1143 set_page_private(page, (unsigned long)spool); 1169 set_page_private(page, (unsigned long)spool);
1144 1170
1145 vma_commit_reservation(h, vma, addr); 1171 vma_commit_reservation(h, vma, addr);
1146
1147 return page; 1172 return page;
1148} 1173}
1149 1174
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1646 struct attribute_group *hstate_attr_group) 1671 struct attribute_group *hstate_attr_group)
1647{ 1672{
1648 int retval; 1673 int retval;
1649 int hi = h - hstates; 1674 int hi = hstate_index(h);
1650 1675
1651 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 1676 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1652 if (!hstate_kobjs[hi]) 1677 if (!hstate_kobjs[hi])
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node)
1741 if (!nhs->hugepages_kobj) 1766 if (!nhs->hugepages_kobj)
1742 return; /* no hstate attributes */ 1767 return; /* no hstate attributes */
1743 1768
1744 for_each_hstate(h) 1769 for_each_hstate(h) {
1745 if (nhs->hstate_kobjs[h - hstates]) { 1770 int idx = hstate_index(h);
1746 kobject_put(nhs->hstate_kobjs[h - hstates]); 1771 if (nhs->hstate_kobjs[idx]) {
1747 nhs->hstate_kobjs[h - hstates] = NULL; 1772 kobject_put(nhs->hstate_kobjs[idx]);
1773 nhs->hstate_kobjs[idx] = NULL;
1748 } 1774 }
1775 }
1749 1776
1750 kobject_put(nhs->hugepages_kobj); 1777 kobject_put(nhs->hugepages_kobj);
1751 nhs->hugepages_kobj = NULL; 1778 nhs->hugepages_kobj = NULL;
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void)
1848 hugetlb_unregister_all_nodes(); 1875 hugetlb_unregister_all_nodes();
1849 1876
1850 for_each_hstate(h) { 1877 for_each_hstate(h) {
1851 kobject_put(hstate_kobjs[h - hstates]); 1878 kobject_put(hstate_kobjs[hstate_index(h)]);
1852 } 1879 }
1853 1880
1854 kobject_put(hugepages_kobj); 1881 kobject_put(hugepages_kobj);
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void)
1869 if (!size_to_hstate(default_hstate_size)) 1896 if (!size_to_hstate(default_hstate_size))
1870 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 1897 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1871 } 1898 }
1872 default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; 1899 default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
1873 if (default_hstate_max_huge_pages) 1900 if (default_hstate_max_huge_pages)
1874 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 1901 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1875 1902
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order)
1897 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); 1924 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1898 return; 1925 return;
1899 } 1926 }
1900 BUG_ON(max_hstate >= HUGE_MAX_HSTATE); 1927 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
1901 BUG_ON(order == 0); 1928 BUG_ON(order == 0);
1902 h = &hstates[max_hstate++]; 1929 h = &hstates[hugetlb_max_hstate++];
1903 h->order = order; 1930 h->order = order;
1904 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 1931 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1905 h->nr_huge_pages = 0; 1932 h->nr_huge_pages = 0;
1906 h->free_huge_pages = 0; 1933 h->free_huge_pages = 0;
1907 for (i = 0; i < MAX_NUMNODES; ++i) 1934 for (i = 0; i < MAX_NUMNODES; ++i)
1908 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1935 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1936 INIT_LIST_HEAD(&h->hugepage_activelist);
1909 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); 1937 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
1910 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); 1938 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
1911 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1939 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1912 huge_page_size(h)/1024); 1940 huge_page_size(h)/1024);
1941 /*
1942 * Add cgroup control files only if the huge page consists
1943 * of more than two normal pages. This is because we use
1944 * page[2].lru.next for storing cgoup details.
1945 */
1946 if (order >= HUGETLB_CGROUP_MIN_ORDER)
1947 hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
1913 1948
1914 parsed_hstate = h; 1949 parsed_hstate = h;
1915} 1950}
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s)
1920 static unsigned long *last_mhp; 1955 static unsigned long *last_mhp;
1921 1956
1922 /* 1957 /*
1923 * !max_hstate means we haven't parsed a hugepagesz= parameter yet, 1958 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
1924 * so this hugepages= parameter goes to the "default hstate". 1959 * so this hugepages= parameter goes to the "default hstate".
1925 */ 1960 */
1926 if (!max_hstate) 1961 if (!hugetlb_max_hstate)
1927 mhp = &default_hstate_max_huge_pages; 1962 mhp = &default_hstate_max_huge_pages;
1928 else 1963 else
1929 mhp = &parsed_hstate->max_huge_pages; 1964 mhp = &parsed_hstate->max_huge_pages;
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s)
1942 * But we need to allocate >= MAX_ORDER hstates here early to still 1977 * But we need to allocate >= MAX_ORDER hstates here early to still
1943 * use the bootmem allocator. 1978 * use the bootmem allocator.
1944 */ 1979 */
1945 if (max_hstate && parsed_hstate->order >= MAX_ORDER) 1980 if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
1946 hugetlb_hstate_alloc_pages(parsed_hstate); 1981 hugetlb_hstate_alloc_pages(parsed_hstate);
1947 1982
1948 last_mhp = mhp; 1983 last_mhp = mhp;
@@ -2157,6 +2192,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2157 kref_get(&reservations->refs); 2192 kref_get(&reservations->refs);
2158} 2193}
2159 2194
2195static void resv_map_put(struct vm_area_struct *vma)
2196{
2197 struct resv_map *reservations = vma_resv_map(vma);
2198
2199 if (!reservations)
2200 return;
2201 kref_put(&reservations->refs, resv_map_release);
2202}
2203
2160static void hugetlb_vm_op_close(struct vm_area_struct *vma) 2204static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2161{ 2205{
2162 struct hstate *h = hstate_vma(vma); 2206 struct hstate *h = hstate_vma(vma);
@@ -2173,7 +2217,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2173 reserve = (end - start) - 2217 reserve = (end - start) -
2174 region_count(&reservations->regions, start, end); 2218 region_count(&reservations->regions, start, end);
2175 2219
2176 kref_put(&reservations->refs, resv_map_release); 2220 resv_map_put(vma);
2177 2221
2178 if (reserve) { 2222 if (reserve) {
2179 hugetlb_acct_memory(h, -reserve); 2223 hugetlb_acct_memory(h, -reserve);
@@ -2213,6 +2257,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
2213 } 2257 }
2214 entry = pte_mkyoung(entry); 2258 entry = pte_mkyoung(entry);
2215 entry = pte_mkhuge(entry); 2259 entry = pte_mkhuge(entry);
2260 entry = arch_make_huge_pte(entry, vma, page, writable);
2216 2261
2217 return entry; 2262 return entry;
2218} 2263}
@@ -2298,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2298 return 0; 2343 return 0;
2299} 2344}
2300 2345
2301void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2346void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2302 unsigned long end, struct page *ref_page) 2347 unsigned long start, unsigned long end,
2348 struct page *ref_page)
2303{ 2349{
2350 int force_flush = 0;
2304 struct mm_struct *mm = vma->vm_mm; 2351 struct mm_struct *mm = vma->vm_mm;
2305 unsigned long address; 2352 unsigned long address;
2306 pte_t *ptep; 2353 pte_t *ptep;
2307 pte_t pte; 2354 pte_t pte;
2308 struct page *page; 2355 struct page *page;
2309 struct page *tmp;
2310 struct hstate *h = hstate_vma(vma); 2356 struct hstate *h = hstate_vma(vma);
2311 unsigned long sz = huge_page_size(h); 2357 unsigned long sz = huge_page_size(h);
2312 2358
2313 /*
2314 * A page gathering list, protected by per file i_mmap_mutex. The
2315 * lock is used to avoid list corruption from multiple unmapping
2316 * of the same page since we are using page->lru.
2317 */
2318 LIST_HEAD(page_list);
2319
2320 WARN_ON(!is_vm_hugetlb_page(vma)); 2359 WARN_ON(!is_vm_hugetlb_page(vma));
2321 BUG_ON(start & ~huge_page_mask(h)); 2360 BUG_ON(start & ~huge_page_mask(h));
2322 BUG_ON(end & ~huge_page_mask(h)); 2361 BUG_ON(end & ~huge_page_mask(h));
2323 2362
2363 tlb_start_vma(tlb, vma);
2324 mmu_notifier_invalidate_range_start(mm, start, end); 2364 mmu_notifier_invalidate_range_start(mm, start, end);
2365again:
2325 spin_lock(&mm->page_table_lock); 2366 spin_lock(&mm->page_table_lock);
2326 for (address = start; address < end; address += sz) { 2367 for (address = start; address < end; address += sz) {
2327 ptep = huge_pte_offset(mm, address); 2368 ptep = huge_pte_offset(mm, address);
@@ -2360,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2360 } 2401 }
2361 2402
2362 pte = huge_ptep_get_and_clear(mm, address, ptep); 2403 pte = huge_ptep_get_and_clear(mm, address, ptep);
2404 tlb_remove_tlb_entry(tlb, ptep, address);
2363 if (pte_dirty(pte)) 2405 if (pte_dirty(pte))
2364 set_page_dirty(page); 2406 set_page_dirty(page);
2365 list_add(&page->lru, &page_list);
2366 2407
2408 page_remove_rmap(page);
2409 force_flush = !__tlb_remove_page(tlb, page);
2410 if (force_flush)
2411 break;
2367 /* Bail out after unmapping reference page if supplied */ 2412 /* Bail out after unmapping reference page if supplied */
2368 if (ref_page) 2413 if (ref_page)
2369 break; 2414 break;
2370 } 2415 }
2371 flush_tlb_range(vma, start, end);
2372 spin_unlock(&mm->page_table_lock); 2416 spin_unlock(&mm->page_table_lock);
2373 mmu_notifier_invalidate_range_end(mm, start, end); 2417 /*
2374 list_for_each_entry_safe(page, tmp, &page_list, lru) { 2418 * mmu_gather ran out of room to batch pages, we break out of
2375 page_remove_rmap(page); 2419 * the PTE lock to avoid doing the potential expensive TLB invalidate
2376 list_del(&page->lru); 2420 * and page-free while holding it.
2377 put_page(page); 2421 */
2422 if (force_flush) {
2423 force_flush = 0;
2424 tlb_flush_mmu(tlb);
2425 if (address < end && !ref_page)
2426 goto again;
2378 } 2427 }
2428 mmu_notifier_invalidate_range_end(mm, start, end);
2429 tlb_end_vma(tlb, vma);
2430}
2431
2432void __unmap_hugepage_range_final(struct mmu_gather *tlb,
2433 struct vm_area_struct *vma, unsigned long start,
2434 unsigned long end, struct page *ref_page)
2435{
2436 __unmap_hugepage_range(tlb, vma, start, end, ref_page);
2437
2438 /*
2439 * Clear this flag so that x86's huge_pmd_share page_table_shareable
2440 * test will fail on a vma being torn down, and not grab a page table
2441 * on its way out. We're lucky that the flag has such an appropriate
2442 * name, and can in fact be safely cleared here. We could clear it
2443 * before the __unmap_hugepage_range above, but all that's necessary
2444 * is to clear it before releasing the i_mmap_mutex. This works
2445 * because in the context this is called, the VMA is about to be
2446 * destroyed and the i_mmap_mutex is held.
2447 */
2448 vma->vm_flags &= ~VM_MAYSHARE;
2379} 2449}
2380 2450
2381void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2451void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2382 unsigned long end, struct page *ref_page) 2452 unsigned long end, struct page *ref_page)
2383{ 2453{
2384 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 2454 struct mm_struct *mm;
2385 __unmap_hugepage_range(vma, start, end, ref_page); 2455 struct mmu_gather tlb;
2386 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 2456
2457 mm = vma->vm_mm;
2458
2459 tlb_gather_mmu(&tlb, mm, 0);
2460 __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
2461 tlb_finish_mmu(&tlb, start, end);
2387} 2462}
2388 2463
2389/* 2464/*
@@ -2428,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2428 * from the time of fork. This would look like data corruption 2503 * from the time of fork. This would look like data corruption
2429 */ 2504 */
2430 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2505 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
2431 __unmap_hugepage_range(iter_vma, 2506 unmap_hugepage_range(iter_vma, address,
2432 address, address + huge_page_size(h), 2507 address + huge_page_size(h), page);
2433 page);
2434 } 2508 }
2435 mutex_unlock(&mapping->i_mmap_mutex); 2509 mutex_unlock(&mapping->i_mmap_mutex);
2436 2510
@@ -2486,6 +2560,7 @@ retry_avoidcopy:
2486 new_page = alloc_huge_page(vma, address, outside_reserve); 2560 new_page = alloc_huge_page(vma, address, outside_reserve);
2487 2561
2488 if (IS_ERR(new_page)) { 2562 if (IS_ERR(new_page)) {
2563 long err = PTR_ERR(new_page);
2489 page_cache_release(old_page); 2564 page_cache_release(old_page);
2490 2565
2491 /* 2566 /*
@@ -2498,7 +2573,6 @@ retry_avoidcopy:
2498 if (outside_reserve) { 2573 if (outside_reserve) {
2499 BUG_ON(huge_pte_none(pte)); 2574 BUG_ON(huge_pte_none(pte));
2500 if (unmap_ref_private(mm, vma, old_page, address)) { 2575 if (unmap_ref_private(mm, vma, old_page, address)) {
2501 BUG_ON(page_count(old_page) != 1);
2502 BUG_ON(huge_pte_none(pte)); 2576 BUG_ON(huge_pte_none(pte));
2503 spin_lock(&mm->page_table_lock); 2577 spin_lock(&mm->page_table_lock);
2504 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2578 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
@@ -2515,7 +2589,10 @@ retry_avoidcopy:
2515 2589
2516 /* Caller expects lock to be held */ 2590 /* Caller expects lock to be held */
2517 spin_lock(&mm->page_table_lock); 2591 spin_lock(&mm->page_table_lock);
2518 return -PTR_ERR(new_page); 2592 if (err == -ENOMEM)
2593 return VM_FAULT_OOM;
2594 else
2595 return VM_FAULT_SIGBUS;
2519 } 2596 }
2520 2597
2521 /* 2598 /*
@@ -2633,7 +2710,11 @@ retry:
2633 goto out; 2710 goto out;
2634 page = alloc_huge_page(vma, address, 0); 2711 page = alloc_huge_page(vma, address, 0);
2635 if (IS_ERR(page)) { 2712 if (IS_ERR(page)) {
2636 ret = -PTR_ERR(page); 2713 ret = PTR_ERR(page);
2714 if (ret == -ENOMEM)
2715 ret = VM_FAULT_OOM;
2716 else
2717 ret = VM_FAULT_SIGBUS;
2637 goto out; 2718 goto out;
2638 } 2719 }
2639 clear_huge_page(page, address, pages_per_huge_page(h)); 2720 clear_huge_page(page, address, pages_per_huge_page(h));
@@ -2670,7 +2751,7 @@ retry:
2670 */ 2751 */
2671 if (unlikely(PageHWPoison(page))) { 2752 if (unlikely(PageHWPoison(page))) {
2672 ret = VM_FAULT_HWPOISON | 2753 ret = VM_FAULT_HWPOISON |
2673 VM_FAULT_SET_HINDEX(h - hstates); 2754 VM_FAULT_SET_HINDEX(hstate_index(h));
2674 goto backout_unlocked; 2755 goto backout_unlocked;
2675 } 2756 }
2676 } 2757 }
@@ -2743,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2743 return 0; 2824 return 0;
2744 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2825 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2745 return VM_FAULT_HWPOISON_LARGE | 2826 return VM_FAULT_HWPOISON_LARGE |
2746 VM_FAULT_SET_HINDEX(h - hstates); 2827 VM_FAULT_SET_HINDEX(hstate_index(h));
2747 } 2828 }
2748 2829
2749 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2830 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2791,6 +2872,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2791 * so no worry about deadlock. 2872 * so no worry about deadlock.
2792 */ 2873 */
2793 page = pte_page(entry); 2874 page = pte_page(entry);
2875 get_page(page);
2794 if (page != pagecache_page) 2876 if (page != pagecache_page)
2795 lock_page(page); 2877 lock_page(page);
2796 2878
@@ -2822,6 +2904,7 @@ out_page_table_lock:
2822 } 2904 }
2823 if (page != pagecache_page) 2905 if (page != pagecache_page)
2824 unlock_page(page); 2906 unlock_page(page);
2907 put_page(page);
2825 2908
2826out_mutex: 2909out_mutex:
2827 mutex_unlock(&hugetlb_instantiation_mutex); 2910 mutex_unlock(&hugetlb_instantiation_mutex);
@@ -2948,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2948 } 3031 }
2949 } 3032 }
2950 spin_unlock(&mm->page_table_lock); 3033 spin_unlock(&mm->page_table_lock);
2951 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 3034 /*
2952 3035 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
3036 * may have cleared our pud entry and done put_page on the page table:
3037 * once we release i_mmap_mutex, another task can do the final put_page
3038 * and that page table be reused and filled with junk.
3039 */
2953 flush_tlb_range(vma, start, end); 3040 flush_tlb_range(vma, start, end);
3041 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2954} 3042}
2955 3043
2956int hugetlb_reserve_pages(struct inode *inode, 3044int hugetlb_reserve_pages(struct inode *inode,
@@ -2989,12 +3077,16 @@ int hugetlb_reserve_pages(struct inode *inode,
2989 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 3077 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
2990 } 3078 }
2991 3079
2992 if (chg < 0) 3080 if (chg < 0) {
2993 return chg; 3081 ret = chg;
3082 goto out_err;
3083 }
2994 3084
2995 /* There must be enough pages in the subpool for the mapping */ 3085 /* There must be enough pages in the subpool for the mapping */
2996 if (hugepage_subpool_get_pages(spool, chg)) 3086 if (hugepage_subpool_get_pages(spool, chg)) {
2997 return -ENOSPC; 3087 ret = -ENOSPC;
3088 goto out_err;
3089 }
2998 3090
2999 /* 3091 /*
3000 * Check enough hugepages are available for the reservation. 3092 * Check enough hugepages are available for the reservation.
@@ -3003,7 +3095,7 @@ int hugetlb_reserve_pages(struct inode *inode,
3003 ret = hugetlb_acct_memory(h, chg); 3095 ret = hugetlb_acct_memory(h, chg);
3004 if (ret < 0) { 3096 if (ret < 0) {
3005 hugepage_subpool_put_pages(spool, chg); 3097 hugepage_subpool_put_pages(spool, chg);
3006 return ret; 3098 goto out_err;
3007 } 3099 }
3008 3100
3009 /* 3101 /*
@@ -3020,6 +3112,10 @@ int hugetlb_reserve_pages(struct inode *inode,
3020 if (!vma || vma->vm_flags & VM_MAYSHARE) 3112 if (!vma || vma->vm_flags & VM_MAYSHARE)
3021 region_add(&inode->i_mapping->private_list, from, to); 3113 region_add(&inode->i_mapping->private_list, from, to);
3022 return 0; 3114 return 0;
3115out_err:
3116 if (vma)
3117 resv_map_put(vma);
3118 return ret;
3023} 3119}
3024 3120
3025void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 3121void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
new file mode 100644
index 000000000000..a3f358fb8a0c
--- /dev/null
+++ b/mm/hugetlb_cgroup.c
@@ -0,0 +1,418 @@
1/*
2 *
3 * Copyright IBM Corporation, 2012
4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of version 2.1 of the GNU Lesser General Public License
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 *
14 */
15
16#include <linux/cgroup.h>
17#include <linux/slab.h>
18#include <linux/hugetlb.h>
19#include <linux/hugetlb_cgroup.h>
20
21struct hugetlb_cgroup {
22 struct cgroup_subsys_state css;
23 /*
24 * the counter to account for hugepages from hugetlb.
25 */
26 struct res_counter hugepage[HUGE_MAX_HSTATE];
27};
28
29#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
30#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
31#define MEMFILE_ATTR(val) ((val) & 0xffff)
32
33struct cgroup_subsys hugetlb_subsys __read_mostly;
34static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
35
36static inline
37struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
38{
39 return container_of(s, struct hugetlb_cgroup, css);
40}
41
42static inline
43struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
44{
45 return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
46 hugetlb_subsys_id));
47}
48
49static inline
50struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
51{
52 return hugetlb_cgroup_from_css(task_subsys_state(task,
53 hugetlb_subsys_id));
54}
55
56static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
57{
58 return (h_cg == root_h_cgroup);
59}
60
61static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg)
62{
63 if (!cg->parent)
64 return NULL;
65 return hugetlb_cgroup_from_cgroup(cg->parent);
66}
67
68static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
69{
70 int idx;
71 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
72
73 for (idx = 0; idx < hugetlb_max_hstate; idx++) {
74 if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
75 return true;
76 }
77 return false;
78}
79
80static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
81{
82 int idx;
83 struct cgroup *parent_cgroup;
84 struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
85
86 h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
87 if (!h_cgroup)
88 return ERR_PTR(-ENOMEM);
89
90 parent_cgroup = cgroup->parent;
91 if (parent_cgroup) {
92 parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
93 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
94 res_counter_init(&h_cgroup->hugepage[idx],
95 &parent_h_cgroup->hugepage[idx]);
96 } else {
97 root_h_cgroup = h_cgroup;
98 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
99 res_counter_init(&h_cgroup->hugepage[idx], NULL);
100 }
101 return &h_cgroup->css;
102}
103
104static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
105{
106 struct hugetlb_cgroup *h_cgroup;
107
108 h_cgroup = hugetlb_cgroup_from_cgroup(cgroup);
109 kfree(h_cgroup);
110}
111
112
113/*
114 * Should be called with hugetlb_lock held.
115 * Since we are holding hugetlb_lock, pages cannot get moved from
116 * active list or uncharged from the cgroup, So no need to get
117 * page reference and test for page active here. This function
118 * cannot fail.
119 */
120static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup,
121 struct page *page)
122{
123 int csize;
124 struct res_counter *counter;
125 struct res_counter *fail_res;
126 struct hugetlb_cgroup *page_hcg;
127 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
128 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
129
130 page_hcg = hugetlb_cgroup_from_page(page);
131 /*
132 * We can have pages in active list without any cgroup
133 * ie, hugepage with less than 3 pages. We can safely
134 * ignore those pages.
135 */
136 if (!page_hcg || page_hcg != h_cg)
137 goto out;
138
139 csize = PAGE_SIZE << compound_order(page);
140 if (!parent) {
141 parent = root_h_cgroup;
142 /* root has no limit */
143 res_counter_charge_nofail(&parent->hugepage[idx],
144 csize, &fail_res);
145 }
146 counter = &h_cg->hugepage[idx];
147 res_counter_uncharge_until(counter, counter->parent, csize);
148
149 set_hugetlb_cgroup(page, parent);
150out:
151 return;
152}
153
154/*
155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
156 * the parent cgroup.
157 */
158static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
159{
160 struct hstate *h;
161 struct page *page;
162 int ret = 0, idx = 0;
163
164 do {
165 if (cgroup_task_count(cgroup) ||
166 !list_empty(&cgroup->children)) {
167 ret = -EBUSY;
168 goto out;
169 }
170 for_each_hstate(h) {
171 spin_lock(&hugetlb_lock);
172 list_for_each_entry(page, &h->hugepage_activelist, lru)
173 hugetlb_cgroup_move_parent(idx, cgroup, page);
174
175 spin_unlock(&hugetlb_lock);
176 idx++;
177 }
178 cond_resched();
179 } while (hugetlb_cgroup_have_usage(cgroup));
180out:
181 return ret;
182}
183
184int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
185 struct hugetlb_cgroup **ptr)
186{
187 int ret = 0;
188 struct res_counter *fail_res;
189 struct hugetlb_cgroup *h_cg = NULL;
190 unsigned long csize = nr_pages * PAGE_SIZE;
191
192 if (hugetlb_cgroup_disabled())
193 goto done;
194 /*
195 * We don't charge any cgroup if the compound page have less
196 * than 3 pages.
197 */
198 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
199 goto done;
200again:
201 rcu_read_lock();
202 h_cg = hugetlb_cgroup_from_task(current);
203 if (!css_tryget(&h_cg->css)) {
204 rcu_read_unlock();
205 goto again;
206 }
207 rcu_read_unlock();
208
209 ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
210 css_put(&h_cg->css);
211done:
212 *ptr = h_cg;
213 return ret;
214}
215
216/* Should be called with hugetlb_lock held */
217void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
218 struct hugetlb_cgroup *h_cg,
219 struct page *page)
220{
221 if (hugetlb_cgroup_disabled() || !h_cg)
222 return;
223
224 set_hugetlb_cgroup(page, h_cg);
225 return;
226}
227
228/*
229 * Should be called with hugetlb_lock held
230 */
231void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
232 struct page *page)
233{
234 struct hugetlb_cgroup *h_cg;
235 unsigned long csize = nr_pages * PAGE_SIZE;
236
237 if (hugetlb_cgroup_disabled())
238 return;
239 VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
240 h_cg = hugetlb_cgroup_from_page(page);
241 if (unlikely(!h_cg))
242 return;
243 set_hugetlb_cgroup(page, NULL);
244 res_counter_uncharge(&h_cg->hugepage[idx], csize);
245 return;
246}
247
248void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
249 struct hugetlb_cgroup *h_cg)
250{
251 unsigned long csize = nr_pages * PAGE_SIZE;
252
253 if (hugetlb_cgroup_disabled() || !h_cg)
254 return;
255
256 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
257 return;
258
259 res_counter_uncharge(&h_cg->hugepage[idx], csize);
260 return;
261}
262
263static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
264 struct file *file, char __user *buf,
265 size_t nbytes, loff_t *ppos)
266{
267 u64 val;
268 char str[64];
269 int idx, name, len;
270 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
271
272 idx = MEMFILE_IDX(cft->private);
273 name = MEMFILE_ATTR(cft->private);
274
275 val = res_counter_read_u64(&h_cg->hugepage[idx], name);
276 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
277 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
278}
279
280static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
281 const char *buffer)
282{
283 int idx, name, ret;
284 unsigned long long val;
285 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
286
287 idx = MEMFILE_IDX(cft->private);
288 name = MEMFILE_ATTR(cft->private);
289
290 switch (name) {
291 case RES_LIMIT:
292 if (hugetlb_cgroup_is_root(h_cg)) {
293 /* Can't set limit on root */
294 ret = -EINVAL;
295 break;
296 }
297 /* This function does all necessary parse...reuse it */
298 ret = res_counter_memparse_write_strategy(buffer, &val);
299 if (ret)
300 break;
301 ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
302 break;
303 default:
304 ret = -EINVAL;
305 break;
306 }
307 return ret;
308}
309
310static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event)
311{
312 int idx, name, ret = 0;
313 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
314
315 idx = MEMFILE_IDX(event);
316 name = MEMFILE_ATTR(event);
317
318 switch (name) {
319 case RES_MAX_USAGE:
320 res_counter_reset_max(&h_cg->hugepage[idx]);
321 break;
322 case RES_FAILCNT:
323 res_counter_reset_failcnt(&h_cg->hugepage[idx]);
324 break;
325 default:
326 ret = -EINVAL;
327 break;
328 }
329 return ret;
330}
331
332static char *mem_fmt(char *buf, int size, unsigned long hsize)
333{
334 if (hsize >= (1UL << 30))
335 snprintf(buf, size, "%luGB", hsize >> 30);
336 else if (hsize >= (1UL << 20))
337 snprintf(buf, size, "%luMB", hsize >> 20);
338 else
339 snprintf(buf, size, "%luKB", hsize >> 10);
340 return buf;
341}
342
343int __init hugetlb_cgroup_file_init(int idx)
344{
345 char buf[32];
346 struct cftype *cft;
347 struct hstate *h = &hstates[idx];
348
349 /* format the size */
350 mem_fmt(buf, 32, huge_page_size(h));
351
352 /* Add the limit file */
353 cft = &h->cgroup_files[0];
354 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
355 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
356 cft->read = hugetlb_cgroup_read;
357 cft->write_string = hugetlb_cgroup_write;
358
359 /* Add the usage file */
360 cft = &h->cgroup_files[1];
361 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
362 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
363 cft->read = hugetlb_cgroup_read;
364
365 /* Add the MAX usage file */
366 cft = &h->cgroup_files[2];
367 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
368 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
369 cft->trigger = hugetlb_cgroup_reset;
370 cft->read = hugetlb_cgroup_read;
371
372 /* Add the failcntfile */
373 cft = &h->cgroup_files[3];
374 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
375 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
376 cft->trigger = hugetlb_cgroup_reset;
377 cft->read = hugetlb_cgroup_read;
378
379 /* NULL terminate the last cft */
380 cft = &h->cgroup_files[4];
381 memset(cft, 0, sizeof(*cft));
382
383 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
384
385 return 0;
386}
387
388/*
389 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
390 * when we migrate hugepages
391 */
392void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
393{
394 struct hugetlb_cgroup *h_cg;
395 struct hstate *h = page_hstate(oldhpage);
396
397 if (hugetlb_cgroup_disabled())
398 return;
399
400 VM_BUG_ON(!PageHuge(oldhpage));
401 spin_lock(&hugetlb_lock);
402 h_cg = hugetlb_cgroup_from_page(oldhpage);
403 set_hugetlb_cgroup(oldhpage, NULL);
404
405 /* move the h_cg details to new cgroup */
406 set_hugetlb_cgroup(newhpage, h_cg);
407 list_move(&newhpage->lru, &h->hugepage_activelist);
408 spin_unlock(&hugetlb_lock);
409 return;
410}
411
412struct cgroup_subsys hugetlb_subsys = {
413 .name = "hugetlb",
414 .create = hugetlb_cgroup_create,
415 .pre_destroy = hugetlb_cgroup_pre_destroy,
416 .destroy = hugetlb_cgroup_destroy,
417 .subsys_id = hugetlb_subsys_id,
418};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index cc448bb983ba..3a61efc518d5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -123,7 +123,7 @@ static int pfn_inject_init(void)
123 if (!dentry) 123 if (!dentry)
124 goto fail; 124 goto fail;
125 125
126#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 126#ifdef CONFIG_MEMCG_SWAP
127 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, 127 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
128 hwpoison_dir, &hwpoison_filter_memcg); 128 hwpoison_dir, &hwpoison_filter_memcg);
129 if (!dentry) 129 if (!dentry)
diff --git a/mm/internal.h b/mm/internal.h
index 2189af491783..b8c91b342e24 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -100,6 +100,46 @@ extern void prep_compound_page(struct page *page, unsigned long order);
100extern bool is_free_buddy_page(struct page *page); 100extern bool is_free_buddy_page(struct page *page);
101#endif 101#endif
102 102
103#if defined CONFIG_COMPACTION || defined CONFIG_CMA
104
105/*
106 * in mm/compaction.c
107 */
108/*
109 * compact_control is used to track pages being migrated and the free pages
110 * they are being migrated to during memory compaction. The free_pfn starts
111 * at the end of a zone and migrate_pfn begins at the start. Movable pages
112 * are moved to the end of a zone during a compaction run and the run
113 * completes when free_pfn <= migrate_pfn
114 */
115struct compact_control {
116 struct list_head freepages; /* List of free pages to migrate to */
117 struct list_head migratepages; /* List of pages being migrated */
118 unsigned long nr_freepages; /* Number of isolated free pages */
119 unsigned long nr_migratepages; /* Number of pages to migrate */
120 unsigned long free_pfn; /* isolate_freepages search base */
121 unsigned long start_free_pfn; /* where we started the search */
122 unsigned long migrate_pfn; /* isolate_migratepages search base */
123 bool sync; /* Synchronous migration */
124 bool wrapped; /* Order > 0 compactions are
125 incremental, once free_pfn
126 and migrate_pfn meet, we restart
127 from the top of the zone;
128 remember we wrapped around. */
129
130 int order; /* order a direct compactor needs */
131 int migratetype; /* MOVABLE, RECLAIMABLE etc */
132 struct zone *zone;
133 bool *contended; /* True if a lock was contended */
134};
135
136unsigned long
137isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
138unsigned long
139isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
140 unsigned long low_pfn, unsigned long end_pfn);
141
142#endif
103 143
104/* 144/*
105 * function for dealing with page's order in buddy system. 145 * function for dealing with page's order in buddy system.
@@ -131,7 +171,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
131 * to determine if it's being mapped into a LOCKED vma. 171 * to determine if it's being mapped into a LOCKED vma.
132 * If so, mark page as mlocked. 172 * If so, mark page as mlocked.
133 */ 173 */
134static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) 174static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
175 struct page *page)
135{ 176{
136 VM_BUG_ON(PageLRU(page)); 177 VM_BUG_ON(PageLRU(page));
137 178
@@ -189,7 +230,7 @@ extern unsigned long vma_address(struct page *page,
189 struct vm_area_struct *vma); 230 struct vm_area_struct *vma);
190#endif 231#endif
191#else /* !CONFIG_MMU */ 232#else /* !CONFIG_MMU */
192static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 233static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
193{ 234{
194 return 0; 235 return 0;
195} 236}
@@ -309,3 +350,9 @@ extern u64 hwpoison_filter_flags_mask;
309extern u64 hwpoison_filter_flags_value; 350extern u64 hwpoison_filter_flags_value;
310extern u64 hwpoison_filter_memcg; 351extern u64 hwpoison_filter_memcg;
311extern u32 hwpoison_filter_enable; 352extern u32 hwpoison_filter_enable;
353
354extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
355 unsigned long, unsigned long,
356 unsigned long, unsigned long);
357
358extern void set_pageblock_order(void);
diff --git a/mm/madvise.c b/mm/madvise.c
index 1ccbba5b6674..14d260fa0d17 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,8 +11,11 @@
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/page-isolation.h> 12#include <linux/page-isolation.h>
13#include <linux/hugetlb.h> 13#include <linux/hugetlb.h>
14#include <linux/falloc.h>
14#include <linux/sched.h> 15#include <linux/sched.h>
15#include <linux/ksm.h> 16#include <linux/ksm.h>
17#include <linux/fs.h>
18#include <linux/file.h>
16 19
17/* 20/*
18 * Any behaviour which results in changes to the vma->vm_flags needs to 21 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -200,33 +203,39 @@ static long madvise_remove(struct vm_area_struct *vma,
200 struct vm_area_struct **prev, 203 struct vm_area_struct **prev,
201 unsigned long start, unsigned long end) 204 unsigned long start, unsigned long end)
202{ 205{
203 struct address_space *mapping; 206 loff_t offset;
204 loff_t offset, endoff;
205 int error; 207 int error;
208 struct file *f;
206 209
207 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 210 *prev = NULL; /* tell sys_madvise we drop mmap_sem */
208 211
209 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 212 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
210 return -EINVAL; 213 return -EINVAL;
211 214
212 if (!vma->vm_file || !vma->vm_file->f_mapping 215 f = vma->vm_file;
213 || !vma->vm_file->f_mapping->host) { 216
217 if (!f || !f->f_mapping || !f->f_mapping->host) {
214 return -EINVAL; 218 return -EINVAL;
215 } 219 }
216 220
217 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 221 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
218 return -EACCES; 222 return -EACCES;
219 223
220 mapping = vma->vm_file->f_mapping;
221
222 offset = (loff_t)(start - vma->vm_start) 224 offset = (loff_t)(start - vma->vm_start)
223 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 225 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
224 endoff = (loff_t)(end - vma->vm_start - 1)
225 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
226 226
227 /* vmtruncate_range needs to take i_mutex */ 227 /*
228 * Filesystem's fallocate may need to take i_mutex. We need to
229 * explicitly grab a reference because the vma (and hence the
230 * vma's reference to the file) can go away as soon as we drop
231 * mmap_sem.
232 */
233 get_file(f);
228 up_read(&current->mm->mmap_sem); 234 up_read(&current->mm->mmap_sem);
229 error = vmtruncate_range(mapping->host, offset, endoff); 235 error = do_fallocate(f,
236 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
237 offset, end - start);
238 fput(f);
230 down_read(&current->mm->mmap_sem); 239 down_read(&current->mm->mmap_sem);
231 return error; 240 return error;
232} 241}
diff --git a/mm/memblock.c b/mm/memblock.c
index 99f285599501..4d9393c7edc9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -37,6 +37,8 @@ struct memblock memblock __initdata_memblock = {
37 37
38int memblock_debug __initdata_memblock; 38int memblock_debug __initdata_memblock;
39static int memblock_can_resize __initdata_memblock; 39static int memblock_can_resize __initdata_memblock;
40static int memblock_memory_in_slab __initdata_memblock = 0;
41static int memblock_reserved_in_slab __initdata_memblock = 0;
40 42
41/* inline so we don't get a warning when pr_debug is compiled out */ 43/* inline so we don't get a warning when pr_debug is compiled out */
42static inline const char *memblock_type_name(struct memblock_type *type) 44static inline const char *memblock_type_name(struct memblock_type *type)
@@ -141,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
141 MAX_NUMNODES); 143 MAX_NUMNODES);
142} 144}
143 145
144/*
145 * Free memblock.reserved.regions
146 */
147int __init_memblock memblock_free_reserved_regions(void)
148{
149 if (memblock.reserved.regions == memblock_reserved_init_regions)
150 return 0;
151
152 return memblock_free(__pa(memblock.reserved.regions),
153 sizeof(struct memblock_region) * memblock.reserved.max);
154}
155
156/*
157 * Reserve memblock.reserved.regions
158 */
159int __init_memblock memblock_reserve_reserved_regions(void)
160{
161 if (memblock.reserved.regions == memblock_reserved_init_regions)
162 return 0;
163
164 return memblock_reserve(__pa(memblock.reserved.regions),
165 sizeof(struct memblock_region) * memblock.reserved.max);
166}
167
168static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) 146static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
169{ 147{
170 type->total_size -= type->regions[r].size; 148 type->total_size -= type->regions[r].size;
@@ -182,11 +160,42 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
182 } 160 }
183} 161}
184 162
185static int __init_memblock memblock_double_array(struct memblock_type *type) 163phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
164 phys_addr_t *addr)
165{
166 if (memblock.reserved.regions == memblock_reserved_init_regions)
167 return 0;
168
169 *addr = __pa(memblock.reserved.regions);
170
171 return PAGE_ALIGN(sizeof(struct memblock_region) *
172 memblock.reserved.max);
173}
174
175/**
176 * memblock_double_array - double the size of the memblock regions array
177 * @type: memblock type of the regions array being doubled
178 * @new_area_start: starting address of memory range to avoid overlap with
179 * @new_area_size: size of memory range to avoid overlap with
180 *
181 * Double the size of the @type regions array. If memblock is being used to
182 * allocate memory for a new reserved regions array and there is a previously
183 * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
184 * waiting to be reserved, ensure the memory used by the new array does
185 * not overlap.
186 *
187 * RETURNS:
188 * 0 on success, -1 on failure.
189 */
190static int __init_memblock memblock_double_array(struct memblock_type *type,
191 phys_addr_t new_area_start,
192 phys_addr_t new_area_size)
186{ 193{
187 struct memblock_region *new_array, *old_array; 194 struct memblock_region *new_array, *old_array;
195 phys_addr_t old_alloc_size, new_alloc_size;
188 phys_addr_t old_size, new_size, addr; 196 phys_addr_t old_size, new_size, addr;
189 int use_slab = slab_is_available(); 197 int use_slab = slab_is_available();
198 int *in_slab;
190 199
191 /* We don't allow resizing until we know about the reserved regions 200 /* We don't allow resizing until we know about the reserved regions
192 * of memory that aren't suitable for allocation 201 * of memory that aren't suitable for allocation
@@ -197,36 +206,62 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
197 /* Calculate new doubled size */ 206 /* Calculate new doubled size */
198 old_size = type->max * sizeof(struct memblock_region); 207 old_size = type->max * sizeof(struct memblock_region);
199 new_size = old_size << 1; 208 new_size = old_size << 1;
209 /*
210 * We need to allocated new one align to PAGE_SIZE,
211 * so we can free them completely later.
212 */
213 old_alloc_size = PAGE_ALIGN(old_size);
214 new_alloc_size = PAGE_ALIGN(new_size);
215
216 /* Retrieve the slab flag */
217 if (type == &memblock.memory)
218 in_slab = &memblock_memory_in_slab;
219 else
220 in_slab = &memblock_reserved_in_slab;
200 221
201 /* Try to find some space for it. 222 /* Try to find some space for it.
202 * 223 *
203 * WARNING: We assume that either slab_is_available() and we use it or 224 * WARNING: We assume that either slab_is_available() and we use it or
204 * we use MEMBLOCK for allocations. That means that this is unsafe to use 225 * we use MEMBLOCK for allocations. That means that this is unsafe to
205 * when bootmem is currently active (unless bootmem itself is implemented 226 * use when bootmem is currently active (unless bootmem itself is
206 * on top of MEMBLOCK which isn't the case yet) 227 * implemented on top of MEMBLOCK which isn't the case yet)
207 * 228 *
208 * This should however not be an issue for now, as we currently only 229 * This should however not be an issue for now, as we currently only
209 * call into MEMBLOCK while it's still active, or much later when slab is 230 * call into MEMBLOCK while it's still active, or much later when slab
210 * active for memory hotplug operations 231 * is active for memory hotplug operations
211 */ 232 */
212 if (use_slab) { 233 if (use_slab) {
213 new_array = kmalloc(new_size, GFP_KERNEL); 234 new_array = kmalloc(new_size, GFP_KERNEL);
214 addr = new_array ? __pa(new_array) : 0; 235 addr = new_array ? __pa(new_array) : 0;
215 } else 236 } else {
216 addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); 237 /* only exclude range when trying to double reserved.regions */
238 if (type != &memblock.reserved)
239 new_area_start = new_area_size = 0;
240
241 addr = memblock_find_in_range(new_area_start + new_area_size,
242 memblock.current_limit,
243 new_alloc_size, PAGE_SIZE);
244 if (!addr && new_area_size)
245 addr = memblock_find_in_range(0,
246 min(new_area_start, memblock.current_limit),
247 new_alloc_size, PAGE_SIZE);
248
249 new_array = addr ? __va(addr) : 0;
250 }
217 if (!addr) { 251 if (!addr) {
218 pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", 252 pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
219 memblock_type_name(type), type->max, type->max * 2); 253 memblock_type_name(type), type->max, type->max * 2);
220 return -1; 254 return -1;
221 } 255 }
222 new_array = __va(addr);
223 256
224 memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", 257 memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
225 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); 258 memblock_type_name(type), type->max * 2, (u64)addr,
259 (u64)addr + new_size - 1);
226 260
227 /* Found space, we now need to move the array over before 261 /*
228 * we add the reserved region since it may be our reserved 262 * Found space, we now need to move the array over before we add the
229 * array itself that is full. 263 * reserved region since it may be our reserved array itself that is
264 * full.
230 */ 265 */
231 memcpy(new_array, type->regions, old_size); 266 memcpy(new_array, type->regions, old_size);
232 memset(new_array + type->max, 0, old_size); 267 memset(new_array + type->max, 0, old_size);
@@ -234,21 +269,22 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
234 type->regions = new_array; 269 type->regions = new_array;
235 type->max <<= 1; 270 type->max <<= 1;
236 271
237 /* If we use SLAB that's it, we are done */ 272 /* Free old array. We needn't free it if the array is the static one */
238 if (use_slab) 273 if (*in_slab)
239 return 0; 274 kfree(old_array);
240 275 else if (old_array != memblock_memory_init_regions &&
241 /* Add the new reserved region now. Should not fail ! */ 276 old_array != memblock_reserved_init_regions)
242 BUG_ON(memblock_reserve(addr, new_size)); 277 memblock_free(__pa(old_array), old_alloc_size);
243 278
244 /* If the array wasn't our static init one, then free it. We only do 279 /*
245 * that before SLAB is available as later on, we don't know whether 280 * Reserve the new array if that comes from the memblock. Otherwise, we
246 * to use kfree or free_bootmem_pages(). Shouldn't be a big deal 281 * needn't do it
247 * anyways
248 */ 282 */
249 if (old_array != memblock_memory_init_regions && 283 if (!use_slab)
250 old_array != memblock_reserved_init_regions) 284 BUG_ON(memblock_reserve(addr, new_alloc_size));
251 memblock_free(__pa(old_array), old_size); 285
286 /* Update slab flag */
287 *in_slab = use_slab;
252 288
253 return 0; 289 return 0;
254} 290}
@@ -330,6 +366,9 @@ static int __init_memblock memblock_add_region(struct memblock_type *type,
330 phys_addr_t end = base + memblock_cap_size(base, &size); 366 phys_addr_t end = base + memblock_cap_size(base, &size);
331 int i, nr_new; 367 int i, nr_new;
332 368
369 if (!size)
370 return 0;
371
333 /* special case for empty array */ 372 /* special case for empty array */
334 if (type->regions[0].size == 0) { 373 if (type->regions[0].size == 0) {
335 WARN_ON(type->cnt != 1 || type->total_size); 374 WARN_ON(type->cnt != 1 || type->total_size);
@@ -384,7 +423,7 @@ repeat:
384 */ 423 */
385 if (!insert) { 424 if (!insert) {
386 while (type->cnt + nr_new > type->max) 425 while (type->cnt + nr_new > type->max)
387 if (memblock_double_array(type) < 0) 426 if (memblock_double_array(type, obase, size) < 0)
388 return -ENOMEM; 427 return -ENOMEM;
389 insert = true; 428 insert = true;
390 goto repeat; 429 goto repeat;
@@ -430,9 +469,12 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
430 469
431 *start_rgn = *end_rgn = 0; 470 *start_rgn = *end_rgn = 0;
432 471
472 if (!size)
473 return 0;
474
433 /* we'll create at most two more regions */ 475 /* we'll create at most two more regions */
434 while (type->cnt + 2 > type->max) 476 while (type->cnt + 2 > type->max)
435 if (memblock_double_array(type) < 0) 477 if (memblock_double_array(type, base, size) < 0)
436 return -ENOMEM; 478 return -ENOMEM;
437 479
438 for (i = 0; i < type->cnt; i++) { 480 for (i = 0; i < type->cnt; i++) {
@@ -514,7 +556,6 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
514 (unsigned long long)base, 556 (unsigned long long)base,
515 (unsigned long long)base + size, 557 (unsigned long long)base + size,
516 (void *)_RET_IP_); 558 (void *)_RET_IP_);
517 BUG_ON(0 == size);
518 559
519 return memblock_add_region(_rgn, base, size, MAX_NUMNODES); 560 return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
520} 561}
@@ -523,9 +564,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
523 * __next_free_mem_range - next function for for_each_free_mem_range() 564 * __next_free_mem_range - next function for for_each_free_mem_range()
524 * @idx: pointer to u64 loop variable 565 * @idx: pointer to u64 loop variable
525 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 566 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
526 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 567 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
527 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 568 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
528 * @p_nid: ptr to int for nid of the range, can be %NULL 569 * @out_nid: ptr to int for nid of the range, can be %NULL
529 * 570 *
530 * Find the first free area from *@idx which matches @nid, fill the out 571 * Find the first free area from *@idx which matches @nid, fill the out
531 * parameters, and update *@idx for the next iteration. The lower 32bit of 572 * parameters, and update *@idx for the next iteration. The lower 32bit of
@@ -599,9 +640,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
599 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() 640 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
600 * @idx: pointer to u64 loop variable 641 * @idx: pointer to u64 loop variable
601 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 642 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
602 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 643 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
603 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 644 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
604 * @p_nid: ptr to int for nid of the range, can be %NULL 645 * @out_nid: ptr to int for nid of the range, can be %NULL
605 * 646 *
606 * Reverse of __next_free_mem_range(). 647 * Reverse of __next_free_mem_range().
607 */ 648 */
@@ -850,6 +891,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
850 return memblock_search(&memblock.memory, addr) != -1; 891 return memblock_search(&memblock.memory, addr) != -1;
851} 892}
852 893
894/**
895 * memblock_is_region_memory - check if a region is a subset of memory
896 * @base: base of region to check
897 * @size: size of region to check
898 *
899 * Check if the region [@base, @base+@size) is a subset of a memory block.
900 *
901 * RETURNS:
902 * 0 if false, non-zero if true
903 */
853int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) 904int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
854{ 905{
855 int idx = memblock_search(&memblock.memory, base); 906 int idx = memblock_search(&memblock.memory, base);
@@ -862,6 +913,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
862 memblock.memory.regions[idx].size) >= end; 913 memblock.memory.regions[idx].size) >= end;
863} 914}
864 915
916/**
917 * memblock_is_region_reserved - check if a region intersects reserved memory
918 * @base: base of region to check
919 * @size: size of region to check
920 *
921 * Check if the region [@base, @base+@size) intersects a reserved memory block.
922 *
923 * RETURNS:
924 * 0 if false, non-zero if true
925 */
865int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) 926int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
866{ 927{
867 memblock_cap_size(base, &size); 928 memblock_cap_size(base, &size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7d698df4a067..795e525afaba 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,21 +59,21 @@
59 59
60struct cgroup_subsys mem_cgroup_subsys __read_mostly; 60struct cgroup_subsys mem_cgroup_subsys __read_mostly;
61#define MEM_CGROUP_RECLAIM_RETRIES 5 61#define MEM_CGROUP_RECLAIM_RETRIES 5
62struct mem_cgroup *root_mem_cgroup __read_mostly; 62static struct mem_cgroup *root_mem_cgroup __read_mostly;
63 63
64#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 64#ifdef CONFIG_MEMCG_SWAP
65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
66int do_swap_account __read_mostly; 66int do_swap_account __read_mostly;
67 67
68/* for remember boot option*/ 68/* for remember boot option*/
69#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 69#ifdef CONFIG_MEMCG_SWAP_ENABLED
70static int really_do_swap_account __initdata = 1; 70static int really_do_swap_account __initdata = 1;
71#else 71#else
72static int really_do_swap_account __initdata = 0; 72static int really_do_swap_account __initdata = 0;
73#endif 73#endif
74 74
75#else 75#else
76#define do_swap_account (0) 76#define do_swap_account 0
77#endif 77#endif
78 78
79 79
@@ -87,19 +87,32 @@ enum mem_cgroup_stat_index {
87 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 87 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 90 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
92 MEM_CGROUP_STAT_NSTATS, 91 MEM_CGROUP_STAT_NSTATS,
93}; 92};
94 93
94static const char * const mem_cgroup_stat_names[] = {
95 "cache",
96 "rss",
97 "mapped_file",
98 "swap",
99};
100
95enum mem_cgroup_events_index { 101enum mem_cgroup_events_index {
96 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 102 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
97 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 103 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
98 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
99 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 104 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
100 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 105 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
101 MEM_CGROUP_EVENTS_NSTATS, 106 MEM_CGROUP_EVENTS_NSTATS,
102}; 107};
108
109static const char * const mem_cgroup_events_names[] = {
110 "pgpgin",
111 "pgpgout",
112 "pgfault",
113 "pgmajfault",
114};
115
103/* 116/*
104 * Per memcg event counter is incremented at every pagein/pageout. With THP, 117 * Per memcg event counter is incremented at every pagein/pageout. With THP,
105 * it will be incremated by the number of pages. This counter is used for 118 * it will be incremated by the number of pages. This counter is used for
@@ -112,13 +125,14 @@ enum mem_cgroup_events_target {
112 MEM_CGROUP_TARGET_NUMAINFO, 125 MEM_CGROUP_TARGET_NUMAINFO,
113 MEM_CGROUP_NTARGETS, 126 MEM_CGROUP_NTARGETS,
114}; 127};
115#define THRESHOLDS_EVENTS_TARGET (128) 128#define THRESHOLDS_EVENTS_TARGET 128
116#define SOFTLIMIT_EVENTS_TARGET (1024) 129#define SOFTLIMIT_EVENTS_TARGET 1024
117#define NUMAINFO_EVENTS_TARGET (1024) 130#define NUMAINFO_EVENTS_TARGET 1024
118 131
119struct mem_cgroup_stat_cpu { 132struct mem_cgroup_stat_cpu {
120 long count[MEM_CGROUP_STAT_NSTATS]; 133 long count[MEM_CGROUP_STAT_NSTATS];
121 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 134 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
135 unsigned long nr_page_events;
122 unsigned long targets[MEM_CGROUP_NTARGETS]; 136 unsigned long targets[MEM_CGROUP_NTARGETS];
123}; 137};
124 138
@@ -138,7 +152,6 @@ struct mem_cgroup_per_zone {
138 152
139 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 153 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
140 154
141 struct zone_reclaim_stat reclaim_stat;
142 struct rb_node tree_node; /* RB tree node */ 155 struct rb_node tree_node; /* RB tree node */
143 unsigned long long usage_in_excess;/* Set to the value by which */ 156 unsigned long long usage_in_excess;/* Set to the value by which */
144 /* the soft limit is exceeded*/ 157 /* the soft limit is exceeded*/
@@ -182,7 +195,7 @@ struct mem_cgroup_threshold {
182 195
183/* For threshold */ 196/* For threshold */
184struct mem_cgroup_threshold_ary { 197struct mem_cgroup_threshold_ary {
185 /* An array index points to threshold just below usage. */ 198 /* An array index points to threshold just below or equal to usage. */
186 int current_threshold; 199 int current_threshold;
187 /* Size of entries[] */ 200 /* Size of entries[] */
188 unsigned int size; 201 unsigned int size;
@@ -245,8 +258,8 @@ struct mem_cgroup {
245 */ 258 */
246 struct rcu_head rcu_freeing; 259 struct rcu_head rcu_freeing;
247 /* 260 /*
248 * But when using vfree(), that cannot be done at 261 * We also need some space for a worker in deferred freeing.
249 * interrupt time, so we must then queue the work. 262 * By the time we call it, rcu_freeing is no longer in use.
250 */ 263 */
251 struct work_struct work_freeing; 264 struct work_struct work_freeing;
252 }; 265 };
@@ -305,7 +318,7 @@ struct mem_cgroup {
305 /* 318 /*
306 * percpu counter. 319 * percpu counter.
307 */ 320 */
308 struct mem_cgroup_stat_cpu *stat; 321 struct mem_cgroup_stat_cpu __percpu *stat;
309 /* 322 /*
310 * used when a cpu is offlined or other synchronizations 323 * used when a cpu is offlined or other synchronizations
311 * See mem_cgroup_read_stat(). 324 * See mem_cgroup_read_stat().
@@ -360,14 +373,12 @@ static bool move_file(void)
360 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 373 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
361 * limit reclaim to prevent infinite loops, if they ever occur. 374 * limit reclaim to prevent infinite loops, if they ever occur.
362 */ 375 */
363#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 376#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
364#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 377#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
365 378
366enum charge_type { 379enum charge_type {
367 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 380 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
368 MEM_CGROUP_CHARGE_TYPE_MAPPED, 381 MEM_CGROUP_CHARGE_TYPE_ANON,
369 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
370 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
371 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 382 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
372 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 383 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
373 NR_CHARGE_TYPE, 384 NR_CHARGE_TYPE,
@@ -377,8 +388,8 @@ enum charge_type {
377#define _MEM (0) 388#define _MEM (0)
378#define _MEMSWAP (1) 389#define _MEMSWAP (1)
379#define _OOM_TYPE (2) 390#define _OOM_TYPE (2)
380#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 391#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
381#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 392#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
382#define MEMFILE_ATTR(val) ((val) & 0xffff) 393#define MEMFILE_ATTR(val) ((val) & 0xffff)
383/* Used for OOM nofiier */ 394/* Used for OOM nofiier */
384#define OOM_CONTROL (0) 395#define OOM_CONTROL (0)
@@ -394,8 +405,14 @@ enum charge_type {
394static void mem_cgroup_get(struct mem_cgroup *memcg); 405static void mem_cgroup_get(struct mem_cgroup *memcg);
395static void mem_cgroup_put(struct mem_cgroup *memcg); 406static void mem_cgroup_put(struct mem_cgroup *memcg);
396 407
408static inline
409struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
410{
411 return container_of(s, struct mem_cgroup, css);
412}
413
397/* Writing them here to avoid exposing memcg's inner layout */ 414/* Writing them here to avoid exposing memcg's inner layout */
398#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 415#ifdef CONFIG_MEMCG_KMEM
399#include <net/sock.h> 416#include <net/sock.h>
400#include <net/ip.h> 417#include <net/ip.h>
401 418
@@ -404,6 +421,7 @@ void sock_update_memcg(struct sock *sk)
404{ 421{
405 if (mem_cgroup_sockets_enabled) { 422 if (mem_cgroup_sockets_enabled) {
406 struct mem_cgroup *memcg; 423 struct mem_cgroup *memcg;
424 struct cg_proto *cg_proto;
407 425
408 BUG_ON(!sk->sk_prot->proto_cgroup); 426 BUG_ON(!sk->sk_prot->proto_cgroup);
409 427
@@ -423,9 +441,10 @@ void sock_update_memcg(struct sock *sk)
423 441
424 rcu_read_lock(); 442 rcu_read_lock();
425 memcg = mem_cgroup_from_task(current); 443 memcg = mem_cgroup_from_task(current);
426 if (!mem_cgroup_is_root(memcg)) { 444 cg_proto = sk->sk_prot->proto_cgroup(memcg);
445 if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
427 mem_cgroup_get(memcg); 446 mem_cgroup_get(memcg);
428 sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg); 447 sk->sk_cgrp = cg_proto;
429 } 448 }
430 rcu_read_unlock(); 449 rcu_read_unlock();
431 } 450 }
@@ -452,7 +471,20 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
452} 471}
453EXPORT_SYMBOL(tcp_proto_cgroup); 472EXPORT_SYMBOL(tcp_proto_cgroup);
454#endif /* CONFIG_INET */ 473#endif /* CONFIG_INET */
455#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ 474#endif /* CONFIG_MEMCG_KMEM */
475
476#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
477static void disarm_sock_keys(struct mem_cgroup *memcg)
478{
479 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
480 return;
481 static_key_slow_dec(&memcg_socket_limit_enabled);
482}
483#else
484static void disarm_sock_keys(struct mem_cgroup *memcg)
485{
486}
487#endif
456 488
457static void drain_all_stock_async(struct mem_cgroup *memcg); 489static void drain_all_stock_async(struct mem_cgroup *memcg);
458 490
@@ -675,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
675 bool charge) 707 bool charge)
676{ 708{
677 int val = (charge) ? 1 : -1; 709 int val = (charge) ? 1 : -1;
678 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 710 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
679} 711}
680 712
681static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 713static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
@@ -718,12 +750,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
718 nr_pages = -nr_pages; /* for event */ 750 nr_pages = -nr_pages; /* for event */
719 } 751 }
720 752
721 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); 753 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
722 754
723 preempt_enable(); 755 preempt_enable();
724} 756}
725 757
726unsigned long 758unsigned long
759mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
760{
761 struct mem_cgroup_per_zone *mz;
762
763 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
764 return mz->lru_size[lru];
765}
766
767static unsigned long
727mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, 768mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
728 unsigned int lru_mask) 769 unsigned int lru_mask)
729{ 770{
@@ -770,7 +811,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
770{ 811{
771 unsigned long val, next; 812 unsigned long val, next;
772 813
773 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); 814 val = __this_cpu_read(memcg->stat->nr_page_events);
774 next = __this_cpu_read(memcg->stat->targets[target]); 815 next = __this_cpu_read(memcg->stat->targets[target]);
775 /* from time_after() in jiffies.h */ 816 /* from time_after() in jiffies.h */
776 if ((long)next - (long)val < 0) { 817 if ((long)next - (long)val < 0) {
@@ -827,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
827 868
828struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 869struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
829{ 870{
830 return container_of(cgroup_subsys_state(cont, 871 return mem_cgroup_from_css(
831 mem_cgroup_subsys_id), struct mem_cgroup, 872 cgroup_subsys_state(cont, mem_cgroup_subsys_id));
832 css);
833} 873}
834 874
835struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 875struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -842,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
842 if (unlikely(!p)) 882 if (unlikely(!p))
843 return NULL; 883 return NULL;
844 884
845 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 885 return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
846 struct mem_cgroup, css);
847} 886}
848 887
849struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 888struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -929,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
929 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); 968 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
930 if (css) { 969 if (css) {
931 if (css == &root->css || css_tryget(css)) 970 if (css == &root->css || css_tryget(css))
932 memcg = container_of(css, 971 memcg = mem_cgroup_from_css(css);
933 struct mem_cgroup, css);
934 } else 972 } else
935 id = 0; 973 id = 0;
936 rcu_read_unlock(); 974 rcu_read_unlock();
@@ -1013,7 +1051,7 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event);
1013/** 1051/**
1014 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1052 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
1015 * @zone: zone of the wanted lruvec 1053 * @zone: zone of the wanted lruvec
1016 * @mem: memcg of the wanted lruvec 1054 * @memcg: memcg of the wanted lruvec
1017 * 1055 *
1018 * Returns the lru list vector holding pages for the given @zone and 1056 * Returns the lru list vector holding pages for the given @zone and
1019 * @mem. This can be the global zone lruvec, if the memory controller 1057 * @mem. This can be the global zone lruvec, if the memory controller
@@ -1046,19 +1084,11 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1046 */ 1084 */
1047 1085
1048/** 1086/**
1049 * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec 1087 * mem_cgroup_page_lruvec - return lruvec for adding an lru page
1050 * @zone: zone of the page
1051 * @page: the page 1088 * @page: the page
1052 * @lru: current lru 1089 * @zone: zone of the page
1053 *
1054 * This function accounts for @page being added to @lru, and returns
1055 * the lruvec for the given @zone and the memcg @page is charged to.
1056 *
1057 * The callsite is then responsible for physically linking the page to
1058 * the returned lruvec->lists[@lru].
1059 */ 1090 */
1060struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, 1091struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1061 enum lru_list lru)
1062{ 1092{
1063 struct mem_cgroup_per_zone *mz; 1093 struct mem_cgroup_per_zone *mz;
1064 struct mem_cgroup *memcg; 1094 struct mem_cgroup *memcg;
@@ -1071,7 +1101,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1071 memcg = pc->mem_cgroup; 1101 memcg = pc->mem_cgroup;
1072 1102
1073 /* 1103 /*
1074 * Surreptitiously switch any uncharged page to root: 1104 * Surreptitiously switch any uncharged offlist page to root:
1075 * an uncharged page off lru does nothing to secure 1105 * an uncharged page off lru does nothing to secure
1076 * its former mem_cgroup from sudden removal. 1106 * its former mem_cgroup from sudden removal.
1077 * 1107 *
@@ -1079,85 +1109,60 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1079 * under page_cgroup lock: between them, they make all uses 1109 * under page_cgroup lock: between them, they make all uses
1080 * of pc->mem_cgroup safe. 1110 * of pc->mem_cgroup safe.
1081 */ 1111 */
1082 if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1112 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1083 pc->mem_cgroup = memcg = root_mem_cgroup; 1113 pc->mem_cgroup = memcg = root_mem_cgroup;
1084 1114
1085 mz = page_cgroup_zoneinfo(memcg, page); 1115 mz = page_cgroup_zoneinfo(memcg, page);
1086 /* compound_order() is stabilized through lru_lock */
1087 mz->lru_size[lru] += 1 << compound_order(page);
1088 return &mz->lruvec; 1116 return &mz->lruvec;
1089} 1117}
1090 1118
1091/** 1119/**
1092 * mem_cgroup_lru_del_list - account for removing an lru page 1120 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1093 * @page: the page 1121 * @lruvec: mem_cgroup per zone lru vector
1094 * @lru: target lru 1122 * @lru: index of lru list the page is sitting on
1123 * @nr_pages: positive when adding or negative when removing
1095 * 1124 *
1096 * This function accounts for @page being removed from @lru. 1125 * This function must be called when a page is added to or removed from an
1097 * 1126 * lru list.
1098 * The callsite is then responsible for physically unlinking
1099 * @page->lru.
1100 */ 1127 */
1101void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) 1128void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1129 int nr_pages)
1102{ 1130{
1103 struct mem_cgroup_per_zone *mz; 1131 struct mem_cgroup_per_zone *mz;
1104 struct mem_cgroup *memcg; 1132 unsigned long *lru_size;
1105 struct page_cgroup *pc;
1106 1133
1107 if (mem_cgroup_disabled()) 1134 if (mem_cgroup_disabled())
1108 return; 1135 return;
1109 1136
1110 pc = lookup_page_cgroup(page); 1137 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1111 memcg = pc->mem_cgroup; 1138 lru_size = mz->lru_size + lru;
1112 VM_BUG_ON(!memcg); 1139 *lru_size += nr_pages;
1113 mz = page_cgroup_zoneinfo(memcg, page); 1140 VM_BUG_ON((long)(*lru_size) < 0);
1114 /* huge page split is done under lru_lock. so, we have no races. */
1115 VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
1116 mz->lru_size[lru] -= 1 << compound_order(page);
1117}
1118
1119void mem_cgroup_lru_del(struct page *page)
1120{
1121 mem_cgroup_lru_del_list(page, page_lru(page));
1122}
1123
1124/**
1125 * mem_cgroup_lru_move_lists - account for moving a page between lrus
1126 * @zone: zone of the page
1127 * @page: the page
1128 * @from: current lru
1129 * @to: target lru
1130 *
1131 * This function accounts for @page being moved between the lrus @from
1132 * and @to, and returns the lruvec for the given @zone and the memcg
1133 * @page is charged to.
1134 *
1135 * The callsite is then responsible for physically relinking
1136 * @page->lru to the returned lruvec->lists[@to].
1137 */
1138struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
1139 struct page *page,
1140 enum lru_list from,
1141 enum lru_list to)
1142{
1143 /* XXX: Optimize this, especially for @from == @to */
1144 mem_cgroup_lru_del_list(page, from);
1145 return mem_cgroup_lru_add_list(zone, page, to);
1146} 1141}
1147 1142
1148/* 1143/*
1149 * Checks whether given mem is same or in the root_mem_cgroup's 1144 * Checks whether given mem is same or in the root_mem_cgroup's
1150 * hierarchy subtree 1145 * hierarchy subtree
1151 */ 1146 */
1147bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1148 struct mem_cgroup *memcg)
1149{
1150 if (root_memcg == memcg)
1151 return true;
1152 if (!root_memcg->use_hierarchy || !memcg)
1153 return false;
1154 return css_is_ancestor(&memcg->css, &root_memcg->css);
1155}
1156
1152static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1157static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1153 struct mem_cgroup *memcg) 1158 struct mem_cgroup *memcg)
1154{ 1159{
1155 if (root_memcg != memcg) { 1160 bool ret;
1156 return (root_memcg->use_hierarchy &&
1157 css_is_ancestor(&memcg->css, &root_memcg->css));
1158 }
1159 1161
1160 return true; 1162 rcu_read_lock();
1163 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1164 rcu_read_unlock();
1165 return ret;
1161} 1166}
1162 1167
1163int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) 1168int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
@@ -1195,19 +1200,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1195 return ret; 1200 return ret;
1196} 1201}
1197 1202
1198int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) 1203int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1199{ 1204{
1200 unsigned long inactive_ratio; 1205 unsigned long inactive_ratio;
1201 int nid = zone_to_nid(zone);
1202 int zid = zone_idx(zone);
1203 unsigned long inactive; 1206 unsigned long inactive;
1204 unsigned long active; 1207 unsigned long active;
1205 unsigned long gb; 1208 unsigned long gb;
1206 1209
1207 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1210 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1208 BIT(LRU_INACTIVE_ANON)); 1211 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1209 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1210 BIT(LRU_ACTIVE_ANON));
1211 1212
1212 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1213 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1213 if (gb) 1214 if (gb)
@@ -1218,55 +1219,23 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
1218 return inactive * inactive_ratio < active; 1219 return inactive * inactive_ratio < active;
1219} 1220}
1220 1221
1221int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) 1222int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
1222{ 1223{
1223 unsigned long active; 1224 unsigned long active;
1224 unsigned long inactive; 1225 unsigned long inactive;
1225 int zid = zone_idx(zone);
1226 int nid = zone_to_nid(zone);
1227 1226
1228 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1227 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
1229 BIT(LRU_INACTIVE_FILE)); 1228 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
1230 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1231 BIT(LRU_ACTIVE_FILE));
1232 1229
1233 return (active > inactive); 1230 return (active > inactive);
1234} 1231}
1235 1232
1236struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1237 struct zone *zone)
1238{
1239 int nid = zone_to_nid(zone);
1240 int zid = zone_idx(zone);
1241 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1242
1243 return &mz->reclaim_stat;
1244}
1245
1246struct zone_reclaim_stat *
1247mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1248{
1249 struct page_cgroup *pc;
1250 struct mem_cgroup_per_zone *mz;
1251
1252 if (mem_cgroup_disabled())
1253 return NULL;
1254
1255 pc = lookup_page_cgroup(page);
1256 if (!PageCgroupUsed(pc))
1257 return NULL;
1258 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1259 smp_rmb();
1260 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1261 return &mz->reclaim_stat;
1262}
1263
1264#define mem_cgroup_from_res_counter(counter, member) \ 1233#define mem_cgroup_from_res_counter(counter, member) \
1265 container_of(counter, struct mem_cgroup, member) 1234 container_of(counter, struct mem_cgroup, member)
1266 1235
1267/** 1236/**
1268 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1237 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1269 * @mem: the memory cgroup 1238 * @memcg: the memory cgroup
1270 * 1239 *
1271 * Returns the maximum amount of memory @mem can be charged with, in 1240 * Returns the maximum amount of memory @mem can be charged with, in
1272 * pages. 1241 * pages.
@@ -1486,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1486/* 1455/*
1487 * Return the memory (and swap, if configured) limit for a memcg. 1456 * Return the memory (and swap, if configured) limit for a memcg.
1488 */ 1457 */
1489u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1458static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1490{ 1459{
1491 u64 limit; 1460 u64 limit;
1492 u64 memsw; 1461 u64 memsw;
@@ -1502,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1502 return min(limit, memsw); 1471 return min(limit, memsw);
1503} 1472}
1504 1473
1474void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1475 int order)
1476{
1477 struct mem_cgroup *iter;
1478 unsigned long chosen_points = 0;
1479 unsigned long totalpages;
1480 unsigned int points = 0;
1481 struct task_struct *chosen = NULL;
1482
1483 /*
1484 * If current has a pending SIGKILL, then automatically select it. The
1485 * goal is to allow it to allocate so that it may quickly exit and free
1486 * its memory.
1487 */
1488 if (fatal_signal_pending(current)) {
1489 set_thread_flag(TIF_MEMDIE);
1490 return;
1491 }
1492
1493 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1494 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1495 for_each_mem_cgroup_tree(iter, memcg) {
1496 struct cgroup *cgroup = iter->css.cgroup;
1497 struct cgroup_iter it;
1498 struct task_struct *task;
1499
1500 cgroup_iter_start(cgroup, &it);
1501 while ((task = cgroup_iter_next(cgroup, &it))) {
1502 switch (oom_scan_process_thread(task, totalpages, NULL,
1503 false)) {
1504 case OOM_SCAN_SELECT:
1505 if (chosen)
1506 put_task_struct(chosen);
1507 chosen = task;
1508 chosen_points = ULONG_MAX;
1509 get_task_struct(chosen);
1510 /* fall through */
1511 case OOM_SCAN_CONTINUE:
1512 continue;
1513 case OOM_SCAN_ABORT:
1514 cgroup_iter_end(cgroup, &it);
1515 mem_cgroup_iter_break(memcg, iter);
1516 if (chosen)
1517 put_task_struct(chosen);
1518 return;
1519 case OOM_SCAN_OK:
1520 break;
1521 };
1522 points = oom_badness(task, memcg, NULL, totalpages);
1523 if (points > chosen_points) {
1524 if (chosen)
1525 put_task_struct(chosen);
1526 chosen = task;
1527 chosen_points = points;
1528 get_task_struct(chosen);
1529 }
1530 }
1531 cgroup_iter_end(cgroup, &it);
1532 }
1533
1534 if (!chosen)
1535 return;
1536 points = chosen_points * 1000 / totalpages;
1537 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1538 NULL, "Memory cgroup out of memory");
1539}
1540
1505static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, 1541static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1506 gfp_t gfp_mask, 1542 gfp_t gfp_mask,
1507 unsigned long flags) 1543 unsigned long flags)
@@ -1540,7 +1576,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1540 1576
1541/** 1577/**
1542 * test_mem_cgroup_node_reclaimable 1578 * test_mem_cgroup_node_reclaimable
1543 * @mem: the target memcg 1579 * @memcg: the target memcg
1544 * @nid: the node ID to be checked. 1580 * @nid: the node ID to be checked.
1545 * @noswap : specify true here if the user wants flle only information. 1581 * @noswap : specify true here if the user wants flle only information.
1546 * 1582 *
@@ -1634,7 +1670,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1634 * unused nodes. But scan_nodes is lazily updated and may not cotain 1670 * unused nodes. But scan_nodes is lazily updated and may not cotain
1635 * enough new information. We need to do double check. 1671 * enough new information. We need to do double check.
1636 */ 1672 */
1637bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1673static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1638{ 1674{
1639 int nid; 1675 int nid;
1640 1676
@@ -1669,7 +1705,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1669 return 0; 1705 return 0;
1670} 1706}
1671 1707
1672bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1708static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1673{ 1709{
1674 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 1710 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1675} 1711}
@@ -1843,7 +1879,8 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
1843/* 1879/*
1844 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1880 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1845 */ 1881 */
1846bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1882static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
1883 int order)
1847{ 1884{
1848 struct oom_wait_info owait; 1885 struct oom_wait_info owait;
1849 bool locked, need_to_kill; 1886 bool locked, need_to_kill;
@@ -1930,7 +1967,7 @@ again:
1930 return; 1967 return;
1931 /* 1968 /*
1932 * If this memory cgroup is not under account moving, we don't 1969 * If this memory cgroup is not under account moving, we don't
1933 * need to take move_lock_page_cgroup(). Because we already hold 1970 * need to take move_lock_mem_cgroup(). Because we already hold
1934 * rcu_read_lock(), any calls to move_account will be delayed until 1971 * rcu_read_lock(), any calls to move_account will be delayed until
1935 * rcu_read_unlock() if mem_cgroup_stolen() == true. 1972 * rcu_read_unlock() if mem_cgroup_stolen() == true.
1936 */ 1973 */
@@ -1952,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
1952 /* 1989 /*
1953 * It's guaranteed that pc->mem_cgroup never changes while 1990 * It's guaranteed that pc->mem_cgroup never changes while
1954 * lock is held because a routine modifies pc->mem_cgroup 1991 * lock is held because a routine modifies pc->mem_cgroup
1955 * should take move_lock_page_cgroup(). 1992 * should take move_lock_mem_cgroup().
1956 */ 1993 */
1957 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 1994 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
1958} 1995}
@@ -1992,7 +2029,7 @@ struct memcg_stock_pcp {
1992 unsigned int nr_pages; 2029 unsigned int nr_pages;
1993 struct work_struct work; 2030 struct work_struct work;
1994 unsigned long flags; 2031 unsigned long flags;
1995#define FLUSHING_CACHED_CHARGE (0) 2032#define FLUSHING_CACHED_CHARGE 0
1996}; 2033};
1997static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2034static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1998static DEFINE_MUTEX(percpu_charge_mutex); 2035static DEFINE_MUTEX(percpu_charge_mutex);
@@ -2139,7 +2176,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2139 int i; 2176 int i;
2140 2177
2141 spin_lock(&memcg->pcp_counter_lock); 2178 spin_lock(&memcg->pcp_counter_lock);
2142 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 2179 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2143 long x = per_cpu(memcg->stat->count[i], cpu); 2180 long x = per_cpu(memcg->stat->count[i], cpu);
2144 2181
2145 per_cpu(memcg->stat->count[i], cpu) = 0; 2182 per_cpu(memcg->stat->count[i], cpu) = 0;
@@ -2165,7 +2202,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2165 if (action == CPU_ONLINE) 2202 if (action == CPU_ONLINE)
2166 return NOTIFY_OK; 2203 return NOTIFY_OK;
2167 2204
2168 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2205 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2169 return NOTIFY_OK; 2206 return NOTIFY_OK;
2170 2207
2171 for_each_mem_cgroup(iter) 2208 for_each_mem_cgroup(iter)
@@ -2299,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2299 * We always charge the cgroup the mm_struct belongs to. 2336 * We always charge the cgroup the mm_struct belongs to.
2300 * The mm_struct's mem_cgroup changes on task migration if the 2337 * The mm_struct's mem_cgroup changes on task migration if the
2301 * thread group leader migrates. It's possible that mm is not 2338 * thread group leader migrates. It's possible that mm is not
2302 * set, if so charge the init_mm (happens for pagecache usage). 2339 * set, if so charge the root memcg (happens for pagecache usage).
2303 */ 2340 */
2304 if (!*ptr && !mm) 2341 if (!*ptr && !mm)
2305 *ptr = root_mem_cgroup; 2342 *ptr = root_mem_cgroup;
@@ -2427,6 +2464,24 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2427} 2464}
2428 2465
2429/* 2466/*
2467 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
2468 * This is useful when moving usage to parent cgroup.
2469 */
2470static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2471 unsigned int nr_pages)
2472{
2473 unsigned long bytes = nr_pages * PAGE_SIZE;
2474
2475 if (mem_cgroup_is_root(memcg))
2476 return;
2477
2478 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2479 if (do_swap_account)
2480 res_counter_uncharge_until(&memcg->memsw,
2481 memcg->memsw.parent, bytes);
2482}
2483
2484/*
2430 * A helper function to get mem_cgroup from ID. must be called under 2485 * A helper function to get mem_cgroup from ID. must be called under
2431 * rcu_read_lock(). The caller must check css_is_removed() or some if 2486 * rcu_read_lock(). The caller must check css_is_removed() or some if
2432 * it's concern. (dropping refcnt from swap can be called against removed 2487 * it's concern. (dropping refcnt from swap can be called against removed
@@ -2442,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2442 css = css_lookup(&mem_cgroup_subsys, id); 2497 css = css_lookup(&mem_cgroup_subsys, id);
2443 if (!css) 2498 if (!css)
2444 return NULL; 2499 return NULL;
2445 return container_of(css, struct mem_cgroup, css); 2500 return mem_cgroup_from_css(css);
2446} 2501}
2447 2502
2448struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2503struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2476,20 +2531,17 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2476static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, 2531static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2477 struct page *page, 2532 struct page *page,
2478 unsigned int nr_pages, 2533 unsigned int nr_pages,
2479 struct page_cgroup *pc,
2480 enum charge_type ctype, 2534 enum charge_type ctype,
2481 bool lrucare) 2535 bool lrucare)
2482{ 2536{
2537 struct page_cgroup *pc = lookup_page_cgroup(page);
2483 struct zone *uninitialized_var(zone); 2538 struct zone *uninitialized_var(zone);
2539 struct lruvec *lruvec;
2484 bool was_on_lru = false; 2540 bool was_on_lru = false;
2485 bool anon; 2541 bool anon;
2486 2542
2487 lock_page_cgroup(pc); 2543 lock_page_cgroup(pc);
2488 if (unlikely(PageCgroupUsed(pc))) { 2544 VM_BUG_ON(PageCgroupUsed(pc));
2489 unlock_page_cgroup(pc);
2490 __mem_cgroup_cancel_charge(memcg, nr_pages);
2491 return;
2492 }
2493 /* 2545 /*
2494 * we don't need page_cgroup_lock about tail pages, becase they are not 2546 * we don't need page_cgroup_lock about tail pages, becase they are not
2495 * accessed by any other context at this point. 2547 * accessed by any other context at this point.
@@ -2503,8 +2555,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2503 zone = page_zone(page); 2555 zone = page_zone(page);
2504 spin_lock_irq(&zone->lru_lock); 2556 spin_lock_irq(&zone->lru_lock);
2505 if (PageLRU(page)) { 2557 if (PageLRU(page)) {
2558 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2506 ClearPageLRU(page); 2559 ClearPageLRU(page);
2507 del_page_from_lru_list(zone, page, page_lru(page)); 2560 del_page_from_lru_list(page, lruvec, page_lru(page));
2508 was_on_lru = true; 2561 was_on_lru = true;
2509 } 2562 }
2510 } 2563 }
@@ -2522,14 +2575,15 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2522 2575
2523 if (lrucare) { 2576 if (lrucare) {
2524 if (was_on_lru) { 2577 if (was_on_lru) {
2578 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2525 VM_BUG_ON(PageLRU(page)); 2579 VM_BUG_ON(PageLRU(page));
2526 SetPageLRU(page); 2580 SetPageLRU(page);
2527 add_page_to_lru_list(zone, page, page_lru(page)); 2581 add_page_to_lru_list(page, lruvec, page_lru(page));
2528 } 2582 }
2529 spin_unlock_irq(&zone->lru_lock); 2583 spin_unlock_irq(&zone->lru_lock);
2530 } 2584 }
2531 2585
2532 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2586 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2533 anon = true; 2587 anon = true;
2534 else 2588 else
2535 anon = false; 2589 anon = false;
@@ -2547,7 +2601,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2547 2601
2548#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2602#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2549 2603
2550#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) 2604#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
2551/* 2605/*
2552 * Because tail pages are not marked as "used", set it. We're under 2606 * Because tail pages are not marked as "used", set it. We're under
2553 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2607 * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2578,23 +2632,19 @@ void mem_cgroup_split_huge_fixup(struct page *head)
2578 * @pc: page_cgroup of the page. 2632 * @pc: page_cgroup of the page.
2579 * @from: mem_cgroup which the page is moved from. 2633 * @from: mem_cgroup which the page is moved from.
2580 * @to: mem_cgroup which the page is moved to. @from != @to. 2634 * @to: mem_cgroup which the page is moved to. @from != @to.
2581 * @uncharge: whether we should call uncharge and css_put against @from.
2582 * 2635 *
2583 * The caller must confirm following. 2636 * The caller must confirm following.
2584 * - page is not on LRU (isolate_page() is useful.) 2637 * - page is not on LRU (isolate_page() is useful.)
2585 * - compound_lock is held when nr_pages > 1 2638 * - compound_lock is held when nr_pages > 1
2586 * 2639 *
2587 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2640 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
2588 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is 2641 * from old cgroup.
2589 * true, this function does "uncharge" from old cgroup, but it doesn't if
2590 * @uncharge is false, so a caller should do "uncharge".
2591 */ 2642 */
2592static int mem_cgroup_move_account(struct page *page, 2643static int mem_cgroup_move_account(struct page *page,
2593 unsigned int nr_pages, 2644 unsigned int nr_pages,
2594 struct page_cgroup *pc, 2645 struct page_cgroup *pc,
2595 struct mem_cgroup *from, 2646 struct mem_cgroup *from,
2596 struct mem_cgroup *to, 2647 struct mem_cgroup *to)
2597 bool uncharge)
2598{ 2648{
2599 unsigned long flags; 2649 unsigned long flags;
2600 int ret; 2650 int ret;
@@ -2628,9 +2678,6 @@ static int mem_cgroup_move_account(struct page *page,
2628 preempt_enable(); 2678 preempt_enable();
2629 } 2679 }
2630 mem_cgroup_charge_statistics(from, anon, -nr_pages); 2680 mem_cgroup_charge_statistics(from, anon, -nr_pages);
2631 if (uncharge)
2632 /* This is not "cancel", but cancel_charge does all we need. */
2633 __mem_cgroup_cancel_charge(from, nr_pages);
2634 2681
2635 /* caller should have done css_get */ 2682 /* caller should have done css_get */
2636 pc->mem_cgroup = to; 2683 pc->mem_cgroup = to;
@@ -2661,18 +2708,15 @@ out:
2661 2708
2662static int mem_cgroup_move_parent(struct page *page, 2709static int mem_cgroup_move_parent(struct page *page,
2663 struct page_cgroup *pc, 2710 struct page_cgroup *pc,
2664 struct mem_cgroup *child, 2711 struct mem_cgroup *child)
2665 gfp_t gfp_mask)
2666{ 2712{
2667 struct cgroup *cg = child->css.cgroup;
2668 struct cgroup *pcg = cg->parent;
2669 struct mem_cgroup *parent; 2713 struct mem_cgroup *parent;
2670 unsigned int nr_pages; 2714 unsigned int nr_pages;
2671 unsigned long uninitialized_var(flags); 2715 unsigned long uninitialized_var(flags);
2672 int ret; 2716 int ret;
2673 2717
2674 /* Is ROOT ? */ 2718 /* Is ROOT ? */
2675 if (!pcg) 2719 if (mem_cgroup_is_root(child))
2676 return -EINVAL; 2720 return -EINVAL;
2677 2721
2678 ret = -EBUSY; 2722 ret = -EBUSY;
@@ -2683,21 +2727,23 @@ static int mem_cgroup_move_parent(struct page *page,
2683 2727
2684 nr_pages = hpage_nr_pages(page); 2728 nr_pages = hpage_nr_pages(page);
2685 2729
2686 parent = mem_cgroup_from_cont(pcg); 2730 parent = parent_mem_cgroup(child);
2687 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2731 /*
2688 if (ret) 2732 * If no parent, move charges to root cgroup.
2689 goto put_back; 2733 */
2734 if (!parent)
2735 parent = root_mem_cgroup;
2690 2736
2691 if (nr_pages > 1) 2737 if (nr_pages > 1)
2692 flags = compound_lock_irqsave(page); 2738 flags = compound_lock_irqsave(page);
2693 2739
2694 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); 2740 ret = mem_cgroup_move_account(page, nr_pages,
2695 if (ret) 2741 pc, child, parent);
2696 __mem_cgroup_cancel_charge(parent, nr_pages); 2742 if (!ret)
2743 __mem_cgroup_cancel_local_charge(child, nr_pages);
2697 2744
2698 if (nr_pages > 1) 2745 if (nr_pages > 1)
2699 compound_unlock_irqrestore(page, flags); 2746 compound_unlock_irqrestore(page, flags);
2700put_back:
2701 putback_lru_page(page); 2747 putback_lru_page(page);
2702put: 2748put:
2703 put_page(page); 2749 put_page(page);
@@ -2716,7 +2762,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2716{ 2762{
2717 struct mem_cgroup *memcg = NULL; 2763 struct mem_cgroup *memcg = NULL;
2718 unsigned int nr_pages = 1; 2764 unsigned int nr_pages = 1;
2719 struct page_cgroup *pc;
2720 bool oom = true; 2765 bool oom = true;
2721 int ret; 2766 int ret;
2722 2767
@@ -2730,11 +2775,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2730 oom = false; 2775 oom = false;
2731 } 2776 }
2732 2777
2733 pc = lookup_page_cgroup(page);
2734 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 2778 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2735 if (ret == -ENOMEM) 2779 if (ret == -ENOMEM)
2736 return ret; 2780 return ret;
2737 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false); 2781 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
2738 return 0; 2782 return 0;
2739} 2783}
2740 2784
@@ -2747,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page,
2747 VM_BUG_ON(page->mapping && !PageAnon(page)); 2791 VM_BUG_ON(page->mapping && !PageAnon(page));
2748 VM_BUG_ON(!mm); 2792 VM_BUG_ON(!mm);
2749 return mem_cgroup_charge_common(page, mm, gfp_mask, 2793 return mem_cgroup_charge_common(page, mm, gfp_mask,
2750 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2794 MEM_CGROUP_CHARGE_TYPE_ANON);
2751}
2752
2753static void
2754__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2755 enum charge_type ctype);
2756
2757int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2758 gfp_t gfp_mask)
2759{
2760 struct mem_cgroup *memcg = NULL;
2761 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2762 int ret;
2763
2764 if (mem_cgroup_disabled())
2765 return 0;
2766 if (PageCompound(page))
2767 return 0;
2768
2769 if (unlikely(!mm))
2770 mm = &init_mm;
2771 if (!page_is_file_cache(page))
2772 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2773
2774 if (!PageSwapCache(page))
2775 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2776 else { /* page is swapcache/shmem */
2777 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2778 if (!ret)
2779 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2780 }
2781 return ret;
2782} 2795}
2783 2796
2784/* 2797/*
@@ -2787,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2787 * struct page_cgroup is acquired. This refcnt will be consumed by 2800 * struct page_cgroup is acquired. This refcnt will be consumed by
2788 * "commit()" or removed by "cancel()" 2801 * "commit()" or removed by "cancel()"
2789 */ 2802 */
2790int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2803static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2791 struct page *page, 2804 struct page *page,
2792 gfp_t mask, struct mem_cgroup **memcgp) 2805 gfp_t mask,
2806 struct mem_cgroup **memcgp)
2793{ 2807{
2794 struct mem_cgroup *memcg; 2808 struct mem_cgroup *memcg;
2809 struct page_cgroup *pc;
2795 int ret; 2810 int ret;
2796 2811
2797 *memcgp = NULL; 2812 pc = lookup_page_cgroup(page);
2798
2799 if (mem_cgroup_disabled())
2800 return 0;
2801
2802 if (!do_swap_account)
2803 goto charge_cur_mm;
2804 /* 2813 /*
2805 * A racing thread's fault, or swapoff, may have already updated 2814 * Every swap fault against a single page tries to charge the
2806 * the pte, and even removed page from swap cache: in those cases 2815 * page, bail as early as possible. shmem_unuse() encounters
2807 * do_swap_page()'s pte_same() test will fail; but there's also a 2816 * already charged pages, too. The USED bit is protected by
2808 * KSM case which does need to charge the page. 2817 * the page lock, which serializes swap cache removal, which
2818 * in turn serializes uncharging.
2809 */ 2819 */
2810 if (!PageSwapCache(page)) 2820 if (PageCgroupUsed(pc))
2821 return 0;
2822 if (!do_swap_account)
2811 goto charge_cur_mm; 2823 goto charge_cur_mm;
2812 memcg = try_get_mem_cgroup_from_page(page); 2824 memcg = try_get_mem_cgroup_from_page(page);
2813 if (!memcg) 2825 if (!memcg)
@@ -2819,28 +2831,55 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2819 ret = 0; 2831 ret = 0;
2820 return ret; 2832 return ret;
2821charge_cur_mm: 2833charge_cur_mm:
2822 if (unlikely(!mm))
2823 mm = &init_mm;
2824 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 2834 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
2825 if (ret == -EINTR) 2835 if (ret == -EINTR)
2826 ret = 0; 2836 ret = 0;
2827 return ret; 2837 return ret;
2828} 2838}
2829 2839
2840int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
2841 gfp_t gfp_mask, struct mem_cgroup **memcgp)
2842{
2843 *memcgp = NULL;
2844 if (mem_cgroup_disabled())
2845 return 0;
2846 /*
2847 * A racing thread's fault, or swapoff, may have already
2848 * updated the pte, and even removed page from swap cache: in
2849 * those cases unuse_pte()'s pte_same() test will fail; but
2850 * there's also a KSM case which does need to charge the page.
2851 */
2852 if (!PageSwapCache(page)) {
2853 int ret;
2854
2855 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
2856 if (ret == -EINTR)
2857 ret = 0;
2858 return ret;
2859 }
2860 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
2861}
2862
2863void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
2864{
2865 if (mem_cgroup_disabled())
2866 return;
2867 if (!memcg)
2868 return;
2869 __mem_cgroup_cancel_charge(memcg, 1);
2870}
2871
2830static void 2872static void
2831__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 2873__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2832 enum charge_type ctype) 2874 enum charge_type ctype)
2833{ 2875{
2834 struct page_cgroup *pc;
2835
2836 if (mem_cgroup_disabled()) 2876 if (mem_cgroup_disabled())
2837 return; 2877 return;
2838 if (!memcg) 2878 if (!memcg)
2839 return; 2879 return;
2840 cgroup_exclude_rmdir(&memcg->css); 2880 cgroup_exclude_rmdir(&memcg->css);
2841 2881
2842 pc = lookup_page_cgroup(page); 2882 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
2843 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true);
2844 /* 2883 /*
2845 * Now swap is on-memory. This means this page may be 2884 * Now swap is on-memory. This means this page may be
2846 * counted both as mem and swap....double count. 2885 * counted both as mem and swap....double count.
@@ -2850,24 +2889,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2850 */ 2889 */
2851 if (do_swap_account && PageSwapCache(page)) { 2890 if (do_swap_account && PageSwapCache(page)) {
2852 swp_entry_t ent = {.val = page_private(page)}; 2891 swp_entry_t ent = {.val = page_private(page)};
2853 struct mem_cgroup *swap_memcg; 2892 mem_cgroup_uncharge_swap(ent);
2854 unsigned short id;
2855
2856 id = swap_cgroup_record(ent, 0);
2857 rcu_read_lock();
2858 swap_memcg = mem_cgroup_lookup(id);
2859 if (swap_memcg) {
2860 /*
2861 * This recorded memcg can be obsolete one. So, avoid
2862 * calling css_tryget
2863 */
2864 if (!mem_cgroup_is_root(swap_memcg))
2865 res_counter_uncharge(&swap_memcg->memsw,
2866 PAGE_SIZE);
2867 mem_cgroup_swap_statistics(swap_memcg, false);
2868 mem_cgroup_put(swap_memcg);
2869 }
2870 rcu_read_unlock();
2871 } 2893 }
2872 /* 2894 /*
2873 * At swapin, we may charge account against cgroup which has no tasks. 2895 * At swapin, we may charge account against cgroup which has no tasks.
@@ -2881,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
2881 struct mem_cgroup *memcg) 2903 struct mem_cgroup *memcg)
2882{ 2904{
2883 __mem_cgroup_commit_charge_swapin(page, memcg, 2905 __mem_cgroup_commit_charge_swapin(page, memcg,
2884 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2906 MEM_CGROUP_CHARGE_TYPE_ANON);
2885} 2907}
2886 2908
2887void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 2909int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2910 gfp_t gfp_mask)
2888{ 2911{
2912 struct mem_cgroup *memcg = NULL;
2913 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2914 int ret;
2915
2889 if (mem_cgroup_disabled()) 2916 if (mem_cgroup_disabled())
2890 return; 2917 return 0;
2891 if (!memcg) 2918 if (PageCompound(page))
2892 return; 2919 return 0;
2893 __mem_cgroup_cancel_charge(memcg, 1); 2920
2921 if (!PageSwapCache(page))
2922 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2923 else { /* page is swapcache/shmem */
2924 ret = __mem_cgroup_try_charge_swapin(mm, page,
2925 gfp_mask, &memcg);
2926 if (!ret)
2927 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2928 }
2929 return ret;
2894} 2930}
2895 2931
2896static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 2932static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -2950,7 +2986,8 @@ direct_uncharge:
2950 * uncharge if !page_mapped(page) 2986 * uncharge if !page_mapped(page)
2951 */ 2987 */
2952static struct mem_cgroup * 2988static struct mem_cgroup *
2953__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2989__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
2990 bool end_migration)
2954{ 2991{
2955 struct mem_cgroup *memcg = NULL; 2992 struct mem_cgroup *memcg = NULL;
2956 unsigned int nr_pages = 1; 2993 unsigned int nr_pages = 1;
@@ -2960,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2960 if (mem_cgroup_disabled()) 2997 if (mem_cgroup_disabled())
2961 return NULL; 2998 return NULL;
2962 2999
2963 if (PageSwapCache(page)) 3000 VM_BUG_ON(PageSwapCache(page));
2964 return NULL;
2965 3001
2966 if (PageTransHuge(page)) { 3002 if (PageTransHuge(page)) {
2967 nr_pages <<= compound_order(page); 3003 nr_pages <<= compound_order(page);
@@ -2984,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2984 anon = PageAnon(page); 3020 anon = PageAnon(page);
2985 3021
2986 switch (ctype) { 3022 switch (ctype) {
2987 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 3023 case MEM_CGROUP_CHARGE_TYPE_ANON:
2988 /* 3024 /*
2989 * Generally PageAnon tells if it's the anon statistics to be 3025 * Generally PageAnon tells if it's the anon statistics to be
2990 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is 3026 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
@@ -2994,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2994 /* fallthrough */ 3030 /* fallthrough */
2995 case MEM_CGROUP_CHARGE_TYPE_DROP: 3031 case MEM_CGROUP_CHARGE_TYPE_DROP:
2996 /* See mem_cgroup_prepare_migration() */ 3032 /* See mem_cgroup_prepare_migration() */
2997 if (page_mapped(page) || PageCgroupMigration(pc)) 3033 if (page_mapped(page))
3034 goto unlock_out;
3035 /*
3036 * Pages under migration may not be uncharged. But
3037 * end_migration() /must/ be the one uncharging the
3038 * unused post-migration page and so it has to call
3039 * here with the migration bit still set. See the
3040 * res_counter handling below.
3041 */
3042 if (!end_migration && PageCgroupMigration(pc))
2998 goto unlock_out; 3043 goto unlock_out;
2999 break; 3044 break;
3000 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 3045 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -3028,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3028 mem_cgroup_swap_statistics(memcg, true); 3073 mem_cgroup_swap_statistics(memcg, true);
3029 mem_cgroup_get(memcg); 3074 mem_cgroup_get(memcg);
3030 } 3075 }
3031 if (!mem_cgroup_is_root(memcg)) 3076 /*
3077 * Migration does not charge the res_counter for the
3078 * replacement page, so leave it alone when phasing out the
3079 * page that is unused after the migration.
3080 */
3081 if (!end_migration && !mem_cgroup_is_root(memcg))
3032 mem_cgroup_do_uncharge(memcg, nr_pages, ctype); 3082 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
3033 3083
3034 return memcg; 3084 return memcg;
@@ -3044,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page)
3044 if (page_mapped(page)) 3094 if (page_mapped(page))
3045 return; 3095 return;
3046 VM_BUG_ON(page->mapping && !PageAnon(page)); 3096 VM_BUG_ON(page->mapping && !PageAnon(page));
3047 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 3097 if (PageSwapCache(page))
3098 return;
3099 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
3048} 3100}
3049 3101
3050void mem_cgroup_uncharge_cache_page(struct page *page) 3102void mem_cgroup_uncharge_cache_page(struct page *page)
3051{ 3103{
3052 VM_BUG_ON(page_mapped(page)); 3104 VM_BUG_ON(page_mapped(page));
3053 VM_BUG_ON(page->mapping); 3105 VM_BUG_ON(page->mapping);
3054 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 3106 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
3055} 3107}
3056 3108
3057/* 3109/*
@@ -3115,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3115 if (!swapout) /* this was a swap cache but the swap is unused ! */ 3167 if (!swapout) /* this was a swap cache but the swap is unused ! */
3116 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 3168 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3117 3169
3118 memcg = __mem_cgroup_uncharge_common(page, ctype); 3170 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
3119 3171
3120 /* 3172 /*
3121 * record memcg information, if swapout && memcg != NULL, 3173 * record memcg information, if swapout && memcg != NULL,
@@ -3126,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3126} 3178}
3127#endif 3179#endif
3128 3180
3129#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3181#ifdef CONFIG_MEMCG_SWAP
3130/* 3182/*
3131 * called from swap_entry_free(). remove record in swap_cgroup and 3183 * called from swap_entry_free(). remove record in swap_cgroup and
3132 * uncharge "memsw" account. 3184 * uncharge "memsw" account.
@@ -3160,7 +3212,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
3160 * @entry: swap entry to be moved 3212 * @entry: swap entry to be moved
3161 * @from: mem_cgroup which the entry is moved from 3213 * @from: mem_cgroup which the entry is moved from
3162 * @to: mem_cgroup which the entry is moved to 3214 * @to: mem_cgroup which the entry is moved to
3163 * @need_fixup: whether we should fixup res_counters and refcounts.
3164 * 3215 *
3165 * It succeeds only when the swap_cgroup's record for this entry is the same 3216 * It succeeds only when the swap_cgroup's record for this entry is the same
3166 * as the mem_cgroup's id of @from. 3217 * as the mem_cgroup's id of @from.
@@ -3171,7 +3222,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
3171 * both res and memsw, and called css_get(). 3222 * both res and memsw, and called css_get().
3172 */ 3223 */
3173static int mem_cgroup_move_swap_account(swp_entry_t entry, 3224static int mem_cgroup_move_swap_account(swp_entry_t entry,
3174 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3225 struct mem_cgroup *from, struct mem_cgroup *to)
3175{ 3226{
3176 unsigned short old_id, new_id; 3227 unsigned short old_id, new_id;
3177 3228
@@ -3190,24 +3241,13 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
3190 * swap-in, the refcount of @to might be decreased to 0. 3241 * swap-in, the refcount of @to might be decreased to 0.
3191 */ 3242 */
3192 mem_cgroup_get(to); 3243 mem_cgroup_get(to);
3193 if (need_fixup) {
3194 if (!mem_cgroup_is_root(from))
3195 res_counter_uncharge(&from->memsw, PAGE_SIZE);
3196 mem_cgroup_put(from);
3197 /*
3198 * we charged both to->res and to->memsw, so we should
3199 * uncharge to->res.
3200 */
3201 if (!mem_cgroup_is_root(to))
3202 res_counter_uncharge(&to->res, PAGE_SIZE);
3203 }
3204 return 0; 3244 return 0;
3205 } 3245 }
3206 return -EINVAL; 3246 return -EINVAL;
3207} 3247}
3208#else 3248#else
3209static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3249static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3210 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3250 struct mem_cgroup *from, struct mem_cgroup *to)
3211{ 3251{
3212 return -EINVAL; 3252 return -EINVAL;
3213} 3253}
@@ -3217,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3217 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 3257 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
3218 * page belongs to. 3258 * page belongs to.
3219 */ 3259 */
3220int mem_cgroup_prepare_migration(struct page *page, 3260void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3221 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) 3261 struct mem_cgroup **memcgp)
3222{ 3262{
3223 struct mem_cgroup *memcg = NULL; 3263 struct mem_cgroup *memcg = NULL;
3224 struct page_cgroup *pc; 3264 struct page_cgroup *pc;
3225 enum charge_type ctype; 3265 enum charge_type ctype;
3226 int ret = 0;
3227 3266
3228 *memcgp = NULL; 3267 *memcgp = NULL;
3229 3268
3230 VM_BUG_ON(PageTransHuge(page)); 3269 VM_BUG_ON(PageTransHuge(page));
3231 if (mem_cgroup_disabled()) 3270 if (mem_cgroup_disabled())
3232 return 0; 3271 return;
3233 3272
3234 pc = lookup_page_cgroup(page); 3273 pc = lookup_page_cgroup(page);
3235 lock_page_cgroup(pc); 3274 lock_page_cgroup(pc);
@@ -3274,39 +3313,25 @@ int mem_cgroup_prepare_migration(struct page *page,
3274 * we return here. 3313 * we return here.
3275 */ 3314 */
3276 if (!memcg) 3315 if (!memcg)
3277 return 0; 3316 return;
3278 3317
3279 *memcgp = memcg; 3318 *memcgp = memcg;
3280 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
3281 css_put(&memcg->css);/* drop extra refcnt */
3282 if (ret) {
3283 if (PageAnon(page)) {
3284 lock_page_cgroup(pc);
3285 ClearPageCgroupMigration(pc);
3286 unlock_page_cgroup(pc);
3287 /*
3288 * The old page may be fully unmapped while we kept it.
3289 */
3290 mem_cgroup_uncharge_page(page);
3291 }
3292 /* we'll need to revisit this error code (we have -EINTR) */
3293 return -ENOMEM;
3294 }
3295 /* 3319 /*
3296 * We charge new page before it's used/mapped. So, even if unlock_page() 3320 * We charge new page before it's used/mapped. So, even if unlock_page()
3297 * is called before end_migration, we can catch all events on this new 3321 * is called before end_migration, we can catch all events on this new
3298 * page. In the case new page is migrated but not remapped, new page's 3322 * page. In the case new page is migrated but not remapped, new page's
3299 * mapcount will be finally 0 and we call uncharge in end_migration(). 3323 * mapcount will be finally 0 and we call uncharge in end_migration().
3300 */ 3324 */
3301 pc = lookup_page_cgroup(newpage);
3302 if (PageAnon(page)) 3325 if (PageAnon(page))
3303 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 3326 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
3304 else if (page_is_file_cache(page))
3305 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3306 else 3327 else
3307 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3328 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3308 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false); 3329 /*
3309 return ret; 3330 * The page is committed to the memcg, but it's not actually
3331 * charged to the res_counter since we plan on replacing the
3332 * old one and only one page is going to be left afterwards.
3333 */
3334 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
3310} 3335}
3311 3336
3312/* remove redundant charge if migration failed*/ 3337/* remove redundant charge if migration failed*/
@@ -3328,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3328 used = newpage; 3353 used = newpage;
3329 unused = oldpage; 3354 unused = oldpage;
3330 } 3355 }
3356 anon = PageAnon(used);
3357 __mem_cgroup_uncharge_common(unused,
3358 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
3359 : MEM_CGROUP_CHARGE_TYPE_CACHE,
3360 true);
3361 css_put(&memcg->css);
3331 /* 3362 /*
3332 * We disallowed uncharge of pages under migration because mapcount 3363 * We disallowed uncharge of pages under migration because mapcount
3333 * of the page goes down to zero, temporarly. 3364 * of the page goes down to zero, temporarly.
@@ -3337,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3337 lock_page_cgroup(pc); 3368 lock_page_cgroup(pc);
3338 ClearPageCgroupMigration(pc); 3369 ClearPageCgroupMigration(pc);
3339 unlock_page_cgroup(pc); 3370 unlock_page_cgroup(pc);
3340 anon = PageAnon(used);
3341 __mem_cgroup_uncharge_common(unused,
3342 anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
3343 : MEM_CGROUP_CHARGE_TYPE_CACHE);
3344 3371
3345 /* 3372 /*
3346 * If a page is a file cache, radix-tree replacement is very atomic 3373 * If a page is a file cache, radix-tree replacement is very atomic
@@ -3369,7 +3396,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3369void mem_cgroup_replace_page_cache(struct page *oldpage, 3396void mem_cgroup_replace_page_cache(struct page *oldpage,
3370 struct page *newpage) 3397 struct page *newpage)
3371{ 3398{
3372 struct mem_cgroup *memcg; 3399 struct mem_cgroup *memcg = NULL;
3373 struct page_cgroup *pc; 3400 struct page_cgroup *pc;
3374 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 3401 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3375 3402
@@ -3379,20 +3406,25 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3379 pc = lookup_page_cgroup(oldpage); 3406 pc = lookup_page_cgroup(oldpage);
3380 /* fix accounting on old pages */ 3407 /* fix accounting on old pages */
3381 lock_page_cgroup(pc); 3408 lock_page_cgroup(pc);
3382 memcg = pc->mem_cgroup; 3409 if (PageCgroupUsed(pc)) {
3383 mem_cgroup_charge_statistics(memcg, false, -1); 3410 memcg = pc->mem_cgroup;
3384 ClearPageCgroupUsed(pc); 3411 mem_cgroup_charge_statistics(memcg, false, -1);
3412 ClearPageCgroupUsed(pc);
3413 }
3385 unlock_page_cgroup(pc); 3414 unlock_page_cgroup(pc);
3386 3415
3387 if (PageSwapBacked(oldpage)) 3416 /*
3388 type = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3417 * When called from shmem_replace_page(), in some cases the
3389 3418 * oldpage has already been charged, and in some cases not.
3419 */
3420 if (!memcg)
3421 return;
3390 /* 3422 /*
3391 * Even if newpage->mapping was NULL before starting replacement, 3423 * Even if newpage->mapping was NULL before starting replacement,
3392 * the newpage may be on LRU(or pagevec for LRU) already. We lock 3424 * the newpage may be on LRU(or pagevec for LRU) already. We lock
3393 * LRU while we overwrite pc->mem_cgroup. 3425 * LRU while we overwrite pc->mem_cgroup.
3394 */ 3426 */
3395 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true); 3427 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
3396} 3428}
3397 3429
3398#ifdef CONFIG_DEBUG_VM 3430#ifdef CONFIG_DEBUG_VM
@@ -3461,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3461 /* 3493 /*
3462 * Rather than hide all in some function, I do this in 3494 * Rather than hide all in some function, I do this in
3463 * open coded manner. You see what this really does. 3495 * open coded manner. You see what this really does.
3464 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3496 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
3465 */ 3497 */
3466 mutex_lock(&set_limit_mutex); 3498 mutex_lock(&set_limit_mutex);
3467 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3499 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3522,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3522 /* 3554 /*
3523 * Rather than hide all in some function, I do this in 3555 * Rather than hide all in some function, I do this in
3524 * open coded manner. You see what this really does. 3556 * open coded manner. You see what this really does.
3525 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3557 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
3526 */ 3558 */
3527 mutex_lock(&set_limit_mutex); 3559 mutex_lock(&set_limit_mutex);
3528 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3560 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3654,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3654} 3686}
3655 3687
3656/* 3688/*
3657 * This routine traverse page_cgroup in given list and drop them all. 3689 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3658 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3690 * reclaim the pages page themselves - it just removes the page_cgroups.
3691 * Returns true if some page_cgroups were not freed, indicating that the caller
3692 * must retry this operation.
3659 */ 3693 */
3660static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3694static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3661 int node, int zid, enum lru_list lru) 3695 int node, int zid, enum lru_list lru)
3662{ 3696{
3663 struct mem_cgroup_per_zone *mz; 3697 struct mem_cgroup_per_zone *mz;
@@ -3665,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3665 struct list_head *list; 3699 struct list_head *list;
3666 struct page *busy; 3700 struct page *busy;
3667 struct zone *zone; 3701 struct zone *zone;
3668 int ret = 0;
3669 3702
3670 zone = &NODE_DATA(node)->node_zones[zid]; 3703 zone = &NODE_DATA(node)->node_zones[zid];
3671 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3704 mz = mem_cgroup_zoneinfo(memcg, node, zid);
@@ -3679,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3679 struct page_cgroup *pc; 3712 struct page_cgroup *pc;
3680 struct page *page; 3713 struct page *page;
3681 3714
3682 ret = 0;
3683 spin_lock_irqsave(&zone->lru_lock, flags); 3715 spin_lock_irqsave(&zone->lru_lock, flags);
3684 if (list_empty(list)) { 3716 if (list_empty(list)) {
3685 spin_unlock_irqrestore(&zone->lru_lock, flags); 3717 spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -3696,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3696 3728
3697 pc = lookup_page_cgroup(page); 3729 pc = lookup_page_cgroup(page);
3698 3730
3699 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); 3731 if (mem_cgroup_move_parent(page, pc, memcg)) {
3700 if (ret == -ENOMEM || ret == -EINTR)
3701 break;
3702
3703 if (ret == -EBUSY || ret == -EINVAL) {
3704 /* found lock contention or "pc" is obsolete. */ 3732 /* found lock contention or "pc" is obsolete. */
3705 busy = page; 3733 busy = page;
3706 cond_resched(); 3734 cond_resched();
3707 } else 3735 } else
3708 busy = NULL; 3736 busy = NULL;
3709 } 3737 }
3710 3738 return !list_empty(list);
3711 if (!ret && !list_empty(list))
3712 return -EBUSY;
3713 return ret;
3714} 3739}
3715 3740
3716/* 3741/*
@@ -3735,9 +3760,6 @@ move_account:
3735 ret = -EBUSY; 3760 ret = -EBUSY;
3736 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3761 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3737 goto out; 3762 goto out;
3738 ret = -EINTR;
3739 if (signal_pending(current))
3740 goto out;
3741 /* This is for making all *used* pages to be on LRU. */ 3763 /* This is for making all *used* pages to be on LRU. */
3742 lru_add_drain_all(); 3764 lru_add_drain_all();
3743 drain_all_stock_sync(memcg); 3765 drain_all_stock_sync(memcg);
@@ -3758,12 +3780,9 @@ move_account:
3758 } 3780 }
3759 mem_cgroup_end_move(memcg); 3781 mem_cgroup_end_move(memcg);
3760 memcg_oom_recover(memcg); 3782 memcg_oom_recover(memcg);
3761 /* it seems parent cgroup doesn't have enough mem */
3762 if (ret == -ENOMEM)
3763 goto try_to_free;
3764 cond_resched(); 3783 cond_resched();
3765 /* "ret" should also be checked to ensure all lists are empty. */ 3784 /* "ret" should also be checked to ensure all lists are empty. */
3766 } while (memcg->res.usage > 0 || ret); 3785 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
3767out: 3786out:
3768 css_put(&memcg->css); 3787 css_put(&memcg->css);
3769 return ret; 3788 return ret;
@@ -3778,7 +3797,7 @@ try_to_free:
3778 lru_add_drain_all(); 3797 lru_add_drain_all();
3779 /* try to free all pages in this cgroup */ 3798 /* try to free all pages in this cgroup */
3780 shrink = 1; 3799 shrink = 1;
3781 while (nr_retries && memcg->res.usage > 0) { 3800 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
3782 int progress; 3801 int progress;
3783 3802
3784 if (signal_pending(current)) { 3803 if (signal_pending(current)) {
@@ -3799,7 +3818,7 @@ try_to_free:
3799 goto move_account; 3818 goto move_account;
3800} 3819}
3801 3820
3802int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3821static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3803{ 3822{
3804 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3823 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3805} 3824}
@@ -3822,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3822 parent_memcg = mem_cgroup_from_cont(parent); 3841 parent_memcg = mem_cgroup_from_cont(parent);
3823 3842
3824 cgroup_lock(); 3843 cgroup_lock();
3844
3845 if (memcg->use_hierarchy == val)
3846 goto out;
3847
3825 /* 3848 /*
3826 * If parent's use_hierarchy is set, we can't make any modifications 3849 * If parent's use_hierarchy is set, we can't make any modifications
3827 * in the child subtrees. If it is unset, then the change can 3850 * in the child subtrees. If it is unset, then the change can
@@ -3838,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3838 retval = -EBUSY; 3861 retval = -EBUSY;
3839 } else 3862 } else
3840 retval = -EINVAL; 3863 retval = -EINVAL;
3864
3865out:
3841 cgroup_unlock(); 3866 cgroup_unlock();
3842 3867
3843 return retval; 3868 return retval;
@@ -3874,19 +3899,26 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3874 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 3899 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
3875 3900
3876 if (swap) 3901 if (swap)
3877 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); 3902 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
3878 3903
3879 return val << PAGE_SHIFT; 3904 return val << PAGE_SHIFT;
3880} 3905}
3881 3906
3882static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3907static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3908 struct file *file, char __user *buf,
3909 size_t nbytes, loff_t *ppos)
3883{ 3910{
3884 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3911 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3912 char str[64];
3885 u64 val; 3913 u64 val;
3886 int type, name; 3914 int type, name, len;
3887 3915
3888 type = MEMFILE_TYPE(cft->private); 3916 type = MEMFILE_TYPE(cft->private);
3889 name = MEMFILE_ATTR(cft->private); 3917 name = MEMFILE_ATTR(cft->private);
3918
3919 if (!do_swap_account && type == _MEMSWAP)
3920 return -EOPNOTSUPP;
3921
3890 switch (type) { 3922 switch (type) {
3891 case _MEM: 3923 case _MEM:
3892 if (name == RES_USAGE) 3924 if (name == RES_USAGE)
@@ -3903,7 +3935,9 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3903 default: 3935 default:
3904 BUG(); 3936 BUG();
3905 } 3937 }
3906 return val; 3938
3939 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
3940 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
3907} 3941}
3908/* 3942/*
3909 * The user of this function is... 3943 * The user of this function is...
@@ -3919,6 +3953,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3919 3953
3920 type = MEMFILE_TYPE(cft->private); 3954 type = MEMFILE_TYPE(cft->private);
3921 name = MEMFILE_ATTR(cft->private); 3955 name = MEMFILE_ATTR(cft->private);
3956
3957 if (!do_swap_account && type == _MEMSWAP)
3958 return -EOPNOTSUPP;
3959
3922 switch (name) { 3960 switch (name) {
3923 case RES_LIMIT: 3961 case RES_LIMIT:
3924 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3962 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
@@ -3984,12 +4022,15 @@ out:
3984 4022
3985static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 4023static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3986{ 4024{
3987 struct mem_cgroup *memcg; 4025 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3988 int type, name; 4026 int type, name;
3989 4027
3990 memcg = mem_cgroup_from_cont(cont);
3991 type = MEMFILE_TYPE(event); 4028 type = MEMFILE_TYPE(event);
3992 name = MEMFILE_ATTR(event); 4029 name = MEMFILE_ATTR(event);
4030
4031 if (!do_swap_account && type == _MEMSWAP)
4032 return -EOPNOTSUPP;
4033
3993 switch (name) { 4034 switch (name) {
3994 case RES_MAX_USAGE: 4035 case RES_MAX_USAGE:
3995 if (type == _MEM) 4036 if (type == _MEM)
@@ -4041,103 +4082,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4041} 4082}
4042#endif 4083#endif
4043 4084
4044
4045/* For read statistics */
4046enum {
4047 MCS_CACHE,
4048 MCS_RSS,
4049 MCS_FILE_MAPPED,
4050 MCS_PGPGIN,
4051 MCS_PGPGOUT,
4052 MCS_SWAP,
4053 MCS_PGFAULT,
4054 MCS_PGMAJFAULT,
4055 MCS_INACTIVE_ANON,
4056 MCS_ACTIVE_ANON,
4057 MCS_INACTIVE_FILE,
4058 MCS_ACTIVE_FILE,
4059 MCS_UNEVICTABLE,
4060 NR_MCS_STAT,
4061};
4062
4063struct mcs_total_stat {
4064 s64 stat[NR_MCS_STAT];
4065};
4066
4067struct {
4068 char *local_name;
4069 char *total_name;
4070} memcg_stat_strings[NR_MCS_STAT] = {
4071 {"cache", "total_cache"},
4072 {"rss", "total_rss"},
4073 {"mapped_file", "total_mapped_file"},
4074 {"pgpgin", "total_pgpgin"},
4075 {"pgpgout", "total_pgpgout"},
4076 {"swap", "total_swap"},
4077 {"pgfault", "total_pgfault"},
4078 {"pgmajfault", "total_pgmajfault"},
4079 {"inactive_anon", "total_inactive_anon"},
4080 {"active_anon", "total_active_anon"},
4081 {"inactive_file", "total_inactive_file"},
4082 {"active_file", "total_active_file"},
4083 {"unevictable", "total_unevictable"}
4084};
4085
4086
4087static void
4088mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4089{
4090 s64 val;
4091
4092 /* per cpu stat */
4093 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
4094 s->stat[MCS_CACHE] += val * PAGE_SIZE;
4095 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
4096 s->stat[MCS_RSS] += val * PAGE_SIZE;
4097 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
4098 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
4099 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
4100 s->stat[MCS_PGPGIN] += val;
4101 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
4102 s->stat[MCS_PGPGOUT] += val;
4103 if (do_swap_account) {
4104 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
4105 s->stat[MCS_SWAP] += val * PAGE_SIZE;
4106 }
4107 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
4108 s->stat[MCS_PGFAULT] += val;
4109 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
4110 s->stat[MCS_PGMAJFAULT] += val;
4111
4112 /* per zone stat */
4113 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
4114 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4115 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
4116 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4117 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
4118 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4119 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
4120 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4121 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4122 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4123}
4124
4125static void
4126mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4127{
4128 struct mem_cgroup *iter;
4129
4130 for_each_mem_cgroup_tree(iter, memcg)
4131 mem_cgroup_get_local_stat(iter, s);
4132}
4133
4134#ifdef CONFIG_NUMA 4085#ifdef CONFIG_NUMA
4135static int mem_control_numa_stat_show(struct seq_file *m, void *arg) 4086static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4087 struct seq_file *m)
4136{ 4088{
4137 int nid; 4089 int nid;
4138 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4090 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4139 unsigned long node_nr; 4091 unsigned long node_nr;
4140 struct cgroup *cont = m->private;
4141 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4092 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4142 4093
4143 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 4094 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
@@ -4178,64 +4129,100 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4178} 4129}
4179#endif /* CONFIG_NUMA */ 4130#endif /* CONFIG_NUMA */
4180 4131
4181static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4132static const char * const mem_cgroup_lru_names[] = {
4182 struct cgroup_map_cb *cb) 4133 "inactive_anon",
4183{ 4134 "active_anon",
4184 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4135 "inactive_file",
4185 struct mcs_total_stat mystat; 4136 "active_file",
4186 int i; 4137 "unevictable",
4138};
4187 4139
4188 memset(&mystat, 0, sizeof(mystat)); 4140static inline void mem_cgroup_lru_names_not_uptodate(void)
4189 mem_cgroup_get_local_stat(memcg, &mystat); 4141{
4142 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
4143}
4190 4144
4145static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
4146 struct seq_file *m)
4147{
4148 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4149 struct mem_cgroup *mi;
4150 unsigned int i;
4191 4151
4192 for (i = 0; i < NR_MCS_STAT; i++) { 4152 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4193 if (i == MCS_SWAP && !do_swap_account) 4153 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4194 continue; 4154 continue;
4195 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 4155 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
4156 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
4196 } 4157 }
4197 4158
4159 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
4160 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
4161 mem_cgroup_read_events(memcg, i));
4162
4163 for (i = 0; i < NR_LRU_LISTS; i++)
4164 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
4165 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4166
4198 /* Hierarchical information */ 4167 /* Hierarchical information */
4199 { 4168 {
4200 unsigned long long limit, memsw_limit; 4169 unsigned long long limit, memsw_limit;
4201 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 4170 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
4202 cb->fill(cb, "hierarchical_memory_limit", limit); 4171 seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
4203 if (do_swap_account) 4172 if (do_swap_account)
4204 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4173 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4174 memsw_limit);
4205 } 4175 }
4206 4176
4207 memset(&mystat, 0, sizeof(mystat)); 4177 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4208 mem_cgroup_get_total_stat(memcg, &mystat); 4178 long long val = 0;
4209 for (i = 0; i < NR_MCS_STAT; i++) { 4179
4210 if (i == MCS_SWAP && !do_swap_account) 4180 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4211 continue; 4181 continue;
4212 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 4182 for_each_mem_cgroup_tree(mi, memcg)
4183 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
4184 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
4185 }
4186
4187 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
4188 unsigned long long val = 0;
4189
4190 for_each_mem_cgroup_tree(mi, memcg)
4191 val += mem_cgroup_read_events(mi, i);
4192 seq_printf(m, "total_%s %llu\n",
4193 mem_cgroup_events_names[i], val);
4194 }
4195
4196 for (i = 0; i < NR_LRU_LISTS; i++) {
4197 unsigned long long val = 0;
4198
4199 for_each_mem_cgroup_tree(mi, memcg)
4200 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
4201 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
4213 } 4202 }
4214 4203
4215#ifdef CONFIG_DEBUG_VM 4204#ifdef CONFIG_DEBUG_VM
4216 { 4205 {
4217 int nid, zid; 4206 int nid, zid;
4218 struct mem_cgroup_per_zone *mz; 4207 struct mem_cgroup_per_zone *mz;
4208 struct zone_reclaim_stat *rstat;
4219 unsigned long recent_rotated[2] = {0, 0}; 4209 unsigned long recent_rotated[2] = {0, 0};
4220 unsigned long recent_scanned[2] = {0, 0}; 4210 unsigned long recent_scanned[2] = {0, 0};
4221 4211
4222 for_each_online_node(nid) 4212 for_each_online_node(nid)
4223 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4213 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4224 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 4214 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
4215 rstat = &mz->lruvec.reclaim_stat;
4225 4216
4226 recent_rotated[0] += 4217 recent_rotated[0] += rstat->recent_rotated[0];
4227 mz->reclaim_stat.recent_rotated[0]; 4218 recent_rotated[1] += rstat->recent_rotated[1];
4228 recent_rotated[1] += 4219 recent_scanned[0] += rstat->recent_scanned[0];
4229 mz->reclaim_stat.recent_rotated[1]; 4220 recent_scanned[1] += rstat->recent_scanned[1];
4230 recent_scanned[0] +=
4231 mz->reclaim_stat.recent_scanned[0];
4232 recent_scanned[1] +=
4233 mz->reclaim_stat.recent_scanned[1];
4234 } 4221 }
4235 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 4222 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
4236 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 4223 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
4237 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 4224 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
4238 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 4225 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
4239 } 4226 }
4240#endif 4227#endif
4241 4228
@@ -4297,7 +4284,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4297 usage = mem_cgroup_usage(memcg, swap); 4284 usage = mem_cgroup_usage(memcg, swap);
4298 4285
4299 /* 4286 /*
4300 * current_threshold points to threshold just below usage. 4287 * current_threshold points to threshold just below or equal to usage.
4301 * If it's not true, a threshold was crossed after last 4288 * If it's not true, a threshold was crossed after last
4302 * call of __mem_cgroup_threshold(). 4289 * call of __mem_cgroup_threshold().
4303 */ 4290 */
@@ -4423,14 +4410,15 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4423 /* Find current threshold */ 4410 /* Find current threshold */
4424 new->current_threshold = -1; 4411 new->current_threshold = -1;
4425 for (i = 0; i < size; i++) { 4412 for (i = 0; i < size; i++) {
4426 if (new->entries[i].threshold < usage) { 4413 if (new->entries[i].threshold <= usage) {
4427 /* 4414 /*
4428 * new->current_threshold will not be used until 4415 * new->current_threshold will not be used until
4429 * rcu_assign_pointer(), so it's safe to increment 4416 * rcu_assign_pointer(), so it's safe to increment
4430 * it here. 4417 * it here.
4431 */ 4418 */
4432 ++new->current_threshold; 4419 ++new->current_threshold;
4433 } 4420 } else
4421 break;
4434 } 4422 }
4435 4423
4436 /* Free old spare buffer and save old primary buffer as spare */ 4424 /* Free old spare buffer and save old primary buffer as spare */
@@ -4499,7 +4487,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4499 continue; 4487 continue;
4500 4488
4501 new->entries[j] = thresholds->primary->entries[i]; 4489 new->entries[j] = thresholds->primary->entries[i];
4502 if (new->entries[j].threshold < usage) { 4490 if (new->entries[j].threshold <= usage) {
4503 /* 4491 /*
4504 * new->current_threshold will not be used 4492 * new->current_threshold will not be used
4505 * until rcu_assign_pointer(), so it's safe to increment 4493 * until rcu_assign_pointer(), so it's safe to increment
@@ -4513,6 +4501,12 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4513swap_buffers: 4501swap_buffers:
4514 /* Swap primary and spare array */ 4502 /* Swap primary and spare array */
4515 thresholds->spare = thresholds->primary; 4503 thresholds->spare = thresholds->primary;
4504 /* If all events are unregistered, free the spare array */
4505 if (!new) {
4506 kfree(thresholds->spare);
4507 thresholds->spare = NULL;
4508 }
4509
4516 rcu_assign_pointer(thresholds->primary, new); 4510 rcu_assign_pointer(thresholds->primary, new);
4517 4511
4518 /* To be sure that nobody uses thresholds */ 4512 /* To be sure that nobody uses thresholds */
@@ -4607,46 +4601,23 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4607 return 0; 4601 return 0;
4608} 4602}
4609 4603
4610#ifdef CONFIG_NUMA 4604#ifdef CONFIG_MEMCG_KMEM
4611static const struct file_operations mem_control_numa_stat_file_operations = { 4605static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4612 .read = seq_read,
4613 .llseek = seq_lseek,
4614 .release = single_release,
4615};
4616
4617static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4618{ 4606{
4619 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; 4607 return mem_cgroup_sockets_init(memcg, ss);
4620
4621 file->f_op = &mem_control_numa_stat_file_operations;
4622 return single_open(file, mem_control_numa_stat_show, cont);
4623}
4624#endif /* CONFIG_NUMA */
4625
4626#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
4627static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4628{
4629 /*
4630 * Part of this would be better living in a separate allocation
4631 * function, leaving us with just the cgroup tree population work.
4632 * We, however, depend on state such as network's proto_list that
4633 * is only initialized after cgroup creation. I found the less
4634 * cumbersome way to deal with it to defer it all to populate time
4635 */
4636 return mem_cgroup_sockets_init(cont, ss);
4637}; 4608};
4638 4609
4639static void kmem_cgroup_destroy(struct cgroup *cont) 4610static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4640{ 4611{
4641 mem_cgroup_sockets_destroy(cont); 4612 mem_cgroup_sockets_destroy(memcg);
4642} 4613}
4643#else 4614#else
4644static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) 4615static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4645{ 4616{
4646 return 0; 4617 return 0;
4647} 4618}
4648 4619
4649static void kmem_cgroup_destroy(struct cgroup *cont) 4620static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4650{ 4621{
4651} 4622}
4652#endif 4623#endif
@@ -4655,7 +4626,7 @@ static struct cftype mem_cgroup_files[] = {
4655 { 4626 {
4656 .name = "usage_in_bytes", 4627 .name = "usage_in_bytes",
4657 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4628 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4658 .read_u64 = mem_cgroup_read, 4629 .read = mem_cgroup_read,
4659 .register_event = mem_cgroup_usage_register_event, 4630 .register_event = mem_cgroup_usage_register_event,
4660 .unregister_event = mem_cgroup_usage_unregister_event, 4631 .unregister_event = mem_cgroup_usage_unregister_event,
4661 }, 4632 },
@@ -4663,29 +4634,29 @@ static struct cftype mem_cgroup_files[] = {
4663 .name = "max_usage_in_bytes", 4634 .name = "max_usage_in_bytes",
4664 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4635 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4665 .trigger = mem_cgroup_reset, 4636 .trigger = mem_cgroup_reset,
4666 .read_u64 = mem_cgroup_read, 4637 .read = mem_cgroup_read,
4667 }, 4638 },
4668 { 4639 {
4669 .name = "limit_in_bytes", 4640 .name = "limit_in_bytes",
4670 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4641 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4671 .write_string = mem_cgroup_write, 4642 .write_string = mem_cgroup_write,
4672 .read_u64 = mem_cgroup_read, 4643 .read = mem_cgroup_read,
4673 }, 4644 },
4674 { 4645 {
4675 .name = "soft_limit_in_bytes", 4646 .name = "soft_limit_in_bytes",
4676 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4647 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4677 .write_string = mem_cgroup_write, 4648 .write_string = mem_cgroup_write,
4678 .read_u64 = mem_cgroup_read, 4649 .read = mem_cgroup_read,
4679 }, 4650 },
4680 { 4651 {
4681 .name = "failcnt", 4652 .name = "failcnt",
4682 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4653 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4683 .trigger = mem_cgroup_reset, 4654 .trigger = mem_cgroup_reset,
4684 .read_u64 = mem_cgroup_read, 4655 .read = mem_cgroup_read,
4685 }, 4656 },
4686 { 4657 {
4687 .name = "stat", 4658 .name = "stat",
4688 .read_map = mem_control_stat_show, 4659 .read_seq_string = memcg_stat_show,
4689 }, 4660 },
4690 { 4661 {
4691 .name = "force_empty", 4662 .name = "force_empty",
@@ -4717,18 +4688,14 @@ static struct cftype mem_cgroup_files[] = {
4717#ifdef CONFIG_NUMA 4688#ifdef CONFIG_NUMA
4718 { 4689 {
4719 .name = "numa_stat", 4690 .name = "numa_stat",
4720 .open = mem_control_numa_stat_open, 4691 .read_seq_string = memcg_numa_stat_show,
4721 .mode = S_IRUGO,
4722 }, 4692 },
4723#endif 4693#endif
4724}; 4694#ifdef CONFIG_MEMCG_SWAP
4725
4726#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4727static struct cftype memsw_cgroup_files[] = {
4728 { 4695 {
4729 .name = "memsw.usage_in_bytes", 4696 .name = "memsw.usage_in_bytes",
4730 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4697 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4731 .read_u64 = mem_cgroup_read, 4698 .read = mem_cgroup_read,
4732 .register_event = mem_cgroup_usage_register_event, 4699 .register_event = mem_cgroup_usage_register_event,
4733 .unregister_event = mem_cgroup_usage_unregister_event, 4700 .unregister_event = mem_cgroup_usage_unregister_event,
4734 }, 4701 },
@@ -4736,41 +4703,28 @@ static struct cftype memsw_cgroup_files[] = {
4736 .name = "memsw.max_usage_in_bytes", 4703 .name = "memsw.max_usage_in_bytes",
4737 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4704 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4738 .trigger = mem_cgroup_reset, 4705 .trigger = mem_cgroup_reset,
4739 .read_u64 = mem_cgroup_read, 4706 .read = mem_cgroup_read,
4740 }, 4707 },
4741 { 4708 {
4742 .name = "memsw.limit_in_bytes", 4709 .name = "memsw.limit_in_bytes",
4743 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4710 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4744 .write_string = mem_cgroup_write, 4711 .write_string = mem_cgroup_write,
4745 .read_u64 = mem_cgroup_read, 4712 .read = mem_cgroup_read,
4746 }, 4713 },
4747 { 4714 {
4748 .name = "memsw.failcnt", 4715 .name = "memsw.failcnt",
4749 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4716 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4750 .trigger = mem_cgroup_reset, 4717 .trigger = mem_cgroup_reset,
4751 .read_u64 = mem_cgroup_read, 4718 .read = mem_cgroup_read,
4752 }, 4719 },
4753};
4754
4755static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4756{
4757 if (!do_swap_account)
4758 return 0;
4759 return cgroup_add_files(cont, ss, memsw_cgroup_files,
4760 ARRAY_SIZE(memsw_cgroup_files));
4761};
4762#else
4763static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4764{
4765 return 0;
4766}
4767#endif 4720#endif
4721 { }, /* terminate */
4722};
4768 4723
4769static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4724static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4770{ 4725{
4771 struct mem_cgroup_per_node *pn; 4726 struct mem_cgroup_per_node *pn;
4772 struct mem_cgroup_per_zone *mz; 4727 struct mem_cgroup_per_zone *mz;
4773 enum lru_list lru;
4774 int zone, tmp = node; 4728 int zone, tmp = node;
4775 /* 4729 /*
4776 * This routine is called against possible nodes. 4730 * This routine is called against possible nodes.
@@ -4788,8 +4742,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4788 4742
4789 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4743 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4790 mz = &pn->zoneinfo[zone]; 4744 mz = &pn->zoneinfo[zone];
4791 for_each_lru(lru) 4745 lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
4792 INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
4793 mz->usage_in_excess = 0; 4746 mz->usage_in_excess = 0;
4794 mz->on_tree = false; 4747 mz->on_tree = false;
4795 mz->memcg = memcg; 4748 mz->memcg = memcg;
@@ -4832,23 +4785,40 @@ out_free:
4832} 4785}
4833 4786
4834/* 4787/*
4835 * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, 4788 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
4836 * but in process context. The work_freeing structure is overlaid 4789 * but in process context. The work_freeing structure is overlaid
4837 * on the rcu_freeing structure, which itself is overlaid on memsw. 4790 * on the rcu_freeing structure, which itself is overlaid on memsw.
4838 */ 4791 */
4839static void vfree_work(struct work_struct *work) 4792static void free_work(struct work_struct *work)
4840{ 4793{
4841 struct mem_cgroup *memcg; 4794 struct mem_cgroup *memcg;
4795 int size = sizeof(struct mem_cgroup);
4842 4796
4843 memcg = container_of(work, struct mem_cgroup, work_freeing); 4797 memcg = container_of(work, struct mem_cgroup, work_freeing);
4844 vfree(memcg); 4798 /*
4799 * We need to make sure that (at least for now), the jump label
4800 * destruction code runs outside of the cgroup lock. This is because
4801 * get_online_cpus(), which is called from the static_branch update,
4802 * can't be called inside the cgroup_lock. cpusets are the ones
4803 * enforcing this dependency, so if they ever change, we might as well.
4804 *
4805 * schedule_work() will guarantee this happens. Be careful if you need
4806 * to move this code around, and make sure it is outside
4807 * the cgroup_lock.
4808 */
4809 disarm_sock_keys(memcg);
4810 if (size < PAGE_SIZE)
4811 kfree(memcg);
4812 else
4813 vfree(memcg);
4845} 4814}
4846static void vfree_rcu(struct rcu_head *rcu_head) 4815
4816static void free_rcu(struct rcu_head *rcu_head)
4847{ 4817{
4848 struct mem_cgroup *memcg; 4818 struct mem_cgroup *memcg;
4849 4819
4850 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); 4820 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4851 INIT_WORK(&memcg->work_freeing, vfree_work); 4821 INIT_WORK(&memcg->work_freeing, free_work);
4852 schedule_work(&memcg->work_freeing); 4822 schedule_work(&memcg->work_freeing);
4853} 4823}
4854 4824
@@ -4874,10 +4844,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4874 free_mem_cgroup_per_zone_info(memcg, node); 4844 free_mem_cgroup_per_zone_info(memcg, node);
4875 4845
4876 free_percpu(memcg->stat); 4846 free_percpu(memcg->stat);
4877 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4847 call_rcu(&memcg->rcu_freeing, free_rcu);
4878 kfree_rcu(memcg, rcu_freeing);
4879 else
4880 call_rcu(&memcg->rcu_freeing, vfree_rcu);
4881} 4848}
4882 4849
4883static void mem_cgroup_get(struct mem_cgroup *memcg) 4850static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4911,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4911} 4878}
4912EXPORT_SYMBOL(parent_mem_cgroup); 4879EXPORT_SYMBOL(parent_mem_cgroup);
4913 4880
4914#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4881#ifdef CONFIG_MEMCG_SWAP
4915static void __init enable_swap_cgroup(void) 4882static void __init enable_swap_cgroup(void)
4916{ 4883{
4917 if (!mem_cgroup_disabled() && really_do_swap_account) 4884 if (!mem_cgroup_disabled() && really_do_swap_account)
@@ -5016,6 +4983,17 @@ mem_cgroup_create(struct cgroup *cont)
5016 memcg->move_charge_at_immigrate = 0; 4983 memcg->move_charge_at_immigrate = 0;
5017 mutex_init(&memcg->thresholds_lock); 4984 mutex_init(&memcg->thresholds_lock);
5018 spin_lock_init(&memcg->move_lock); 4985 spin_lock_init(&memcg->move_lock);
4986
4987 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
4988 if (error) {
4989 /*
4990 * We call put now because our (and parent's) refcnts
4991 * are already in place. mem_cgroup_put() will internally
4992 * call __mem_cgroup_free, so return directly
4993 */
4994 mem_cgroup_put(memcg);
4995 return ERR_PTR(error);
4996 }
5019 return &memcg->css; 4997 return &memcg->css;
5020free_out: 4998free_out:
5021 __mem_cgroup_free(memcg); 4999 __mem_cgroup_free(memcg);
@@ -5033,28 +5011,11 @@ static void mem_cgroup_destroy(struct cgroup *cont)
5033{ 5011{
5034 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5012 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5035 5013
5036 kmem_cgroup_destroy(cont); 5014 kmem_cgroup_destroy(memcg);
5037 5015
5038 mem_cgroup_put(memcg); 5016 mem_cgroup_put(memcg);
5039} 5017}
5040 5018
5041static int mem_cgroup_populate(struct cgroup_subsys *ss,
5042 struct cgroup *cont)
5043{
5044 int ret;
5045
5046 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
5047 ARRAY_SIZE(mem_cgroup_files));
5048
5049 if (!ret)
5050 ret = register_memsw_files(cont, ss);
5051
5052 if (!ret)
5053 ret = register_kmem_files(cont, ss);
5054
5055 return ret;
5056}
5057
5058#ifdef CONFIG_MMU 5019#ifdef CONFIG_MMU
5059/* Handlers for move charge at task migration. */ 5020/* Handlers for move charge at task migration. */
5060#define PRECHARGE_COUNT_AT_ONCE 256 5021#define PRECHARGE_COUNT_AT_ONCE 256
@@ -5147,7 +5108,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5147 return NULL; 5108 return NULL;
5148 if (PageAnon(page)) { 5109 if (PageAnon(page)) {
5149 /* we don't move shared anon */ 5110 /* we don't move shared anon */
5150 if (!move_anon() || page_mapcount(page) > 2) 5111 if (!move_anon())
5151 return NULL; 5112 return NULL;
5152 } else if (!move_file()) 5113 } else if (!move_file())
5153 /* we ignore mapcount for file pages */ 5114 /* we ignore mapcount for file pages */
@@ -5158,32 +5119,37 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5158 return page; 5119 return page;
5159} 5120}
5160 5121
5122#ifdef CONFIG_SWAP
5161static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5123static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5162 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5124 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5163{ 5125{
5164 int usage_count;
5165 struct page *page = NULL; 5126 struct page *page = NULL;
5166 swp_entry_t ent = pte_to_swp_entry(ptent); 5127 swp_entry_t ent = pte_to_swp_entry(ptent);
5167 5128
5168 if (!move_anon() || non_swap_entry(ent)) 5129 if (!move_anon() || non_swap_entry(ent))
5169 return NULL; 5130 return NULL;
5170 usage_count = mem_cgroup_count_swap_user(ent, &page); 5131 /*
5171 if (usage_count > 1) { /* we don't move shared anon */ 5132 * Because lookup_swap_cache() updates some statistics counter,
5172 if (page) 5133 * we call find_get_page() with swapper_space directly.
5173 put_page(page); 5134 */
5174 return NULL; 5135 page = find_get_page(&swapper_space, ent.val);
5175 }
5176 if (do_swap_account) 5136 if (do_swap_account)
5177 entry->val = ent.val; 5137 entry->val = ent.val;
5178 5138
5179 return page; 5139 return page;
5180} 5140}
5141#else
5142static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5143 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5144{
5145 return NULL;
5146}
5147#endif
5181 5148
5182static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5149static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5183 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5150 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5184{ 5151{
5185 struct page *page = NULL; 5152 struct page *page = NULL;
5186 struct inode *inode;
5187 struct address_space *mapping; 5153 struct address_space *mapping;
5188 pgoff_t pgoff; 5154 pgoff_t pgoff;
5189 5155
@@ -5192,7 +5158,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5192 if (!move_file()) 5158 if (!move_file())
5193 return NULL; 5159 return NULL;
5194 5160
5195 inode = vma->vm_file->f_path.dentry->d_inode;
5196 mapping = vma->vm_file->f_mapping; 5161 mapping = vma->vm_file->f_mapping;
5197 if (pte_none(ptent)) 5162 if (pte_none(ptent))
5198 pgoff = linear_page_index(vma, addr); 5163 pgoff = linear_page_index(vma, addr);
@@ -5481,7 +5446,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5481 * part of thp split is not executed yet. 5446 * part of thp split is not executed yet.
5482 */ 5447 */
5483 if (pmd_trans_huge_lock(pmd, vma) == 1) { 5448 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5484 if (!mc.precharge) { 5449 if (mc.precharge < HPAGE_PMD_NR) {
5485 spin_unlock(&vma->vm_mm->page_table_lock); 5450 spin_unlock(&vma->vm_mm->page_table_lock);
5486 return 0; 5451 return 0;
5487 } 5452 }
@@ -5491,8 +5456,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5491 if (!isolate_lru_page(page)) { 5456 if (!isolate_lru_page(page)) {
5492 pc = lookup_page_cgroup(page); 5457 pc = lookup_page_cgroup(page);
5493 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 5458 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
5494 pc, mc.from, mc.to, 5459 pc, mc.from, mc.to)) {
5495 false)) {
5496 mc.precharge -= HPAGE_PMD_NR; 5460 mc.precharge -= HPAGE_PMD_NR;
5497 mc.moved_charge += HPAGE_PMD_NR; 5461 mc.moved_charge += HPAGE_PMD_NR;
5498 } 5462 }
@@ -5522,7 +5486,7 @@ retry:
5522 goto put; 5486 goto put;
5523 pc = lookup_page_cgroup(page); 5487 pc = lookup_page_cgroup(page);
5524 if (!mem_cgroup_move_account(page, 1, pc, 5488 if (!mem_cgroup_move_account(page, 1, pc,
5525 mc.from, mc.to, false)) { 5489 mc.from, mc.to)) {
5526 mc.precharge--; 5490 mc.precharge--;
5527 /* we uncharge from mc.from later. */ 5491 /* we uncharge from mc.from later. */
5528 mc.moved_charge++; 5492 mc.moved_charge++;
@@ -5533,8 +5497,7 @@ put: /* get_mctgt_type() gets the page */
5533 break; 5497 break;
5534 case MC_TARGET_SWAP: 5498 case MC_TARGET_SWAP:
5535 ent = target.ent; 5499 ent = target.ent;
5536 if (!mem_cgroup_move_swap_account(ent, 5500 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5537 mc.from, mc.to, false)) {
5538 mc.precharge--; 5501 mc.precharge--;
5539 /* we fixup refcnts and charges later. */ 5502 /* we fixup refcnts and charges later. */
5540 mc.moved_swap++; 5503 mc.moved_swap++;
@@ -5610,7 +5573,6 @@ static void mem_cgroup_move_task(struct cgroup *cont,
5610 if (mm) { 5573 if (mm) {
5611 if (mc.to) 5574 if (mc.to)
5612 mem_cgroup_move_charge(mm); 5575 mem_cgroup_move_charge(mm);
5613 put_swap_token(mm);
5614 mmput(mm); 5576 mmput(mm);
5615 } 5577 }
5616 if (mc.to) 5578 if (mc.to)
@@ -5638,15 +5600,16 @@ struct cgroup_subsys mem_cgroup_subsys = {
5638 .create = mem_cgroup_create, 5600 .create = mem_cgroup_create,
5639 .pre_destroy = mem_cgroup_pre_destroy, 5601 .pre_destroy = mem_cgroup_pre_destroy,
5640 .destroy = mem_cgroup_destroy, 5602 .destroy = mem_cgroup_destroy,
5641 .populate = mem_cgroup_populate,
5642 .can_attach = mem_cgroup_can_attach, 5603 .can_attach = mem_cgroup_can_attach,
5643 .cancel_attach = mem_cgroup_cancel_attach, 5604 .cancel_attach = mem_cgroup_cancel_attach,
5644 .attach = mem_cgroup_move_task, 5605 .attach = mem_cgroup_move_task,
5606 .base_cftypes = mem_cgroup_files,
5645 .early_init = 0, 5607 .early_init = 0,
5646 .use_id = 1, 5608 .use_id = 1,
5609 .__DEPRECATED_clear_css_refs = true,
5647}; 5610};
5648 5611
5649#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5612#ifdef CONFIG_MEMCG_SWAP
5650static int __init enable_swap_account(char *s) 5613static int __init enable_swap_account(char *s)
5651{ 5614{
5652 /* consider enabled if no parameter or 1 is given */ 5615 /* consider enabled if no parameter or 1 is given */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 97cc2733551a..a6e2141a6610 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p)
128 * can only guarantee that the page either belongs to the memcg tasks, or is 128 * can only guarantee that the page either belongs to the memcg tasks, or is
129 * a freed page. 129 * a freed page.
130 */ 130 */
131#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 131#ifdef CONFIG_MEMCG_SWAP
132u64 hwpoison_filter_memcg; 132u64 hwpoison_filter_memcg;
133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); 133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
134static int hwpoison_filter_task(struct page *p) 134static int hwpoison_filter_task(struct page *p)
@@ -345,14 +345,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
345 * Also when FAIL is set do a force kill because something went 345 * Also when FAIL is set do a force kill because something went
346 * wrong earlier. 346 * wrong earlier.
347 */ 347 */
348static void kill_procs(struct list_head *to_kill, int doit, int trapno, 348static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
349 int fail, struct page *page, unsigned long pfn, 349 int fail, struct page *page, unsigned long pfn,
350 int flags) 350 int flags)
351{ 351{
352 struct to_kill *tk, *next; 352 struct to_kill *tk, *next;
353 353
354 list_for_each_entry_safe (tk, next, to_kill, nd) { 354 list_for_each_entry_safe (tk, next, to_kill, nd) {
355 if (doit) { 355 if (forcekill) {
356 /* 356 /*
357 * In case something went wrong with munmapping 357 * In case something went wrong with munmapping
358 * make sure the process doesn't catch the 358 * make sure the process doesn't catch the
@@ -858,7 +858,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
858 struct address_space *mapping; 858 struct address_space *mapping;
859 LIST_HEAD(tokill); 859 LIST_HEAD(tokill);
860 int ret; 860 int ret;
861 int kill = 1; 861 int kill = 1, forcekill;
862 struct page *hpage = compound_head(p); 862 struct page *hpage = compound_head(p);
863 struct page *ppage; 863 struct page *ppage;
864 864
@@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
888 * be called inside page lock (it's recommended but not enforced). 888 * be called inside page lock (it's recommended but not enforced).
889 */ 889 */
890 mapping = page_mapping(hpage); 890 mapping = page_mapping(hpage);
891 if (!PageDirty(hpage) && mapping && 891 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
892 mapping_cap_writeback_dirty(mapping)) { 892 mapping_cap_writeback_dirty(mapping)) {
893 if (page_mkclean(hpage)) { 893 if (page_mkclean(hpage)) {
894 SetPageDirty(hpage); 894 SetPageDirty(hpage);
@@ -965,12 +965,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
965 * Now that the dirty bit has been propagated to the 965 * Now that the dirty bit has been propagated to the
966 * struct page and all unmaps done we can decide if 966 * struct page and all unmaps done we can decide if
967 * killing is needed or not. Only kill when the page 967 * killing is needed or not. Only kill when the page
968 * was dirty, otherwise the tokill list is merely 968 * was dirty or the process is not restartable,
969 * otherwise the tokill list is merely
969 * freed. When there was a problem unmapping earlier 970 * freed. When there was a problem unmapping earlier
970 * use a more force-full uncatchable kill to prevent 971 * use a more force-full uncatchable kill to prevent
971 * any accesses to the poisoned memory. 972 * any accesses to the poisoned memory.
972 */ 973 */
973 kill_procs(&tokill, !!PageDirty(ppage), trapno, 974 forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
975 kill_procs(&tokill, forcekill, trapno,
974 ret != SWAP_SUCCESS, p, pfn, flags); 976 ret != SWAP_SUCCESS, p, pfn, flags);
975 977
976 return ret; 978 return ret;
@@ -1388,23 +1390,23 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1388 */ 1390 */
1389 if (!get_page_unless_zero(compound_head(p))) { 1391 if (!get_page_unless_zero(compound_head(p))) {
1390 if (PageHuge(p)) { 1392 if (PageHuge(p)) {
1391 pr_info("get_any_page: %#lx free huge page\n", pfn); 1393 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1392 ret = dequeue_hwpoisoned_huge_page(compound_head(p)); 1394 ret = dequeue_hwpoisoned_huge_page(compound_head(p));
1393 } else if (is_free_buddy_page(p)) { 1395 } else if (is_free_buddy_page(p)) {
1394 pr_info("get_any_page: %#lx free buddy page\n", pfn); 1396 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1395 /* Set hwpoison bit while page is still isolated */ 1397 /* Set hwpoison bit while page is still isolated */
1396 SetPageHWPoison(p); 1398 SetPageHWPoison(p);
1397 ret = 0; 1399 ret = 0;
1398 } else { 1400 } else {
1399 pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", 1401 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1400 pfn, p->flags); 1402 __func__, pfn, p->flags);
1401 ret = -EIO; 1403 ret = -EIO;
1402 } 1404 }
1403 } else { 1405 } else {
1404 /* Not a free page */ 1406 /* Not a free page */
1405 ret = 1; 1407 ret = 1;
1406 } 1408 }
1407 unset_migratetype_isolate(p); 1409 unset_migratetype_isolate(p, MIGRATE_MOVABLE);
1408 unlock_memory_hotplug(); 1410 unlock_memory_hotplug();
1409 return ret; 1411 return ret;
1410} 1412}
@@ -1414,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
1414 int ret; 1416 int ret;
1415 unsigned long pfn = page_to_pfn(page); 1417 unsigned long pfn = page_to_pfn(page);
1416 struct page *hpage = compound_head(page); 1418 struct page *hpage = compound_head(page);
1417 LIST_HEAD(pagelist);
1418 1419
1419 ret = get_any_page(page, pfn, flags); 1420 ret = get_any_page(page, pfn, flags);
1420 if (ret < 0) 1421 if (ret < 0)
@@ -1429,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
1429 } 1430 }
1430 1431
1431 /* Keep page count to indicate a given hugepage is isolated. */ 1432 /* Keep page count to indicate a given hugepage is isolated. */
1432 1433 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
1433 list_add(&hpage->lru, &pagelist); 1434 MIGRATE_SYNC);
1434 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, 1435 put_page(hpage);
1435 true);
1436 if (ret) { 1436 if (ret) {
1437 struct page *page1, *page2;
1438 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1439 put_page(page1);
1440
1441 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1437 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1442 pfn, ret, page->flags); 1438 pfn, ret, page->flags);
1443 if (ret > 0)
1444 ret = -EIO;
1445 return ret; 1439 return ret;
1446 } 1440 }
1447done: 1441done:
1448 if (!PageHWPoison(hpage)) 1442 if (!PageHWPoison(hpage))
1449 atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); 1443 atomic_long_add(1 << compound_trans_order(hpage),
1444 &mce_bad_pages);
1450 set_page_hwpoison_huge_page(hpage); 1445 set_page_hwpoison_huge_page(hpage);
1451 dequeue_hwpoisoned_huge_page(hpage); 1446 dequeue_hwpoisoned_huge_page(hpage);
1452 /* keep elevated page count for bad page */ 1447 /* keep elevated page count for bad page */
@@ -1561,7 +1556,7 @@ int soft_offline_page(struct page *page, int flags)
1561 page_is_file_cache(page)); 1556 page_is_file_cache(page));
1562 list_add(&page->lru, &pagelist); 1557 list_add(&page->lru, &pagelist);
1563 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1558 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1564 0, MIGRATE_SYNC); 1559 false, MIGRATE_SYNC);
1565 if (ret) { 1560 if (ret) {
1566 putback_lru_pages(&pagelist); 1561 putback_lru_pages(&pagelist);
1567 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1562 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 6105f475fa86..57361708d1a5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -206,6 +206,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
206 tlb->mm = mm; 206 tlb->mm = mm;
207 207
208 tlb->fullmm = fullmm; 208 tlb->fullmm = fullmm;
209 tlb->start = -1UL;
210 tlb->end = 0;
209 tlb->need_flush = 0; 211 tlb->need_flush = 0;
210 tlb->fast_mode = (num_possible_cpus() == 1); 212 tlb->fast_mode = (num_possible_cpus() == 1);
211 tlb->local.next = NULL; 213 tlb->local.next = NULL;
@@ -248,6 +250,8 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
248{ 250{
249 struct mmu_gather_batch *batch, *next; 251 struct mmu_gather_batch *batch, *next;
250 252
253 tlb->start = start;
254 tlb->end = end;
251 tlb_flush_mmu(tlb); 255 tlb_flush_mmu(tlb);
252 256
253 /* keep the page table cache within bounds */ 257 /* keep the page table cache within bounds */
@@ -1204,6 +1208,11 @@ again:
1204 */ 1208 */
1205 if (force_flush) { 1209 if (force_flush) {
1206 force_flush = 0; 1210 force_flush = 0;
1211
1212#ifdef HAVE_GENERIC_MMU_GATHER
1213 tlb->start = addr;
1214 tlb->end = end;
1215#endif
1207 tlb_flush_mmu(tlb); 1216 tlb_flush_mmu(tlb);
1208 if (addr != end) 1217 if (addr != end)
1209 goto again; 1218 goto again;
@@ -1225,7 +1234,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1225 next = pmd_addr_end(addr, end); 1234 next = pmd_addr_end(addr, end);
1226 if (pmd_trans_huge(*pmd)) { 1235 if (pmd_trans_huge(*pmd)) {
1227 if (next - addr != HPAGE_PMD_SIZE) { 1236 if (next - addr != HPAGE_PMD_SIZE) {
1228 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1237#ifdef CONFIG_DEBUG_VM
1238 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1239 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1240 __func__, addr, end,
1241 vma->vm_start,
1242 vma->vm_end);
1243 BUG();
1244 }
1245#endif
1229 split_huge_page_pmd(vma->vm_mm, pmd); 1246 split_huge_page_pmd(vma->vm_mm, pmd);
1230 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1247 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1231 goto next; 1248 goto next;
@@ -1295,7 +1312,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
1295 1312
1296static void unmap_single_vma(struct mmu_gather *tlb, 1313static void unmap_single_vma(struct mmu_gather *tlb,
1297 struct vm_area_struct *vma, unsigned long start_addr, 1314 struct vm_area_struct *vma, unsigned long start_addr,
1298 unsigned long end_addr, unsigned long *nr_accounted, 1315 unsigned long end_addr,
1299 struct zap_details *details) 1316 struct zap_details *details)
1300{ 1317{
1301 unsigned long start = max(vma->vm_start, start_addr); 1318 unsigned long start = max(vma->vm_start, start_addr);
@@ -1307,8 +1324,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1307 if (end <= vma->vm_start) 1324 if (end <= vma->vm_start)
1308 return; 1325 return;
1309 1326
1310 if (vma->vm_flags & VM_ACCOUNT) 1327 if (vma->vm_file)
1311 *nr_accounted += (end - start) >> PAGE_SHIFT; 1328 uprobe_munmap(vma, start, end);
1312 1329
1313 if (unlikely(is_pfn_mapping(vma))) 1330 if (unlikely(is_pfn_mapping(vma)))
1314 untrack_pfn_vma(vma, 0, 0); 1331 untrack_pfn_vma(vma, 0, 0);
@@ -1326,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1326 * Since no pte has actually been setup, it is 1343 * Since no pte has actually been setup, it is
1327 * safe to do nothing in this case. 1344 * safe to do nothing in this case.
1328 */ 1345 */
1329 if (vma->vm_file) 1346 if (vma->vm_file) {
1330 unmap_hugepage_range(vma, start, end, NULL); 1347 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
1348 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1349 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
1350 }
1331 } else 1351 } else
1332 unmap_page_range(tlb, vma, start, end, details); 1352 unmap_page_range(tlb, vma, start, end, details);
1333 } 1353 }
@@ -1339,8 +1359,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1339 * @vma: the starting vma 1359 * @vma: the starting vma
1340 * @start_addr: virtual address at which to start unmapping 1360 * @start_addr: virtual address at which to start unmapping
1341 * @end_addr: virtual address at which to end unmapping 1361 * @end_addr: virtual address at which to end unmapping
1342 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
1343 * @details: details of nonlinear truncation or shared cache invalidation
1344 * 1362 *
1345 * Unmap all pages in the vma list. 1363 * Unmap all pages in the vma list.
1346 * 1364 *
@@ -1355,40 +1373,40 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1355 */ 1373 */
1356void unmap_vmas(struct mmu_gather *tlb, 1374void unmap_vmas(struct mmu_gather *tlb,
1357 struct vm_area_struct *vma, unsigned long start_addr, 1375 struct vm_area_struct *vma, unsigned long start_addr,
1358 unsigned long end_addr, unsigned long *nr_accounted, 1376 unsigned long end_addr)
1359 struct zap_details *details)
1360{ 1377{
1361 struct mm_struct *mm = vma->vm_mm; 1378 struct mm_struct *mm = vma->vm_mm;
1362 1379
1363 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); 1380 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1364 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) 1381 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1365 unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted, 1382 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1366 details);
1367 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); 1383 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1368} 1384}
1369 1385
1370/** 1386/**
1371 * zap_page_range - remove user pages in a given range 1387 * zap_page_range - remove user pages in a given range
1372 * @vma: vm_area_struct holding the applicable pages 1388 * @vma: vm_area_struct holding the applicable pages
1373 * @address: starting address of pages to zap 1389 * @start: starting address of pages to zap
1374 * @size: number of bytes to zap 1390 * @size: number of bytes to zap
1375 * @details: details of nonlinear truncation or shared cache invalidation 1391 * @details: details of nonlinear truncation or shared cache invalidation
1376 * 1392 *
1377 * Caller must protect the VMA list 1393 * Caller must protect the VMA list
1378 */ 1394 */
1379void zap_page_range(struct vm_area_struct *vma, unsigned long address, 1395void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1380 unsigned long size, struct zap_details *details) 1396 unsigned long size, struct zap_details *details)
1381{ 1397{
1382 struct mm_struct *mm = vma->vm_mm; 1398 struct mm_struct *mm = vma->vm_mm;
1383 struct mmu_gather tlb; 1399 struct mmu_gather tlb;
1384 unsigned long end = address + size; 1400 unsigned long end = start + size;
1385 unsigned long nr_accounted = 0;
1386 1401
1387 lru_add_drain(); 1402 lru_add_drain();
1388 tlb_gather_mmu(&tlb, mm, 0); 1403 tlb_gather_mmu(&tlb, mm, 0);
1389 update_hiwater_rss(mm); 1404 update_hiwater_rss(mm);
1390 unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); 1405 mmu_notifier_invalidate_range_start(mm, start, end);
1391 tlb_finish_mmu(&tlb, address, end); 1406 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1407 unmap_single_vma(&tlb, vma, start, end, details);
1408 mmu_notifier_invalidate_range_end(mm, start, end);
1409 tlb_finish_mmu(&tlb, start, end);
1392} 1410}
1393 1411
1394/** 1412/**
@@ -1406,13 +1424,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
1406 struct mm_struct *mm = vma->vm_mm; 1424 struct mm_struct *mm = vma->vm_mm;
1407 struct mmu_gather tlb; 1425 struct mmu_gather tlb;
1408 unsigned long end = address + size; 1426 unsigned long end = address + size;
1409 unsigned long nr_accounted = 0;
1410 1427
1411 lru_add_drain(); 1428 lru_add_drain();
1412 tlb_gather_mmu(&tlb, mm, 0); 1429 tlb_gather_mmu(&tlb, mm, 0);
1413 update_hiwater_rss(mm); 1430 update_hiwater_rss(mm);
1414 mmu_notifier_invalidate_range_start(mm, address, end); 1431 mmu_notifier_invalidate_range_start(mm, address, end);
1415 unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details); 1432 unmap_single_vma(&tlb, vma, address, end, details);
1416 mmu_notifier_invalidate_range_end(mm, address, end); 1433 mmu_notifier_invalidate_range_end(mm, address, end);
1417 tlb_finish_mmu(&tlb, address, end); 1434 tlb_finish_mmu(&tlb, address, end);
1418} 1435}
@@ -2633,6 +2650,9 @@ reuse:
2633 if (!page_mkwrite) { 2650 if (!page_mkwrite) {
2634 wait_on_page_locked(dirty_page); 2651 wait_on_page_locked(dirty_page);
2635 set_page_dirty_balance(dirty_page, page_mkwrite); 2652 set_page_dirty_balance(dirty_page, page_mkwrite);
2653 /* file_update_time outside page_lock */
2654 if (vma->vm_file)
2655 file_update_time(vma->vm_file);
2636 } 2656 }
2637 put_page(dirty_page); 2657 put_page(dirty_page);
2638 if (page_mkwrite) { 2658 if (page_mkwrite) {
@@ -2650,10 +2670,6 @@ reuse:
2650 } 2670 }
2651 } 2671 }
2652 2672
2653 /* file_update_time outside page_lock */
2654 if (vma->vm_file)
2655 file_update_time(vma->vm_file);
2656
2657 return ret; 2673 return ret;
2658 } 2674 }
2659 2675
@@ -2911,7 +2927,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2911 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2927 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2912 page = lookup_swap_cache(entry); 2928 page = lookup_swap_cache(entry);
2913 if (!page) { 2929 if (!page) {
2914 grab_swap_token(mm); /* Contend for token _before_ read-in */
2915 page = swapin_readahead(entry, 2930 page = swapin_readahead(entry,
2916 GFP_HIGHUSER_MOVABLE, vma, address); 2931 GFP_HIGHUSER_MOVABLE, vma, address);
2917 if (!page) { 2932 if (!page) {
@@ -2941,6 +2956,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2941 } 2956 }
2942 2957
2943 locked = lock_page_or_retry(page, mm, flags); 2958 locked = lock_page_or_retry(page, mm, flags);
2959
2944 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2960 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2945 if (!locked) { 2961 if (!locked) {
2946 ret |= VM_FAULT_RETRY; 2962 ret |= VM_FAULT_RETRY;
@@ -3322,12 +3338,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3322 3338
3323 if (dirty_page) { 3339 if (dirty_page) {
3324 struct address_space *mapping = page->mapping; 3340 struct address_space *mapping = page->mapping;
3341 int dirtied = 0;
3325 3342
3326 if (set_page_dirty(dirty_page)) 3343 if (set_page_dirty(dirty_page))
3327 page_mkwrite = 1; 3344 dirtied = 1;
3328 unlock_page(dirty_page); 3345 unlock_page(dirty_page);
3329 put_page(dirty_page); 3346 put_page(dirty_page);
3330 if (page_mkwrite && mapping) { 3347 if ((dirtied || page_mkwrite) && mapping) {
3331 /* 3348 /*
3332 * Some device drivers do not set page.mapping but still 3349 * Some device drivers do not set page.mapping but still
3333 * dirty their pages 3350 * dirty their pages
@@ -3336,7 +3353,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3336 } 3353 }
3337 3354
3338 /* file_update_time outside page_lock */ 3355 /* file_update_time outside page_lock */
3339 if (vma->vm_file) 3356 if (vma->vm_file && !page_mkwrite)
3340 file_update_time(vma->vm_file); 3357 file_update_time(vma->vm_file);
3341 } else { 3358 } else {
3342 unlock_page(vmf.page); 3359 unlock_page(vmf.page);
@@ -3489,6 +3506,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3489 if (unlikely(is_vm_hugetlb_page(vma))) 3506 if (unlikely(is_vm_hugetlb_page(vma)))
3490 return hugetlb_fault(mm, vma, address, flags); 3507 return hugetlb_fault(mm, vma, address, flags);
3491 3508
3509retry:
3492 pgd = pgd_offset(mm, address); 3510 pgd = pgd_offset(mm, address);
3493 pud = pud_alloc(mm, pgd, address); 3511 pud = pud_alloc(mm, pgd, address);
3494 if (!pud) 3512 if (!pud)
@@ -3502,13 +3520,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3502 pmd, flags); 3520 pmd, flags);
3503 } else { 3521 } else {
3504 pmd_t orig_pmd = *pmd; 3522 pmd_t orig_pmd = *pmd;
3523 int ret;
3524
3505 barrier(); 3525 barrier();
3506 if (pmd_trans_huge(orig_pmd)) { 3526 if (pmd_trans_huge(orig_pmd)) {
3507 if (flags & FAULT_FLAG_WRITE && 3527 if (flags & FAULT_FLAG_WRITE &&
3508 !pmd_write(orig_pmd) && 3528 !pmd_write(orig_pmd) &&
3509 !pmd_trans_splitting(orig_pmd)) 3529 !pmd_trans_splitting(orig_pmd)) {
3510 return do_huge_pmd_wp_page(mm, vma, address, 3530 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3511 pmd, orig_pmd); 3531 orig_pmd);
3532 /*
3533 * If COW results in an oom, the huge pmd will
3534 * have been split, so retry the fault on the
3535 * pte for a smaller charge.
3536 */
3537 if (unlikely(ret & VM_FAULT_OOM))
3538 goto retry;
3539 return ret;
3540 }
3512 return 0; 3541 return 0;
3513 } 3542 }
3514 } 3543 }
@@ -3912,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
3912 free_page((unsigned long)buf); 3941 free_page((unsigned long)buf);
3913 } 3942 }
3914 } 3943 }
3915 up_read(&current->mm->mmap_sem); 3944 up_read(&mm->mmap_sem);
3916} 3945}
3917 3946
3918#ifdef CONFIG_PROVE_LOCKING 3947#ifdef CONFIG_PROVE_LOCKING
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6629fafd6ce4..3ad25f9d1fc1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -74,8 +74,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
74 res->end = start + size - 1; 74 res->end = start + size - 1;
75 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 75 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
76 if (request_resource(&iomem_resource, res) < 0) { 76 if (request_resource(&iomem_resource, res) < 0) {
77 printk("System RAM resource %llx - %llx cannot be added\n", 77 printk("System RAM resource %pR cannot be added\n", res);
78 (unsigned long long)res->start, (unsigned long long)res->end);
79 kfree(res); 78 kfree(res);
80 res = NULL; 79 res = NULL;
81 } 80 }
@@ -502,8 +501,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
502 online_pages_range); 501 online_pages_range);
503 if (ret) { 502 if (ret) {
504 mutex_unlock(&zonelists_mutex); 503 mutex_unlock(&zonelists_mutex);
505 printk(KERN_DEBUG "online_pages %lx at %lx failed\n", 504 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
506 nr_pages, pfn); 505 (unsigned long long) pfn << PAGE_SHIFT,
506 (((unsigned long long) pfn + nr_pages)
507 << PAGE_SHIFT) - 1);
507 memory_notify(MEM_CANCEL_ONLINE, &arg); 508 memory_notify(MEM_CANCEL_ONLINE, &arg);
508 unlock_memory_hotplug(); 509 unlock_memory_hotplug();
509 return ret; 510 return ret;
@@ -511,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
511 512
512 zone->present_pages += onlined_pages; 513 zone->present_pages += onlined_pages;
513 zone->zone_pgdat->node_present_pages += onlined_pages; 514 zone->zone_pgdat->node_present_pages += onlined_pages;
514 if (need_zonelists_rebuild) 515 if (onlined_pages) {
515 build_all_zonelists(zone); 516 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
516 else 517 if (need_zonelists_rebuild)
517 zone_pcp_update(zone); 518 build_all_zonelists(NULL, zone);
519 else
520 zone_pcp_update(zone);
521 }
518 522
519 mutex_unlock(&zonelists_mutex); 523 mutex_unlock(&zonelists_mutex);
520 524
521 init_per_zone_wmark_min(); 525 init_per_zone_wmark_min();
522 526
523 if (onlined_pages) { 527 if (onlined_pages)
524 kswapd_run(zone_to_nid(zone)); 528 kswapd_run(zone_to_nid(zone));
525 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
526 }
527 529
528 vm_total_pages = nr_free_pagecache_pages(); 530 vm_total_pages = nr_free_pagecache_pages();
529 531
@@ -561,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
561 * to access not-initialized zonelist, build here. 563 * to access not-initialized zonelist, build here.
562 */ 564 */
563 mutex_lock(&zonelists_mutex); 565 mutex_lock(&zonelists_mutex);
564 build_all_zonelists(NULL); 566 build_all_zonelists(pgdat, NULL);
565 mutex_unlock(&zonelists_mutex); 567 mutex_unlock(&zonelists_mutex);
566 568
567 return pgdat; 569 return pgdat;
@@ -617,7 +619,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
617 pgdat = hotadd_new_pgdat(nid, start); 619 pgdat = hotadd_new_pgdat(nid, start);
618 ret = -ENOMEM; 620 ret = -ENOMEM;
619 if (!pgdat) 621 if (!pgdat)
620 goto out; 622 goto error;
621 new_pgdat = 1; 623 new_pgdat = 1;
622 } 624 }
623 625
@@ -891,7 +893,7 @@ static int __ref offline_pages(unsigned long start_pfn,
891 nr_pages = end_pfn - start_pfn; 893 nr_pages = end_pfn - start_pfn;
892 894
893 /* set above range as isolated */ 895 /* set above range as isolated */
894 ret = start_isolate_page_range(start_pfn, end_pfn); 896 ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
895 if (ret) 897 if (ret)
896 goto out; 898 goto out;
897 899
@@ -956,7 +958,7 @@ repeat:
956 We cannot do rollback at this point. */ 958 We cannot do rollback at this point. */
957 offline_isolated_pages(start_pfn, end_pfn); 959 offline_isolated_pages(start_pfn, end_pfn);
958 /* reset pagetype flags and makes migrate type to be MOVABLE */ 960 /* reset pagetype flags and makes migrate type to be MOVABLE */
959 undo_isolate_page_range(start_pfn, end_pfn); 961 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
960 /* removal success */ 962 /* removal success */
961 zone->present_pages -= offlined_pages; 963 zone->present_pages -= offlined_pages;
962 zone->zone_pgdat->node_present_pages -= offlined_pages; 964 zone->zone_pgdat->node_present_pages -= offlined_pages;
@@ -964,6 +966,9 @@ repeat:
964 966
965 init_per_zone_wmark_min(); 967 init_per_zone_wmark_min();
966 968
969 if (!populated_zone(zone))
970 zone_pcp_reset(zone);
971
967 if (!node_present_pages(node)) { 972 if (!node_present_pages(node)) {
968 node_clear_state(node, N_HIGH_MEMORY); 973 node_clear_state(node, N_HIGH_MEMORY);
969 kswapd_stop(node); 974 kswapd_stop(node);
@@ -977,11 +982,12 @@ repeat:
977 return 0; 982 return 0;
978 983
979failed_removal: 984failed_removal:
980 printk(KERN_INFO "memory offlining %lx to %lx failed\n", 985 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
981 start_pfn, end_pfn); 986 (unsigned long long) start_pfn << PAGE_SHIFT,
987 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
982 memory_notify(MEM_CANCEL_OFFLINE, &arg); 988 memory_notify(MEM_CANCEL_OFFLINE, &arg);
983 /* pushback to free area */ 989 /* pushback to free area */
984 undo_isolate_page_range(start_pfn, end_pfn); 990 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
985 991
986out: 992out:
987 unlock_memory_hotplug(); 993 unlock_memory_hotplug();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cfb6c8678754..bd92431d4c49 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -390,7 +390,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
390{ 390{
391 if (!pol) 391 if (!pol)
392 return; 392 return;
393 if (!mpol_store_user_nodemask(pol) && step == 0 && 393 if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
394 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 394 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
395 return; 395 return;
396 396
@@ -607,27 +607,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
607 return first; 607 return first;
608} 608}
609 609
610/* Apply policy to a single VMA */
611static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
612{
613 int err = 0;
614 struct mempolicy *old = vma->vm_policy;
615
616 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
617 vma->vm_start, vma->vm_end, vma->vm_pgoff,
618 vma->vm_ops, vma->vm_file,
619 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
620
621 if (vma->vm_ops && vma->vm_ops->set_policy)
622 err = vma->vm_ops->set_policy(vma, new);
623 if (!err) {
624 mpol_get(new);
625 vma->vm_policy = new;
626 mpol_put(old);
627 }
628 return err;
629}
630
631/* Step 2: apply policy to a range and do splits. */ 610/* Step 2: apply policy to a range and do splits. */
632static int mbind_range(struct mm_struct *mm, unsigned long start, 611static int mbind_range(struct mm_struct *mm, unsigned long start,
633 unsigned long end, struct mempolicy *new_pol) 612 unsigned long end, struct mempolicy *new_pol)
@@ -676,9 +655,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
676 if (err) 655 if (err)
677 goto out; 656 goto out;
678 } 657 }
679 err = policy_vma(vma, new_pol); 658
680 if (err) 659 /*
681 goto out; 660 * Apply policy to a single VMA. The reference counting of
661 * policy for vma_policy linkages has already been handled by
662 * vma_merge and split_vma as necessary. If this is a shared
663 * policy then ->set_policy will increment the reference count
664 * for an sp node.
665 */
666 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
667 vma->vm_start, vma->vm_end, vma->vm_pgoff,
668 vma->vm_ops, vma->vm_file,
669 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
670 if (vma->vm_ops && vma->vm_ops->set_policy) {
671 err = vma->vm_ops->set_policy(vma, new_pol);
672 if (err)
673 goto out;
674 }
682 } 675 }
683 676
684 out: 677 out:
@@ -957,8 +950,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
957 * 950 *
958 * Returns the number of page that could not be moved. 951 * Returns the number of page that could not be moved.
959 */ 952 */
960int do_migrate_pages(struct mm_struct *mm, 953int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
961 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 954 const nodemask_t *to, int flags)
962{ 955{
963 int busy = 0; 956 int busy = 0;
964 int err; 957 int err;
@@ -970,7 +963,7 @@ int do_migrate_pages(struct mm_struct *mm,
970 963
971 down_read(&mm->mmap_sem); 964 down_read(&mm->mmap_sem);
972 965
973 err = migrate_vmas(mm, from_nodes, to_nodes, flags); 966 err = migrate_vmas(mm, from, to, flags);
974 if (err) 967 if (err)
975 goto out; 968 goto out;
976 969
@@ -1005,14 +998,34 @@ int do_migrate_pages(struct mm_struct *mm,
1005 * moved to an empty node, then there is nothing left worth migrating. 998 * moved to an empty node, then there is nothing left worth migrating.
1006 */ 999 */
1007 1000
1008 tmp = *from_nodes; 1001 tmp = *from;
1009 while (!nodes_empty(tmp)) { 1002 while (!nodes_empty(tmp)) {
1010 int s,d; 1003 int s,d;
1011 int source = -1; 1004 int source = -1;
1012 int dest = 0; 1005 int dest = 0;
1013 1006
1014 for_each_node_mask(s, tmp) { 1007 for_each_node_mask(s, tmp) {
1015 d = node_remap(s, *from_nodes, *to_nodes); 1008
1009 /*
1010 * do_migrate_pages() tries to maintain the relative
1011 * node relationship of the pages established between
1012 * threads and memory areas.
1013 *
1014 * However if the number of source nodes is not equal to
1015 * the number of destination nodes we can not preserve
1016 * this node relative relationship. In that case, skip
1017 * copying memory from a node that is in the destination
1018 * mask.
1019 *
1020 * Example: [2,3,4] -> [3,4,5] moves everything.
1021 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1022 */
1023
1024 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1025 (node_isset(s, *to)))
1026 continue;
1027
1028 d = node_remap(s, *from, *to);
1016 if (s == d) 1029 if (s == d)
1017 continue; 1030 continue;
1018 1031
@@ -1072,8 +1085,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
1072{ 1085{
1073} 1086}
1074 1087
1075int do_migrate_pages(struct mm_struct *mm, 1088int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1076 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 1089 const nodemask_t *to, int flags)
1077{ 1090{
1078 return -ENOSYS; 1091 return -ENOSYS;
1079} 1092}
@@ -1164,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1164 if (!list_empty(&pagelist)) { 1177 if (!list_empty(&pagelist)) {
1165 nr_failed = migrate_pages(&pagelist, new_vma_page, 1178 nr_failed = migrate_pages(&pagelist, new_vma_page,
1166 (unsigned long)vma, 1179 (unsigned long)vma,
1167 false, true); 1180 false, MIGRATE_SYNC);
1168 if (nr_failed) 1181 if (nr_failed)
1169 putback_lru_pages(&pagelist); 1182 putback_lru_pages(&pagelist);
1170 } 1183 }
@@ -1334,8 +1347,8 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1334 * userid as the target process. 1347 * userid as the target process.
1335 */ 1348 */
1336 tcred = __task_cred(task); 1349 tcred = __task_cred(task);
1337 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1350 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1338 cred->uid != tcred->suid && cred->uid != tcred->uid && 1351 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1339 !capable(CAP_SYS_NICE)) { 1352 !capable(CAP_SYS_NICE)) {
1340 rcu_read_unlock(); 1353 rcu_read_unlock();
1341 err = -EPERM; 1354 err = -EPERM;
@@ -1361,11 +1374,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1361 1374
1362 mm = get_task_mm(task); 1375 mm = get_task_mm(task);
1363 put_task_struct(task); 1376 put_task_struct(task);
1364 if (mm) 1377
1365 err = do_migrate_pages(mm, old, new, 1378 if (!mm) {
1366 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1367 else
1368 err = -EINVAL; 1379 err = -EINVAL;
1380 goto out;
1381 }
1382
1383 err = do_migrate_pages(mm, old, new,
1384 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1369 1385
1370 mmput(mm); 1386 mmput(mm);
1371out: 1387out:
@@ -1586,8 +1602,14 @@ static unsigned interleave_nodes(struct mempolicy *policy)
1586 * task can change it's policy. The system default policy requires no 1602 * task can change it's policy. The system default policy requires no
1587 * such protection. 1603 * such protection.
1588 */ 1604 */
1589unsigned slab_node(struct mempolicy *policy) 1605unsigned slab_node(void)
1590{ 1606{
1607 struct mempolicy *policy;
1608
1609 if (in_interrupt())
1610 return numa_node_id();
1611
1612 policy = current->mempolicy;
1591 if (!policy || policy->flags & MPOL_F_LOCAL) 1613 if (!policy || policy->flags & MPOL_F_LOCAL)
1592 return numa_node_id(); 1614 return numa_node_id();
1593 1615
diff --git a/mm/mempool.c b/mm/mempool.c
index d9049811f352..54990476c049 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -63,19 +63,21 @@ EXPORT_SYMBOL(mempool_destroy);
63mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, 63mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
64 mempool_free_t *free_fn, void *pool_data) 64 mempool_free_t *free_fn, void *pool_data)
65{ 65{
66 return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1); 66 return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
67 GFP_KERNEL, NUMA_NO_NODE);
67} 68}
68EXPORT_SYMBOL(mempool_create); 69EXPORT_SYMBOL(mempool_create);
69 70
70mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, 71mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
71 mempool_free_t *free_fn, void *pool_data, int node_id) 72 mempool_free_t *free_fn, void *pool_data,
73 gfp_t gfp_mask, int node_id)
72{ 74{
73 mempool_t *pool; 75 mempool_t *pool;
74 pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id); 76 pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
75 if (!pool) 77 if (!pool)
76 return NULL; 78 return NULL;
77 pool->elements = kmalloc_node(min_nr * sizeof(void *), 79 pool->elements = kmalloc_node(min_nr * sizeof(void *),
78 GFP_KERNEL, node_id); 80 gfp_mask, node_id);
79 if (!pool->elements) { 81 if (!pool->elements) {
80 kfree(pool); 82 kfree(pool);
81 return NULL; 83 return NULL;
@@ -93,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
93 while (pool->curr_nr < pool->min_nr) { 95 while (pool->curr_nr < pool->min_nr) {
94 void *element; 96 void *element;
95 97
96 element = pool->alloc(GFP_KERNEL, pool->pool_data); 98 element = pool->alloc(gfp_mask, pool->pool_data);
97 if (unlikely(!element)) { 99 if (unlikely(!element)) {
98 mempool_destroy(pool); 100 mempool_destroy(pool);
99 return NULL; 101 return NULL;
diff --git a/mm/migrate.c b/mm/migrate.c
index 51c08a0c6f68..77ed2d773705 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -33,6 +33,7 @@
33#include <linux/memcontrol.h> 33#include <linux/memcontrol.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/hugetlb_cgroup.h>
36#include <linux/gfp.h> 37#include <linux/gfp.h>
37 38
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
@@ -436,7 +437,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
436 * is actually a signal that all of the page has become dirty. 437 * is actually a signal that all of the page has become dirty.
437 * Whereas only part of our page may be dirty. 438 * Whereas only part of our page may be dirty.
438 */ 439 */
439 __set_page_dirty_nobuffers(newpage); 440 if (PageSwapBacked(page))
441 SetPageDirty(newpage);
442 else
443 __set_page_dirty_nobuffers(newpage);
440 } 444 }
441 445
442 mlock_migrate_page(newpage, page); 446 mlock_migrate_page(newpage, page);
@@ -679,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
679{ 683{
680 int rc = -EAGAIN; 684 int rc = -EAGAIN;
681 int remap_swapcache = 1; 685 int remap_swapcache = 1;
682 int charge = 0;
683 struct mem_cgroup *mem; 686 struct mem_cgroup *mem;
684 struct anon_vma *anon_vma = NULL; 687 struct anon_vma *anon_vma = NULL;
685 688
@@ -721,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
721 } 724 }
722 725
723 /* charge against new page */ 726 /* charge against new page */
724 charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); 727 mem_cgroup_prepare_migration(page, newpage, &mem);
725 if (charge == -ENOMEM) {
726 rc = -ENOMEM;
727 goto unlock;
728 }
729 BUG_ON(charge);
730 728
731 if (PageWriteback(page)) { 729 if (PageWriteback(page)) {
732 /* 730 /*
@@ -816,8 +814,7 @@ skip_unmap:
816 put_anon_vma(anon_vma); 814 put_anon_vma(anon_vma);
817 815
818uncharge: 816uncharge:
819 if (!charge) 817 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
820 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
821unlock: 818unlock:
822 unlock_page(page); 819 unlock_page(page);
823out: 820out:
@@ -928,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
928 925
929 if (anon_vma) 926 if (anon_vma)
930 put_anon_vma(anon_vma); 927 put_anon_vma(anon_vma);
931 unlock_page(hpage);
932 928
933out: 929 if (!rc)
934 if (rc != -EAGAIN) { 930 hugetlb_cgroup_migrate(hpage, new_hpage);
935 list_del(&hpage->lru);
936 put_page(hpage);
937 }
938 931
932 unlock_page(hpage);
933out:
939 put_page(new_hpage); 934 put_page(new_hpage);
940
941 if (result) { 935 if (result) {
942 if (rc) 936 if (rc)
943 *result = rc; 937 *result = rc;
@@ -1013,48 +1007,32 @@ out:
1013 return nr_failed + retry; 1007 return nr_failed + retry;
1014} 1008}
1015 1009
1016int migrate_huge_pages(struct list_head *from, 1010int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1017 new_page_t get_new_page, unsigned long private, bool offlining, 1011 unsigned long private, bool offlining,
1018 enum migrate_mode mode) 1012 enum migrate_mode mode)
1019{ 1013{
1020 int retry = 1; 1014 int pass, rc;
1021 int nr_failed = 0; 1015
1022 int pass = 0; 1016 for (pass = 0; pass < 10; pass++) {
1023 struct page *page; 1017 rc = unmap_and_move_huge_page(get_new_page,
1024 struct page *page2; 1018 private, hpage, pass > 2, offlining,
1025 int rc; 1019 mode);
1026 1020 switch (rc) {
1027 for (pass = 0; pass < 10 && retry; pass++) { 1021 case -ENOMEM:
1028 retry = 0; 1022 goto out;
1029 1023 case -EAGAIN:
1030 list_for_each_entry_safe(page, page2, from, lru) { 1024 /* try again */
1031 cond_resched(); 1025 cond_resched();
1032 1026 break;
1033 rc = unmap_and_move_huge_page(get_new_page, 1027 case 0:
1034 private, page, pass > 2, offlining, 1028 goto out;
1035 mode); 1029 default:
1036 1030 rc = -EIO;
1037 switch(rc) { 1031 goto out;
1038 case -ENOMEM:
1039 goto out;
1040 case -EAGAIN:
1041 retry++;
1042 break;
1043 case 0:
1044 break;
1045 default:
1046 /* Permanent failure */
1047 nr_failed++;
1048 break;
1049 }
1050 } 1032 }
1051 } 1033 }
1052 rc = 0;
1053out: 1034out:
1054 if (rc) 1035 return rc;
1055 return rc;
1056
1057 return nr_failed + retry;
1058} 1036}
1059 1037
1060#ifdef CONFIG_NUMA 1038#ifdef CONFIG_NUMA
@@ -1371,8 +1349,8 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1371 * userid as the target process. 1349 * userid as the target process.
1372 */ 1350 */
1373 tcred = __task_cred(task); 1351 tcred = __task_cred(task);
1374 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1352 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1375 cred->uid != tcred->suid && cred->uid != tcred->uid && 1353 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1376 !capable(CAP_SYS_NICE)) { 1354 !capable(CAP_SYS_NICE)) {
1377 rcu_read_unlock(); 1355 rcu_read_unlock();
1378 err = -EPERM; 1356 err = -EPERM;
@@ -1388,14 +1366,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1388 mm = get_task_mm(task); 1366 mm = get_task_mm(task);
1389 put_task_struct(task); 1367 put_task_struct(task);
1390 1368
1391 if (mm) { 1369 if (!mm)
1392 if (nodes) 1370 return -EINVAL;
1393 err = do_pages_move(mm, task_nodes, nr_pages, pages, 1371
1394 nodes, status, flags); 1372 if (nodes)
1395 else 1373 err = do_pages_move(mm, task_nodes, nr_pages, pages,
1396 err = do_pages_stat(mm, nr_pages, pages, status); 1374 nodes, status, flags);
1397 } else 1375 else
1398 err = -EINVAL; 1376 err = do_pages_stat(mm, nr_pages, pages, status);
1399 1377
1400 mmput(mm); 1378 mmput(mm);
1401 return err; 1379 return err;
diff --git a/mm/mmap.c b/mm/mmap.c
index a7bf6a31c9f6..ae18a48e7e4e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -30,6 +30,7 @@
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/audit.h> 31#include <linux/audit.h>
32#include <linux/khugepaged.h> 32#include <linux/khugepaged.h>
33#include <linux/uprobes.h>
33 34
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35#include <asm/cacheflush.h> 36#include <asm/cacheflush.h>
@@ -240,6 +241,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
240 return next; 241 return next;
241} 242}
242 243
244static unsigned long do_brk(unsigned long addr, unsigned long len);
245
243SYSCALL_DEFINE1(brk, unsigned long, brk) 246SYSCALL_DEFINE1(brk, unsigned long, brk)
244{ 247{
245 unsigned long rlim, retval; 248 unsigned long rlim, retval;
@@ -544,8 +547,15 @@ again: remove_next = 1 + (end > next->vm_end);
544 547
545 if (file) { 548 if (file) {
546 mapping = file->f_mapping; 549 mapping = file->f_mapping;
547 if (!(vma->vm_flags & VM_NONLINEAR)) 550 if (!(vma->vm_flags & VM_NONLINEAR)) {
548 root = &mapping->i_mmap; 551 root = &mapping->i_mmap;
552 uprobe_munmap(vma, vma->vm_start, vma->vm_end);
553
554 if (adjust_next)
555 uprobe_munmap(next, next->vm_start,
556 next->vm_end);
557 }
558
549 mutex_lock(&mapping->i_mmap_mutex); 559 mutex_lock(&mapping->i_mmap_mutex);
550 if (insert) { 560 if (insert) {
551 /* 561 /*
@@ -615,8 +625,16 @@ again: remove_next = 1 + (end > next->vm_end);
615 if (mapping) 625 if (mapping)
616 mutex_unlock(&mapping->i_mmap_mutex); 626 mutex_unlock(&mapping->i_mmap_mutex);
617 627
628 if (root) {
629 uprobe_mmap(vma);
630
631 if (adjust_next)
632 uprobe_mmap(next);
633 }
634
618 if (remove_next) { 635 if (remove_next) {
619 if (file) { 636 if (file) {
637 uprobe_munmap(next, next->vm_start, next->vm_end);
620 fput(file); 638 fput(file);
621 if (next->vm_flags & VM_EXECUTABLE) 639 if (next->vm_flags & VM_EXECUTABLE)
622 removed_exe_file_vma(mm); 640 removed_exe_file_vma(mm);
@@ -636,6 +654,8 @@ again: remove_next = 1 + (end > next->vm_end);
636 goto again; 654 goto again;
637 } 655 }
638 } 656 }
657 if (insert && file)
658 uprobe_mmap(insert);
639 659
640 validate_mm(mm); 660 validate_mm(mm);
641 661
@@ -923,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
923 const unsigned long stack_flags 943 const unsigned long stack_flags
924 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); 944 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
925 945
946 mm->total_vm += pages;
947
926 if (file) { 948 if (file) {
927 mm->shared_vm += pages; 949 mm->shared_vm += pages;
928 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) 950 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
@@ -958,8 +980,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
958 struct mm_struct * mm = current->mm; 980 struct mm_struct * mm = current->mm;
959 struct inode *inode; 981 struct inode *inode;
960 vm_flags_t vm_flags; 982 vm_flags_t vm_flags;
961 int error;
962 unsigned long reqprot = prot;
963 983
964 /* 984 /*
965 * Does the application expect PROT_READ to imply PROT_EXEC? 985 * Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1081,13 +1101,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1081 } 1101 }
1082 } 1102 }
1083 1103
1084 error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
1085 if (error)
1086 return error;
1087
1088 return mmap_region(file, addr, len, flags, vm_flags, pgoff); 1104 return mmap_region(file, addr, len, flags, vm_flags, pgoff);
1089} 1105}
1090EXPORT_SYMBOL(do_mmap_pgoff);
1091 1106
1092SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1107SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1093 unsigned long, prot, unsigned long, flags, 1108 unsigned long, prot, unsigned long, flags,
@@ -1120,10 +1135,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1120 1135
1121 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1136 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1122 1137
1123 down_write(&current->mm->mmap_sem); 1138 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1124 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1125 up_write(&current->mm->mmap_sem);
1126
1127 if (file) 1139 if (file)
1128 fput(file); 1140 fput(file);
1129out: 1141out:
@@ -1337,13 +1349,16 @@ munmap_back:
1337out: 1349out:
1338 perf_event_mmap(vma); 1350 perf_event_mmap(vma);
1339 1351
1340 mm->total_vm += len >> PAGE_SHIFT;
1341 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1352 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1342 if (vm_flags & VM_LOCKED) { 1353 if (vm_flags & VM_LOCKED) {
1343 if (!mlock_vma_pages_range(vma, addr, addr + len)) 1354 if (!mlock_vma_pages_range(vma, addr, addr + len))
1344 mm->locked_vm += (len >> PAGE_SHIFT); 1355 mm->locked_vm += (len >> PAGE_SHIFT);
1345 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1356 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1346 make_pages_present(addr, addr + len); 1357 make_pages_present(addr, addr + len);
1358
1359 if (file)
1360 uprobe_mmap(vma);
1361
1347 return addr; 1362 return addr;
1348 1363
1349unmap_and_free_vma: 1364unmap_and_free_vma:
@@ -1579,7 +1594,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1579 if (addr & ~PAGE_MASK) 1594 if (addr & ~PAGE_MASK)
1580 return -EINVAL; 1595 return -EINVAL;
1581 1596
1582 return arch_rebalance_pgtables(addr, len); 1597 addr = arch_rebalance_pgtables(addr, len);
1598 error = security_mmap_addr(addr);
1599 return error ? error : addr;
1583} 1600}
1584 1601
1585EXPORT_SYMBOL(get_unmapped_area); 1602EXPORT_SYMBOL(get_unmapped_area);
@@ -1589,33 +1606,34 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1589{ 1606{
1590 struct vm_area_struct *vma = NULL; 1607 struct vm_area_struct *vma = NULL;
1591 1608
1592 if (mm) { 1609 if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */
1593 /* Check the cache first. */ 1610 return NULL;
1594 /* (Cache hit rate is typically around 35%.) */ 1611
1595 vma = mm->mmap_cache; 1612 /* Check the cache first. */
1596 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { 1613 /* (Cache hit rate is typically around 35%.) */
1597 struct rb_node * rb_node; 1614 vma = mm->mmap_cache;
1598 1615 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
1599 rb_node = mm->mm_rb.rb_node; 1616 struct rb_node *rb_node;
1600 vma = NULL; 1617
1601 1618 rb_node = mm->mm_rb.rb_node;
1602 while (rb_node) { 1619 vma = NULL;
1603 struct vm_area_struct * vma_tmp; 1620
1604 1621 while (rb_node) {
1605 vma_tmp = rb_entry(rb_node, 1622 struct vm_area_struct *vma_tmp;
1606 struct vm_area_struct, vm_rb); 1623
1607 1624 vma_tmp = rb_entry(rb_node,
1608 if (vma_tmp->vm_end > addr) { 1625 struct vm_area_struct, vm_rb);
1609 vma = vma_tmp; 1626
1610 if (vma_tmp->vm_start <= addr) 1627 if (vma_tmp->vm_end > addr) {
1611 break; 1628 vma = vma_tmp;
1612 rb_node = rb_node->rb_left; 1629 if (vma_tmp->vm_start <= addr)
1613 } else 1630 break;
1614 rb_node = rb_node->rb_right; 1631 rb_node = rb_node->rb_left;
1615 } 1632 } else
1616 if (vma) 1633 rb_node = rb_node->rb_right;
1617 mm->mmap_cache = vma;
1618 } 1634 }
1635 if (vma)
1636 mm->mmap_cache = vma;
1619 } 1637 }
1620 return vma; 1638 return vma;
1621} 1639}
@@ -1689,7 +1707,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1689 return -ENOMEM; 1707 return -ENOMEM;
1690 1708
1691 /* Ok, everything looks good - let it rip */ 1709 /* Ok, everything looks good - let it rip */
1692 mm->total_vm += grow;
1693 if (vma->vm_flags & VM_LOCKED) 1710 if (vma->vm_flags & VM_LOCKED)
1694 mm->locked_vm += grow; 1711 mm->locked_vm += grow;
1695 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); 1712 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
@@ -1768,7 +1785,7 @@ int expand_downwards(struct vm_area_struct *vma,
1768 return -ENOMEM; 1785 return -ENOMEM;
1769 1786
1770 address &= PAGE_MASK; 1787 address &= PAGE_MASK;
1771 error = security_file_mmap(NULL, 0, 0, 0, address, 1); 1788 error = security_mmap_addr(address);
1772 if (error) 1789 if (error)
1773 return error; 1790 return error;
1774 1791
@@ -1862,15 +1879,19 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
1862 */ 1879 */
1863static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) 1880static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1864{ 1881{
1882 unsigned long nr_accounted = 0;
1883
1865 /* Update high watermark before we lower total_vm */ 1884 /* Update high watermark before we lower total_vm */
1866 update_hiwater_vm(mm); 1885 update_hiwater_vm(mm);
1867 do { 1886 do {
1868 long nrpages = vma_pages(vma); 1887 long nrpages = vma_pages(vma);
1869 1888
1870 mm->total_vm -= nrpages; 1889 if (vma->vm_flags & VM_ACCOUNT)
1890 nr_accounted += nrpages;
1871 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 1891 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1872 vma = remove_vma(vma); 1892 vma = remove_vma(vma);
1873 } while (vma); 1893 } while (vma);
1894 vm_unacct_memory(nr_accounted);
1874 validate_mm(mm); 1895 validate_mm(mm);
1875} 1896}
1876 1897
@@ -1885,13 +1906,11 @@ static void unmap_region(struct mm_struct *mm,
1885{ 1906{
1886 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; 1907 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
1887 struct mmu_gather tlb; 1908 struct mmu_gather tlb;
1888 unsigned long nr_accounted = 0;
1889 1909
1890 lru_add_drain(); 1910 lru_add_drain();
1891 tlb_gather_mmu(&tlb, mm, 0); 1911 tlb_gather_mmu(&tlb, mm, 0);
1892 update_hiwater_rss(mm); 1912 update_hiwater_rss(mm);
1893 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); 1913 unmap_vmas(&tlb, vma, start, end);
1894 vm_unacct_memory(nr_accounted);
1895 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, 1914 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
1896 next ? next->vm_start : 0); 1915 next ? next->vm_start : 0);
1897 tlb_finish_mmu(&tlb, start, end); 1916 tlb_finish_mmu(&tlb, start, end);
@@ -2106,20 +2125,23 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2106 return 0; 2125 return 0;
2107} 2126}
2108 2127
2109EXPORT_SYMBOL(do_munmap); 2128int vm_munmap(unsigned long start, size_t len)
2110
2111SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
2112{ 2129{
2113 int ret; 2130 int ret;
2114 struct mm_struct *mm = current->mm; 2131 struct mm_struct *mm = current->mm;
2115 2132
2116 profile_munmap(addr);
2117
2118 down_write(&mm->mmap_sem); 2133 down_write(&mm->mmap_sem);
2119 ret = do_munmap(mm, addr, len); 2134 ret = do_munmap(mm, start, len);
2120 up_write(&mm->mmap_sem); 2135 up_write(&mm->mmap_sem);
2121 return ret; 2136 return ret;
2122} 2137}
2138EXPORT_SYMBOL(vm_munmap);
2139
2140SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
2141{
2142 profile_munmap(addr);
2143 return vm_munmap(addr, len);
2144}
2123 2145
2124static inline void verify_mm_writelocked(struct mm_struct *mm) 2146static inline void verify_mm_writelocked(struct mm_struct *mm)
2125{ 2147{
@@ -2136,7 +2158,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
2136 * anonymous maps. eventually we may be able to do some 2158 * anonymous maps. eventually we may be able to do some
2137 * brk-specific accounting here. 2159 * brk-specific accounting here.
2138 */ 2160 */
2139unsigned long do_brk(unsigned long addr, unsigned long len) 2161static unsigned long do_brk(unsigned long addr, unsigned long len)
2140{ 2162{
2141 struct mm_struct * mm = current->mm; 2163 struct mm_struct * mm = current->mm;
2142 struct vm_area_struct * vma, * prev; 2164 struct vm_area_struct * vma, * prev;
@@ -2149,10 +2171,6 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2149 if (!len) 2171 if (!len)
2150 return addr; 2172 return addr;
2151 2173
2152 error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
2153 if (error)
2154 return error;
2155
2156 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 2174 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2157 2175
2158 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); 2176 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
@@ -2232,7 +2250,17 @@ out:
2232 return addr; 2250 return addr;
2233} 2251}
2234 2252
2235EXPORT_SYMBOL(do_brk); 2253unsigned long vm_brk(unsigned long addr, unsigned long len)
2254{
2255 struct mm_struct *mm = current->mm;
2256 unsigned long ret;
2257
2258 down_write(&mm->mmap_sem);
2259 ret = do_brk(addr, len);
2260 up_write(&mm->mmap_sem);
2261 return ret;
2262}
2263EXPORT_SYMBOL(vm_brk);
2236 2264
2237/* Release all mmaps. */ 2265/* Release all mmaps. */
2238void exit_mmap(struct mm_struct *mm) 2266void exit_mmap(struct mm_struct *mm)
@@ -2264,8 +2292,7 @@ void exit_mmap(struct mm_struct *mm)
2264 tlb_gather_mmu(&tlb, mm, 1); 2292 tlb_gather_mmu(&tlb, mm, 1);
2265 /* update_hiwater_rss(mm) here? but nobody should be looking */ 2293 /* update_hiwater_rss(mm) here? but nobody should be looking */
2266 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2294 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2267 unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2295 unmap_vmas(&tlb, vma, 0, -1);
2268 vm_unacct_memory(nr_accounted);
2269 2296
2270 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); 2297 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
2271 tlb_finish_mmu(&tlb, 0, -1); 2298 tlb_finish_mmu(&tlb, 0, -1);
@@ -2274,10 +2301,14 @@ void exit_mmap(struct mm_struct *mm)
2274 * Walk the list again, actually closing and freeing it, 2301 * Walk the list again, actually closing and freeing it,
2275 * with preemption enabled, without holding any MM locks. 2302 * with preemption enabled, without holding any MM locks.
2276 */ 2303 */
2277 while (vma) 2304 while (vma) {
2305 if (vma->vm_flags & VM_ACCOUNT)
2306 nr_accounted += vma_pages(vma);
2278 vma = remove_vma(vma); 2307 vma = remove_vma(vma);
2308 }
2309 vm_unacct_memory(nr_accounted);
2279 2310
2280 BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); 2311 WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
2281} 2312}
2282 2313
2283/* Insert vm structure into process list sorted by address 2314/* Insert vm structure into process list sorted by address
@@ -2311,6 +2342,7 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2311 if ((vma->vm_flags & VM_ACCOUNT) && 2342 if ((vma->vm_flags & VM_ACCOUNT) &&
2312 security_vm_enough_memory_mm(mm, vma_pages(vma))) 2343 security_vm_enough_memory_mm(mm, vma_pages(vma)))
2313 return -ENOMEM; 2344 return -ENOMEM;
2345
2314 vma_link(mm, vma, prev, rb_link, rb_parent); 2346 vma_link(mm, vma, prev, rb_link, rb_parent);
2315 return 0; 2347 return 0;
2316} 2348}
@@ -2380,6 +2412,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2380 new_vma->vm_pgoff = pgoff; 2412 new_vma->vm_pgoff = pgoff;
2381 if (new_vma->vm_file) { 2413 if (new_vma->vm_file) {
2382 get_file(new_vma->vm_file); 2414 get_file(new_vma->vm_file);
2415
2383 if (vma->vm_flags & VM_EXECUTABLE) 2416 if (vma->vm_flags & VM_EXECUTABLE)
2384 added_exe_file_vma(mm); 2417 added_exe_file_vma(mm);
2385 } 2418 }
@@ -2484,10 +2517,6 @@ int install_special_mapping(struct mm_struct *mm,
2484 vma->vm_ops = &special_mapping_vmops; 2517 vma->vm_ops = &special_mapping_vmops;
2485 vma->vm_private_data = pages; 2518 vma->vm_private_data = pages;
2486 2519
2487 ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
2488 if (ret)
2489 goto out;
2490
2491 ret = insert_vm_struct(mm, vma); 2520 ret = insert_vm_struct(mm, vma);
2492 if (ret) 2521 if (ret)
2493 goto out; 2522 goto out;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9a611d3a1848..862b60822d9f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
33void __mmu_notifier_release(struct mm_struct *mm) 33void __mmu_notifier_release(struct mm_struct *mm)
34{ 34{
35 struct mmu_notifier *mn; 35 struct mmu_notifier *mn;
36 struct hlist_node *n;
37
38 /*
39 * RCU here will block mmu_notifier_unregister until
40 * ->release returns.
41 */
42 rcu_read_lock();
43 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
44 /*
45 * if ->release runs before mmu_notifier_unregister it
46 * must be handled as it's the only way for the driver
47 * to flush all existing sptes and stop the driver
48 * from establishing any more sptes before all the
49 * pages in the mm are freed.
50 */
51 if (mn->ops->release)
52 mn->ops->release(mn, mm);
53 rcu_read_unlock();
36 54
37 spin_lock(&mm->mmu_notifier_mm->lock); 55 spin_lock(&mm->mmu_notifier_mm->lock);
38 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 56 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
46 * mmu_notifier_unregister to return. 64 * mmu_notifier_unregister to return.
47 */ 65 */
48 hlist_del_init_rcu(&mn->hlist); 66 hlist_del_init_rcu(&mn->hlist);
49 /*
50 * RCU here will block mmu_notifier_unregister until
51 * ->release returns.
52 */
53 rcu_read_lock();
54 spin_unlock(&mm->mmu_notifier_mm->lock);
55 /*
56 * if ->release runs before mmu_notifier_unregister it
57 * must be handled as it's the only way for the driver
58 * to flush all existing sptes and stop the driver
59 * from establishing any more sptes before all the
60 * pages in the mm are freed.
61 */
62 if (mn->ops->release)
63 mn->ops->release(mn, mm);
64 rcu_read_unlock();
65 spin_lock(&mm->mmu_notifier_mm->lock);
66 } 67 }
67 spin_unlock(&mm->mmu_notifier_mm->lock); 68 spin_unlock(&mm->mmu_notifier_mm->lock);
68 69
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
284{ 285{
285 BUG_ON(atomic_read(&mm->mm_count) <= 0); 286 BUG_ON(atomic_read(&mm->mm_count) <= 0);
286 287
287 spin_lock(&mm->mmu_notifier_mm->lock);
288 if (!hlist_unhashed(&mn->hlist)) { 288 if (!hlist_unhashed(&mn->hlist)) {
289 hlist_del_rcu(&mn->hlist);
290
291 /* 289 /*
292 * RCU here will force exit_mmap to wait ->release to finish 290 * RCU here will force exit_mmap to wait ->release to finish
293 * before freeing the pages. 291 * before freeing the pages.
294 */ 292 */
295 rcu_read_lock(); 293 rcu_read_lock();
296 spin_unlock(&mm->mmu_notifier_mm->lock); 294
297 /* 295 /*
298 * exit_mmap will block in mmu_notifier_release to 296 * exit_mmap will block in mmu_notifier_release to
299 * guarantee ->release is called before freeing the 297 * guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
302 if (mn->ops->release) 300 if (mn->ops->release)
303 mn->ops->release(mn, mm); 301 mn->ops->release(mn, mm);
304 rcu_read_unlock(); 302 rcu_read_unlock();
305 } else 303
304 spin_lock(&mm->mmu_notifier_mm->lock);
305 hlist_del_rcu(&mn->hlist);
306 spin_unlock(&mm->mmu_notifier_mm->lock); 306 spin_unlock(&mm->mmu_notifier_mm->lock);
307 }
307 308
308 /* 309 /*
309 * Wait any running method to finish, of course including 310 * Wait any running method to finish, of course including
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 7cf7b7ddc7c5..3cef80f6ac79 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -86,3 +86,17 @@ int memmap_valid_within(unsigned long pfn,
86 return 1; 86 return 1;
87} 87}
88#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 88#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
89
90void lruvec_init(struct lruvec *lruvec, struct zone *zone)
91{
92 enum lru_list lru;
93
94 memset(lruvec, 0, sizeof(struct lruvec));
95
96 for_each_lru(lru)
97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98
99#ifdef CONFIG_MEMCG
100 lruvec->zone = zone;
101#endif
102}
diff --git a/mm/mremap.c b/mm/mremap.c
index db8d983b5a7d..cc06d0e48d05 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
260 * If this were a serious issue, we'd add a flag to do_munmap(). 260 * If this were a serious issue, we'd add a flag to do_munmap().
261 */ 261 */
262 hiwater_vm = mm->hiwater_vm; 262 hiwater_vm = mm->hiwater_vm;
263 mm->total_vm += new_len >> PAGE_SHIFT;
264 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); 263 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
265 264
266 if (do_munmap(mm, old_addr, old_len) < 0) { 265 if (do_munmap(mm, old_addr, old_len) < 0) {
@@ -371,10 +370,6 @@ static unsigned long mremap_to(unsigned long addr,
371 if ((addr <= new_addr) && (addr+old_len) > new_addr) 370 if ((addr <= new_addr) && (addr+old_len) > new_addr)
372 goto out; 371 goto out;
373 372
374 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
375 if (ret)
376 goto out;
377
378 ret = do_munmap(mm, new_addr, new_len); 373 ret = do_munmap(mm, new_addr, new_len);
379 if (ret) 374 if (ret)
380 goto out; 375 goto out;
@@ -432,15 +427,17 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
432 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise 427 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
433 * This option implies MREMAP_MAYMOVE. 428 * This option implies MREMAP_MAYMOVE.
434 */ 429 */
435unsigned long do_mremap(unsigned long addr, 430SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
436 unsigned long old_len, unsigned long new_len, 431 unsigned long, new_len, unsigned long, flags,
437 unsigned long flags, unsigned long new_addr) 432 unsigned long, new_addr)
438{ 433{
439 struct mm_struct *mm = current->mm; 434 struct mm_struct *mm = current->mm;
440 struct vm_area_struct *vma; 435 struct vm_area_struct *vma;
441 unsigned long ret = -EINVAL; 436 unsigned long ret = -EINVAL;
442 unsigned long charged = 0; 437 unsigned long charged = 0;
443 438
439 down_write(&current->mm->mmap_sem);
440
444 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) 441 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
445 goto out; 442 goto out;
446 443
@@ -499,7 +496,6 @@ unsigned long do_mremap(unsigned long addr,
499 goto out; 496 goto out;
500 } 497 }
501 498
502 mm->total_vm += pages;
503 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 499 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
504 if (vma->vm_flags & VM_LOCKED) { 500 if (vma->vm_flags & VM_LOCKED) {
505 mm->locked_vm += pages; 501 mm->locked_vm += pages;
@@ -530,25 +526,11 @@ unsigned long do_mremap(unsigned long addr,
530 goto out; 526 goto out;
531 } 527 }
532 528
533 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
534 if (ret)
535 goto out;
536 ret = move_vma(vma, addr, old_len, new_len, new_addr); 529 ret = move_vma(vma, addr, old_len, new_len, new_addr);
537 } 530 }
538out: 531out:
539 if (ret & ~PAGE_MASK) 532 if (ret & ~PAGE_MASK)
540 vm_unacct_memory(charged); 533 vm_unacct_memory(charged);
541 return ret;
542}
543
544SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
545 unsigned long, new_len, unsigned long, flags,
546 unsigned long, new_addr)
547{
548 unsigned long ret;
549
550 down_write(&current->mm->mmap_sem);
551 ret = do_mremap(addr, old_len, new_len, flags, new_addr);
552 up_write(&current->mm->mmap_sem); 534 up_write(&current->mm->mmap_sem);
553 return ret; 535 return ret;
554} 536}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 24f0fc1a56d6..405573010f99 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -82,8 +82,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
82 82
83static void __init __free_pages_memory(unsigned long start, unsigned long end) 83static void __init __free_pages_memory(unsigned long start, unsigned long end)
84{ 84{
85 int i; 85 unsigned long i, start_aligned, end_aligned;
86 unsigned long start_aligned, end_aligned;
87 int order = ilog2(BITS_PER_LONG); 86 int order = ilog2(BITS_PER_LONG);
88 87
89 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); 88 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
@@ -106,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
106 __free_pages_bootmem(pfn_to_page(i), 0); 105 __free_pages_bootmem(pfn_to_page(i), 0);
107} 106}
108 107
108static unsigned long __init __free_memory_core(phys_addr_t start,
109 phys_addr_t end)
110{
111 unsigned long start_pfn = PFN_UP(start);
112 unsigned long end_pfn = min_t(unsigned long,
113 PFN_DOWN(end), max_low_pfn);
114
115 if (start_pfn > end_pfn)
116 return 0;
117
118 __free_pages_memory(start_pfn, end_pfn);
119
120 return end_pfn - start_pfn;
121}
122
109unsigned long __init free_low_memory_core_early(int nodeid) 123unsigned long __init free_low_memory_core_early(int nodeid)
110{ 124{
111 unsigned long count = 0; 125 unsigned long count = 0;
112 phys_addr_t start, end; 126 phys_addr_t start, end, size;
113 u64 i; 127 u64 i;
114 128
115 /* free reserved array temporarily so that it's treated as free area */ 129 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
116 memblock_free_reserved_regions(); 130 count += __free_memory_core(start, end);
117 131
118 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { 132 /* free range that is used for reserved array if we allocate it */
119 unsigned long start_pfn = PFN_UP(start); 133 size = get_allocated_memblock_reserved_regions_info(&start);
120 unsigned long end_pfn = min_t(unsigned long, 134 if (size)
121 PFN_DOWN(end), max_low_pfn); 135 count += __free_memory_core(start, start + size);
122 if (start_pfn < end_pfn) {
123 __free_pages_memory(start_pfn, end_pfn);
124 count += end_pfn - start_pfn;
125 }
126 }
127 136
128 /* put region array back? */
129 memblock_reserve_reserved_regions();
130 return count; 137 return count;
131} 138}
132 139
@@ -275,6 +282,57 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
275 return ___alloc_bootmem(size, align, goal, limit); 282 return ___alloc_bootmem(size, align, goal, limit);
276} 283}
277 284
285void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
286 unsigned long size,
287 unsigned long align,
288 unsigned long goal,
289 unsigned long limit)
290{
291 void *ptr;
292
293again:
294 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
295 goal, limit);
296 if (ptr)
297 return ptr;
298
299 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
300 goal, limit);
301 if (ptr)
302 return ptr;
303
304 if (goal) {
305 goal = 0;
306 goto again;
307 }
308
309 return NULL;
310}
311
312void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
313 unsigned long align, unsigned long goal)
314{
315 if (WARN_ON_ONCE(slab_is_available()))
316 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
317
318 return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
319}
320
321void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
322 unsigned long align, unsigned long goal,
323 unsigned long limit)
324{
325 void *ptr;
326
327 ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
328 if (ptr)
329 return ptr;
330
331 printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
332 panic("Out of memory");
333 return NULL;
334}
335
278/** 336/**
279 * __alloc_bootmem_node - allocate boot memory from a specific node 337 * __alloc_bootmem_node - allocate boot memory from a specific node
280 * @pgdat: node to allocate from 338 * @pgdat: node to allocate from
@@ -293,18 +351,10 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
293void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, 351void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
294 unsigned long align, unsigned long goal) 352 unsigned long align, unsigned long goal)
295{ 353{
296 void *ptr;
297
298 if (WARN_ON_ONCE(slab_is_available())) 354 if (WARN_ON_ONCE(slab_is_available()))
299 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 355 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
300 356
301 ptr = __alloc_memory_core_early(pgdat->node_id, size, align, 357 return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
302 goal, -1ULL);
303 if (ptr)
304 return ptr;
305
306 return __alloc_memory_core_early(MAX_NUMNODES, size, align,
307 goal, -1ULL);
308} 358}
309 359
310void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, 360void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -313,44 +363,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
313 return __alloc_bootmem_node(pgdat, size, align, goal); 363 return __alloc_bootmem_node(pgdat, size, align, goal);
314} 364}
315 365
316#ifdef CONFIG_SPARSEMEM
317/**
318 * alloc_bootmem_section - allocate boot memory from a specific section
319 * @size: size of the request in bytes
320 * @section_nr: sparse map section to allocate from
321 *
322 * Return NULL on failure.
323 */
324void * __init alloc_bootmem_section(unsigned long size,
325 unsigned long section_nr)
326{
327 unsigned long pfn, goal, limit;
328
329 pfn = section_nr_to_pfn(section_nr);
330 goal = pfn << PAGE_SHIFT;
331 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
332
333 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
334 SMP_CACHE_BYTES, goal, limit);
335}
336#endif
337
338void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
339 unsigned long align, unsigned long goal)
340{
341 void *ptr;
342
343 if (WARN_ON_ONCE(slab_is_available()))
344 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
345
346 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
347 goal, -1ULL);
348 if (ptr)
349 return ptr;
350
351 return __alloc_bootmem_nopanic(size, align, goal);
352}
353
354#ifndef ARCH_LOW_ADDRESS_LIMIT 366#ifndef ARCH_LOW_ADDRESS_LIMIT
355#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL 367#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
356#endif 368#endif
@@ -392,16 +404,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
392void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 404void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
393 unsigned long align, unsigned long goal) 405 unsigned long align, unsigned long goal)
394{ 406{
395 void *ptr;
396
397 if (WARN_ON_ONCE(slab_is_available())) 407 if (WARN_ON_ONCE(slab_is_available()))
398 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 408 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
399 409
400 ptr = __alloc_memory_core_early(pgdat->node_id, size, align, 410 return ___alloc_bootmem_node(pgdat, size, align, goal,
401 goal, ARCH_LOW_ADDRESS_LIMIT); 411 ARCH_LOW_ADDRESS_LIMIT);
402 if (ptr)
403 return ptr;
404
405 return __alloc_memory_core_early(MAX_NUMNODES, size, align,
406 goal, ARCH_LOW_ADDRESS_LIMIT);
407} 412}
diff --git a/mm/nommu.c b/mm/nommu.c
index f59e170fceb4..d4b0c10872de 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -889,7 +889,6 @@ static int validate_mmap_request(struct file *file,
889 unsigned long *_capabilities) 889 unsigned long *_capabilities)
890{ 890{
891 unsigned long capabilities, rlen; 891 unsigned long capabilities, rlen;
892 unsigned long reqprot = prot;
893 int ret; 892 int ret;
894 893
895 /* do the simple checks first */ 894 /* do the simple checks first */
@@ -1047,7 +1046,7 @@ static int validate_mmap_request(struct file *file,
1047 } 1046 }
1048 1047
1049 /* allow the security API to have its say */ 1048 /* allow the security API to have its say */
1050 ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); 1049 ret = security_mmap_addr(addr);
1051 if (ret < 0) 1050 if (ret < 0)
1052 return ret; 1051 return ret;
1053 1052
@@ -1470,7 +1469,6 @@ error_getting_region:
1470 show_free_areas(0); 1469 show_free_areas(0);
1471 return -ENOMEM; 1470 return -ENOMEM;
1472} 1471}
1473EXPORT_SYMBOL(do_mmap_pgoff);
1474 1472
1475SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1473SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1476 unsigned long, prot, unsigned long, flags, 1474 unsigned long, prot, unsigned long, flags,
@@ -1488,9 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1488 1486
1489 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1487 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1490 1488
1491 down_write(&current->mm->mmap_sem); 1489 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1492 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1493 up_write(&current->mm->mmap_sem);
1494 1490
1495 if (file) 1491 if (file)
1496 fput(file); 1492 fput(file);
@@ -1709,16 +1705,22 @@ erase_whole_vma:
1709} 1705}
1710EXPORT_SYMBOL(do_munmap); 1706EXPORT_SYMBOL(do_munmap);
1711 1707
1712SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) 1708int vm_munmap(unsigned long addr, size_t len)
1713{ 1709{
1714 int ret;
1715 struct mm_struct *mm = current->mm; 1710 struct mm_struct *mm = current->mm;
1711 int ret;
1716 1712
1717 down_write(&mm->mmap_sem); 1713 down_write(&mm->mmap_sem);
1718 ret = do_munmap(mm, addr, len); 1714 ret = do_munmap(mm, addr, len);
1719 up_write(&mm->mmap_sem); 1715 up_write(&mm->mmap_sem);
1720 return ret; 1716 return ret;
1721} 1717}
1718EXPORT_SYMBOL(vm_munmap);
1719
1720SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
1721{
1722 return vm_munmap(addr, len);
1723}
1722 1724
1723/* 1725/*
1724 * release all the mappings made in a process's VM space 1726 * release all the mappings made in a process's VM space
@@ -1744,7 +1746,7 @@ void exit_mmap(struct mm_struct *mm)
1744 kleave(""); 1746 kleave("");
1745} 1747}
1746 1748
1747unsigned long do_brk(unsigned long addr, unsigned long len) 1749unsigned long vm_brk(unsigned long addr, unsigned long len)
1748{ 1750{
1749 return -ENOMEM; 1751 return -ENOMEM;
1750} 1752}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 46bf2ed5594c..198600861638 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -180,10 +180,11 @@ static bool oom_unkillable_task(struct task_struct *p,
180 * predictable as possible. The goal is to return the highest value for the 180 * predictable as possible. The goal is to return the highest value for the
181 * task consuming the most memory to avoid subsequent oom failures. 181 * task consuming the most memory to avoid subsequent oom failures.
182 */ 182 */
183unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, 183unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
184 const nodemask_t *nodemask, unsigned long totalpages) 184 const nodemask_t *nodemask, unsigned long totalpages)
185{ 185{
186 long points; 186 long points;
187 long adj;
187 188
188 if (oom_unkillable_task(p, memcg, nodemask)) 189 if (oom_unkillable_task(p, memcg, nodemask))
189 return 0; 190 return 0;
@@ -192,27 +193,18 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
192 if (!p) 193 if (!p)
193 return 0; 194 return 0;
194 195
195 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { 196 adj = p->signal->oom_score_adj;
197 if (adj == OOM_SCORE_ADJ_MIN) {
196 task_unlock(p); 198 task_unlock(p);
197 return 0; 199 return 0;
198 } 200 }
199 201
200 /* 202 /*
201 * The memory controller may have a limit of 0 bytes, so avoid a divide
202 * by zero, if necessary.
203 */
204 if (!totalpages)
205 totalpages = 1;
206
207 /*
208 * The baseline for the badness score is the proportion of RAM that each 203 * The baseline for the badness score is the proportion of RAM that each
209 * task's rss, pagetable and swap space use. 204 * task's rss, pagetable and swap space use.
210 */ 205 */
211 points = get_mm_rss(p->mm) + p->mm->nr_ptes; 206 points = get_mm_rss(p->mm) + p->mm->nr_ptes +
212 points += get_mm_counter(p->mm, MM_SWAPENTS); 207 get_mm_counter(p->mm, MM_SWAPENTS);
213
214 points *= 1000;
215 points /= totalpages;
216 task_unlock(p); 208 task_unlock(p);
217 209
218 /* 210 /*
@@ -220,23 +212,17 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
220 * implementation used by LSMs. 212 * implementation used by LSMs.
221 */ 213 */
222 if (has_capability_noaudit(p, CAP_SYS_ADMIN)) 214 if (has_capability_noaudit(p, CAP_SYS_ADMIN))
223 points -= 30; 215 adj -= 30;
224 216
225 /* 217 /* Normalize to oom_score_adj units */
226 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may 218 adj *= totalpages / 1000;
227 * either completely disable oom killing or always prefer a certain 219 points += adj;
228 * task.
229 */
230 points += p->signal->oom_score_adj;
231 220
232 /* 221 /*
233 * Never return 0 for an eligible task that may be killed since it's 222 * Never return 0 for an eligible task regardless of the root bonus and
234 * possible that no single user task uses more than 0.1% of memory and 223 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
235 * no single admin tasks uses more than 3.0%.
236 */ 224 */
237 if (points <= 0) 225 return points > 0 ? points : 1;
238 return 1;
239 return (points < 1000) ? points : 1000;
240} 226}
241 227
242/* 228/*
@@ -302,99 +288,116 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
302} 288}
303#endif 289#endif
304 290
291enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
292 unsigned long totalpages, const nodemask_t *nodemask,
293 bool force_kill)
294{
295 if (task->exit_state)
296 return OOM_SCAN_CONTINUE;
297 if (oom_unkillable_task(task, NULL, nodemask))
298 return OOM_SCAN_CONTINUE;
299
300 /*
301 * This task already has access to memory reserves and is being killed.
302 * Don't allow any other task to have access to the reserves.
303 */
304 if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
305 if (unlikely(frozen(task)))
306 __thaw_task(task);
307 if (!force_kill)
308 return OOM_SCAN_ABORT;
309 }
310 if (!task->mm)
311 return OOM_SCAN_CONTINUE;
312
313 if (task->flags & PF_EXITING) {
314 /*
315 * If task is current and is in the process of releasing memory,
316 * allow the "kill" to set TIF_MEMDIE, which will allow it to
317 * access memory reserves. Otherwise, it may stall forever.
318 *
319 * The iteration isn't broken here, however, in case other
320 * threads are found to have already been oom killed.
321 */
322 if (task == current)
323 return OOM_SCAN_SELECT;
324 else if (!force_kill) {
325 /*
326 * If this task is not being ptraced on exit, then wait
327 * for it to finish before killing some other task
328 * unnecessarily.
329 */
330 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
331 return OOM_SCAN_ABORT;
332 }
333 }
334 return OOM_SCAN_OK;
335}
336
305/* 337/*
306 * Simple selection loop. We chose the process with the highest 338 * Simple selection loop. We chose the process with the highest
307 * number of 'points'. We expect the caller will lock the tasklist. 339 * number of 'points'.
308 * 340 *
309 * (not docbooked, we don't want this one cluttering up the manual) 341 * (not docbooked, we don't want this one cluttering up the manual)
310 */ 342 */
311static struct task_struct *select_bad_process(unsigned int *ppoints, 343static struct task_struct *select_bad_process(unsigned int *ppoints,
312 unsigned long totalpages, struct mem_cgroup *memcg, 344 unsigned long totalpages, const nodemask_t *nodemask,
313 const nodemask_t *nodemask, bool force_kill) 345 bool force_kill)
314{ 346{
315 struct task_struct *g, *p; 347 struct task_struct *g, *p;
316 struct task_struct *chosen = NULL; 348 struct task_struct *chosen = NULL;
317 *ppoints = 0; 349 unsigned long chosen_points = 0;
318 350
351 rcu_read_lock();
319 do_each_thread(g, p) { 352 do_each_thread(g, p) {
320 unsigned int points; 353 unsigned int points;
321 354
322 if (p->exit_state) 355 switch (oom_scan_process_thread(p, totalpages, nodemask,
323 continue; 356 force_kill)) {
324 if (oom_unkillable_task(p, memcg, nodemask)) 357 case OOM_SCAN_SELECT:
325 continue; 358 chosen = p;
326 359 chosen_points = ULONG_MAX;
327 /* 360 /* fall through */
328 * This task already has access to memory reserves and is 361 case OOM_SCAN_CONTINUE:
329 * being killed. Don't allow any other task access to the
330 * memory reserve.
331 *
332 * Note: this may have a chance of deadlock if it gets
333 * blocked waiting for another task which itself is waiting
334 * for memory. Is there a better alternative?
335 */
336 if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
337 if (unlikely(frozen(p)))
338 __thaw_task(p);
339 if (!force_kill)
340 return ERR_PTR(-1UL);
341 }
342 if (!p->mm)
343 continue; 362 continue;
344 363 case OOM_SCAN_ABORT:
345 if (p->flags & PF_EXITING) { 364 rcu_read_unlock();
346 /* 365 return ERR_PTR(-1UL);
347 * If p is the current task and is in the process of 366 case OOM_SCAN_OK:
348 * releasing memory, we allow the "kill" to set 367 break;
349 * TIF_MEMDIE, which will allow it to gain access to 368 };
350 * memory reserves. Otherwise, it may stall forever. 369 points = oom_badness(p, NULL, nodemask, totalpages);
351 * 370 if (points > chosen_points) {
352 * The loop isn't broken here, however, in case other
353 * threads are found to have already been oom killed.
354 */
355 if (p == current) {
356 chosen = p;
357 *ppoints = 1000;
358 } else if (!force_kill) {
359 /*
360 * If this task is not being ptraced on exit,
361 * then wait for it to finish before killing
362 * some other task unnecessarily.
363 */
364 if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
365 return ERR_PTR(-1UL);
366 }
367 }
368
369 points = oom_badness(p, memcg, nodemask, totalpages);
370 if (points > *ppoints) {
371 chosen = p; 371 chosen = p;
372 *ppoints = points; 372 chosen_points = points;
373 } 373 }
374 } while_each_thread(g, p); 374 } while_each_thread(g, p);
375 if (chosen)
376 get_task_struct(chosen);
377 rcu_read_unlock();
375 378
379 *ppoints = chosen_points * 1000 / totalpages;
376 return chosen; 380 return chosen;
377} 381}
378 382
379/** 383/**
380 * dump_tasks - dump current memory state of all system tasks 384 * dump_tasks - dump current memory state of all system tasks
381 * @mem: current's memory controller, if constrained 385 * @memcg: current's memory controller, if constrained
382 * @nodemask: nodemask passed to page allocator for mempolicy ooms 386 * @nodemask: nodemask passed to page allocator for mempolicy ooms
383 * 387 *
384 * Dumps the current memory state of all eligible tasks. Tasks not in the same 388 * Dumps the current memory state of all eligible tasks. Tasks not in the same
385 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes 389 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
386 * are not shown. 390 * are not shown.
387 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 391 * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
388 * value, oom_score_adj value, and name. 392 * swapents, oom_score_adj value, and name.
389 *
390 * Call with tasklist_lock read-locked.
391 */ 393 */
392static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) 394static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
393{ 395{
394 struct task_struct *p; 396 struct task_struct *p;
395 struct task_struct *task; 397 struct task_struct *task;
396 398
397 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); 399 pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n");
400 rcu_read_lock();
398 for_each_process(p) { 401 for_each_process(p) {
399 if (oom_unkillable_task(p, memcg, nodemask)) 402 if (oom_unkillable_task(p, memcg, nodemask))
400 continue; 403 continue;
@@ -409,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
409 continue; 412 continue;
410 } 413 }
411 414
412 pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", 415 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
413 task->pid, task_uid(task), task->tgid, 416 task->pid, from_kuid(&init_user_ns, task_uid(task)),
414 task->mm->total_vm, get_mm_rss(task->mm), 417 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
415 task_cpu(task), task->signal->oom_adj, 418 task->mm->nr_ptes,
419 get_mm_counter(task->mm, MM_SWAPENTS),
416 task->signal->oom_score_adj, task->comm); 420 task->signal->oom_score_adj, task->comm);
417 task_unlock(task); 421 task_unlock(task);
418 } 422 }
423 rcu_read_unlock();
419} 424}
420 425
421static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 426static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
@@ -436,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
436} 441}
437 442
438#define K(x) ((x) << (PAGE_SHIFT-10)) 443#define K(x) ((x) << (PAGE_SHIFT-10))
439static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 444/*
440 unsigned int points, unsigned long totalpages, 445 * Must be called while holding a reference to p, which will be released upon
441 struct mem_cgroup *memcg, nodemask_t *nodemask, 446 * returning.
442 const char *message) 447 */
448void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
449 unsigned int points, unsigned long totalpages,
450 struct mem_cgroup *memcg, nodemask_t *nodemask,
451 const char *message)
443{ 452{
444 struct task_struct *victim = p; 453 struct task_struct *victim = p;
445 struct task_struct *child; 454 struct task_struct *child;
@@ -455,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
455 */ 464 */
456 if (p->flags & PF_EXITING) { 465 if (p->flags & PF_EXITING) {
457 set_tsk_thread_flag(p, TIF_MEMDIE); 466 set_tsk_thread_flag(p, TIF_MEMDIE);
467 put_task_struct(p);
458 return; 468 return;
459 } 469 }
460 470
@@ -472,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
472 * parent. This attempts to lose the minimal amount of work done while 482 * parent. This attempts to lose the minimal amount of work done while
473 * still freeing memory. 483 * still freeing memory.
474 */ 484 */
485 read_lock(&tasklist_lock);
475 do { 486 do {
476 list_for_each_entry(child, &t->children, sibling) { 487 list_for_each_entry(child, &t->children, sibling) {
477 unsigned int child_points; 488 unsigned int child_points;
@@ -484,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
484 child_points = oom_badness(child, memcg, nodemask, 495 child_points = oom_badness(child, memcg, nodemask,
485 totalpages); 496 totalpages);
486 if (child_points > victim_points) { 497 if (child_points > victim_points) {
498 put_task_struct(victim);
487 victim = child; 499 victim = child;
488 victim_points = child_points; 500 victim_points = child_points;
501 get_task_struct(victim);
489 } 502 }
490 } 503 }
491 } while_each_thread(p, t); 504 } while_each_thread(p, t);
505 read_unlock(&tasklist_lock);
492 506
493 victim = find_lock_task_mm(victim); 507 rcu_read_lock();
494 if (!victim) 508 p = find_lock_task_mm(victim);
509 if (!p) {
510 rcu_read_unlock();
511 put_task_struct(victim);
495 return; 512 return;
513 } else if (victim != p) {
514 get_task_struct(p);
515 put_task_struct(victim);
516 victim = p;
517 }
496 518
497 /* mm cannot safely be dereferenced after task_unlock(victim) */ 519 /* mm cannot safely be dereferenced after task_unlock(victim) */
498 mm = victim->mm; 520 mm = victim->mm;
@@ -523,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
523 task_unlock(p); 545 task_unlock(p);
524 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); 546 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
525 } 547 }
548 rcu_read_unlock();
526 549
527 set_tsk_thread_flag(victim, TIF_MEMDIE); 550 set_tsk_thread_flag(victim, TIF_MEMDIE);
528 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); 551 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
552 put_task_struct(victim);
529} 553}
530#undef K 554#undef K
531 555
532/* 556/*
533 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 557 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
534 */ 558 */
535static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, 559void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
536 int order, const nodemask_t *nodemask) 560 int order, const nodemask_t *nodemask)
537{ 561{
538 if (likely(!sysctl_panic_on_oom)) 562 if (likely(!sysctl_panic_on_oom))
539 return; 563 return;
@@ -546,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
546 if (constraint != CONSTRAINT_NONE) 570 if (constraint != CONSTRAINT_NONE)
547 return; 571 return;
548 } 572 }
549 read_lock(&tasklist_lock);
550 dump_header(NULL, gfp_mask, order, NULL, nodemask); 573 dump_header(NULL, gfp_mask, order, NULL, nodemask);
551 read_unlock(&tasklist_lock);
552 panic("Out of memory: %s panic_on_oom is enabled\n", 574 panic("Out of memory: %s panic_on_oom is enabled\n",
553 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); 575 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
554} 576}
555 577
556#ifdef CONFIG_CGROUP_MEM_RES_CTLR
557void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
558 int order)
559{
560 unsigned long limit;
561 unsigned int points = 0;
562 struct task_struct *p;
563
564 /*
565 * If current has a pending SIGKILL, then automatically select it. The
566 * goal is to allow it to allocate so that it may quickly exit and free
567 * its memory.
568 */
569 if (fatal_signal_pending(current)) {
570 set_thread_flag(TIF_MEMDIE);
571 return;
572 }
573
574 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
575 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
576 read_lock(&tasklist_lock);
577 p = select_bad_process(&points, limit, memcg, NULL, false);
578 if (p && PTR_ERR(p) != -1UL)
579 oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
580 "Memory cgroup out of memory");
581 read_unlock(&tasklist_lock);
582}
583#endif
584
585static BLOCKING_NOTIFIER_HEAD(oom_notify_list); 578static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
586 579
587int register_oom_notifier(struct notifier_block *nb) 580int register_oom_notifier(struct notifier_block *nb)
@@ -703,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
703 struct task_struct *p; 696 struct task_struct *p;
704 unsigned long totalpages; 697 unsigned long totalpages;
705 unsigned long freed = 0; 698 unsigned long freed = 0;
706 unsigned int points; 699 unsigned int uninitialized_var(points);
707 enum oom_constraint constraint = CONSTRAINT_NONE; 700 enum oom_constraint constraint = CONSTRAINT_NONE;
708 int killed = 0; 701 int killed = 0;
709 702
@@ -731,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
731 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; 724 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
732 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); 725 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
733 726
734 read_lock(&tasklist_lock); 727 if (sysctl_oom_kill_allocating_task && current->mm &&
735 if (sysctl_oom_kill_allocating_task &&
736 !oom_unkillable_task(current, NULL, nodemask) && 728 !oom_unkillable_task(current, NULL, nodemask) &&
737 current->mm) { 729 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
730 get_task_struct(current);
738 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, 731 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
739 nodemask, 732 nodemask,
740 "Out of memory (oom_kill_allocating_task)"); 733 "Out of memory (oom_kill_allocating_task)");
741 goto out; 734 goto out;
742 } 735 }
743 736
744 p = select_bad_process(&points, totalpages, NULL, mpol_mask, 737 p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
745 force_kill);
746 /* Found nothing?!?! Either we hang forever, or we panic. */ 738 /* Found nothing?!?! Either we hang forever, or we panic. */
747 if (!p) { 739 if (!p) {
748 dump_header(NULL, gfp_mask, order, NULL, mpol_mask); 740 dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
749 read_unlock(&tasklist_lock);
750 panic("Out of memory and no killable processes...\n"); 741 panic("Out of memory and no killable processes...\n");
751 } 742 }
752 if (PTR_ERR(p) != -1UL) { 743 if (PTR_ERR(p) != -1UL) {
@@ -755,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
755 killed = 1; 746 killed = 1;
756 } 747 }
757out: 748out:
758 read_unlock(&tasklist_lock);
759
760 /* 749 /*
761 * Give "p" a good chance of killing itself before we 750 * Give the killed threads a good chance of exiting before trying to
762 * retry to allocate memory unless "p" is current 751 * allocate memory again.
763 */ 752 */
764 if (killed && !test_thread_flag(TIF_MEMDIE)) 753 if (killed)
765 schedule_timeout_uninterruptible(1); 754 schedule_timeout_killable(1);
766} 755}
767 756
768/* 757/*
@@ -777,6 +766,5 @@ void pagefault_out_of_memory(void)
777 out_of_memory(NULL, 0, 0, NULL, false); 766 out_of_memory(NULL, 0, 0, NULL, false);
778 clear_system_oom(); 767 clear_system_oom();
779 } 768 }
780 if (!test_thread_flag(TIF_MEMDIE)) 769 schedule_timeout_killable(1);
781 schedule_timeout_uninterruptible(1);
782} 770}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 26adea8ca2e7..5ad5ce23c1e0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ 35#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37#include <linux/timer.h>
37#include <trace/events/writeback.h> 38#include <trace/events/writeback.h>
38 39
39/* 40/*
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit;
135 * measured in page writeback completions. 136 * measured in page writeback completions.
136 * 137 *
137 */ 138 */
138static struct prop_descriptor vm_completions; 139static struct fprop_global writeout_completions;
140
141static void writeout_period(unsigned long t);
142/* Timer for aging of writeout_completions */
143static struct timer_list writeout_period_timer =
144 TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
145static unsigned long writeout_period_time = 0;
146
147/*
148 * Length of period for aging writeout fractions of bdis. This is an
149 * arbitrarily chosen number. The longer the period, the slower fractions will
150 * reflect changes in current writeout rate.
151 */
152#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
139 153
140/* 154/*
141 * Work out the current dirty-memory clamping and background writeout 155 * Work out the current dirty-memory clamping and background writeout
@@ -204,7 +218,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
204 * Returns the global number of pages potentially available for dirty 218 * Returns the global number of pages potentially available for dirty
205 * page cache. This is the base value for the global dirty limits. 219 * page cache. This is the base value for the global dirty limits.
206 */ 220 */
207unsigned long global_dirtyable_memory(void) 221static unsigned long global_dirtyable_memory(void)
208{ 222{
209 unsigned long x; 223 unsigned long x;
210 224
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone)
322 zone_page_state(zone, NR_WRITEBACK) <= limit; 336 zone_page_state(zone, NR_WRITEBACK) <= limit;
323} 337}
324 338
325/*
326 * couple the period to the dirty_ratio:
327 *
328 * period/2 ~ roundup_pow_of_two(dirty limit)
329 */
330static int calc_period_shift(void)
331{
332 unsigned long dirty_total;
333
334 if (vm_dirty_bytes)
335 dirty_total = vm_dirty_bytes / PAGE_SIZE;
336 else
337 dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
338 100;
339 return 2 + ilog2(dirty_total - 1);
340}
341
342/*
343 * update the period when the dirty threshold changes.
344 */
345static void update_completion_period(void)
346{
347 int shift = calc_period_shift();
348 prop_change_shift(&vm_completions, shift);
349
350 writeback_set_ratelimit();
351}
352
353int dirty_background_ratio_handler(struct ctl_table *table, int write, 339int dirty_background_ratio_handler(struct ctl_table *table, int write,
354 void __user *buffer, size_t *lenp, 340 void __user *buffer, size_t *lenp,
355 loff_t *ppos) 341 loff_t *ppos)
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
383 369
384 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 370 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
385 if (ret == 0 && write && vm_dirty_ratio != old_ratio) { 371 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
386 update_completion_period(); 372 writeback_set_ratelimit();
387 vm_dirty_bytes = 0; 373 vm_dirty_bytes = 0;
388 } 374 }
389 return ret; 375 return ret;
@@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
398 384
399 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 385 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
400 if (ret == 0 && write && vm_dirty_bytes != old_bytes) { 386 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
401 update_completion_period(); 387 writeback_set_ratelimit();
402 vm_dirty_ratio = 0; 388 vm_dirty_ratio = 0;
403 } 389 }
404 return ret; 390 return ret;
405} 391}
406 392
393static unsigned long wp_next_time(unsigned long cur_time)
394{
395 cur_time += VM_COMPLETIONS_PERIOD_LEN;
396 /* 0 has a special meaning... */
397 if (!cur_time)
398 return 1;
399 return cur_time;
400}
401
407/* 402/*
408 * Increment the BDI's writeout completion count and the global writeout 403 * Increment the BDI's writeout completion count and the global writeout
409 * completion count. Called from test_clear_page_writeback(). 404 * completion count. Called from test_clear_page_writeback().
@@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
411static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) 406static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
412{ 407{
413 __inc_bdi_stat(bdi, BDI_WRITTEN); 408 __inc_bdi_stat(bdi, BDI_WRITTEN);
414 __prop_inc_percpu_max(&vm_completions, &bdi->completions, 409 __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
415 bdi->max_prop_frac); 410 bdi->max_prop_frac);
411 /* First event after period switching was turned off? */
412 if (!unlikely(writeout_period_time)) {
413 /*
414 * We can race with other __bdi_writeout_inc calls here but
415 * it does not cause any harm since the resulting time when
416 * timer will fire and what is in writeout_period_time will be
417 * roughly the same.
418 */
419 writeout_period_time = wp_next_time(jiffies);
420 mod_timer(&writeout_period_timer, writeout_period_time);
421 }
416} 422}
417 423
418void bdi_writeout_inc(struct backing_dev_info *bdi) 424void bdi_writeout_inc(struct backing_dev_info *bdi)
@@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);
431static void bdi_writeout_fraction(struct backing_dev_info *bdi, 437static void bdi_writeout_fraction(struct backing_dev_info *bdi,
432 long *numerator, long *denominator) 438 long *numerator, long *denominator)
433{ 439{
434 prop_fraction_percpu(&vm_completions, &bdi->completions, 440 fprop_fraction_percpu(&writeout_completions, &bdi->completions,
435 numerator, denominator); 441 numerator, denominator);
436} 442}
437 443
438/* 444/*
445 * On idle system, we can be called long after we scheduled because we use
446 * deferred timers so count with missed periods.
447 */
448static void writeout_period(unsigned long t)
449{
450 int miss_periods = (jiffies - writeout_period_time) /
451 VM_COMPLETIONS_PERIOD_LEN;
452
453 if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
454 writeout_period_time = wp_next_time(writeout_period_time +
455 miss_periods * VM_COMPLETIONS_PERIOD_LEN);
456 mod_timer(&writeout_period_timer, writeout_period_time);
457 } else {
458 /*
459 * Aging has zeroed all fractions. Stop wasting CPU on period
460 * updates.
461 */
462 writeout_period_time = 0;
463 }
464}
465
466/*
439 * bdi_min_ratio keeps the sum of the minimum dirty shares of all 467 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
440 * registered backing devices, which, for obvious reasons, can not 468 * registered backing devices, which, for obvious reasons, can not
441 * exceed 100%. 469 * exceed 100%.
@@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
475 ret = -EINVAL; 503 ret = -EINVAL;
476 } else { 504 } else {
477 bdi->max_ratio = max_ratio; 505 bdi->max_ratio = max_ratio;
478 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 506 bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
479 } 507 }
480 spin_unlock_bh(&bdi_lock); 508 spin_unlock_bh(&bdi_lock);
481 509
@@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
918 * bdi->dirty_ratelimit = balanced_dirty_ratelimit; 946 * bdi->dirty_ratelimit = balanced_dirty_ratelimit;
919 * 947 *
920 * However to get a more stable dirty_ratelimit, the below elaborated 948 * However to get a more stable dirty_ratelimit, the below elaborated
921 * code makes use of task_ratelimit to filter out sigular points and 949 * code makes use of task_ratelimit to filter out singular points and
922 * limit the step size. 950 * limit the step size.
923 * 951 *
924 * The below code essentially only uses the relative value of 952 * The below code essentially only uses the relative value of
@@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
941 * feel and care are stable dirty rate and small position error. 969 * feel and care are stable dirty rate and small position error.
942 * 970 *
943 * |task_ratelimit - dirty_ratelimit| is used to limit the step size 971 * |task_ratelimit - dirty_ratelimit| is used to limit the step size
944 * and filter out the sigular points of balanced_dirty_ratelimit. Which 972 * and filter out the singular points of balanced_dirty_ratelimit. Which
945 * keeps jumping around randomly and can even leap far away at times 973 * keeps jumping around randomly and can even leap far away at times
946 * due to the small 200ms estimation period of dirty_rate (we want to 974 * due to the small 200ms estimation period of dirty_rate (we want to
947 * keep that period small to reduce time lags). 975 * keep that period small to reduce time lags).
@@ -1504,7 +1532,6 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
1504 void __user *buffer, size_t *length, loff_t *ppos) 1532 void __user *buffer, size_t *length, loff_t *ppos)
1505{ 1533{
1506 proc_dointvec(table, write, buffer, length, ppos); 1534 proc_dointvec(table, write, buffer, length, ppos);
1507 bdi_arm_supers_timer();
1508 return 0; 1535 return 0;
1509} 1536}
1510 1537
@@ -1568,6 +1595,7 @@ void writeback_set_ratelimit(void)
1568 unsigned long background_thresh; 1595 unsigned long background_thresh;
1569 unsigned long dirty_thresh; 1596 unsigned long dirty_thresh;
1570 global_dirty_limits(&background_thresh, &dirty_thresh); 1597 global_dirty_limits(&background_thresh, &dirty_thresh);
1598 global_dirty_limit = dirty_thresh;
1571 ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); 1599 ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
1572 if (ratelimit_pages < 16) 1600 if (ratelimit_pages < 16)
1573 ratelimit_pages = 16; 1601 ratelimit_pages = 16;
@@ -1605,13 +1633,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
1605 */ 1633 */
1606void __init page_writeback_init(void) 1634void __init page_writeback_init(void)
1607{ 1635{
1608 int shift;
1609
1610 writeback_set_ratelimit(); 1636 writeback_set_ratelimit();
1611 register_cpu_notifier(&ratelimit_nb); 1637 register_cpu_notifier(&ratelimit_nb);
1612 1638
1613 shift = calc_period_shift(); 1639 fprop_global_init(&writeout_completions);
1614 prop_descriptor_init(&vm_completions, shift);
1615} 1640}
1616 1641
1617/** 1642/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a712fb9e04ce..c66fb875104a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,12 +51,12 @@
51#include <linux/page_cgroup.h> 51#include <linux/page_cgroup.h>
52#include <linux/debugobjects.h> 52#include <linux/debugobjects.h>
53#include <linux/kmemleak.h> 53#include <linux/kmemleak.h>
54#include <linux/memory.h>
55#include <linux/compaction.h> 54#include <linux/compaction.h>
56#include <trace/events/kmem.h> 55#include <trace/events/kmem.h>
57#include <linux/ftrace_event.h> 56#include <linux/ftrace_event.h>
58#include <linux/memcontrol.h> 57#include <linux/memcontrol.h>
59#include <linux/prefetch.h> 58#include <linux/prefetch.h>
59#include <linux/migrate.h>
60#include <linux/page-debug-flags.h> 60#include <linux/page-debug-flags.h>
61 61
62#include <asm/tlbflush.h> 62#include <asm/tlbflush.h>
@@ -218,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);
218 218
219int page_group_by_mobility_disabled __read_mostly; 219int page_group_by_mobility_disabled __read_mostly;
220 220
221static void set_pageblock_migratetype(struct page *page, int migratetype) 221/*
222 * NOTE:
223 * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
224 * Instead, use {un}set_pageblock_isolate.
225 */
226void set_pageblock_migratetype(struct page *page, int migratetype)
222{ 227{
223 228
224 if (unlikely(page_group_by_mobility_disabled)) 229 if (unlikely(page_group_by_mobility_disabled))
@@ -513,10 +518,10 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
513 * free pages of length of (1 << order) and marked with _mapcount -2. Page's 518 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
514 * order is recorded in page_private(page) field. 519 * order is recorded in page_private(page) field.
515 * So when we are allocating or freeing one, we can derive the state of the 520 * So when we are allocating or freeing one, we can derive the state of the
516 * other. That is, if we allocate a small block, and both were 521 * other. That is, if we allocate a small block, and both were
517 * free, the remainder of the region must be split into blocks. 522 * free, the remainder of the region must be split into blocks.
518 * If a block is freed, and its buddy is also free, then this 523 * If a block is freed, and its buddy is also free, then this
519 * triggers coalescing into a block of larger size. 524 * triggers coalescing into a block of larger size.
520 * 525 *
521 * -- wli 526 * -- wli
522 */ 527 */
@@ -749,6 +754,24 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
749 __free_pages(page, order); 754 __free_pages(page, order);
750} 755}
751 756
757#ifdef CONFIG_CMA
758/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
759void __init init_cma_reserved_pageblock(struct page *page)
760{
761 unsigned i = pageblock_nr_pages;
762 struct page *p = page;
763
764 do {
765 __ClearPageReserved(p);
766 set_page_count(p, 0);
767 } while (++p, --i);
768
769 set_page_refcounted(page);
770 set_pageblock_migratetype(page, MIGRATE_CMA);
771 __free_pages(page, pageblock_order);
772 totalram_pages += pageblock_nr_pages;
773}
774#endif
752 775
753/* 776/*
754 * The order of subdivision here is critical for the IO subsystem. 777 * The order of subdivision here is critical for the IO subsystem.
@@ -874,11 +897,17 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
874 * This array describes the order lists are fallen back to when 897 * This array describes the order lists are fallen back to when
875 * the free lists for the desirable migrate type are depleted 898 * the free lists for the desirable migrate type are depleted
876 */ 899 */
877static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { 900static int fallbacks[MIGRATE_TYPES][4] = {
878 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 901 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
879 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 902 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
880 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 903#ifdef CONFIG_CMA
881 [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ 904 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
905 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
906#else
907 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
908#endif
909 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
910 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
882}; 911};
883 912
884/* 913/*
@@ -929,7 +958,7 @@ static int move_freepages(struct zone *zone,
929 return pages_moved; 958 return pages_moved;
930} 959}
931 960
932static int move_freepages_block(struct zone *zone, struct page *page, 961int move_freepages_block(struct zone *zone, struct page *page,
933 int migratetype) 962 int migratetype)
934{ 963{
935 unsigned long start_pfn, end_pfn; 964 unsigned long start_pfn, end_pfn;
@@ -973,12 +1002,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
973 /* Find the largest possible block of pages in the other list */ 1002 /* Find the largest possible block of pages in the other list */
974 for (current_order = MAX_ORDER-1; current_order >= order; 1003 for (current_order = MAX_ORDER-1; current_order >= order;
975 --current_order) { 1004 --current_order) {
976 for (i = 0; i < MIGRATE_TYPES - 1; i++) { 1005 for (i = 0;; i++) {
977 migratetype = fallbacks[start_migratetype][i]; 1006 migratetype = fallbacks[start_migratetype][i];
978 1007
979 /* MIGRATE_RESERVE handled later if necessary */ 1008 /* MIGRATE_RESERVE handled later if necessary */
980 if (migratetype == MIGRATE_RESERVE) 1009 if (migratetype == MIGRATE_RESERVE)
981 continue; 1010 break;
982 1011
983 area = &(zone->free_area[current_order]); 1012 area = &(zone->free_area[current_order]);
984 if (list_empty(&area->free_list[migratetype])) 1013 if (list_empty(&area->free_list[migratetype]))
@@ -993,11 +1022,18 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
993 * pages to the preferred allocation list. If falling 1022 * pages to the preferred allocation list. If falling
994 * back for a reclaimable kernel allocation, be more 1023 * back for a reclaimable kernel allocation, be more
995 * aggressive about taking ownership of free pages 1024 * aggressive about taking ownership of free pages
1025 *
1026 * On the other hand, never change migration
1027 * type of MIGRATE_CMA pageblocks nor move CMA
1028 * pages on different free lists. We don't
1029 * want unmovable pages to be allocated from
1030 * MIGRATE_CMA areas.
996 */ 1031 */
997 if (unlikely(current_order >= (pageblock_order >> 1)) || 1032 if (!is_migrate_cma(migratetype) &&
998 start_migratetype == MIGRATE_RECLAIMABLE || 1033 (unlikely(current_order >= pageblock_order / 2) ||
999 page_group_by_mobility_disabled) { 1034 start_migratetype == MIGRATE_RECLAIMABLE ||
1000 unsigned long pages; 1035 page_group_by_mobility_disabled)) {
1036 int pages;
1001 pages = move_freepages_block(zone, page, 1037 pages = move_freepages_block(zone, page,
1002 start_migratetype); 1038 start_migratetype);
1003 1039
@@ -1015,11 +1051,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1015 rmv_page_order(page); 1051 rmv_page_order(page);
1016 1052
1017 /* Take ownership for orders >= pageblock_order */ 1053 /* Take ownership for orders >= pageblock_order */
1018 if (current_order >= pageblock_order) 1054 if (current_order >= pageblock_order &&
1055 !is_migrate_cma(migratetype))
1019 change_pageblock_range(page, current_order, 1056 change_pageblock_range(page, current_order,
1020 start_migratetype); 1057 start_migratetype);
1021 1058
1022 expand(zone, page, order, current_order, area, migratetype); 1059 expand(zone, page, order, current_order, area,
1060 is_migrate_cma(migratetype)
1061 ? migratetype : start_migratetype);
1023 1062
1024 trace_mm_page_alloc_extfrag(page, order, current_order, 1063 trace_mm_page_alloc_extfrag(page, order, current_order,
1025 start_migratetype, migratetype); 1064 start_migratetype, migratetype);
@@ -1061,17 +1100,17 @@ retry_reserve:
1061 return page; 1100 return page;
1062} 1101}
1063 1102
1064/* 1103/*
1065 * Obtain a specified number of elements from the buddy allocator, all under 1104 * Obtain a specified number of elements from the buddy allocator, all under
1066 * a single hold of the lock, for efficiency. Add them to the supplied list. 1105 * a single hold of the lock, for efficiency. Add them to the supplied list.
1067 * Returns the number of new pages which were placed at *list. 1106 * Returns the number of new pages which were placed at *list.
1068 */ 1107 */
1069static int rmqueue_bulk(struct zone *zone, unsigned int order, 1108static int rmqueue_bulk(struct zone *zone, unsigned int order,
1070 unsigned long count, struct list_head *list, 1109 unsigned long count, struct list_head *list,
1071 int migratetype, int cold) 1110 int migratetype, int cold)
1072{ 1111{
1073 int i; 1112 int mt = migratetype, i;
1074 1113
1075 spin_lock(&zone->lock); 1114 spin_lock(&zone->lock);
1076 for (i = 0; i < count; ++i) { 1115 for (i = 0; i < count; ++i) {
1077 struct page *page = __rmqueue(zone, order, migratetype); 1116 struct page *page = __rmqueue(zone, order, migratetype);
@@ -1091,7 +1130,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1091 list_add(&page->lru, list); 1130 list_add(&page->lru, list);
1092 else 1131 else
1093 list_add_tail(&page->lru, list); 1132 list_add_tail(&page->lru, list);
1094 set_page_private(page, migratetype); 1133 if (IS_ENABLED(CONFIG_CMA)) {
1134 mt = get_pageblock_migratetype(page);
1135 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
1136 mt = migratetype;
1137 }
1138 set_page_private(page, mt);
1095 list = &page->lru; 1139 list = &page->lru;
1096 } 1140 }
1097 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1141 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
@@ -1118,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1118 to_drain = pcp->batch; 1162 to_drain = pcp->batch;
1119 else 1163 else
1120 to_drain = pcp->count; 1164 to_drain = pcp->count;
1121 free_pcppages_bulk(zone, to_drain, pcp); 1165 if (to_drain > 0) {
1122 pcp->count -= to_drain; 1166 free_pcppages_bulk(zone, to_drain, pcp);
1167 pcp->count -= to_drain;
1168 }
1123 local_irq_restore(flags); 1169 local_irq_restore(flags);
1124} 1170}
1125#endif 1171#endif
@@ -1371,8 +1417,12 @@ int split_free_page(struct page *page)
1371 1417
1372 if (order >= pageblock_order - 1) { 1418 if (order >= pageblock_order - 1) {
1373 struct page *endpage = page + (1 << order) - 1; 1419 struct page *endpage = page + (1 << order) - 1;
1374 for (; page < endpage; page += pageblock_nr_pages) 1420 for (; page < endpage; page += pageblock_nr_pages) {
1375 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1421 int mt = get_pageblock_migratetype(page);
1422 if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
1423 set_pageblock_migratetype(page,
1424 MIGRATE_MOVABLE);
1425 }
1376 } 1426 }
1377 1427
1378 return 1 << order; 1428 return 1 << order;
@@ -1485,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str)
1485} 1535}
1486__setup("fail_page_alloc=", setup_fail_page_alloc); 1536__setup("fail_page_alloc=", setup_fail_page_alloc);
1487 1537
1488static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1538static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1489{ 1539{
1490 if (order < fail_page_alloc.min_order) 1540 if (order < fail_page_alloc.min_order)
1491 return 0; 1541 return false;
1492 if (gfp_mask & __GFP_NOFAIL) 1542 if (gfp_mask & __GFP_NOFAIL)
1493 return 0; 1543 return false;
1494 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1544 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1495 return 0; 1545 return false;
1496 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1546 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1497 return 0; 1547 return false;
1498 1548
1499 return should_fail(&fail_page_alloc.attr, 1 << order); 1549 return should_fail(&fail_page_alloc.attr, 1 << order);
1500} 1550}
@@ -1534,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs);
1534 1584
1535#else /* CONFIG_FAIL_PAGE_ALLOC */ 1585#else /* CONFIG_FAIL_PAGE_ALLOC */
1536 1586
1537static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1587static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1538{ 1588{
1539 return 0; 1589 return false;
1540} 1590}
1541 1591
1542#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1592#endif /* CONFIG_FAIL_PAGE_ALLOC */
@@ -1550,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1550{ 1600{
1551 /* free_pages my go negative - that's OK */ 1601 /* free_pages my go negative - that's OK */
1552 long min = mark; 1602 long min = mark;
1603 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1553 int o; 1604 int o;
1554 1605
1555 free_pages -= (1 << order) - 1; 1606 free_pages -= (1 << order) - 1;
@@ -1558,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1558 if (alloc_flags & ALLOC_HARDER) 1609 if (alloc_flags & ALLOC_HARDER)
1559 min -= min / 4; 1610 min -= min / 4;
1560 1611
1561 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1612 if (free_pages <= min + lowmem_reserve)
1562 return false; 1613 return false;
1563 for (o = 0; o < order; o++) { 1614 for (o = 0; o < order; o++) {
1564 /* At the next order, this order's pages become unavailable */ 1615 /* At the next order, this order's pages become unavailable */
@@ -1573,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1573 return true; 1624 return true;
1574} 1625}
1575 1626
1627#ifdef CONFIG_MEMORY_ISOLATION
1628static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1629{
1630 if (unlikely(zone->nr_pageblock_isolate))
1631 return zone->nr_pageblock_isolate * pageblock_nr_pages;
1632 return 0;
1633}
1634#else
1635static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1636{
1637 return 0;
1638}
1639#endif
1640
1576bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1641bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1577 int classzone_idx, int alloc_flags) 1642 int classzone_idx, int alloc_flags)
1578{ 1643{
@@ -1588,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1588 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1653 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1589 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1654 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1590 1655
1656 /*
1657 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
1658 * it. nr_zone_isolate_freepages is never accurate so kswapd might not
1659 * sleep although it could do so. But this is more desirable for memory
1660 * hotplug than sleeping which can cause a livelock in the direct
1661 * reclaim path.
1662 */
1663 free_pages -= nr_zone_isolate_freepages(z);
1591 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1664 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1592 free_pages); 1665 free_pages);
1593} 1666}
@@ -1855,6 +1928,17 @@ this_zone_full:
1855 zlc_active = 0; 1928 zlc_active = 0;
1856 goto zonelist_scan; 1929 goto zonelist_scan;
1857 } 1930 }
1931
1932 if (page)
1933 /*
1934 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
1935 * necessary to allocate the page. The expectation is
1936 * that the caller is taking steps that will free more
1937 * memory. The caller should avoid the page being used
1938 * for !PFMEMALLOC purposes.
1939 */
1940 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
1941
1858 return page; 1942 return page;
1859} 1943}
1860 1944
@@ -2018,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2018 struct zonelist *zonelist, enum zone_type high_zoneidx, 2102 struct zonelist *zonelist, enum zone_type high_zoneidx,
2019 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2103 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2020 int migratetype, bool sync_migration, 2104 int migratetype, bool sync_migration,
2021 bool *deferred_compaction, 2105 bool *contended_compaction, bool *deferred_compaction,
2022 unsigned long *did_some_progress) 2106 unsigned long *did_some_progress)
2023{ 2107{
2024 struct page *page; 2108 struct page *page;
@@ -2033,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2033 2117
2034 current->flags |= PF_MEMALLOC; 2118 current->flags |= PF_MEMALLOC;
2035 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2119 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2036 nodemask, sync_migration); 2120 nodemask, sync_migration,
2121 contended_compaction);
2037 current->flags &= ~PF_MEMALLOC; 2122 current->flags &= ~PF_MEMALLOC;
2038 if (*did_some_progress != COMPACT_SKIPPED) { 2123 if (*did_some_progress != COMPACT_SKIPPED) {
2039 2124
@@ -2043,8 +2128,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2043 2128
2044 page = get_page_from_freelist(gfp_mask, nodemask, 2129 page = get_page_from_freelist(gfp_mask, nodemask,
2045 order, zonelist, high_zoneidx, 2130 order, zonelist, high_zoneidx,
2046 alloc_flags, preferred_zone, 2131 alloc_flags & ~ALLOC_NO_WATERMARKS,
2047 migratetype); 2132 preferred_zone, migratetype);
2048 if (page) { 2133 if (page) {
2049 preferred_zone->compact_considered = 0; 2134 preferred_zone->compact_considered = 0;
2050 preferred_zone->compact_defer_shift = 0; 2135 preferred_zone->compact_defer_shift = 0;
@@ -2079,23 +2164,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2079 struct zonelist *zonelist, enum zone_type high_zoneidx, 2164 struct zonelist *zonelist, enum zone_type high_zoneidx,
2080 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2165 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2081 int migratetype, bool sync_migration, 2166 int migratetype, bool sync_migration,
2082 bool *deferred_compaction, 2167 bool *contended_compaction, bool *deferred_compaction,
2083 unsigned long *did_some_progress) 2168 unsigned long *did_some_progress)
2084{ 2169{
2085 return NULL; 2170 return NULL;
2086} 2171}
2087#endif /* CONFIG_COMPACTION */ 2172#endif /* CONFIG_COMPACTION */
2088 2173
2089/* The really slow allocator path where we enter direct reclaim */ 2174/* Perform direct synchronous page reclaim */
2090static inline struct page * 2175static int
2091__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2176__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2092 struct zonelist *zonelist, enum zone_type high_zoneidx, 2177 nodemask_t *nodemask)
2093 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2094 int migratetype, unsigned long *did_some_progress)
2095{ 2178{
2096 struct page *page = NULL;
2097 struct reclaim_state reclaim_state; 2179 struct reclaim_state reclaim_state;
2098 bool drained = false; 2180 int progress;
2099 2181
2100 cond_resched(); 2182 cond_resched();
2101 2183
@@ -2106,7 +2188,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2106 reclaim_state.reclaimed_slab = 0; 2188 reclaim_state.reclaimed_slab = 0;
2107 current->reclaim_state = &reclaim_state; 2189 current->reclaim_state = &reclaim_state;
2108 2190
2109 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2191 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
2110 2192
2111 current->reclaim_state = NULL; 2193 current->reclaim_state = NULL;
2112 lockdep_clear_current_reclaim_state(); 2194 lockdep_clear_current_reclaim_state();
@@ -2114,6 +2196,21 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2114 2196
2115 cond_resched(); 2197 cond_resched();
2116 2198
2199 return progress;
2200}
2201
2202/* The really slow allocator path where we enter direct reclaim */
2203static inline struct page *
2204__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2205 struct zonelist *zonelist, enum zone_type high_zoneidx,
2206 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2207 int migratetype, unsigned long *did_some_progress)
2208{
2209 struct page *page = NULL;
2210 bool drained = false;
2211
2212 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
2213 nodemask);
2117 if (unlikely(!(*did_some_progress))) 2214 if (unlikely(!(*did_some_progress)))
2118 return NULL; 2215 return NULL;
2119 2216
@@ -2124,8 +2221,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2124retry: 2221retry:
2125 page = get_page_from_freelist(gfp_mask, nodemask, order, 2222 page = get_page_from_freelist(gfp_mask, nodemask, order,
2126 zonelist, high_zoneidx, 2223 zonelist, high_zoneidx,
2127 alloc_flags, preferred_zone, 2224 alloc_flags & ~ALLOC_NO_WATERMARKS,
2128 migratetype); 2225 preferred_zone, migratetype);
2129 2226
2130 /* 2227 /*
2131 * If an allocation failed after direct reclaim, it could be because 2228 * If an allocation failed after direct reclaim, it could be because
@@ -2209,15 +2306,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
2209 alloc_flags |= ALLOC_HARDER; 2306 alloc_flags |= ALLOC_HARDER;
2210 2307
2211 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2308 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2212 if (!in_interrupt() && 2309 if (gfp_mask & __GFP_MEMALLOC)
2213 ((current->flags & PF_MEMALLOC) || 2310 alloc_flags |= ALLOC_NO_WATERMARKS;
2214 unlikely(test_thread_flag(TIF_MEMDIE)))) 2311 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2312 alloc_flags |= ALLOC_NO_WATERMARKS;
2313 else if (!in_interrupt() &&
2314 ((current->flags & PF_MEMALLOC) ||
2315 unlikely(test_thread_flag(TIF_MEMDIE))))
2215 alloc_flags |= ALLOC_NO_WATERMARKS; 2316 alloc_flags |= ALLOC_NO_WATERMARKS;
2216 } 2317 }
2217 2318
2218 return alloc_flags; 2319 return alloc_flags;
2219} 2320}
2220 2321
2322bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2323{
2324 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2325}
2326
2221static inline struct page * 2327static inline struct page *
2222__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2328__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2223 struct zonelist *zonelist, enum zone_type high_zoneidx, 2329 struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2231,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2231 unsigned long did_some_progress; 2337 unsigned long did_some_progress;
2232 bool sync_migration = false; 2338 bool sync_migration = false;
2233 bool deferred_compaction = false; 2339 bool deferred_compaction = false;
2340 bool contended_compaction = false;
2234 2341
2235 /* 2342 /*
2236 * In the slowpath, we sanity check order to avoid ever trying to 2343 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2284,11 +2391,19 @@ rebalance:
2284 2391
2285 /* Allocate without watermarks if the context allows */ 2392 /* Allocate without watermarks if the context allows */
2286 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2393 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2394 /*
2395 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
2396 * the allocation is high priority and these type of
2397 * allocations are system rather than user orientated
2398 */
2399 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2400
2287 page = __alloc_pages_high_priority(gfp_mask, order, 2401 page = __alloc_pages_high_priority(gfp_mask, order,
2288 zonelist, high_zoneidx, nodemask, 2402 zonelist, high_zoneidx, nodemask,
2289 preferred_zone, migratetype); 2403 preferred_zone, migratetype);
2290 if (page) 2404 if (page) {
2291 goto got_pg; 2405 goto got_pg;
2406 }
2292 } 2407 }
2293 2408
2294 /* Atomic allocations - we can't balance anything */ 2409 /* Atomic allocations - we can't balance anything */
@@ -2312,6 +2427,7 @@ rebalance:
2312 nodemask, 2427 nodemask,
2313 alloc_flags, preferred_zone, 2428 alloc_flags, preferred_zone,
2314 migratetype, sync_migration, 2429 migratetype, sync_migration,
2430 &contended_compaction,
2315 &deferred_compaction, 2431 &deferred_compaction,
2316 &did_some_progress); 2432 &did_some_progress);
2317 if (page) 2433 if (page)
@@ -2321,10 +2437,11 @@ rebalance:
2321 /* 2437 /*
2322 * If compaction is deferred for high-order allocations, it is because 2438 * If compaction is deferred for high-order allocations, it is because
2323 * sync compaction recently failed. In this is the case and the caller 2439 * sync compaction recently failed. In this is the case and the caller
2324 * has requested the system not be heavily disrupted, fail the 2440 * requested a movable allocation that does not heavily disrupt the
2325 * allocation now instead of entering direct reclaim 2441 * system then fail the allocation instead of entering direct reclaim.
2326 */ 2442 */
2327 if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) 2443 if ((deferred_compaction || contended_compaction) &&
2444 (gfp_mask & __GFP_NO_KSWAPD))
2328 goto nopage; 2445 goto nopage;
2329 2446
2330 /* Try direct reclaim and then allocating */ 2447 /* Try direct reclaim and then allocating */
@@ -2395,6 +2512,7 @@ rebalance:
2395 nodemask, 2512 nodemask,
2396 alloc_flags, preferred_zone, 2513 alloc_flags, preferred_zone,
2397 migratetype, sync_migration, 2514 migratetype, sync_migration,
2515 &contended_compaction,
2398 &deferred_compaction, 2516 &deferred_compaction,
2399 &did_some_progress); 2517 &did_some_progress);
2400 if (page) 2518 if (page)
@@ -2407,8 +2525,8 @@ nopage:
2407got_pg: 2525got_pg:
2408 if (kmemcheck_enabled) 2526 if (kmemcheck_enabled)
2409 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2527 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2410 return page;
2411 2528
2529 return page;
2412} 2530}
2413 2531
2414/* 2532/*
@@ -2974,7 +3092,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2974 user_zonelist_order = oldval; 3092 user_zonelist_order = oldval;
2975 } else if (oldval != user_zonelist_order) { 3093 } else if (oldval != user_zonelist_order) {
2976 mutex_lock(&zonelists_mutex); 3094 mutex_lock(&zonelists_mutex);
2977 build_all_zonelists(NULL); 3095 build_all_zonelists(NULL, NULL);
2978 mutex_unlock(&zonelists_mutex); 3096 mutex_unlock(&zonelists_mutex);
2979 } 3097 }
2980 } 3098 }
@@ -3353,14 +3471,21 @@ static void setup_zone_pageset(struct zone *zone);
3353DEFINE_MUTEX(zonelists_mutex); 3471DEFINE_MUTEX(zonelists_mutex);
3354 3472
3355/* return values int ....just for stop_machine() */ 3473/* return values int ....just for stop_machine() */
3356static __init_refok int __build_all_zonelists(void *data) 3474static int __build_all_zonelists(void *data)
3357{ 3475{
3358 int nid; 3476 int nid;
3359 int cpu; 3477 int cpu;
3478 pg_data_t *self = data;
3360 3479
3361#ifdef CONFIG_NUMA 3480#ifdef CONFIG_NUMA
3362 memset(node_load, 0, sizeof(node_load)); 3481 memset(node_load, 0, sizeof(node_load));
3363#endif 3482#endif
3483
3484 if (self && !node_online(self->node_id)) {
3485 build_zonelists(self);
3486 build_zonelist_cache(self);
3487 }
3488
3364 for_each_online_node(nid) { 3489 for_each_online_node(nid) {
3365 pg_data_t *pgdat = NODE_DATA(nid); 3490 pg_data_t *pgdat = NODE_DATA(nid);
3366 3491
@@ -3405,7 +3530,7 @@ static __init_refok int __build_all_zonelists(void *data)
3405 * Called with zonelists_mutex held always 3530 * Called with zonelists_mutex held always
3406 * unless system_state == SYSTEM_BOOTING. 3531 * unless system_state == SYSTEM_BOOTING.
3407 */ 3532 */
3408void __ref build_all_zonelists(void *data) 3533void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3409{ 3534{
3410 set_zonelist_order(); 3535 set_zonelist_order();
3411 3536
@@ -3417,10 +3542,10 @@ void __ref build_all_zonelists(void *data)
3417 /* we have to stop all cpus to guarantee there is no user 3542 /* we have to stop all cpus to guarantee there is no user
3418 of zonelist */ 3543 of zonelist */
3419#ifdef CONFIG_MEMORY_HOTPLUG 3544#ifdef CONFIG_MEMORY_HOTPLUG
3420 if (data) 3545 if (zone)
3421 setup_zone_pageset((struct zone *)data); 3546 setup_zone_pageset(zone);
3422#endif 3547#endif
3423 stop_machine(__build_all_zonelists, NULL, NULL); 3548 stop_machine(__build_all_zonelists, pgdat, NULL);
3424 /* cpuset refresh routine should be here */ 3549 /* cpuset refresh routine should be here */
3425 } 3550 }
3426 vm_total_pages = nr_free_pagecache_pages(); 3551 vm_total_pages = nr_free_pagecache_pages();
@@ -3690,7 +3815,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
3690 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 3815 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3691#endif 3816#endif
3692 3817
3693static int zone_batchsize(struct zone *zone) 3818static int __meminit zone_batchsize(struct zone *zone)
3694{ 3819{
3695#ifdef CONFIG_MMU 3820#ifdef CONFIG_MMU
3696 int batch; 3821 int batch;
@@ -3772,7 +3897,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3772 pcp->batch = PAGE_SHIFT * 8; 3897 pcp->batch = PAGE_SHIFT * 8;
3773} 3898}
3774 3899
3775static void setup_zone_pageset(struct zone *zone) 3900static void __meminit setup_zone_pageset(struct zone *zone)
3776{ 3901{
3777 int cpu; 3902 int cpu;
3778 3903
@@ -3845,32 +3970,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3845 return 0; 3970 return 0;
3846} 3971}
3847 3972
3848static int __zone_pcp_update(void *data)
3849{
3850 struct zone *zone = data;
3851 int cpu;
3852 unsigned long batch = zone_batchsize(zone), flags;
3853
3854 for_each_possible_cpu(cpu) {
3855 struct per_cpu_pageset *pset;
3856 struct per_cpu_pages *pcp;
3857
3858 pset = per_cpu_ptr(zone->pageset, cpu);
3859 pcp = &pset->pcp;
3860
3861 local_irq_save(flags);
3862 free_pcppages_bulk(zone, pcp->count, pcp);
3863 setup_pageset(pset, batch);
3864 local_irq_restore(flags);
3865 }
3866 return 0;
3867}
3868
3869void zone_pcp_update(struct zone *zone)
3870{
3871 stop_machine(__zone_pcp_update, zone, NULL);
3872}
3873
3874static __meminit void zone_pcp_init(struct zone *zone) 3973static __meminit void zone_pcp_init(struct zone *zone)
3875{ 3974{
3876 /* 3975 /*
@@ -3886,7 +3985,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
3886 zone_batchsize(zone)); 3985 zone_batchsize(zone));
3887} 3986}
3888 3987
3889__meminit int init_currently_empty_zone(struct zone *zone, 3988int __meminit init_currently_empty_zone(struct zone *zone,
3890 unsigned long zone_start_pfn, 3989 unsigned long zone_start_pfn,
3891 unsigned long size, 3990 unsigned long size,
3892 enum memmap_context context) 3991 enum memmap_context context)
@@ -4244,25 +4343,24 @@ static inline void setup_usemap(struct pglist_data *pgdat,
4244 4343
4245#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4344#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4246 4345
4247/* Return a sensible default order for the pageblock size. */
4248static inline int pageblock_default_order(void)
4249{
4250 if (HPAGE_SHIFT > PAGE_SHIFT)
4251 return HUGETLB_PAGE_ORDER;
4252
4253 return MAX_ORDER-1;
4254}
4255
4256/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4346/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4257static inline void __init set_pageblock_order(unsigned int order) 4347void __init set_pageblock_order(void)
4258{ 4348{
4349 unsigned int order;
4350
4259 /* Check that pageblock_nr_pages has not already been setup */ 4351 /* Check that pageblock_nr_pages has not already been setup */
4260 if (pageblock_order) 4352 if (pageblock_order)
4261 return; 4353 return;
4262 4354
4355 if (HPAGE_SHIFT > PAGE_SHIFT)
4356 order = HUGETLB_PAGE_ORDER;
4357 else
4358 order = MAX_ORDER - 1;
4359
4263 /* 4360 /*
4264 * Assume the largest contiguous order of interest is a huge page. 4361 * Assume the largest contiguous order of interest is a huge page.
4265 * This value may be variable depending on boot parameters on IA64 4362 * This value may be variable depending on boot parameters on IA64 and
4363 * powerpc.
4266 */ 4364 */
4267 pageblock_order = order; 4365 pageblock_order = order;
4268} 4366}
@@ -4270,15 +4368,13 @@ static inline void __init set_pageblock_order(unsigned int order)
4270 4368
4271/* 4369/*
4272 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 4370 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
4273 * and pageblock_default_order() are unused as pageblock_order is set 4371 * is unused as pageblock_order is set at compile-time. See
4274 * at compile-time. See include/linux/pageblock-flags.h for the values of 4372 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4275 * pageblock_order based on the kernel config 4373 * the kernel config
4276 */ 4374 */
4277static inline int pageblock_default_order(unsigned int order) 4375void __init set_pageblock_order(void)
4278{ 4376{
4279 return MAX_ORDER-1;
4280} 4377}
4281#define set_pageblock_order(x) do {} while (0)
4282 4378
4283#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4379#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4284 4380
@@ -4287,6 +4383,8 @@ static inline int pageblock_default_order(unsigned int order)
4287 * - mark all pages reserved 4383 * - mark all pages reserved
4288 * - mark all memory queues empty 4384 * - mark all memory queues empty
4289 * - clear the memory bitmaps 4385 * - clear the memory bitmaps
4386 *
4387 * NOTE: pgdat should get zeroed by caller.
4290 */ 4388 */
4291static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4389static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4292 unsigned long *zones_size, unsigned long *zholes_size) 4390 unsigned long *zones_size, unsigned long *zholes_size)
@@ -4297,15 +4395,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4297 int ret; 4395 int ret;
4298 4396
4299 pgdat_resize_init(pgdat); 4397 pgdat_resize_init(pgdat);
4300 pgdat->nr_zones = 0;
4301 init_waitqueue_head(&pgdat->kswapd_wait); 4398 init_waitqueue_head(&pgdat->kswapd_wait);
4302 pgdat->kswapd_max_order = 0; 4399 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4303 pgdat_page_cgroup_init(pgdat); 4400 pgdat_page_cgroup_init(pgdat);
4304 4401
4305 for (j = 0; j < MAX_NR_ZONES; j++) { 4402 for (j = 0; j < MAX_NR_ZONES; j++) {
4306 struct zone *zone = pgdat->node_zones + j; 4403 struct zone *zone = pgdat->node_zones + j;
4307 unsigned long size, realsize, memmap_pages; 4404 unsigned long size, realsize, memmap_pages;
4308 enum lru_list lru;
4309 4405
4310 size = zone_spanned_pages_in_node(nid, j, zones_size); 4406 size = zone_spanned_pages_in_node(nid, j, zones_size);
4311 realsize = size - zone_absent_pages_in_node(nid, j, 4407 realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4342,6 +4438,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4342 4438
4343 zone->spanned_pages = size; 4439 zone->spanned_pages = size;
4344 zone->present_pages = realsize; 4440 zone->present_pages = realsize;
4441#if defined CONFIG_COMPACTION || defined CONFIG_CMA
4442 zone->compact_cached_free_pfn = zone->zone_start_pfn +
4443 zone->spanned_pages;
4444 zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
4445#endif
4345#ifdef CONFIG_NUMA 4446#ifdef CONFIG_NUMA
4346 zone->node = nid; 4447 zone->node = nid;
4347 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4448 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4355,18 +4456,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4355 zone->zone_pgdat = pgdat; 4456 zone->zone_pgdat = pgdat;
4356 4457
4357 zone_pcp_init(zone); 4458 zone_pcp_init(zone);
4358 for_each_lru(lru) 4459 lruvec_init(&zone->lruvec, zone);
4359 INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
4360 zone->reclaim_stat.recent_rotated[0] = 0;
4361 zone->reclaim_stat.recent_rotated[1] = 0;
4362 zone->reclaim_stat.recent_scanned[0] = 0;
4363 zone->reclaim_stat.recent_scanned[1] = 0;
4364 zap_zone_vm_stats(zone);
4365 zone->flags = 0;
4366 if (!size) 4460 if (!size)
4367 continue; 4461 continue;
4368 4462
4369 set_pageblock_order(pageblock_default_order()); 4463 set_pageblock_order();
4370 setup_usemap(pgdat, zone, size); 4464 setup_usemap(pgdat, zone, size);
4371 ret = init_currently_empty_zone(zone, zone_start_pfn, 4465 ret = init_currently_empty_zone(zone, zone_start_pfn,
4372 size, MEMMAP_EARLY); 4466 size, MEMMAP_EARLY);
@@ -4422,6 +4516,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4422{ 4516{
4423 pg_data_t *pgdat = NODE_DATA(nid); 4517 pg_data_t *pgdat = NODE_DATA(nid);
4424 4518
4519 /* pg_data_t should be reset to zero when it's allocated */
4520 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
4521
4425 pgdat->node_id = nid; 4522 pgdat->node_id = nid;
4426 pgdat->node_start_pfn = node_start_pfn; 4523 pgdat->node_start_pfn = node_start_pfn;
4427 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4524 calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -4703,7 +4800,7 @@ out:
4703} 4800}
4704 4801
4705/* Any regular memory on that node ? */ 4802/* Any regular memory on that node ? */
4706static void check_for_regular_memory(pg_data_t *pgdat) 4803static void __init check_for_regular_memory(pg_data_t *pgdat)
4707{ 4804{
4708#ifdef CONFIG_HIGHMEM 4805#ifdef CONFIG_HIGHMEM
4709 enum zone_type zone_type; 4806 enum zone_type zone_type;
@@ -4759,31 +4856,34 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4759 find_zone_movable_pfns_for_nodes(); 4856 find_zone_movable_pfns_for_nodes();
4760 4857
4761 /* Print out the zone ranges */ 4858 /* Print out the zone ranges */
4762 printk("Zone PFN ranges:\n"); 4859 printk("Zone ranges:\n");
4763 for (i = 0; i < MAX_NR_ZONES; i++) { 4860 for (i = 0; i < MAX_NR_ZONES; i++) {
4764 if (i == ZONE_MOVABLE) 4861 if (i == ZONE_MOVABLE)
4765 continue; 4862 continue;
4766 printk(" %-8s ", zone_names[i]); 4863 printk(KERN_CONT " %-8s ", zone_names[i]);
4767 if (arch_zone_lowest_possible_pfn[i] == 4864 if (arch_zone_lowest_possible_pfn[i] ==
4768 arch_zone_highest_possible_pfn[i]) 4865 arch_zone_highest_possible_pfn[i])
4769 printk("empty\n"); 4866 printk(KERN_CONT "empty\n");
4770 else 4867 else
4771 printk("%0#10lx -> %0#10lx\n", 4868 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
4772 arch_zone_lowest_possible_pfn[i], 4869 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
4773 arch_zone_highest_possible_pfn[i]); 4870 (arch_zone_highest_possible_pfn[i]
4871 << PAGE_SHIFT) - 1);
4774 } 4872 }
4775 4873
4776 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 4874 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
4777 printk("Movable zone start PFN for each node\n"); 4875 printk("Movable zone start for each node\n");
4778 for (i = 0; i < MAX_NUMNODES; i++) { 4876 for (i = 0; i < MAX_NUMNODES; i++) {
4779 if (zone_movable_pfn[i]) 4877 if (zone_movable_pfn[i])
4780 printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); 4878 printk(" Node %d: %#010lx\n", i,
4879 zone_movable_pfn[i] << PAGE_SHIFT);
4781 } 4880 }
4782 4881
4783 /* Print out the early_node_map[] */ 4882 /* Print out the early_node_map[] */
4784 printk("Early memory PFN ranges\n"); 4883 printk("Early memory node ranges\n");
4785 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 4884 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4786 printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); 4885 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
4886 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
4787 4887
4788 /* Initialise every node */ 4888 /* Initialise every node */
4789 mminit_verify_pageflags_layout(); 4889 mminit_verify_pageflags_layout();
@@ -4976,14 +5076,7 @@ static void setup_per_zone_lowmem_reserve(void)
4976 calculate_totalreserve_pages(); 5076 calculate_totalreserve_pages();
4977} 5077}
4978 5078
4979/** 5079static void __setup_per_zone_wmarks(void)
4980 * setup_per_zone_wmarks - called when min_free_kbytes changes
4981 * or when memory is hot-{added|removed}
4982 *
4983 * Ensures that the watermark[min,low,high] values for each zone are set
4984 * correctly with respect to min_free_kbytes.
4985 */
4986void setup_per_zone_wmarks(void)
4987{ 5080{
4988 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 5081 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4989 unsigned long lowmem_pages = 0; 5082 unsigned long lowmem_pages = 0;
@@ -5030,6 +5123,11 @@ void setup_per_zone_wmarks(void)
5030 5123
5031 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5124 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5032 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5125 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5126
5127 zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
5128 zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
5129 zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
5130
5033 setup_zone_migrate_reserve(zone); 5131 setup_zone_migrate_reserve(zone);
5034 spin_unlock_irqrestore(&zone->lock, flags); 5132 spin_unlock_irqrestore(&zone->lock, flags);
5035 } 5133 }
@@ -5038,6 +5136,20 @@ void setup_per_zone_wmarks(void)
5038 calculate_totalreserve_pages(); 5136 calculate_totalreserve_pages();
5039} 5137}
5040 5138
5139/**
5140 * setup_per_zone_wmarks - called when min_free_kbytes changes
5141 * or when memory is hot-{added|removed}
5142 *
5143 * Ensures that the watermark[min,low,high] values for each zone are set
5144 * correctly with respect to min_free_kbytes.
5145 */
5146void setup_per_zone_wmarks(void)
5147{
5148 mutex_lock(&zonelists_mutex);
5149 __setup_per_zone_wmarks();
5150 mutex_unlock(&zonelists_mutex);
5151}
5152
5041/* 5153/*
5042 * The inactive anon list should be small enough that the VM never has to 5154 * The inactive anon list should be small enough that the VM never has to
5043 * do too much work, but large enough that each inactive page has a chance 5155 * do too much work, but large enough that each inactive page has a chance
@@ -5203,7 +5315,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5203 int ret; 5315 int ret;
5204 5316
5205 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5317 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5206 if (!write || (ret == -EINVAL)) 5318 if (!write || (ret < 0))
5207 return ret; 5319 return ret;
5208 for_each_populated_zone(zone) { 5320 for_each_populated_zone(zone) {
5209 for_each_possible_cpu(cpu) { 5321 for_each_possible_cpu(cpu) {
@@ -5242,9 +5354,10 @@ void *__init alloc_large_system_hash(const char *tablename,
5242 int flags, 5354 int flags,
5243 unsigned int *_hash_shift, 5355 unsigned int *_hash_shift,
5244 unsigned int *_hash_mask, 5356 unsigned int *_hash_mask,
5245 unsigned long limit) 5357 unsigned long low_limit,
5358 unsigned long high_limit)
5246{ 5359{
5247 unsigned long long max = limit; 5360 unsigned long long max = high_limit;
5248 unsigned long log2qty, size; 5361 unsigned long log2qty, size;
5249 void *table = NULL; 5362 void *table = NULL;
5250 5363
@@ -5282,6 +5395,8 @@ void *__init alloc_large_system_hash(const char *tablename,
5282 } 5395 }
5283 max = min(max, 0x80000000ULL); 5396 max = min(max, 0x80000000ULL);
5284 5397
5398 if (numentries < low_limit)
5399 numentries = low_limit;
5285 if (numentries > max) 5400 if (numentries > max)
5286 numentries = max; 5401 numentries = max;
5287 5402
@@ -5403,24 +5518,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5403} 5518}
5404 5519
5405/* 5520/*
5406 * This is designed as sub function...plz see page_isolation.c also. 5521 * This function checks whether pageblock includes unmovable pages or not.
5407 * set/clear page block's type to be ISOLATE. 5522 * If @count is not zero, it is okay to include less @count unmovable pages
5408 * page allocater never alloc memory from ISOLATE block. 5523 *
5524 * PageLRU check wihtout isolation or lru_lock could race so that
5525 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5526 * expect this function should be exact.
5409 */ 5527 */
5410 5528bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
5411static int
5412__count_immobile_pages(struct zone *zone, struct page *page, int count)
5413{ 5529{
5414 unsigned long pfn, iter, found; 5530 unsigned long pfn, iter, found;
5531 int mt;
5532
5415 /* 5533 /*
5416 * For avoiding noise data, lru_add_drain_all() should be called 5534 * For avoiding noise data, lru_add_drain_all() should be called
5417 * If ZONE_MOVABLE, the zone never contains immobile pages 5535 * If ZONE_MOVABLE, the zone never contains unmovable pages
5418 */ 5536 */
5419 if (zone_idx(zone) == ZONE_MOVABLE) 5537 if (zone_idx(zone) == ZONE_MOVABLE)
5420 return true; 5538 return false;
5421 5539 mt = get_pageblock_migratetype(page);
5422 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) 5540 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
5423 return true; 5541 return false;
5424 5542
5425 pfn = page_to_pfn(page); 5543 pfn = page_to_pfn(page);
5426 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 5544 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
@@ -5430,11 +5548,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5430 continue; 5548 continue;
5431 5549
5432 page = pfn_to_page(check); 5550 page = pfn_to_page(check);
5433 if (!page_count(page)) { 5551 /*
5552 * We can't use page_count without pin a page
5553 * because another CPU can free compound page.
5554 * This check already skips compound tails of THP
5555 * because their page->_count is zero at all time.
5556 */
5557 if (!atomic_read(&page->_count)) {
5434 if (PageBuddy(page)) 5558 if (PageBuddy(page))
5435 iter += (1 << page_order(page)) - 1; 5559 iter += (1 << page_order(page)) - 1;
5436 continue; 5560 continue;
5437 } 5561 }
5562
5438 if (!PageLRU(page)) 5563 if (!PageLRU(page))
5439 found++; 5564 found++;
5440 /* 5565 /*
@@ -5451,9 +5576,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5451 * page at boot. 5576 * page at boot.
5452 */ 5577 */
5453 if (found > count) 5578 if (found > count)
5454 return false; 5579 return true;
5455 } 5580 }
5456 return true; 5581 return false;
5457} 5582}
5458 5583
5459bool is_pageblock_removable_nolock(struct page *page) 5584bool is_pageblock_removable_nolock(struct page *page)
@@ -5477,80 +5602,304 @@ bool is_pageblock_removable_nolock(struct page *page)
5477 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5602 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5478 return false; 5603 return false;
5479 5604
5480 return __count_immobile_pages(zone, page, 0); 5605 return !has_unmovable_pages(zone, page, 0);
5606}
5607
5608#ifdef CONFIG_CMA
5609
5610static unsigned long pfn_max_align_down(unsigned long pfn)
5611{
5612 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
5613 pageblock_nr_pages) - 1);
5481} 5614}
5482 5615
5483int set_migratetype_isolate(struct page *page) 5616static unsigned long pfn_max_align_up(unsigned long pfn)
5484{ 5617{
5485 struct zone *zone; 5618 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
5486 unsigned long flags, pfn; 5619 pageblock_nr_pages));
5487 struct memory_isolate_notify arg; 5620}
5488 int notifier_ret;
5489 int ret = -EBUSY;
5490 5621
5491 zone = page_zone(page); 5622static struct page *
5623__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
5624 int **resultp)
5625{
5626 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
5627
5628 if (PageHighMem(page))
5629 gfp_mask |= __GFP_HIGHMEM;
5492 5630
5631 return alloc_page(gfp_mask);
5632}
5633
5634/* [start, end) must belong to a single zone. */
5635static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
5636{
5637 /* This function is based on compact_zone() from compaction.c. */
5638
5639 unsigned long pfn = start;
5640 unsigned int tries = 0;
5641 int ret = 0;
5642
5643 struct compact_control cc = {
5644 .nr_migratepages = 0,
5645 .order = -1,
5646 .zone = page_zone(pfn_to_page(start)),
5647 .sync = true,
5648 };
5649 INIT_LIST_HEAD(&cc.migratepages);
5650
5651 migrate_prep_local();
5652
5653 while (pfn < end || !list_empty(&cc.migratepages)) {
5654 if (fatal_signal_pending(current)) {
5655 ret = -EINTR;
5656 break;
5657 }
5658
5659 if (list_empty(&cc.migratepages)) {
5660 cc.nr_migratepages = 0;
5661 pfn = isolate_migratepages_range(cc.zone, &cc,
5662 pfn, end);
5663 if (!pfn) {
5664 ret = -EINTR;
5665 break;
5666 }
5667 tries = 0;
5668 } else if (++tries == 5) {
5669 ret = ret < 0 ? ret : -EBUSY;
5670 break;
5671 }
5672
5673 ret = migrate_pages(&cc.migratepages,
5674 __alloc_contig_migrate_alloc,
5675 0, false, MIGRATE_SYNC);
5676 }
5677
5678 putback_lru_pages(&cc.migratepages);
5679 return ret > 0 ? 0 : ret;
5680}
5681
5682/*
5683 * Update zone's cma pages counter used for watermark level calculation.
5684 */
5685static inline void __update_cma_watermarks(struct zone *zone, int count)
5686{
5687 unsigned long flags;
5493 spin_lock_irqsave(&zone->lock, flags); 5688 spin_lock_irqsave(&zone->lock, flags);
5689 zone->min_cma_pages += count;
5690 spin_unlock_irqrestore(&zone->lock, flags);
5691 setup_per_zone_wmarks();
5692}
5494 5693
5495 pfn = page_to_pfn(page); 5694/*
5496 arg.start_pfn = pfn; 5695 * Trigger memory pressure bump to reclaim some pages in order to be able to
5497 arg.nr_pages = pageblock_nr_pages; 5696 * allocate 'count' pages in single page units. Does similar work as
5498 arg.pages_found = 0; 5697 *__alloc_pages_slowpath() function.
5698 */
5699static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5700{
5701 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5702 struct zonelist *zonelist = node_zonelist(0, gfp_mask);
5703 int did_some_progress = 0;
5704 int order = 1;
5499 5705
5500 /* 5706 /*
5501 * It may be possible to isolate a pageblock even if the 5707 * Increase level of watermarks to force kswapd do his job
5502 * migratetype is not MIGRATE_MOVABLE. The memory isolation 5708 * to stabilise at new watermark level.
5503 * notifier chain is used by balloon drivers to return the
5504 * number of pages in a range that are held by the balloon
5505 * driver to shrink memory. If all the pages are accounted for
5506 * by balloons, are free, or on the LRU, isolation can continue.
5507 * Later, for example, when memory hotplug notifier runs, these
5508 * pages reported as "can be isolated" should be isolated(freed)
5509 * by the balloon driver through the memory notifier chain.
5510 */ 5709 */
5511 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); 5710 __update_cma_watermarks(zone, count);
5512 notifier_ret = notifier_to_errno(notifier_ret); 5711
5513 if (notifier_ret) 5712 /* Obey watermarks as if the page was being allocated */
5514 goto out; 5713 while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
5714 wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
5715
5716 did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
5717 NULL);
5718 if (!did_some_progress) {
5719 /* Exhausted what can be done so it's blamo time */
5720 out_of_memory(zonelist, gfp_mask, order, NULL, false);
5721 }
5722 }
5723
5724 /* Restore original watermark levels. */
5725 __update_cma_watermarks(zone, -count);
5726
5727 return count;
5728}
5729
5730/**
5731 * alloc_contig_range() -- tries to allocate given range of pages
5732 * @start: start PFN to allocate
5733 * @end: one-past-the-last PFN to allocate
5734 * @migratetype: migratetype of the underlaying pageblocks (either
5735 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
5736 * in range must have the same migratetype and it must
5737 * be either of the two.
5738 *
5739 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
5740 * aligned, however it's the caller's responsibility to guarantee that
5741 * we are the only thread that changes migrate type of pageblocks the
5742 * pages fall in.
5743 *
5744 * The PFN range must belong to a single zone.
5745 *
5746 * Returns zero on success or negative error code. On success all
5747 * pages which PFN is in [start, end) are allocated for the caller and
5748 * need to be freed with free_contig_range().
5749 */
5750int alloc_contig_range(unsigned long start, unsigned long end,
5751 unsigned migratetype)
5752{
5753 struct zone *zone = page_zone(pfn_to_page(start));
5754 unsigned long outer_start, outer_end;
5755 int ret = 0, order;
5756
5515 /* 5757 /*
5516 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. 5758 * What we do here is we mark all pageblocks in range as
5517 * We just check MOVABLE pages. 5759 * MIGRATE_ISOLATE. Because pageblock and max order pages may
5760 * have different sizes, and due to the way page allocator
5761 * work, we align the range to biggest of the two pages so
5762 * that page allocator won't try to merge buddies from
5763 * different pageblocks and change MIGRATE_ISOLATE to some
5764 * other migration type.
5765 *
5766 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
5767 * migrate the pages from an unaligned range (ie. pages that
5768 * we are interested in). This will put all the pages in
5769 * range back to page allocator as MIGRATE_ISOLATE.
5770 *
5771 * When this is done, we take the pages in range from page
5772 * allocator removing them from the buddy system. This way
5773 * page allocator will never consider using them.
5774 *
5775 * This lets us mark the pageblocks back as
5776 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
5777 * aligned range but not in the unaligned, original range are
5778 * put back to page allocator so that buddy can use them.
5518 */ 5779 */
5519 if (__count_immobile_pages(zone, page, arg.pages_found)) 5780
5520 ret = 0; 5781 ret = start_isolate_page_range(pfn_max_align_down(start),
5782 pfn_max_align_up(end), migratetype);
5783 if (ret)
5784 goto done;
5785
5786 ret = __alloc_contig_migrate_range(start, end);
5787 if (ret)
5788 goto done;
5521 5789
5522 /* 5790 /*
5523 * immobile means "not-on-lru" paes. If immobile is larger than 5791 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
5524 * removable-by-driver pages reported by notifier, we'll fail. 5792 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
5793 * more, all pages in [start, end) are free in page allocator.
5794 * What we are going to do is to allocate all pages from
5795 * [start, end) (that is remove them from page allocator).
5796 *
5797 * The only problem is that pages at the beginning and at the
5798 * end of interesting range may be not aligned with pages that
5799 * page allocator holds, ie. they can be part of higher order
5800 * pages. Because of this, we reserve the bigger range and
5801 * once this is done free the pages we are not interested in.
5802 *
5803 * We don't have to hold zone->lock here because the pages are
5804 * isolated thus they won't get removed from buddy.
5525 */ 5805 */
5526 5806
5527out: 5807 lru_add_drain_all();
5528 if (!ret) { 5808 drain_all_pages();
5529 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5809
5530 move_freepages_block(zone, page, MIGRATE_ISOLATE); 5810 order = 0;
5811 outer_start = start;
5812 while (!PageBuddy(pfn_to_page(outer_start))) {
5813 if (++order >= MAX_ORDER) {
5814 ret = -EBUSY;
5815 goto done;
5816 }
5817 outer_start &= ~0UL << order;
5531 } 5818 }
5532 5819
5533 spin_unlock_irqrestore(&zone->lock, flags); 5820 /* Make sure the range is really isolated. */
5534 if (!ret) 5821 if (test_pages_isolated(outer_start, end)) {
5535 drain_all_pages(); 5822 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
5823 outer_start, end);
5824 ret = -EBUSY;
5825 goto done;
5826 }
5827
5828 /*
5829 * Reclaim enough pages to make sure that contiguous allocation
5830 * will not starve the system.
5831 */
5832 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
5833
5834 /* Grab isolated pages from freelists. */
5835 outer_end = isolate_freepages_range(outer_start, end);
5836 if (!outer_end) {
5837 ret = -EBUSY;
5838 goto done;
5839 }
5840
5841 /* Free head and tail (if any) */
5842 if (start != outer_start)
5843 free_contig_range(outer_start, start - outer_start);
5844 if (end != outer_end)
5845 free_contig_range(end, outer_end - end);
5846
5847done:
5848 undo_isolate_page_range(pfn_max_align_down(start),
5849 pfn_max_align_up(end), migratetype);
5536 return ret; 5850 return ret;
5537} 5851}
5538 5852
5539void unset_migratetype_isolate(struct page *page) 5853void free_contig_range(unsigned long pfn, unsigned nr_pages)
5540{ 5854{
5541 struct zone *zone; 5855 for (; nr_pages--; ++pfn)
5542 unsigned long flags; 5856 __free_page(pfn_to_page(pfn));
5543 zone = page_zone(page); 5857}
5544 spin_lock_irqsave(&zone->lock, flags); 5858#endif
5545 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 5859
5546 goto out; 5860#ifdef CONFIG_MEMORY_HOTPLUG
5547 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 5861static int __meminit __zone_pcp_update(void *data)
5548 move_freepages_block(zone, page, MIGRATE_MOVABLE); 5862{
5549out: 5863 struct zone *zone = data;
5550 spin_unlock_irqrestore(&zone->lock, flags); 5864 int cpu;
5865 unsigned long batch = zone_batchsize(zone), flags;
5866
5867 for_each_possible_cpu(cpu) {
5868 struct per_cpu_pageset *pset;
5869 struct per_cpu_pages *pcp;
5870
5871 pset = per_cpu_ptr(zone->pageset, cpu);
5872 pcp = &pset->pcp;
5873
5874 local_irq_save(flags);
5875 if (pcp->count > 0)
5876 free_pcppages_bulk(zone, pcp->count, pcp);
5877 setup_pageset(pset, batch);
5878 local_irq_restore(flags);
5879 }
5880 return 0;
5551} 5881}
5552 5882
5883void __meminit zone_pcp_update(struct zone *zone)
5884{
5885 stop_machine(__zone_pcp_update, zone, NULL);
5886}
5887#endif
5888
5553#ifdef CONFIG_MEMORY_HOTREMOVE 5889#ifdef CONFIG_MEMORY_HOTREMOVE
5890void zone_pcp_reset(struct zone *zone)
5891{
5892 unsigned long flags;
5893
5894 /* avoid races with drain_pages() */
5895 local_irq_save(flags);
5896 if (zone->pageset != &boot_pageset) {
5897 free_percpu(zone->pageset);
5898 zone->pageset = &boot_pageset;
5899 }
5900 local_irq_restore(flags);
5901}
5902
5554/* 5903/*
5555 * All pages in the range must be isolated before calling this. 5904 * All pages in the range must be isolated before calling this.
5556 */ 5905 */
@@ -5618,7 +5967,7 @@ bool is_free_buddy_page(struct page *page)
5618} 5967}
5619#endif 5968#endif
5620 5969
5621static struct trace_print_flags pageflag_names[] = { 5970static const struct trace_print_flags pageflag_names[] = {
5622 {1UL << PG_locked, "locked" }, 5971 {1UL << PG_locked, "locked" },
5623 {1UL << PG_error, "error" }, 5972 {1UL << PG_error, "error" },
5624 {1UL << PG_referenced, "referenced" }, 5973 {1UL << PG_referenced, "referenced" },
@@ -5653,7 +6002,9 @@ static struct trace_print_flags pageflag_names[] = {
5653#ifdef CONFIG_MEMORY_FAILURE 6002#ifdef CONFIG_MEMORY_FAILURE
5654 {1UL << PG_hwpoison, "hwpoison" }, 6003 {1UL << PG_hwpoison, "hwpoison" },
5655#endif 6004#endif
5656 {-1UL, NULL }, 6005#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6006 {1UL << PG_compound_lock, "compound_lock" },
6007#endif
5657}; 6008};
5658 6009
5659static void dump_page_flags(unsigned long flags) 6010static void dump_page_flags(unsigned long flags)
@@ -5662,12 +6013,14 @@ static void dump_page_flags(unsigned long flags)
5662 unsigned long mask; 6013 unsigned long mask;
5663 int i; 6014 int i;
5664 6015
6016 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
6017
5665 printk(KERN_ALERT "page flags: %#lx(", flags); 6018 printk(KERN_ALERT "page flags: %#lx(", flags);
5666 6019
5667 /* remove zone id */ 6020 /* remove zone id */
5668 flags &= (1UL << NR_PAGEFLAGS) - 1; 6021 flags &= (1UL << NR_PAGEFLAGS) - 1;
5669 6022
5670 for (i = 0; pageflag_names[i].name && flags; i++) { 6023 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
5671 6024
5672 mask = pageflag_names[i].mask; 6025 mask = pageflag_names[i].mask;
5673 if ((flags & mask) != mask) 6026 if ((flags & mask) != mask)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1ccbd714059c..5ddad0c6daa6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
317#endif 317#endif
318 318
319 319
320#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 320#ifdef CONFIG_MEMCG_SWAP
321 321
322static DEFINE_MUTEX(swap_cgroup_mutex); 322static DEFINE_MUTEX(swap_cgroup_mutex);
323struct swap_cgroup_ctrl { 323struct swap_cgroup_ctrl {
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
392 392
393/** 393/**
394 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. 394 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
395 * @end: swap entry to be cmpxchged 395 * @ent: swap entry to be cmpxchged
396 * @old: old id 396 * @old: old id
397 * @new: new id 397 * @new: new id
398 * 398 *
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
422/** 422/**
423 * swap_cgroup_record - record mem_cgroup for this swp_entry. 423 * swap_cgroup_record - record mem_cgroup for this swp_entry.
424 * @ent: swap entry to be recorded into 424 * @ent: swap entry to be recorded into
425 * @mem: mem_cgroup to be recorded 425 * @id: mem_cgroup to be recorded
426 * 426 *
427 * Returns old value at success, 0 at failure. 427 * Returns old value at success, 0 at failure.
428 * (Of course, old value can be 0.) 428 * (Of course, old value can be 0.)
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d0611e..78eee32ee486 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -17,7 +17,9 @@
17#include <linux/swap.h> 17#include <linux/swap.h>
18#include <linux/bio.h> 18#include <linux/bio.h>
19#include <linux/swapops.h> 19#include <linux/swapops.h>
20#include <linux/buffer_head.h>
20#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/frontswap.h>
21#include <asm/pgtable.h> 23#include <asm/pgtable.h>
22 24
23static struct bio *get_swap_bio(gfp_t gfp_flags, 25static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -85,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err)
85 bio_put(bio); 87 bio_put(bio);
86} 88}
87 89
90int generic_swapfile_activate(struct swap_info_struct *sis,
91 struct file *swap_file,
92 sector_t *span)
93{
94 struct address_space *mapping = swap_file->f_mapping;
95 struct inode *inode = mapping->host;
96 unsigned blocks_per_page;
97 unsigned long page_no;
98 unsigned blkbits;
99 sector_t probe_block;
100 sector_t last_block;
101 sector_t lowest_block = -1;
102 sector_t highest_block = 0;
103 int nr_extents = 0;
104 int ret;
105
106 blkbits = inode->i_blkbits;
107 blocks_per_page = PAGE_SIZE >> blkbits;
108
109 /*
110 * Map all the blocks into the extent list. This code doesn't try
111 * to be very smart.
112 */
113 probe_block = 0;
114 page_no = 0;
115 last_block = i_size_read(inode) >> blkbits;
116 while ((probe_block + blocks_per_page) <= last_block &&
117 page_no < sis->max) {
118 unsigned block_in_page;
119 sector_t first_block;
120
121 first_block = bmap(inode, probe_block);
122 if (first_block == 0)
123 goto bad_bmap;
124
125 /*
126 * It must be PAGE_SIZE aligned on-disk
127 */
128 if (first_block & (blocks_per_page - 1)) {
129 probe_block++;
130 goto reprobe;
131 }
132
133 for (block_in_page = 1; block_in_page < blocks_per_page;
134 block_in_page++) {
135 sector_t block;
136
137 block = bmap(inode, probe_block + block_in_page);
138 if (block == 0)
139 goto bad_bmap;
140 if (block != first_block + block_in_page) {
141 /* Discontiguity */
142 probe_block++;
143 goto reprobe;
144 }
145 }
146
147 first_block >>= (PAGE_SHIFT - blkbits);
148 if (page_no) { /* exclude the header page */
149 if (first_block < lowest_block)
150 lowest_block = first_block;
151 if (first_block > highest_block)
152 highest_block = first_block;
153 }
154
155 /*
156 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
157 */
158 ret = add_swap_extent(sis, page_no, 1, first_block);
159 if (ret < 0)
160 goto out;
161 nr_extents += ret;
162 page_no++;
163 probe_block += blocks_per_page;
164reprobe:
165 continue;
166 }
167 ret = nr_extents;
168 *span = 1 + highest_block - lowest_block;
169 if (page_no == 0)
170 page_no = 1; /* force Empty message */
171 sis->max = page_no;
172 sis->pages = page_no - 1;
173 sis->highest_bit = page_no - 1;
174out:
175 return ret;
176bad_bmap:
177 printk(KERN_ERR "swapon: swapfile has holes\n");
178 ret = -EINVAL;
179 goto out;
180}
181
88/* 182/*
89 * We may have stale swap cache pages in memory: notice 183 * We may have stale swap cache pages in memory: notice
90 * them here and get rid of the unnecessary final write. 184 * them here and get rid of the unnecessary final write.
@@ -93,11 +187,45 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
93{ 187{
94 struct bio *bio; 188 struct bio *bio;
95 int ret = 0, rw = WRITE; 189 int ret = 0, rw = WRITE;
190 struct swap_info_struct *sis = page_swap_info(page);
96 191
97 if (try_to_free_swap(page)) { 192 if (try_to_free_swap(page)) {
98 unlock_page(page); 193 unlock_page(page);
99 goto out; 194 goto out;
100 } 195 }
196 if (frontswap_store(page) == 0) {
197 set_page_writeback(page);
198 unlock_page(page);
199 end_page_writeback(page);
200 goto out;
201 }
202
203 if (sis->flags & SWP_FILE) {
204 struct kiocb kiocb;
205 struct file *swap_file = sis->swap_file;
206 struct address_space *mapping = swap_file->f_mapping;
207 struct iovec iov = {
208 .iov_base = kmap(page),
209 .iov_len = PAGE_SIZE,
210 };
211
212 init_sync_kiocb(&kiocb, swap_file);
213 kiocb.ki_pos = page_file_offset(page);
214 kiocb.ki_left = PAGE_SIZE;
215 kiocb.ki_nbytes = PAGE_SIZE;
216
217 unlock_page(page);
218 ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
219 &kiocb, &iov,
220 kiocb.ki_pos, 1);
221 kunmap(page);
222 if (ret == PAGE_SIZE) {
223 count_vm_event(PSWPOUT);
224 ret = 0;
225 }
226 return ret;
227 }
228
101 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); 229 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
102 if (bio == NULL) { 230 if (bio == NULL) {
103 set_page_dirty(page); 231 set_page_dirty(page);
@@ -119,9 +247,26 @@ int swap_readpage(struct page *page)
119{ 247{
120 struct bio *bio; 248 struct bio *bio;
121 int ret = 0; 249 int ret = 0;
250 struct swap_info_struct *sis = page_swap_info(page);
122 251
123 VM_BUG_ON(!PageLocked(page)); 252 VM_BUG_ON(!PageLocked(page));
124 VM_BUG_ON(PageUptodate(page)); 253 VM_BUG_ON(PageUptodate(page));
254 if (frontswap_load(page) == 0) {
255 SetPageUptodate(page);
256 unlock_page(page);
257 goto out;
258 }
259
260 if (sis->flags & SWP_FILE) {
261 struct file *swap_file = sis->swap_file;
262 struct address_space *mapping = swap_file->f_mapping;
263
264 ret = mapping->a_ops->readpage(swap_file, page);
265 if (!ret)
266 count_vm_event(PSWPIN);
267 return ret;
268 }
269
125 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); 270 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
126 if (bio == NULL) { 271 if (bio == NULL) {
127 unlock_page(page); 272 unlock_page(page);
@@ -133,3 +278,15 @@ int swap_readpage(struct page *page)
133out: 278out:
134 return ret; 279 return ret;
135} 280}
281
282int swap_set_page_dirty(struct page *page)
283{
284 struct swap_info_struct *sis = page_swap_info(page);
285
286 if (sis->flags & SWP_FILE) {
287 struct address_space *mapping = sis->swap_file->f_mapping;
288 return mapping->a_ops->set_page_dirty(page);
289 } else {
290 return __set_page_dirty_no_writeback(page);
291 }
292}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 4ae42bb40892..247d1f175739 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -5,8 +5,101 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/page-isolation.h> 6#include <linux/page-isolation.h>
7#include <linux/pageblock-flags.h> 7#include <linux/pageblock-flags.h>
8#include <linux/memory.h>
8#include "internal.h" 9#include "internal.h"
9 10
11/* called while holding zone->lock */
12static void set_pageblock_isolate(struct page *page)
13{
14 if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
15 return;
16
17 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
18 page_zone(page)->nr_pageblock_isolate++;
19}
20
21/* called while holding zone->lock */
22static void restore_pageblock_isolate(struct page *page, int migratetype)
23{
24 struct zone *zone = page_zone(page);
25 if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
26 return;
27
28 BUG_ON(zone->nr_pageblock_isolate <= 0);
29 set_pageblock_migratetype(page, migratetype);
30 zone->nr_pageblock_isolate--;
31}
32
33int set_migratetype_isolate(struct page *page)
34{
35 struct zone *zone;
36 unsigned long flags, pfn;
37 struct memory_isolate_notify arg;
38 int notifier_ret;
39 int ret = -EBUSY;
40
41 zone = page_zone(page);
42
43 spin_lock_irqsave(&zone->lock, flags);
44
45 pfn = page_to_pfn(page);
46 arg.start_pfn = pfn;
47 arg.nr_pages = pageblock_nr_pages;
48 arg.pages_found = 0;
49
50 /*
51 * It may be possible to isolate a pageblock even if the
52 * migratetype is not MIGRATE_MOVABLE. The memory isolation
53 * notifier chain is used by balloon drivers to return the
54 * number of pages in a range that are held by the balloon
55 * driver to shrink memory. If all the pages are accounted for
56 * by balloons, are free, or on the LRU, isolation can continue.
57 * Later, for example, when memory hotplug notifier runs, these
58 * pages reported as "can be isolated" should be isolated(freed)
59 * by the balloon driver through the memory notifier chain.
60 */
61 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
62 notifier_ret = notifier_to_errno(notifier_ret);
63 if (notifier_ret)
64 goto out;
65 /*
66 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
67 * We just check MOVABLE pages.
68 */
69 if (!has_unmovable_pages(zone, page, arg.pages_found))
70 ret = 0;
71
72 /*
73 * immobile means "not-on-lru" paes. If immobile is larger than
74 * removable-by-driver pages reported by notifier, we'll fail.
75 */
76
77out:
78 if (!ret) {
79 set_pageblock_isolate(page);
80 move_freepages_block(zone, page, MIGRATE_ISOLATE);
81 }
82
83 spin_unlock_irqrestore(&zone->lock, flags);
84 if (!ret)
85 drain_all_pages();
86 return ret;
87}
88
89void unset_migratetype_isolate(struct page *page, unsigned migratetype)
90{
91 struct zone *zone;
92 unsigned long flags;
93 zone = page_zone(page);
94 spin_lock_irqsave(&zone->lock, flags);
95 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
96 goto out;
97 move_freepages_block(zone, page, migratetype);
98 restore_pageblock_isolate(page, migratetype);
99out:
100 spin_unlock_irqrestore(&zone->lock, flags);
101}
102
10static inline struct page * 103static inline struct page *
11__first_valid_page(unsigned long pfn, unsigned long nr_pages) 104__first_valid_page(unsigned long pfn, unsigned long nr_pages)
12{ 105{
@@ -24,6 +117,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
24 * to be MIGRATE_ISOLATE. 117 * to be MIGRATE_ISOLATE.
25 * @start_pfn: The lower PFN of the range to be isolated. 118 * @start_pfn: The lower PFN of the range to be isolated.
26 * @end_pfn: The upper PFN of the range to be isolated. 119 * @end_pfn: The upper PFN of the range to be isolated.
120 * @migratetype: migrate type to set in error recovery.
27 * 121 *
28 * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in 122 * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
29 * the range will never be allocated. Any free pages and pages freed in the 123 * the range will never be allocated. Any free pages and pages freed in the
@@ -32,8 +126,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
32 * start_pfn/end_pfn must be aligned to pageblock_order. 126 * start_pfn/end_pfn must be aligned to pageblock_order.
33 * Returns 0 on success and -EBUSY if any part of range cannot be isolated. 127 * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
34 */ 128 */
35int 129int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
36start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) 130 unsigned migratetype)
37{ 131{
38 unsigned long pfn; 132 unsigned long pfn;
39 unsigned long undo_pfn; 133 unsigned long undo_pfn;
@@ -56,7 +150,7 @@ undo:
56 for (pfn = start_pfn; 150 for (pfn = start_pfn;
57 pfn < undo_pfn; 151 pfn < undo_pfn;
58 pfn += pageblock_nr_pages) 152 pfn += pageblock_nr_pages)
59 unset_migratetype_isolate(pfn_to_page(pfn)); 153 unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
60 154
61 return -EBUSY; 155 return -EBUSY;
62} 156}
@@ -64,8 +158,8 @@ undo:
64/* 158/*
65 * Make isolated pages available again. 159 * Make isolated pages available again.
66 */ 160 */
67int 161int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
68undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) 162 unsigned migratetype)
69{ 163{
70 unsigned long pfn; 164 unsigned long pfn;
71 struct page *page; 165 struct page *page;
@@ -77,7 +171,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
77 page = __first_valid_page(pfn, pageblock_nr_pages); 171 page = __first_valid_page(pfn, pageblock_nr_pages);
78 if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 172 if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
79 continue; 173 continue;
80 unset_migratetype_isolate(page); 174 unset_migratetype_isolate(page, migratetype);
81 } 175 }
82 return 0; 176 return 0;
83} 177}
@@ -86,7 +180,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
86 * all pages in [start_pfn...end_pfn) must be in the same zone. 180 * all pages in [start_pfn...end_pfn) must be in the same zone.
87 * zone->lock must be held before call this. 181 * zone->lock must be held before call this.
88 * 182 *
89 * Returns 1 if all pages in the range is isolated. 183 * Returns 1 if all pages in the range are isolated.
90 */ 184 */
91static int 185static int
92__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) 186__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index aa9701e12714..6c118d012bb5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
162 162
163/** 163/**
164 * walk_page_range - walk a memory map's page tables with a callback 164 * walk_page_range - walk a memory map's page tables with a callback
165 * @mm: memory map to walk
166 * @addr: starting address 165 * @addr: starting address
167 * @end: ending address 166 * @end: ending address
168 * @walk: set of callbacks to invoke for each level of the tree 167 * @walk: set of callbacks to invoke for each level of the tree
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 405d331804c3..3707c71ae4cd 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -360,7 +360,6 @@ err_free:
360 * @chunk: chunk to depopulate 360 * @chunk: chunk to depopulate
361 * @off: offset to the area to depopulate 361 * @off: offset to the area to depopulate
362 * @size: size of the area to depopulate in bytes 362 * @size: size of the area to depopulate in bytes
363 * @flush: whether to flush cache and tlb or not
364 * 363 *
365 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 364 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
366 * from @chunk. If @flush is true, vcache is flushed before unmapping 365 * from @chunk. If @flush is true, vcache is flushed before unmapping
diff --git a/mm/percpu.c b/mm/percpu.c
index f47af9123af7..bb4be7435ce3 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1132,20 +1132,20 @@ static void pcpu_dump_alloc_info(const char *lvl,
1132 for (alloc_end += gi->nr_units / upa; 1132 for (alloc_end += gi->nr_units / upa;
1133 alloc < alloc_end; alloc++) { 1133 alloc < alloc_end; alloc++) {
1134 if (!(alloc % apl)) { 1134 if (!(alloc % apl)) {
1135 printk("\n"); 1135 printk(KERN_CONT "\n");
1136 printk("%spcpu-alloc: ", lvl); 1136 printk("%spcpu-alloc: ", lvl);
1137 } 1137 }
1138 printk("[%0*d] ", group_width, group); 1138 printk(KERN_CONT "[%0*d] ", group_width, group);
1139 1139
1140 for (unit_end += upa; unit < unit_end; unit++) 1140 for (unit_end += upa; unit < unit_end; unit++)
1141 if (gi->cpu_map[unit] != NR_CPUS) 1141 if (gi->cpu_map[unit] != NR_CPUS)
1142 printk("%0*d ", cpu_width, 1142 printk(KERN_CONT "%0*d ", cpu_width,
1143 gi->cpu_map[unit]); 1143 gi->cpu_map[unit]);
1144 else 1144 else
1145 printk("%s ", empty_str); 1145 printk(KERN_CONT "%s ", empty_str);
1146 } 1146 }
1147 } 1147 }
1148 printk("\n"); 1148 printk(KERN_CONT "\n");
1149} 1149}
1150 1150
1151/** 1151/**
@@ -1650,6 +1650,16 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1650 areas[group] = ptr; 1650 areas[group] = ptr;
1651 1651
1652 base = min(ptr, base); 1652 base = min(ptr, base);
1653 }
1654
1655 /*
1656 * Copy data and free unused parts. This should happen after all
1657 * allocations are complete; otherwise, we may end up with
1658 * overlapping groups.
1659 */
1660 for (group = 0; group < ai->nr_groups; group++) {
1661 struct pcpu_group_info *gi = &ai->groups[group];
1662 void *ptr = areas[group];
1653 1663
1654 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { 1664 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
1655 if (gi->cpu_map[i] == NR_CPUS) { 1665 if (gi->cpu_map[i] == NR_CPUS) {
@@ -1885,6 +1895,8 @@ void __init setup_per_cpu_areas(void)
1885 fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 1895 fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
1886 if (!ai || !fc) 1896 if (!ai || !fc)
1887 panic("Failed to allocate memory for percpu areas."); 1897 panic("Failed to allocate memory for percpu areas.");
1898 /* kmemleak tracks the percpu allocations separately */
1899 kmemleak_free(fc);
1888 1900
1889 ai->dyn_size = unit_size; 1901 ai->dyn_size = unit_size;
1890 ai->unit_size = unit_size; 1902 ai->unit_size = unit_size;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 5a74fea182f1..74c0ddaa6fa0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -109,8 +109,8 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
109 109
110#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH 110#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
111#ifdef CONFIG_TRANSPARENT_HUGEPAGE 111#ifdef CONFIG_TRANSPARENT_HUGEPAGE
112pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, 112void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
113 pmd_t *pmdp) 113 pmd_t *pmdp)
114{ 114{
115 pmd_t pmd = pmd_mksplitting(*pmdp); 115 pmd_t pmd = pmd_mksplitting(*pmdp);
116 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 116 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index c20ff48994c2..926b46649749 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -371,15 +371,15 @@ static ssize_t process_vm_rw(pid_t pid,
371 /* Check iovecs */ 371 /* Check iovecs */
372 if (vm_write) 372 if (vm_write)
373 rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, 373 rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
374 iovstack_l, &iov_l, 1); 374 iovstack_l, &iov_l);
375 else 375 else
376 rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, 376 rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
377 iovstack_l, &iov_l, 1); 377 iovstack_l, &iov_l);
378 if (rc <= 0) 378 if (rc <= 0)
379 goto free_iovecs; 379 goto free_iovecs;
380 380
381 rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV, 381 rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
382 iovstack_r, &iov_r, 0); 382 iovstack_r, &iov_r);
383 if (rc <= 0) 383 if (rc <= 0)
384 goto free_iovecs; 384 goto free_iovecs;
385 385
@@ -438,16 +438,16 @@ compat_process_vm_rw(compat_pid_t pid,
438 if (vm_write) 438 if (vm_write)
439 rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, 439 rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
440 UIO_FASTIOV, iovstack_l, 440 UIO_FASTIOV, iovstack_l,
441 &iov_l, 1); 441 &iov_l);
442 else 442 else
443 rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, 443 rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
444 UIO_FASTIOV, iovstack_l, 444 UIO_FASTIOV, iovstack_l,
445 &iov_l, 1); 445 &iov_l);
446 if (rc <= 0) 446 if (rc <= 0)
447 goto free_iovecs; 447 goto free_iovecs;
448 rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt, 448 rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
449 UIO_FASTIOV, iovstack_r, 449 UIO_FASTIOV, iovstack_r,
450 &iov_r, 0); 450 &iov_r);
451 if (rc <= 0) 451 if (rc <= 0)
452 goto free_iovecs; 452 goto free_iovecs;
453 453
diff --git a/mm/readahead.c b/mm/readahead.c
index cbcbb02f3e28..ea8f8fa21649 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,6 +17,8 @@
17#include <linux/task_io_accounting_ops.h> 17#include <linux/task_io_accounting_ops.h>
18#include <linux/pagevec.h> 18#include <linux/pagevec.h>
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/syscalls.h>
21#include <linux/file.h>
20 22
21/* 23/*
22 * Initialise a struct file's readahead state. Assumes that the caller has 24 * Initialise a struct file's readahead state. Assumes that the caller has
@@ -562,3 +564,41 @@ page_cache_async_readahead(struct address_space *mapping,
562 ondemand_readahead(mapping, ra, filp, true, offset, req_size); 564 ondemand_readahead(mapping, ra, filp, true, offset, req_size);
563} 565}
564EXPORT_SYMBOL_GPL(page_cache_async_readahead); 566EXPORT_SYMBOL_GPL(page_cache_async_readahead);
567
568static ssize_t
569do_readahead(struct address_space *mapping, struct file *filp,
570 pgoff_t index, unsigned long nr)
571{
572 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
573 return -EINVAL;
574
575 force_page_cache_readahead(mapping, filp, index, nr);
576 return 0;
577}
578
579SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
580{
581 ssize_t ret;
582 struct file *file;
583
584 ret = -EBADF;
585 file = fget(fd);
586 if (file) {
587 if (file->f_mode & FMODE_READ) {
588 struct address_space *mapping = file->f_mapping;
589 pgoff_t start = offset >> PAGE_CACHE_SHIFT;
590 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
591 unsigned long len = end - start + 1;
592 ret = do_readahead(mapping, file, start, len);
593 }
594 fput(file);
595 }
596 return ret;
597}
598#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
599asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
600{
601 return SYSC_readahead((int) fd, offset, (size_t) count);
602}
603SYSCALL_ALIAS(sys_readahead, SyS_readahead);
604#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 5b5ad584ffb7..0f3b7cda2a24 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -755,12 +755,6 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
755 pte_unmap_unlock(pte, ptl); 755 pte_unmap_unlock(pte, ptl);
756 } 756 }
757 757
758 /* Pretend the page is referenced if the task has the
759 swap token and is in the middle of a page fault. */
760 if (mm != current->mm && has_swap_token(mm) &&
761 rwsem_is_locked(&mm->mmap_sem))
762 referenced++;
763
764 (*mapcount)--; 758 (*mapcount)--;
765 759
766 if (referenced) 760 if (referenced)
diff --git a/mm/shmem.c b/mm/shmem.c
index f99ff3e50bd6..d4e184e2a38e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt;
53#include <linux/blkdev.h> 53#include <linux/blkdev.h>
54#include <linux/pagevec.h> 54#include <linux/pagevec.h>
55#include <linux/percpu_counter.h> 55#include <linux/percpu_counter.h>
56#include <linux/falloc.h>
56#include <linux/splice.h> 57#include <linux/splice.h>
57#include <linux/security.h> 58#include <linux/security.h>
58#include <linux/swapops.h> 59#include <linux/swapops.h>
@@ -83,12 +84,25 @@ struct shmem_xattr {
83 char value[0]; 84 char value[0];
84}; 85};
85 86
87/*
88 * shmem_fallocate and shmem_writepage communicate via inode->i_private
89 * (with i_mutex making sure that it has only one user at a time):
90 * we would prefer not to enlarge the shmem inode just for that.
91 */
92struct shmem_falloc {
93 pgoff_t start; /* start of range currently being fallocated */
94 pgoff_t next; /* the next page offset to be fallocated */
95 pgoff_t nr_falloced; /* how many new pages have been fallocated */
96 pgoff_t nr_unswapped; /* how often writepage refused to swap out */
97};
98
86/* Flag allocation requirements to shmem_getpage */ 99/* Flag allocation requirements to shmem_getpage */
87enum sgp_type { 100enum sgp_type {
88 SGP_READ, /* don't exceed i_size, don't allocate page */ 101 SGP_READ, /* don't exceed i_size, don't allocate page */
89 SGP_CACHE, /* don't exceed i_size, may allocate page */ 102 SGP_CACHE, /* don't exceed i_size, may allocate page */
90 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ 103 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
91 SGP_WRITE, /* may exceed i_size, may allocate page */ 104 SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
105 SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
92}; 106};
93 107
94#ifdef CONFIG_TMPFS 108#ifdef CONFIG_TMPFS
@@ -103,6 +117,9 @@ static unsigned long shmem_default_max_inodes(void)
103} 117}
104#endif 118#endif
105 119
120static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
121static int shmem_replace_page(struct page **pagep, gfp_t gfp,
122 struct shmem_inode_info *info, pgoff_t index);
106static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 123static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
107 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); 124 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
108 125
@@ -247,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
247} 264}
248 265
249/* 266/*
267 * Sometimes, before we decide whether to proceed or to fail, we must check
268 * that an entry was not already brought back from swap by a racing thread.
269 *
270 * Checking page is not enough: by the time a SwapCache page is locked, it
271 * might be reused, and again be SwapCache, using the same swap as before.
272 */
273static bool shmem_confirm_swap(struct address_space *mapping,
274 pgoff_t index, swp_entry_t swap)
275{
276 void *item;
277
278 rcu_read_lock();
279 item = radix_tree_lookup(&mapping->page_tree, index);
280 rcu_read_unlock();
281 return item == swp_to_radix_entry(swap);
282}
283
284/*
250 * Like add_to_page_cache_locked, but error if expected item has gone. 285 * Like add_to_page_cache_locked, but error if expected item has gone.
251 */ 286 */
252static int shmem_add_to_page_cache(struct page *page, 287static int shmem_add_to_page_cache(struct page *page,
253 struct address_space *mapping, 288 struct address_space *mapping,
254 pgoff_t index, gfp_t gfp, void *expected) 289 pgoff_t index, gfp_t gfp, void *expected)
255{ 290{
256 int error = 0; 291 int error;
257 292
258 VM_BUG_ON(!PageLocked(page)); 293 VM_BUG_ON(!PageLocked(page));
259 VM_BUG_ON(!PageSwapBacked(page)); 294 VM_BUG_ON(!PageSwapBacked(page));
260 295
296 page_cache_get(page);
297 page->mapping = mapping;
298 page->index = index;
299
300 spin_lock_irq(&mapping->tree_lock);
261 if (!expected) 301 if (!expected)
262 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); 302 error = radix_tree_insert(&mapping->page_tree, index, page);
303 else
304 error = shmem_radix_tree_replace(mapping, index, expected,
305 page);
263 if (!error) { 306 if (!error) {
264 page_cache_get(page); 307 mapping->nrpages++;
265 page->mapping = mapping; 308 __inc_zone_page_state(page, NR_FILE_PAGES);
266 page->index = index; 309 __inc_zone_page_state(page, NR_SHMEM);
267 310 spin_unlock_irq(&mapping->tree_lock);
268 spin_lock_irq(&mapping->tree_lock); 311 } else {
269 if (!expected) 312 page->mapping = NULL;
270 error = radix_tree_insert(&mapping->page_tree, 313 spin_unlock_irq(&mapping->tree_lock);
271 index, page); 314 page_cache_release(page);
272 else
273 error = shmem_radix_tree_replace(mapping, index,
274 expected, page);
275 if (!error) {
276 mapping->nrpages++;
277 __inc_zone_page_state(page, NR_FILE_PAGES);
278 __inc_zone_page_state(page, NR_SHMEM);
279 spin_unlock_irq(&mapping->tree_lock);
280 } else {
281 page->mapping = NULL;
282 spin_unlock_irq(&mapping->tree_lock);
283 page_cache_release(page);
284 }
285 if (!expected)
286 radix_tree_preload_end();
287 } 315 }
288 if (error)
289 mem_cgroup_uncharge_cache_page(page);
290 return error; 316 return error;
291} 317}
292 318
@@ -423,27 +449,31 @@ void shmem_unlock_mapping(struct address_space *mapping)
423 449
424/* 450/*
425 * Remove range of pages and swap entries from radix tree, and free them. 451 * Remove range of pages and swap entries from radix tree, and free them.
452 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
426 */ 453 */
427void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 454static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
455 bool unfalloc)
428{ 456{
429 struct address_space *mapping = inode->i_mapping; 457 struct address_space *mapping = inode->i_mapping;
430 struct shmem_inode_info *info = SHMEM_I(inode); 458 struct shmem_inode_info *info = SHMEM_I(inode);
431 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 459 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
432 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 460 pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
433 pgoff_t end = (lend >> PAGE_CACHE_SHIFT); 461 unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
462 unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
434 struct pagevec pvec; 463 struct pagevec pvec;
435 pgoff_t indices[PAGEVEC_SIZE]; 464 pgoff_t indices[PAGEVEC_SIZE];
436 long nr_swaps_freed = 0; 465 long nr_swaps_freed = 0;
437 pgoff_t index; 466 pgoff_t index;
438 int i; 467 int i;
439 468
440 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); 469 if (lend == -1)
470 end = -1; /* unsigned, so actually very big */
441 471
442 pagevec_init(&pvec, 0); 472 pagevec_init(&pvec, 0);
443 index = start; 473 index = start;
444 while (index <= end) { 474 while (index < end) {
445 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 475 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
446 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, 476 min(end - index, (pgoff_t)PAGEVEC_SIZE),
447 pvec.pages, indices); 477 pvec.pages, indices);
448 if (!pvec.nr) 478 if (!pvec.nr)
449 break; 479 break;
@@ -452,10 +482,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
452 struct page *page = pvec.pages[i]; 482 struct page *page = pvec.pages[i];
453 483
454 index = indices[i]; 484 index = indices[i];
455 if (index > end) 485 if (index >= end)
456 break; 486 break;
457 487
458 if (radix_tree_exceptional_entry(page)) { 488 if (radix_tree_exceptional_entry(page)) {
489 if (unfalloc)
490 continue;
459 nr_swaps_freed += !shmem_free_swap(mapping, 491 nr_swaps_freed += !shmem_free_swap(mapping,
460 index, page); 492 index, page);
461 continue; 493 continue;
@@ -463,9 +495,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
463 495
464 if (!trylock_page(page)) 496 if (!trylock_page(page))
465 continue; 497 continue;
466 if (page->mapping == mapping) { 498 if (!unfalloc || !PageUptodate(page)) {
467 VM_BUG_ON(PageWriteback(page)); 499 if (page->mapping == mapping) {
468 truncate_inode_page(mapping, page); 500 VM_BUG_ON(PageWriteback(page));
501 truncate_inode_page(mapping, page);
502 }
469 } 503 }
470 unlock_page(page); 504 unlock_page(page);
471 } 505 }
@@ -476,30 +510,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
476 index++; 510 index++;
477 } 511 }
478 512
479 if (partial) { 513 if (partial_start) {
480 struct page *page = NULL; 514 struct page *page = NULL;
481 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); 515 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
482 if (page) { 516 if (page) {
483 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 517 unsigned int top = PAGE_CACHE_SIZE;
518 if (start > end) {
519 top = partial_end;
520 partial_end = 0;
521 }
522 zero_user_segment(page, partial_start, top);
484 set_page_dirty(page); 523 set_page_dirty(page);
485 unlock_page(page); 524 unlock_page(page);
486 page_cache_release(page); 525 page_cache_release(page);
487 } 526 }
488 } 527 }
528 if (partial_end) {
529 struct page *page = NULL;
530 shmem_getpage(inode, end, &page, SGP_READ, NULL);
531 if (page) {
532 zero_user_segment(page, 0, partial_end);
533 set_page_dirty(page);
534 unlock_page(page);
535 page_cache_release(page);
536 }
537 }
538 if (start >= end)
539 return;
489 540
490 index = start; 541 index = start;
491 for ( ; ; ) { 542 for ( ; ; ) {
492 cond_resched(); 543 cond_resched();
493 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 544 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
494 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, 545 min(end - index, (pgoff_t)PAGEVEC_SIZE),
495 pvec.pages, indices); 546 pvec.pages, indices);
496 if (!pvec.nr) { 547 if (!pvec.nr) {
497 if (index == start) 548 if (index == start || unfalloc)
498 break; 549 break;
499 index = start; 550 index = start;
500 continue; 551 continue;
501 } 552 }
502 if (index == start && indices[0] > end) { 553 if ((index == start || unfalloc) && indices[0] >= end) {
503 shmem_deswap_pagevec(&pvec); 554 shmem_deswap_pagevec(&pvec);
504 pagevec_release(&pvec); 555 pagevec_release(&pvec);
505 break; 556 break;
@@ -509,19 +560,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
509 struct page *page = pvec.pages[i]; 560 struct page *page = pvec.pages[i];
510 561
511 index = indices[i]; 562 index = indices[i];
512 if (index > end) 563 if (index >= end)
513 break; 564 break;
514 565
515 if (radix_tree_exceptional_entry(page)) { 566 if (radix_tree_exceptional_entry(page)) {
567 if (unfalloc)
568 continue;
516 nr_swaps_freed += !shmem_free_swap(mapping, 569 nr_swaps_freed += !shmem_free_swap(mapping,
517 index, page); 570 index, page);
518 continue; 571 continue;
519 } 572 }
520 573
521 lock_page(page); 574 lock_page(page);
522 if (page->mapping == mapping) { 575 if (!unfalloc || !PageUptodate(page)) {
523 VM_BUG_ON(PageWriteback(page)); 576 if (page->mapping == mapping) {
524 truncate_inode_page(mapping, page); 577 VM_BUG_ON(PageWriteback(page));
578 truncate_inode_page(mapping, page);
579 }
525 } 580 }
526 unlock_page(page); 581 unlock_page(page);
527 } 582 }
@@ -535,7 +590,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
535 info->swapped -= nr_swaps_freed; 590 info->swapped -= nr_swaps_freed;
536 shmem_recalc_inode(inode); 591 shmem_recalc_inode(inode);
537 spin_unlock(&info->lock); 592 spin_unlock(&info->lock);
593}
538 594
595void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
596{
597 shmem_undo_range(inode, lstart, lend, false);
539 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 598 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
540} 599}
541EXPORT_SYMBOL_GPL(shmem_truncate_range); 600EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -597,19 +656,20 @@ static void shmem_evict_inode(struct inode *inode)
597 } 656 }
598 BUG_ON(inode->i_blocks); 657 BUG_ON(inode->i_blocks);
599 shmem_free_inode(inode->i_sb); 658 shmem_free_inode(inode->i_sb);
600 end_writeback(inode); 659 clear_inode(inode);
601} 660}
602 661
603/* 662/*
604 * If swap found in inode, free it and move page from swapcache to filecache. 663 * If swap found in inode, free it and move page from swapcache to filecache.
605 */ 664 */
606static int shmem_unuse_inode(struct shmem_inode_info *info, 665static int shmem_unuse_inode(struct shmem_inode_info *info,
607 swp_entry_t swap, struct page *page) 666 swp_entry_t swap, struct page **pagep)
608{ 667{
609 struct address_space *mapping = info->vfs_inode.i_mapping; 668 struct address_space *mapping = info->vfs_inode.i_mapping;
610 void *radswap; 669 void *radswap;
611 pgoff_t index; 670 pgoff_t index;
612 int error; 671 gfp_t gfp;
672 int error = 0;
613 673
614 radswap = swp_to_radix_entry(swap); 674 radswap = swp_to_radix_entry(swap);
615 index = radix_tree_locate_item(&mapping->page_tree, radswap); 675 index = radix_tree_locate_item(&mapping->page_tree, radswap);
@@ -625,22 +685,48 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
625 if (shmem_swaplist.next != &info->swaplist) 685 if (shmem_swaplist.next != &info->swaplist)
626 list_move_tail(&shmem_swaplist, &info->swaplist); 686 list_move_tail(&shmem_swaplist, &info->swaplist);
627 687
688 gfp = mapping_gfp_mask(mapping);
689 if (shmem_should_replace_page(*pagep, gfp)) {
690 mutex_unlock(&shmem_swaplist_mutex);
691 error = shmem_replace_page(pagep, gfp, info, index);
692 mutex_lock(&shmem_swaplist_mutex);
693 /*
694 * We needed to drop mutex to make that restrictive page
695 * allocation, but the inode might have been freed while we
696 * dropped it: although a racing shmem_evict_inode() cannot
697 * complete without emptying the radix_tree, our page lock
698 * on this swapcache page is not enough to prevent that -
699 * free_swap_and_cache() of our swap entry will only
700 * trylock_page(), removing swap from radix_tree whatever.
701 *
702 * We must not proceed to shmem_add_to_page_cache() if the
703 * inode has been freed, but of course we cannot rely on
704 * inode or mapping or info to check that. However, we can
705 * safely check if our swap entry is still in use (and here
706 * it can't have got reused for another page): if it's still
707 * in use, then the inode cannot have been freed yet, and we
708 * can safely proceed (if it's no longer in use, that tells
709 * nothing about the inode, but we don't need to unuse swap).
710 */
711 if (!page_swapcount(*pagep))
712 error = -ENOENT;
713 }
714
628 /* 715 /*
629 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 716 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
630 * but also to hold up shmem_evict_inode(): so inode cannot be freed 717 * but also to hold up shmem_evict_inode(): so inode cannot be freed
631 * beneath us (pagelock doesn't help until the page is in pagecache). 718 * beneath us (pagelock doesn't help until the page is in pagecache).
632 */ 719 */
633 error = shmem_add_to_page_cache(page, mapping, index, 720 if (!error)
721 error = shmem_add_to_page_cache(*pagep, mapping, index,
634 GFP_NOWAIT, radswap); 722 GFP_NOWAIT, radswap);
635 /* which does mem_cgroup_uncharge_cache_page on error */
636
637 if (error != -ENOMEM) { 723 if (error != -ENOMEM) {
638 /* 724 /*
639 * Truncation and eviction use free_swap_and_cache(), which 725 * Truncation and eviction use free_swap_and_cache(), which
640 * only does trylock page: if we raced, best clean up here. 726 * only does trylock page: if we raced, best clean up here.
641 */ 727 */
642 delete_from_swap_cache(page); 728 delete_from_swap_cache(*pagep);
643 set_page_dirty(page); 729 set_page_dirty(*pagep);
644 if (!error) { 730 if (!error) {
645 spin_lock(&info->lock); 731 spin_lock(&info->lock);
646 info->swapped--; 732 info->swapped--;
@@ -660,7 +746,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
660 struct list_head *this, *next; 746 struct list_head *this, *next;
661 struct shmem_inode_info *info; 747 struct shmem_inode_info *info;
662 int found = 0; 748 int found = 0;
663 int error; 749 int error = 0;
750
751 /*
752 * There's a faint possibility that swap page was replaced before
753 * caller locked it: caller will come back later with the right page.
754 */
755 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
756 goto out;
664 757
665 /* 758 /*
666 * Charge page using GFP_KERNEL while we can wait, before taking 759 * Charge page using GFP_KERNEL while we can wait, before taking
@@ -676,7 +769,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
676 list_for_each_safe(this, next, &shmem_swaplist) { 769 list_for_each_safe(this, next, &shmem_swaplist) {
677 info = list_entry(this, struct shmem_inode_info, swaplist); 770 info = list_entry(this, struct shmem_inode_info, swaplist);
678 if (info->swapped) 771 if (info->swapped)
679 found = shmem_unuse_inode(info, swap, page); 772 found = shmem_unuse_inode(info, swap, &page);
680 else 773 else
681 list_del_init(&info->swaplist); 774 list_del_init(&info->swaplist);
682 cond_resched(); 775 cond_resched();
@@ -685,8 +778,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
685 } 778 }
686 mutex_unlock(&shmem_swaplist_mutex); 779 mutex_unlock(&shmem_swaplist_mutex);
687 780
688 if (!found)
689 mem_cgroup_uncharge_cache_page(page);
690 if (found < 0) 781 if (found < 0)
691 error = found; 782 error = found;
692out: 783out:
@@ -727,6 +818,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
727 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 818 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
728 goto redirty; 819 goto redirty;
729 } 820 }
821
822 /*
823 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
824 * value into swapfile.c, the only way we can correctly account for a
825 * fallocated page arriving here is now to initialize it and write it.
826 *
827 * That's okay for a page already fallocated earlier, but if we have
828 * not yet completed the fallocation, then (a) we want to keep track
829 * of this page in case we have to undo it, and (b) it may not be a
830 * good idea to continue anyway, once we're pushing into swap. So
831 * reactivate the page, and let shmem_fallocate() quit when too many.
832 */
833 if (!PageUptodate(page)) {
834 if (inode->i_private) {
835 struct shmem_falloc *shmem_falloc;
836 spin_lock(&inode->i_lock);
837 shmem_falloc = inode->i_private;
838 if (shmem_falloc &&
839 index >= shmem_falloc->start &&
840 index < shmem_falloc->next)
841 shmem_falloc->nr_unswapped++;
842 else
843 shmem_falloc = NULL;
844 spin_unlock(&inode->i_lock);
845 if (shmem_falloc)
846 goto redirty;
847 }
848 clear_highpage(page);
849 flush_dcache_page(page);
850 SetPageUptodate(page);
851 }
852
730 swap = get_swap_page(); 853 swap = get_swap_page();
731 if (!swap.val) 854 if (!swap.val)
732 goto redirty; 855 goto redirty;
@@ -806,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
806 929
807 /* Create a pseudo vma that just contains the policy */ 930 /* Create a pseudo vma that just contains the policy */
808 pvma.vm_start = 0; 931 pvma.vm_start = 0;
809 pvma.vm_pgoff = index; 932 /* Bias interleave by inode number to distribute better across nodes */
933 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
810 pvma.vm_ops = NULL; 934 pvma.vm_ops = NULL;
811 pvma.vm_policy = spol; 935 pvma.vm_policy = spol;
812 return swapin_readahead(swap, gfp, &pvma, 0); 936 return swapin_readahead(swap, gfp, &pvma, 0);
@@ -819,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp,
819 943
820 /* Create a pseudo vma that just contains the policy */ 944 /* Create a pseudo vma that just contains the policy */
821 pvma.vm_start = 0; 945 pvma.vm_start = 0;
822 pvma.vm_pgoff = index; 946 /* Bias interleave by inode number to distribute better across nodes */
947 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
823 pvma.vm_ops = NULL; 948 pvma.vm_ops = NULL;
824 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 949 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
825 950
@@ -856,6 +981,89 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
856#endif 981#endif
857 982
858/* 983/*
984 * When a page is moved from swapcache to shmem filecache (either by the
985 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
986 * shmem_unuse_inode()), it may have been read in earlier from swap, in
987 * ignorance of the mapping it belongs to. If that mapping has special
988 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
989 * we may need to copy to a suitable page before moving to filecache.
990 *
991 * In a future release, this may well be extended to respect cpuset and
992 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
993 * but for now it is a simple matter of zone.
994 */
995static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
996{
997 return page_zonenum(page) > gfp_zone(gfp);
998}
999
1000static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1001 struct shmem_inode_info *info, pgoff_t index)
1002{
1003 struct page *oldpage, *newpage;
1004 struct address_space *swap_mapping;
1005 pgoff_t swap_index;
1006 int error;
1007
1008 oldpage = *pagep;
1009 swap_index = page_private(oldpage);
1010 swap_mapping = page_mapping(oldpage);
1011
1012 /*
1013 * We have arrived here because our zones are constrained, so don't
1014 * limit chance of success by further cpuset and node constraints.
1015 */
1016 gfp &= ~GFP_CONSTRAINT_MASK;
1017 newpage = shmem_alloc_page(gfp, info, index);
1018 if (!newpage)
1019 return -ENOMEM;
1020
1021 page_cache_get(newpage);
1022 copy_highpage(newpage, oldpage);
1023 flush_dcache_page(newpage);
1024
1025 __set_page_locked(newpage);
1026 SetPageUptodate(newpage);
1027 SetPageSwapBacked(newpage);
1028 set_page_private(newpage, swap_index);
1029 SetPageSwapCache(newpage);
1030
1031 /*
1032 * Our caller will very soon move newpage out of swapcache, but it's
1033 * a nice clean interface for us to replace oldpage by newpage there.
1034 */
1035 spin_lock_irq(&swap_mapping->tree_lock);
1036 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1037 newpage);
1038 if (!error) {
1039 __inc_zone_page_state(newpage, NR_FILE_PAGES);
1040 __dec_zone_page_state(oldpage, NR_FILE_PAGES);
1041 }
1042 spin_unlock_irq(&swap_mapping->tree_lock);
1043
1044 if (unlikely(error)) {
1045 /*
1046 * Is this possible? I think not, now that our callers check
1047 * both PageSwapCache and page_private after getting page lock;
1048 * but be defensive. Reverse old to newpage for clear and free.
1049 */
1050 oldpage = newpage;
1051 } else {
1052 mem_cgroup_replace_page_cache(oldpage, newpage);
1053 lru_cache_add_anon(newpage);
1054 *pagep = newpage;
1055 }
1056
1057 ClearPageSwapCache(oldpage);
1058 set_page_private(oldpage, 0);
1059
1060 unlock_page(oldpage);
1061 page_cache_release(oldpage);
1062 page_cache_release(oldpage);
1063 return error;
1064}
1065
1066/*
859 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 1067 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
860 * 1068 *
861 * If we allocate a new one we do not mark it dirty. That's up to the 1069 * If we allocate a new one we do not mark it dirty. That's up to the
@@ -872,6 +1080,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
872 swp_entry_t swap; 1080 swp_entry_t swap;
873 int error; 1081 int error;
874 int once = 0; 1082 int once = 0;
1083 int alloced = 0;
875 1084
876 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) 1085 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
877 return -EFBIG; 1086 return -EFBIG;
@@ -883,19 +1092,21 @@ repeat:
883 page = NULL; 1092 page = NULL;
884 } 1093 }
885 1094
886 if (sgp != SGP_WRITE && 1095 if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
887 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 1096 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
888 error = -EINVAL; 1097 error = -EINVAL;
889 goto failed; 1098 goto failed;
890 } 1099 }
891 1100
1101 /* fallocated page? */
1102 if (page && !PageUptodate(page)) {
1103 if (sgp != SGP_READ)
1104 goto clear;
1105 unlock_page(page);
1106 page_cache_release(page);
1107 page = NULL;
1108 }
892 if (page || (sgp == SGP_READ && !swap.val)) { 1109 if (page || (sgp == SGP_READ && !swap.val)) {
893 /*
894 * Once we can get the page lock, it must be uptodate:
895 * if there were an error in reading back from swap,
896 * the page would not be inserted into the filecache.
897 */
898 BUG_ON(page && !PageUptodate(page));
899 *pagep = page; 1110 *pagep = page;
900 return 0; 1111 return 0;
901 } 1112 }
@@ -923,26 +1134,31 @@ repeat:
923 1134
924 /* We have to do this with page locked to prevent races */ 1135 /* We have to do this with page locked to prevent races */
925 lock_page(page); 1136 lock_page(page);
1137 if (!PageSwapCache(page) || page_private(page) != swap.val ||
1138 !shmem_confirm_swap(mapping, index, swap)) {
1139 error = -EEXIST; /* try again */
1140 goto unlock;
1141 }
926 if (!PageUptodate(page)) { 1142 if (!PageUptodate(page)) {
927 error = -EIO; 1143 error = -EIO;
928 goto failed; 1144 goto failed;
929 } 1145 }
930 wait_on_page_writeback(page); 1146 wait_on_page_writeback(page);
931 1147
932 /* Someone may have already done it for us */ 1148 if (shmem_should_replace_page(page, gfp)) {
933 if (page->mapping) { 1149 error = shmem_replace_page(&page, gfp, info, index);
934 if (page->mapping == mapping && 1150 if (error)
935 page->index == index) 1151 goto failed;
936 goto done;
937 error = -EEXIST;
938 goto failed;
939 } 1152 }
940 1153
941 error = mem_cgroup_cache_charge(page, current->mm, 1154 error = mem_cgroup_cache_charge(page, current->mm,
942 gfp & GFP_RECLAIM_MASK); 1155 gfp & GFP_RECLAIM_MASK);
943 if (!error) 1156 if (!error) {
944 error = shmem_add_to_page_cache(page, mapping, index, 1157 error = shmem_add_to_page_cache(page, mapping, index,
945 gfp, swp_to_radix_entry(swap)); 1158 gfp, swp_to_radix_entry(swap));
1159 /* We already confirmed swap, and make no allocation */
1160 VM_BUG_ON(error);
1161 }
946 if (error) 1162 if (error)
947 goto failed; 1163 goto failed;
948 1164
@@ -979,11 +1195,18 @@ repeat:
979 __set_page_locked(page); 1195 __set_page_locked(page);
980 error = mem_cgroup_cache_charge(page, current->mm, 1196 error = mem_cgroup_cache_charge(page, current->mm,
981 gfp & GFP_RECLAIM_MASK); 1197 gfp & GFP_RECLAIM_MASK);
982 if (!error)
983 error = shmem_add_to_page_cache(page, mapping, index,
984 gfp, NULL);
985 if (error) 1198 if (error)
986 goto decused; 1199 goto decused;
1200 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
1201 if (!error) {
1202 error = shmem_add_to_page_cache(page, mapping, index,
1203 gfp, NULL);
1204 radix_tree_preload_end();
1205 }
1206 if (error) {
1207 mem_cgroup_uncharge_cache_page(page);
1208 goto decused;
1209 }
987 lru_cache_add_anon(page); 1210 lru_cache_add_anon(page);
988 1211
989 spin_lock(&info->lock); 1212 spin_lock(&info->lock);
@@ -991,19 +1214,36 @@ repeat:
991 inode->i_blocks += BLOCKS_PER_PAGE; 1214 inode->i_blocks += BLOCKS_PER_PAGE;
992 shmem_recalc_inode(inode); 1215 shmem_recalc_inode(inode);
993 spin_unlock(&info->lock); 1216 spin_unlock(&info->lock);
1217 alloced = true;
994 1218
995 clear_highpage(page); 1219 /*
996 flush_dcache_page(page); 1220 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
997 SetPageUptodate(page); 1221 */
1222 if (sgp == SGP_FALLOC)
1223 sgp = SGP_WRITE;
1224clear:
1225 /*
1226 * Let SGP_WRITE caller clear ends if write does not fill page;
1227 * but SGP_FALLOC on a page fallocated earlier must initialize
1228 * it now, lest undo on failure cancel our earlier guarantee.
1229 */
1230 if (sgp != SGP_WRITE) {
1231 clear_highpage(page);
1232 flush_dcache_page(page);
1233 SetPageUptodate(page);
1234 }
998 if (sgp == SGP_DIRTY) 1235 if (sgp == SGP_DIRTY)
999 set_page_dirty(page); 1236 set_page_dirty(page);
1000 } 1237 }
1001done: 1238
1002 /* Perhaps the file has been truncated since we checked */ 1239 /* Perhaps the file has been truncated since we checked */
1003 if (sgp != SGP_WRITE && 1240 if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
1004 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 1241 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
1005 error = -EINVAL; 1242 error = -EINVAL;
1006 goto trunc; 1243 if (alloced)
1244 goto trunc;
1245 else
1246 goto failed;
1007 } 1247 }
1008 *pagep = page; 1248 *pagep = page;
1009 return 0; 1249 return 0;
@@ -1012,6 +1252,7 @@ done:
1012 * Error recovery. 1252 * Error recovery.
1013 */ 1253 */
1014trunc: 1254trunc:
1255 info = SHMEM_I(inode);
1015 ClearPageDirty(page); 1256 ClearPageDirty(page);
1016 delete_from_page_cache(page); 1257 delete_from_page_cache(page);
1017 spin_lock(&info->lock); 1258 spin_lock(&info->lock);
@@ -1019,19 +1260,16 @@ trunc:
1019 inode->i_blocks -= BLOCKS_PER_PAGE; 1260 inode->i_blocks -= BLOCKS_PER_PAGE;
1020 spin_unlock(&info->lock); 1261 spin_unlock(&info->lock);
1021decused: 1262decused:
1263 sbinfo = SHMEM_SB(inode->i_sb);
1022 if (sbinfo->max_blocks) 1264 if (sbinfo->max_blocks)
1023 percpu_counter_add(&sbinfo->used_blocks, -1); 1265 percpu_counter_add(&sbinfo->used_blocks, -1);
1024unacct: 1266unacct:
1025 shmem_unacct_blocks(info->flags, 1); 1267 shmem_unacct_blocks(info->flags, 1);
1026failed: 1268failed:
1027 if (swap.val && error != -EINVAL) { 1269 if (swap.val && error != -EINVAL &&
1028 struct page *test = find_get_page(mapping, index); 1270 !shmem_confirm_swap(mapping, index, swap))
1029 if (test && !radix_tree_exceptional_entry(test)) 1271 error = -EEXIST;
1030 page_cache_release(test); 1272unlock:
1031 /* Have another try if the entry has changed */
1032 if (test != swp_to_radix_entry(swap))
1033 error = -EEXIST;
1034 }
1035 if (page) { 1273 if (page) {
1036 unlock_page(page); 1274 unlock_page(page);
1037 page_cache_release(page); 1275 page_cache_release(page);
@@ -1043,7 +1281,7 @@ failed:
1043 spin_unlock(&info->lock); 1281 spin_unlock(&info->lock);
1044 goto repeat; 1282 goto repeat;
1045 } 1283 }
1046 if (error == -EEXIST) 1284 if (error == -EEXIST) /* from above or from radix_tree_insert */
1047 goto repeat; 1285 goto repeat;
1048 return error; 1286 return error;
1049} 1287}
@@ -1204,6 +1442,14 @@ shmem_write_end(struct file *file, struct address_space *mapping,
1204 if (pos + copied > inode->i_size) 1442 if (pos + copied > inode->i_size)
1205 i_size_write(inode, pos + copied); 1443 i_size_write(inode, pos + copied);
1206 1444
1445 if (!PageUptodate(page)) {
1446 if (copied < PAGE_CACHE_SIZE) {
1447 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1448 zero_user_segments(page, 0, from,
1449 from + copied, PAGE_CACHE_SIZE);
1450 }
1451 SetPageUptodate(page);
1452 }
1207 set_page_dirty(page); 1453 set_page_dirty(page);
1208 unlock_page(page); 1454 unlock_page(page);
1209 page_cache_release(page); 1455 page_cache_release(page);
@@ -1365,6 +1611,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1365 struct splice_pipe_desc spd = { 1611 struct splice_pipe_desc spd = {
1366 .pages = pages, 1612 .pages = pages,
1367 .partial = partial, 1613 .partial = partial,
1614 .nr_pages_max = PIPE_DEF_BUFFERS,
1368 .flags = flags, 1615 .flags = flags,
1369 .ops = &page_cache_pipe_buf_ops, 1616 .ops = &page_cache_pipe_buf_ops,
1370 .spd_release = spd_release_page, 1617 .spd_release = spd_release_page,
@@ -1453,7 +1700,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1453 if (spd.nr_pages) 1700 if (spd.nr_pages)
1454 error = splice_to_pipe(pipe, &spd); 1701 error = splice_to_pipe(pipe, &spd);
1455 1702
1456 splice_shrink_spd(pipe, &spd); 1703 splice_shrink_spd(&spd);
1457 1704
1458 if (error > 0) { 1705 if (error > 0) {
1459 *ppos += error; 1706 *ppos += error;
@@ -1462,6 +1709,107 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1462 return error; 1709 return error;
1463} 1710}
1464 1711
1712static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1713 loff_t len)
1714{
1715 struct inode *inode = file->f_path.dentry->d_inode;
1716 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1717 struct shmem_falloc shmem_falloc;
1718 pgoff_t start, index, end;
1719 int error;
1720
1721 mutex_lock(&inode->i_mutex);
1722
1723 if (mode & FALLOC_FL_PUNCH_HOLE) {
1724 struct address_space *mapping = file->f_mapping;
1725 loff_t unmap_start = round_up(offset, PAGE_SIZE);
1726 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
1727
1728 if ((u64)unmap_end > (u64)unmap_start)
1729 unmap_mapping_range(mapping, unmap_start,
1730 1 + unmap_end - unmap_start, 0);
1731 shmem_truncate_range(inode, offset, offset + len - 1);
1732 /* No need to unmap again: hole-punching leaves COWed pages */
1733 error = 0;
1734 goto out;
1735 }
1736
1737 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
1738 error = inode_newsize_ok(inode, offset + len);
1739 if (error)
1740 goto out;
1741
1742 start = offset >> PAGE_CACHE_SHIFT;
1743 end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1744 /* Try to avoid a swapstorm if len is impossible to satisfy */
1745 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
1746 error = -ENOSPC;
1747 goto out;
1748 }
1749
1750 shmem_falloc.start = start;
1751 shmem_falloc.next = start;
1752 shmem_falloc.nr_falloced = 0;
1753 shmem_falloc.nr_unswapped = 0;
1754 spin_lock(&inode->i_lock);
1755 inode->i_private = &shmem_falloc;
1756 spin_unlock(&inode->i_lock);
1757
1758 for (index = start; index < end; index++) {
1759 struct page *page;
1760
1761 /*
1762 * Good, the fallocate(2) manpage permits EINTR: we may have
1763 * been interrupted because we are using up too much memory.
1764 */
1765 if (signal_pending(current))
1766 error = -EINTR;
1767 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
1768 error = -ENOMEM;
1769 else
1770 error = shmem_getpage(inode, index, &page, SGP_FALLOC,
1771 NULL);
1772 if (error) {
1773 /* Remove the !PageUptodate pages we added */
1774 shmem_undo_range(inode,
1775 (loff_t)start << PAGE_CACHE_SHIFT,
1776 (loff_t)index << PAGE_CACHE_SHIFT, true);
1777 goto undone;
1778 }
1779
1780 /*
1781 * Inform shmem_writepage() how far we have reached.
1782 * No need for lock or barrier: we have the page lock.
1783 */
1784 shmem_falloc.next++;
1785 if (!PageUptodate(page))
1786 shmem_falloc.nr_falloced++;
1787
1788 /*
1789 * If !PageUptodate, leave it that way so that freeable pages
1790 * can be recognized if we need to rollback on error later.
1791 * But set_page_dirty so that memory pressure will swap rather
1792 * than free the pages we are allocating (and SGP_CACHE pages
1793 * might still be clean: we now need to mark those dirty too).
1794 */
1795 set_page_dirty(page);
1796 unlock_page(page);
1797 page_cache_release(page);
1798 cond_resched();
1799 }
1800
1801 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
1802 i_size_write(inode, offset + len);
1803 inode->i_ctime = CURRENT_TIME;
1804undone:
1805 spin_lock(&inode->i_lock);
1806 inode->i_private = NULL;
1807 spin_unlock(&inode->i_lock);
1808out:
1809 mutex_unlock(&inode->i_mutex);
1810 return error;
1811}
1812
1465static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1813static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1466{ 1814{
1467 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 1815 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1531,7 +1879,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1531} 1879}
1532 1880
1533static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, 1881static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
1534 struct nameidata *nd) 1882 bool excl)
1535{ 1883{
1536 return shmem_mknod(dir, dentry, mode | S_IFREG, 0); 1884 return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1537} 1885}
@@ -1665,6 +2013,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1665 kaddr = kmap_atomic(page); 2013 kaddr = kmap_atomic(page);
1666 memcpy(kaddr, symname, len); 2014 memcpy(kaddr, symname, len);
1667 kunmap_atomic(kaddr); 2015 kunmap_atomic(kaddr);
2016 SetPageUptodate(page);
1668 set_page_dirty(page); 2017 set_page_dirty(page);
1669 unlock_page(page); 2018 unlock_page(page);
1670 page_cache_release(page); 2019 page_cache_release(page);
@@ -2033,11 +2382,9 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
2033 return dentry; 2382 return dentry;
2034} 2383}
2035 2384
2036static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, 2385static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
2037 int connectable) 2386 struct inode *parent)
2038{ 2387{
2039 struct inode *inode = dentry->d_inode;
2040
2041 if (*len < 3) { 2388 if (*len < 3) {
2042 *len = 3; 2389 *len = 3;
2043 return 255; 2390 return 255;
@@ -2075,6 +2422,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2075 bool remount) 2422 bool remount)
2076{ 2423{
2077 char *this_char, *value, *rest; 2424 char *this_char, *value, *rest;
2425 uid_t uid;
2426 gid_t gid;
2078 2427
2079 while (options != NULL) { 2428 while (options != NULL) {
2080 this_char = options; 2429 this_char = options;
@@ -2134,15 +2483,21 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2134 } else if (!strcmp(this_char,"uid")) { 2483 } else if (!strcmp(this_char,"uid")) {
2135 if (remount) 2484 if (remount)
2136 continue; 2485 continue;
2137 sbinfo->uid = simple_strtoul(value, &rest, 0); 2486 uid = simple_strtoul(value, &rest, 0);
2138 if (*rest) 2487 if (*rest)
2139 goto bad_val; 2488 goto bad_val;
2489 sbinfo->uid = make_kuid(current_user_ns(), uid);
2490 if (!uid_valid(sbinfo->uid))
2491 goto bad_val;
2140 } else if (!strcmp(this_char,"gid")) { 2492 } else if (!strcmp(this_char,"gid")) {
2141 if (remount) 2493 if (remount)
2142 continue; 2494 continue;
2143 sbinfo->gid = simple_strtoul(value, &rest, 0); 2495 gid = simple_strtoul(value, &rest, 0);
2144 if (*rest) 2496 if (*rest)
2145 goto bad_val; 2497 goto bad_val;
2498 sbinfo->gid = make_kgid(current_user_ns(), gid);
2499 if (!gid_valid(sbinfo->gid))
2500 goto bad_val;
2146 } else if (!strcmp(this_char,"mpol")) { 2501 } else if (!strcmp(this_char,"mpol")) {
2147 if (mpol_parse_str(value, &sbinfo->mpol, 1)) 2502 if (mpol_parse_str(value, &sbinfo->mpol, 1))
2148 goto bad_val; 2503 goto bad_val;
@@ -2210,10 +2565,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
2210 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 2565 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
2211 if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) 2566 if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
2212 seq_printf(seq, ",mode=%03ho", sbinfo->mode); 2567 seq_printf(seq, ",mode=%03ho", sbinfo->mode);
2213 if (sbinfo->uid != 0) 2568 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
2214 seq_printf(seq, ",uid=%u", sbinfo->uid); 2569 seq_printf(seq, ",uid=%u",
2215 if (sbinfo->gid != 0) 2570 from_kuid_munged(&init_user_ns, sbinfo->uid));
2216 seq_printf(seq, ",gid=%u", sbinfo->gid); 2571 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
2572 seq_printf(seq, ",gid=%u",
2573 from_kgid_munged(&init_user_ns, sbinfo->gid));
2217 shmem_show_mpol(seq, sbinfo->mpol); 2574 shmem_show_mpol(seq, sbinfo->mpol);
2218 return 0; 2575 return 0;
2219} 2576}
@@ -2260,6 +2617,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2260 } 2617 }
2261 } 2618 }
2262 sb->s_export_op = &shmem_export_ops; 2619 sb->s_export_op = &shmem_export_ops;
2620 sb->s_flags |= MS_NOSEC;
2263#else 2621#else
2264 sb->s_flags |= MS_NOUSER; 2622 sb->s_flags |= MS_NOUSER;
2265#endif 2623#endif
@@ -2362,12 +2720,12 @@ static const struct file_operations shmem_file_operations = {
2362 .fsync = noop_fsync, 2720 .fsync = noop_fsync,
2363 .splice_read = shmem_file_splice_read, 2721 .splice_read = shmem_file_splice_read,
2364 .splice_write = generic_file_splice_write, 2722 .splice_write = generic_file_splice_write,
2723 .fallocate = shmem_fallocate,
2365#endif 2724#endif
2366}; 2725};
2367 2726
2368static const struct inode_operations shmem_inode_operations = { 2727static const struct inode_operations shmem_inode_operations = {
2369 .setattr = shmem_setattr, 2728 .setattr = shmem_setattr,
2370 .truncate_range = shmem_truncate_range,
2371#ifdef CONFIG_TMPFS_XATTR 2729#ifdef CONFIG_TMPFS_XATTR
2372 .setxattr = shmem_setxattr, 2730 .setxattr = shmem_setxattr,
2373 .getxattr = shmem_getxattr, 2731 .getxattr = shmem_getxattr,
diff --git a/mm/slab.c b/mm/slab.c
index e901a36e2520..f8b0d539b482 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -68,7 +68,7 @@
68 * Further notes from the original documentation: 68 * Further notes from the original documentation:
69 * 69 *
70 * 11 April '97. Started multi-threading - markhe 70 * 11 April '97. Started multi-threading - markhe
71 * The global cache-chain is protected by the mutex 'cache_chain_mutex'. 71 * The global cache-chain is protected by the mutex 'slab_mutex'.
72 * The sem is only needed when accessing/extending the cache-chain, which 72 * The sem is only needed when accessing/extending the cache-chain, which
73 * can never happen inside an interrupt (kmem_cache_create(), 73 * can never happen inside an interrupt (kmem_cache_create(),
74 * kmem_cache_shrink() and kmem_cache_reap()). 74 * kmem_cache_shrink() and kmem_cache_reap()).
@@ -87,6 +87,7 @@
87 */ 87 */
88 88
89#include <linux/slab.h> 89#include <linux/slab.h>
90#include "slab.h"
90#include <linux/mm.h> 91#include <linux/mm.h>
91#include <linux/poison.h> 92#include <linux/poison.h>
92#include <linux/swap.h> 93#include <linux/swap.h>
@@ -117,12 +118,16 @@
117#include <linux/memory.h> 118#include <linux/memory.h>
118#include <linux/prefetch.h> 119#include <linux/prefetch.h>
119 120
121#include <net/sock.h>
122
120#include <asm/cacheflush.h> 123#include <asm/cacheflush.h>
121#include <asm/tlbflush.h> 124#include <asm/tlbflush.h>
122#include <asm/page.h> 125#include <asm/page.h>
123 126
124#include <trace/events/kmem.h> 127#include <trace/events/kmem.h>
125 128
129#include "internal.h"
130
126/* 131/*
127 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 132 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
128 * 0 for faster, smaller code (especially in the critical paths). 133 * 0 for faster, smaller code (especially in the critical paths).
@@ -151,6 +156,12 @@
151#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 156#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
152#endif 157#endif
153 158
159/*
160 * true if a page was allocated from pfmemalloc reserves for network-based
161 * swap
162 */
163static bool pfmemalloc_active __read_mostly;
164
154/* Legal flag mask for kmem_cache_create(). */ 165/* Legal flag mask for kmem_cache_create(). */
155#if DEBUG 166#if DEBUG
156# define CREATE_MASK (SLAB_RED_ZONE | \ 167# define CREATE_MASK (SLAB_RED_ZONE | \
@@ -256,9 +267,30 @@ struct array_cache {
256 * Must have this definition in here for the proper 267 * Must have this definition in here for the proper
257 * alignment of array_cache. Also simplifies accessing 268 * alignment of array_cache. Also simplifies accessing
258 * the entries. 269 * the entries.
270 *
271 * Entries should not be directly dereferenced as
272 * entries belonging to slabs marked pfmemalloc will
273 * have the lower bits set SLAB_OBJ_PFMEMALLOC
259 */ 274 */
260}; 275};
261 276
277#define SLAB_OBJ_PFMEMALLOC 1
278static inline bool is_obj_pfmemalloc(void *objp)
279{
280 return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
281}
282
283static inline void set_obj_pfmemalloc(void **objp)
284{
285 *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
286 return;
287}
288
289static inline void clear_obj_pfmemalloc(void **objp)
290{
291 *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
292}
293
262/* 294/*
263 * bootstrap: The caches do not work without cpuarrays anymore, but the 295 * bootstrap: The caches do not work without cpuarrays anymore, but the
264 * cpuarrays are allocated from the generic caches... 296 * cpuarrays are allocated from the generic caches...
@@ -424,8 +456,8 @@ static void kmem_list3_init(struct kmem_list3 *parent)
424 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: 456 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
425 * redzone word. 457 * redzone word.
426 * cachep->obj_offset: The real object. 458 * cachep->obj_offset: The real object.
427 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 459 * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
428 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address 460 * cachep->size - 1* BYTES_PER_WORD: last caller address
429 * [BYTES_PER_WORD long] 461 * [BYTES_PER_WORD long]
430 */ 462 */
431static int obj_offset(struct kmem_cache *cachep) 463static int obj_offset(struct kmem_cache *cachep)
@@ -433,11 +465,6 @@ static int obj_offset(struct kmem_cache *cachep)
433 return cachep->obj_offset; 465 return cachep->obj_offset;
434} 466}
435 467
436static int obj_size(struct kmem_cache *cachep)
437{
438 return cachep->obj_size;
439}
440
441static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) 468static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
442{ 469{
443 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 470 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
@@ -449,23 +476,22 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
449{ 476{
450 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 477 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
451 if (cachep->flags & SLAB_STORE_USER) 478 if (cachep->flags & SLAB_STORE_USER)
452 return (unsigned long long *)(objp + cachep->buffer_size - 479 return (unsigned long long *)(objp + cachep->size -
453 sizeof(unsigned long long) - 480 sizeof(unsigned long long) -
454 REDZONE_ALIGN); 481 REDZONE_ALIGN);
455 return (unsigned long long *) (objp + cachep->buffer_size - 482 return (unsigned long long *) (objp + cachep->size -
456 sizeof(unsigned long long)); 483 sizeof(unsigned long long));
457} 484}
458 485
459static void **dbg_userword(struct kmem_cache *cachep, void *objp) 486static void **dbg_userword(struct kmem_cache *cachep, void *objp)
460{ 487{
461 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 488 BUG_ON(!(cachep->flags & SLAB_STORE_USER));
462 return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD); 489 return (void **)(objp + cachep->size - BYTES_PER_WORD);
463} 490}
464 491
465#else 492#else
466 493
467#define obj_offset(x) 0 494#define obj_offset(x) 0
468#define obj_size(cachep) (cachep->buffer_size)
469#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 495#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
470#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 496#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
471#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 497#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
@@ -475,7 +501,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
475#ifdef CONFIG_TRACING 501#ifdef CONFIG_TRACING
476size_t slab_buffer_size(struct kmem_cache *cachep) 502size_t slab_buffer_size(struct kmem_cache *cachep)
477{ 503{
478 return cachep->buffer_size; 504 return cachep->size;
479} 505}
480EXPORT_SYMBOL(slab_buffer_size); 506EXPORT_SYMBOL(slab_buffer_size);
481#endif 507#endif
@@ -489,56 +515,37 @@ EXPORT_SYMBOL(slab_buffer_size);
489static int slab_max_order = SLAB_MAX_ORDER_LO; 515static int slab_max_order = SLAB_MAX_ORDER_LO;
490static bool slab_max_order_set __initdata; 516static bool slab_max_order_set __initdata;
491 517
492/*
493 * Functions for storing/retrieving the cachep and or slab from the page
494 * allocator. These are used to find the slab an obj belongs to. With kfree(),
495 * these are used to find the cache which an obj belongs to.
496 */
497static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
498{
499 page->lru.next = (struct list_head *)cache;
500}
501
502static inline struct kmem_cache *page_get_cache(struct page *page) 518static inline struct kmem_cache *page_get_cache(struct page *page)
503{ 519{
504 page = compound_head(page); 520 page = compound_head(page);
505 BUG_ON(!PageSlab(page)); 521 BUG_ON(!PageSlab(page));
506 return (struct kmem_cache *)page->lru.next; 522 return page->slab_cache;
507}
508
509static inline void page_set_slab(struct page *page, struct slab *slab)
510{
511 page->lru.prev = (struct list_head *)slab;
512}
513
514static inline struct slab *page_get_slab(struct page *page)
515{
516 BUG_ON(!PageSlab(page));
517 return (struct slab *)page->lru.prev;
518} 523}
519 524
520static inline struct kmem_cache *virt_to_cache(const void *obj) 525static inline struct kmem_cache *virt_to_cache(const void *obj)
521{ 526{
522 struct page *page = virt_to_head_page(obj); 527 struct page *page = virt_to_head_page(obj);
523 return page_get_cache(page); 528 return page->slab_cache;
524} 529}
525 530
526static inline struct slab *virt_to_slab(const void *obj) 531static inline struct slab *virt_to_slab(const void *obj)
527{ 532{
528 struct page *page = virt_to_head_page(obj); 533 struct page *page = virt_to_head_page(obj);
529 return page_get_slab(page); 534
535 VM_BUG_ON(!PageSlab(page));
536 return page->slab_page;
530} 537}
531 538
532static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, 539static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
533 unsigned int idx) 540 unsigned int idx)
534{ 541{
535 return slab->s_mem + cache->buffer_size * idx; 542 return slab->s_mem + cache->size * idx;
536} 543}
537 544
538/* 545/*
539 * We want to avoid an expensive divide : (offset / cache->buffer_size) 546 * We want to avoid an expensive divide : (offset / cache->size)
540 * Using the fact that buffer_size is a constant for a particular cache, 547 * Using the fact that size is a constant for a particular cache,
541 * we can replace (offset / cache->buffer_size) by 548 * we can replace (offset / cache->size) by
542 * reciprocal_divide(offset, cache->reciprocal_buffer_size) 549 * reciprocal_divide(offset, cache->reciprocal_buffer_size)
543 */ 550 */
544static inline unsigned int obj_to_index(const struct kmem_cache *cache, 551static inline unsigned int obj_to_index(const struct kmem_cache *cache,
@@ -584,33 +591,12 @@ static struct kmem_cache cache_cache = {
584 .batchcount = 1, 591 .batchcount = 1,
585 .limit = BOOT_CPUCACHE_ENTRIES, 592 .limit = BOOT_CPUCACHE_ENTRIES,
586 .shared = 1, 593 .shared = 1,
587 .buffer_size = sizeof(struct kmem_cache), 594 .size = sizeof(struct kmem_cache),
588 .name = "kmem_cache", 595 .name = "kmem_cache",
589}; 596};
590 597
591#define BAD_ALIEN_MAGIC 0x01020304ul 598#define BAD_ALIEN_MAGIC 0x01020304ul
592 599
593/*
594 * chicken and egg problem: delay the per-cpu array allocation
595 * until the general caches are up.
596 */
597static enum {
598 NONE,
599 PARTIAL_AC,
600 PARTIAL_L3,
601 EARLY,
602 LATE,
603 FULL
604} g_cpucache_up;
605
606/*
607 * used by boot code to determine if it can use slab based allocator
608 */
609int slab_is_available(void)
610{
611 return g_cpucache_up >= EARLY;
612}
613
614#ifdef CONFIG_LOCKDEP 600#ifdef CONFIG_LOCKDEP
615 601
616/* 602/*
@@ -676,7 +662,7 @@ static void init_node_lock_keys(int q)
676{ 662{
677 struct cache_sizes *s = malloc_sizes; 663 struct cache_sizes *s = malloc_sizes;
678 664
679 if (g_cpucache_up < LATE) 665 if (slab_state < UP)
680 return; 666 return;
681 667
682 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { 668 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
@@ -716,12 +702,6 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
716} 702}
717#endif 703#endif
718 704
719/*
720 * Guard access to the cache-chain.
721 */
722static DEFINE_MUTEX(cache_chain_mutex);
723static struct list_head cache_chain;
724
725static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); 705static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
726 706
727static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 707static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -951,6 +931,124 @@ static struct array_cache *alloc_arraycache(int node, int entries,
951 return nc; 931 return nc;
952} 932}
953 933
934static inline bool is_slab_pfmemalloc(struct slab *slabp)
935{
936 struct page *page = virt_to_page(slabp->s_mem);
937
938 return PageSlabPfmemalloc(page);
939}
940
941/* Clears pfmemalloc_active if no slabs have pfmalloc set */
942static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
943 struct array_cache *ac)
944{
945 struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
946 struct slab *slabp;
947 unsigned long flags;
948
949 if (!pfmemalloc_active)
950 return;
951
952 spin_lock_irqsave(&l3->list_lock, flags);
953 list_for_each_entry(slabp, &l3->slabs_full, list)
954 if (is_slab_pfmemalloc(slabp))
955 goto out;
956
957 list_for_each_entry(slabp, &l3->slabs_partial, list)
958 if (is_slab_pfmemalloc(slabp))
959 goto out;
960
961 list_for_each_entry(slabp, &l3->slabs_free, list)
962 if (is_slab_pfmemalloc(slabp))
963 goto out;
964
965 pfmemalloc_active = false;
966out:
967 spin_unlock_irqrestore(&l3->list_lock, flags);
968}
969
970static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
971 gfp_t flags, bool force_refill)
972{
973 int i;
974 void *objp = ac->entry[--ac->avail];
975
976 /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
977 if (unlikely(is_obj_pfmemalloc(objp))) {
978 struct kmem_list3 *l3;
979
980 if (gfp_pfmemalloc_allowed(flags)) {
981 clear_obj_pfmemalloc(&objp);
982 return objp;
983 }
984
985 /* The caller cannot use PFMEMALLOC objects, find another one */
986 for (i = 1; i < ac->avail; i++) {
987 /* If a !PFMEMALLOC object is found, swap them */
988 if (!is_obj_pfmemalloc(ac->entry[i])) {
989 objp = ac->entry[i];
990 ac->entry[i] = ac->entry[ac->avail];
991 ac->entry[ac->avail] = objp;
992 return objp;
993 }
994 }
995
996 /*
997 * If there are empty slabs on the slabs_free list and we are
998 * being forced to refill the cache, mark this one !pfmemalloc.
999 */
1000 l3 = cachep->nodelists[numa_mem_id()];
1001 if (!list_empty(&l3->slabs_free) && force_refill) {
1002 struct slab *slabp = virt_to_slab(objp);
1003 ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem));
1004 clear_obj_pfmemalloc(&objp);
1005 recheck_pfmemalloc_active(cachep, ac);
1006 return objp;
1007 }
1008
1009 /* No !PFMEMALLOC objects available */
1010 ac->avail++;
1011 objp = NULL;
1012 }
1013
1014 return objp;
1015}
1016
1017static inline void *ac_get_obj(struct kmem_cache *cachep,
1018 struct array_cache *ac, gfp_t flags, bool force_refill)
1019{
1020 void *objp;
1021
1022 if (unlikely(sk_memalloc_socks()))
1023 objp = __ac_get_obj(cachep, ac, flags, force_refill);
1024 else
1025 objp = ac->entry[--ac->avail];
1026
1027 return objp;
1028}
1029
1030static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
1031 void *objp)
1032{
1033 if (unlikely(pfmemalloc_active)) {
1034 /* Some pfmemalloc slabs exist, check if this is one */
1035 struct page *page = virt_to_page(objp);
1036 if (PageSlabPfmemalloc(page))
1037 set_obj_pfmemalloc(&objp);
1038 }
1039
1040 return objp;
1041}
1042
1043static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
1044 void *objp)
1045{
1046 if (unlikely(sk_memalloc_socks()))
1047 objp = __ac_put_obj(cachep, ac, objp);
1048
1049 ac->entry[ac->avail++] = objp;
1050}
1051
954/* 1052/*
955 * Transfer objects in one arraycache to another. 1053 * Transfer objects in one arraycache to another.
956 * Locking must be handled by the caller. 1054 * Locking must be handled by the caller.
@@ -1127,7 +1225,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1127 STATS_INC_ACOVERFLOW(cachep); 1225 STATS_INC_ACOVERFLOW(cachep);
1128 __drain_alien_cache(cachep, alien, nodeid); 1226 __drain_alien_cache(cachep, alien, nodeid);
1129 } 1227 }
1130 alien->entry[alien->avail++] = objp; 1228 ac_put_obj(cachep, alien, objp);
1131 spin_unlock(&alien->lock); 1229 spin_unlock(&alien->lock);
1132 } else { 1230 } else {
1133 spin_lock(&(cachep->nodelists[nodeid])->list_lock); 1231 spin_lock(&(cachep->nodelists[nodeid])->list_lock);
@@ -1145,7 +1243,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1145 * When hotplugging memory or a cpu, existing nodelists are not replaced if 1243 * When hotplugging memory or a cpu, existing nodelists are not replaced if
1146 * already in use. 1244 * already in use.
1147 * 1245 *
1148 * Must hold cache_chain_mutex. 1246 * Must hold slab_mutex.
1149 */ 1247 */
1150static int init_cache_nodelists_node(int node) 1248static int init_cache_nodelists_node(int node)
1151{ 1249{
@@ -1153,7 +1251,7 @@ static int init_cache_nodelists_node(int node)
1153 struct kmem_list3 *l3; 1251 struct kmem_list3 *l3;
1154 const int memsize = sizeof(struct kmem_list3); 1252 const int memsize = sizeof(struct kmem_list3);
1155 1253
1156 list_for_each_entry(cachep, &cache_chain, next) { 1254 list_for_each_entry(cachep, &slab_caches, list) {
1157 /* 1255 /*
1158 * Set up the size64 kmemlist for cpu before we can 1256 * Set up the size64 kmemlist for cpu before we can
1159 * begin anything. Make sure some other cpu on this 1257 * begin anything. Make sure some other cpu on this
@@ -1169,7 +1267,7 @@ static int init_cache_nodelists_node(int node)
1169 1267
1170 /* 1268 /*
1171 * The l3s don't come and go as CPUs come and 1269 * The l3s don't come and go as CPUs come and
1172 * go. cache_chain_mutex is sufficient 1270 * go. slab_mutex is sufficient
1173 * protection here. 1271 * protection here.
1174 */ 1272 */
1175 cachep->nodelists[node] = l3; 1273 cachep->nodelists[node] = l3;
@@ -1191,7 +1289,7 @@ static void __cpuinit cpuup_canceled(long cpu)
1191 int node = cpu_to_mem(cpu); 1289 int node = cpu_to_mem(cpu);
1192 const struct cpumask *mask = cpumask_of_node(node); 1290 const struct cpumask *mask = cpumask_of_node(node);
1193 1291
1194 list_for_each_entry(cachep, &cache_chain, next) { 1292 list_for_each_entry(cachep, &slab_caches, list) {
1195 struct array_cache *nc; 1293 struct array_cache *nc;
1196 struct array_cache *shared; 1294 struct array_cache *shared;
1197 struct array_cache **alien; 1295 struct array_cache **alien;
@@ -1241,7 +1339,7 @@ free_array_cache:
1241 * the respective cache's slabs, now we can go ahead and 1339 * the respective cache's slabs, now we can go ahead and
1242 * shrink each nodelist to its limit. 1340 * shrink each nodelist to its limit.
1243 */ 1341 */
1244 list_for_each_entry(cachep, &cache_chain, next) { 1342 list_for_each_entry(cachep, &slab_caches, list) {
1245 l3 = cachep->nodelists[node]; 1343 l3 = cachep->nodelists[node];
1246 if (!l3) 1344 if (!l3)
1247 continue; 1345 continue;
@@ -1270,7 +1368,7 @@ static int __cpuinit cpuup_prepare(long cpu)
1270 * Now we can go ahead with allocating the shared arrays and 1368 * Now we can go ahead with allocating the shared arrays and
1271 * array caches 1369 * array caches
1272 */ 1370 */
1273 list_for_each_entry(cachep, &cache_chain, next) { 1371 list_for_each_entry(cachep, &slab_caches, list) {
1274 struct array_cache *nc; 1372 struct array_cache *nc;
1275 struct array_cache *shared = NULL; 1373 struct array_cache *shared = NULL;
1276 struct array_cache **alien = NULL; 1374 struct array_cache **alien = NULL;
@@ -1338,9 +1436,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1338 switch (action) { 1436 switch (action) {
1339 case CPU_UP_PREPARE: 1437 case CPU_UP_PREPARE:
1340 case CPU_UP_PREPARE_FROZEN: 1438 case CPU_UP_PREPARE_FROZEN:
1341 mutex_lock(&cache_chain_mutex); 1439 mutex_lock(&slab_mutex);
1342 err = cpuup_prepare(cpu); 1440 err = cpuup_prepare(cpu);
1343 mutex_unlock(&cache_chain_mutex); 1441 mutex_unlock(&slab_mutex);
1344 break; 1442 break;
1345 case CPU_ONLINE: 1443 case CPU_ONLINE:
1346 case CPU_ONLINE_FROZEN: 1444 case CPU_ONLINE_FROZEN:
@@ -1350,7 +1448,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1350 case CPU_DOWN_PREPARE: 1448 case CPU_DOWN_PREPARE:
1351 case CPU_DOWN_PREPARE_FROZEN: 1449 case CPU_DOWN_PREPARE_FROZEN:
1352 /* 1450 /*
1353 * Shutdown cache reaper. Note that the cache_chain_mutex is 1451 * Shutdown cache reaper. Note that the slab_mutex is
1354 * held so that if cache_reap() is invoked it cannot do 1452 * held so that if cache_reap() is invoked it cannot do
1355 * anything expensive but will only modify reap_work 1453 * anything expensive but will only modify reap_work
1356 * and reschedule the timer. 1454 * and reschedule the timer.
@@ -1377,9 +1475,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1377#endif 1475#endif
1378 case CPU_UP_CANCELED: 1476 case CPU_UP_CANCELED:
1379 case CPU_UP_CANCELED_FROZEN: 1477 case CPU_UP_CANCELED_FROZEN:
1380 mutex_lock(&cache_chain_mutex); 1478 mutex_lock(&slab_mutex);
1381 cpuup_canceled(cpu); 1479 cpuup_canceled(cpu);
1382 mutex_unlock(&cache_chain_mutex); 1480 mutex_unlock(&slab_mutex);
1383 break; 1481 break;
1384 } 1482 }
1385 return notifier_from_errno(err); 1483 return notifier_from_errno(err);
@@ -1395,14 +1493,14 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
1395 * Returns -EBUSY if all objects cannot be drained so that the node is not 1493 * Returns -EBUSY if all objects cannot be drained so that the node is not
1396 * removed. 1494 * removed.
1397 * 1495 *
1398 * Must hold cache_chain_mutex. 1496 * Must hold slab_mutex.
1399 */ 1497 */
1400static int __meminit drain_cache_nodelists_node(int node) 1498static int __meminit drain_cache_nodelists_node(int node)
1401{ 1499{
1402 struct kmem_cache *cachep; 1500 struct kmem_cache *cachep;
1403 int ret = 0; 1501 int ret = 0;
1404 1502
1405 list_for_each_entry(cachep, &cache_chain, next) { 1503 list_for_each_entry(cachep, &slab_caches, list) {
1406 struct kmem_list3 *l3; 1504 struct kmem_list3 *l3;
1407 1505
1408 l3 = cachep->nodelists[node]; 1506 l3 = cachep->nodelists[node];
@@ -1433,14 +1531,14 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
1433 1531
1434 switch (action) { 1532 switch (action) {
1435 case MEM_GOING_ONLINE: 1533 case MEM_GOING_ONLINE:
1436 mutex_lock(&cache_chain_mutex); 1534 mutex_lock(&slab_mutex);
1437 ret = init_cache_nodelists_node(nid); 1535 ret = init_cache_nodelists_node(nid);
1438 mutex_unlock(&cache_chain_mutex); 1536 mutex_unlock(&slab_mutex);
1439 break; 1537 break;
1440 case MEM_GOING_OFFLINE: 1538 case MEM_GOING_OFFLINE:
1441 mutex_lock(&cache_chain_mutex); 1539 mutex_lock(&slab_mutex);
1442 ret = drain_cache_nodelists_node(nid); 1540 ret = drain_cache_nodelists_node(nid);
1443 mutex_unlock(&cache_chain_mutex); 1541 mutex_unlock(&slab_mutex);
1444 break; 1542 break;
1445 case MEM_ONLINE: 1543 case MEM_ONLINE:
1446 case MEM_OFFLINE: 1544 case MEM_OFFLINE:
@@ -1544,8 +1642,8 @@ void __init kmem_cache_init(void)
1544 node = numa_mem_id(); 1642 node = numa_mem_id();
1545 1643
1546 /* 1) create the cache_cache */ 1644 /* 1) create the cache_cache */
1547 INIT_LIST_HEAD(&cache_chain); 1645 INIT_LIST_HEAD(&slab_caches);
1548 list_add(&cache_cache.next, &cache_chain); 1646 list_add(&cache_cache.list, &slab_caches);
1549 cache_cache.colour_off = cache_line_size(); 1647 cache_cache.colour_off = cache_line_size();
1550 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1648 cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1551 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; 1649 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
@@ -1553,18 +1651,16 @@ void __init kmem_cache_init(void)
1553 /* 1651 /*
1554 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids 1652 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1555 */ 1653 */
1556 cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + 1654 cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1557 nr_node_ids * sizeof(struct kmem_list3 *); 1655 nr_node_ids * sizeof(struct kmem_list3 *);
1558#if DEBUG 1656 cache_cache.object_size = cache_cache.size;
1559 cache_cache.obj_size = cache_cache.buffer_size; 1657 cache_cache.size = ALIGN(cache_cache.size,
1560#endif
1561 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1562 cache_line_size()); 1658 cache_line_size());
1563 cache_cache.reciprocal_buffer_size = 1659 cache_cache.reciprocal_buffer_size =
1564 reciprocal_value(cache_cache.buffer_size); 1660 reciprocal_value(cache_cache.size);
1565 1661
1566 for (order = 0; order < MAX_ORDER; order++) { 1662 for (order = 0; order < MAX_ORDER; order++) {
1567 cache_estimate(order, cache_cache.buffer_size, 1663 cache_estimate(order, cache_cache.size,
1568 cache_line_size(), 0, &left_over, &cache_cache.num); 1664 cache_line_size(), 0, &left_over, &cache_cache.num);
1569 if (cache_cache.num) 1665 if (cache_cache.num)
1570 break; 1666 break;
@@ -1585,7 +1681,7 @@ void __init kmem_cache_init(void)
1585 * bug. 1681 * bug.
1586 */ 1682 */
1587 1683
1588 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1684 sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name,
1589 sizes[INDEX_AC].cs_size, 1685 sizes[INDEX_AC].cs_size,
1590 ARCH_KMALLOC_MINALIGN, 1686 ARCH_KMALLOC_MINALIGN,
1591 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1687 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1593,7 +1689,7 @@ void __init kmem_cache_init(void)
1593 1689
1594 if (INDEX_AC != INDEX_L3) { 1690 if (INDEX_AC != INDEX_L3) {
1595 sizes[INDEX_L3].cs_cachep = 1691 sizes[INDEX_L3].cs_cachep =
1596 kmem_cache_create(names[INDEX_L3].name, 1692 __kmem_cache_create(names[INDEX_L3].name,
1597 sizes[INDEX_L3].cs_size, 1693 sizes[INDEX_L3].cs_size,
1598 ARCH_KMALLOC_MINALIGN, 1694 ARCH_KMALLOC_MINALIGN,
1599 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1695 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1611,14 +1707,14 @@ void __init kmem_cache_init(void)
1611 * allow tighter packing of the smaller caches. 1707 * allow tighter packing of the smaller caches.
1612 */ 1708 */
1613 if (!sizes->cs_cachep) { 1709 if (!sizes->cs_cachep) {
1614 sizes->cs_cachep = kmem_cache_create(names->name, 1710 sizes->cs_cachep = __kmem_cache_create(names->name,
1615 sizes->cs_size, 1711 sizes->cs_size,
1616 ARCH_KMALLOC_MINALIGN, 1712 ARCH_KMALLOC_MINALIGN,
1617 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1713 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1618 NULL); 1714 NULL);
1619 } 1715 }
1620#ifdef CONFIG_ZONE_DMA 1716#ifdef CONFIG_ZONE_DMA
1621 sizes->cs_dmacachep = kmem_cache_create( 1717 sizes->cs_dmacachep = __kmem_cache_create(
1622 names->name_dma, 1718 names->name_dma,
1623 sizes->cs_size, 1719 sizes->cs_size,
1624 ARCH_KMALLOC_MINALIGN, 1720 ARCH_KMALLOC_MINALIGN,
@@ -1676,27 +1772,27 @@ void __init kmem_cache_init(void)
1676 } 1772 }
1677 } 1773 }
1678 1774
1679 g_cpucache_up = EARLY; 1775 slab_state = UP;
1680} 1776}
1681 1777
1682void __init kmem_cache_init_late(void) 1778void __init kmem_cache_init_late(void)
1683{ 1779{
1684 struct kmem_cache *cachep; 1780 struct kmem_cache *cachep;
1685 1781
1686 g_cpucache_up = LATE; 1782 slab_state = UP;
1687 1783
1688 /* Annotate slab for lockdep -- annotate the malloc caches */ 1784 /* Annotate slab for lockdep -- annotate the malloc caches */
1689 init_lock_keys(); 1785 init_lock_keys();
1690 1786
1691 /* 6) resize the head arrays to their final sizes */ 1787 /* 6) resize the head arrays to their final sizes */
1692 mutex_lock(&cache_chain_mutex); 1788 mutex_lock(&slab_mutex);
1693 list_for_each_entry(cachep, &cache_chain, next) 1789 list_for_each_entry(cachep, &slab_caches, list)
1694 if (enable_cpucache(cachep, GFP_NOWAIT)) 1790 if (enable_cpucache(cachep, GFP_NOWAIT))
1695 BUG(); 1791 BUG();
1696 mutex_unlock(&cache_chain_mutex); 1792 mutex_unlock(&slab_mutex);
1697 1793
1698 /* Done! */ 1794 /* Done! */
1699 g_cpucache_up = FULL; 1795 slab_state = FULL;
1700 1796
1701 /* 1797 /*
1702 * Register a cpu startup notifier callback that initializes 1798 * Register a cpu startup notifier callback that initializes
@@ -1727,6 +1823,9 @@ static int __init cpucache_init(void)
1727 */ 1823 */
1728 for_each_online_cpu(cpu) 1824 for_each_online_cpu(cpu)
1729 start_cpu_timer(cpu); 1825 start_cpu_timer(cpu);
1826
1827 /* Done! */
1828 slab_state = FULL;
1730 return 0; 1829 return 0;
1731} 1830}
1732__initcall(cpucache_init); 1831__initcall(cpucache_init);
@@ -1743,7 +1842,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1743 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", 1842 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1744 nodeid, gfpflags); 1843 nodeid, gfpflags);
1745 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", 1844 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
1746 cachep->name, cachep->buffer_size, cachep->gfporder); 1845 cachep->name, cachep->size, cachep->gfporder);
1747 1846
1748 for_each_online_node(node) { 1847 for_each_online_node(node) {
1749 unsigned long active_objs = 0, num_objs = 0, free_objects = 0; 1848 unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
@@ -1798,7 +1897,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1798 flags |= __GFP_COMP; 1897 flags |= __GFP_COMP;
1799#endif 1898#endif
1800 1899
1801 flags |= cachep->gfpflags; 1900 flags |= cachep->allocflags;
1802 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1901 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1803 flags |= __GFP_RECLAIMABLE; 1902 flags |= __GFP_RECLAIMABLE;
1804 1903
@@ -1809,6 +1908,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1809 return NULL; 1908 return NULL;
1810 } 1909 }
1811 1910
1911 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1912 if (unlikely(page->pfmemalloc))
1913 pfmemalloc_active = true;
1914
1812 nr_pages = (1 << cachep->gfporder); 1915 nr_pages = (1 << cachep->gfporder);
1813 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1916 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1814 add_zone_page_state(page_zone(page), 1917 add_zone_page_state(page_zone(page),
@@ -1816,9 +1919,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1816 else 1919 else
1817 add_zone_page_state(page_zone(page), 1920 add_zone_page_state(page_zone(page),
1818 NR_SLAB_UNRECLAIMABLE, nr_pages); 1921 NR_SLAB_UNRECLAIMABLE, nr_pages);
1819 for (i = 0; i < nr_pages; i++) 1922 for (i = 0; i < nr_pages; i++) {
1820 __SetPageSlab(page + i); 1923 __SetPageSlab(page + i);
1821 1924
1925 if (page->pfmemalloc)
1926 SetPageSlabPfmemalloc(page + i);
1927 }
1928
1822 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1929 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1823 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1930 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1824 1931
@@ -1850,6 +1957,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1850 NR_SLAB_UNRECLAIMABLE, nr_freed); 1957 NR_SLAB_UNRECLAIMABLE, nr_freed);
1851 while (i--) { 1958 while (i--) {
1852 BUG_ON(!PageSlab(page)); 1959 BUG_ON(!PageSlab(page));
1960 __ClearPageSlabPfmemalloc(page);
1853 __ClearPageSlab(page); 1961 __ClearPageSlab(page);
1854 page++; 1962 page++;
1855 } 1963 }
@@ -1874,7 +1982,7 @@ static void kmem_rcu_free(struct rcu_head *head)
1874static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, 1982static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1875 unsigned long caller) 1983 unsigned long caller)
1876{ 1984{
1877 int size = obj_size(cachep); 1985 int size = cachep->object_size;
1878 1986
1879 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; 1987 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1880 1988
@@ -1906,7 +2014,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1906 2014
1907static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) 2015static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1908{ 2016{
1909 int size = obj_size(cachep); 2017 int size = cachep->object_size;
1910 addr = &((char *)addr)[obj_offset(cachep)]; 2018 addr = &((char *)addr)[obj_offset(cachep)];
1911 2019
1912 memset(addr, val, size); 2020 memset(addr, val, size);
@@ -1966,7 +2074,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1966 printk("\n"); 2074 printk("\n");
1967 } 2075 }
1968 realobj = (char *)objp + obj_offset(cachep); 2076 realobj = (char *)objp + obj_offset(cachep);
1969 size = obj_size(cachep); 2077 size = cachep->object_size;
1970 for (i = 0; i < size && lines; i += 16, lines--) { 2078 for (i = 0; i < size && lines; i += 16, lines--) {
1971 int limit; 2079 int limit;
1972 limit = 16; 2080 limit = 16;
@@ -1983,7 +2091,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1983 int lines = 0; 2091 int lines = 0;
1984 2092
1985 realobj = (char *)objp + obj_offset(cachep); 2093 realobj = (char *)objp + obj_offset(cachep);
1986 size = obj_size(cachep); 2094 size = cachep->object_size;
1987 2095
1988 for (i = 0; i < size; i++) { 2096 for (i = 0; i < size; i++) {
1989 char exp = POISON_FREE; 2097 char exp = POISON_FREE;
@@ -2047,10 +2155,10 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
2047 2155
2048 if (cachep->flags & SLAB_POISON) { 2156 if (cachep->flags & SLAB_POISON) {
2049#ifdef CONFIG_DEBUG_PAGEALLOC 2157#ifdef CONFIG_DEBUG_PAGEALLOC
2050 if (cachep->buffer_size % PAGE_SIZE == 0 && 2158 if (cachep->size % PAGE_SIZE == 0 &&
2051 OFF_SLAB(cachep)) 2159 OFF_SLAB(cachep))
2052 kernel_map_pages(virt_to_page(objp), 2160 kernel_map_pages(virt_to_page(objp),
2053 cachep->buffer_size / PAGE_SIZE, 1); 2161 cachep->size / PAGE_SIZE, 1);
2054 else 2162 else
2055 check_poison_obj(cachep, objp); 2163 check_poison_obj(cachep, objp);
2056#else 2164#else
@@ -2194,10 +2302,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2194 2302
2195static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) 2303static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2196{ 2304{
2197 if (g_cpucache_up == FULL) 2305 if (slab_state >= FULL)
2198 return enable_cpucache(cachep, gfp); 2306 return enable_cpucache(cachep, gfp);
2199 2307
2200 if (g_cpucache_up == NONE) { 2308 if (slab_state == DOWN) {
2201 /* 2309 /*
2202 * Note: the first kmem_cache_create must create the cache 2310 * Note: the first kmem_cache_create must create the cache
2203 * that's used by kmalloc(24), otherwise the creation of 2311 * that's used by kmalloc(24), otherwise the creation of
@@ -2212,16 +2320,16 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2212 */ 2320 */
2213 set_up_list3s(cachep, SIZE_AC); 2321 set_up_list3s(cachep, SIZE_AC);
2214 if (INDEX_AC == INDEX_L3) 2322 if (INDEX_AC == INDEX_L3)
2215 g_cpucache_up = PARTIAL_L3; 2323 slab_state = PARTIAL_L3;
2216 else 2324 else
2217 g_cpucache_up = PARTIAL_AC; 2325 slab_state = PARTIAL_ARRAYCACHE;
2218 } else { 2326 } else {
2219 cachep->array[smp_processor_id()] = 2327 cachep->array[smp_processor_id()] =
2220 kmalloc(sizeof(struct arraycache_init), gfp); 2328 kmalloc(sizeof(struct arraycache_init), gfp);
2221 2329
2222 if (g_cpucache_up == PARTIAL_AC) { 2330 if (slab_state == PARTIAL_ARRAYCACHE) {
2223 set_up_list3s(cachep, SIZE_L3); 2331 set_up_list3s(cachep, SIZE_L3);
2224 g_cpucache_up = PARTIAL_L3; 2332 slab_state = PARTIAL_L3;
2225 } else { 2333 } else {
2226 int node; 2334 int node;
2227 for_each_online_node(node) { 2335 for_each_online_node(node) {
@@ -2247,7 +2355,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2247} 2355}
2248 2356
2249/** 2357/**
2250 * kmem_cache_create - Create a cache. 2358 * __kmem_cache_create - Create a cache.
2251 * @name: A string which is used in /proc/slabinfo to identify this cache. 2359 * @name: A string which is used in /proc/slabinfo to identify this cache.
2252 * @size: The size of objects to be created in this cache. 2360 * @size: The size of objects to be created in this cache.
2253 * @align: The required alignment for the objects. 2361 * @align: The required alignment for the objects.
@@ -2274,59 +2382,14 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2274 * as davem. 2382 * as davem.
2275 */ 2383 */
2276struct kmem_cache * 2384struct kmem_cache *
2277kmem_cache_create (const char *name, size_t size, size_t align, 2385__kmem_cache_create (const char *name, size_t size, size_t align,
2278 unsigned long flags, void (*ctor)(void *)) 2386 unsigned long flags, void (*ctor)(void *))
2279{ 2387{
2280 size_t left_over, slab_size, ralign; 2388 size_t left_over, slab_size, ralign;
2281 struct kmem_cache *cachep = NULL, *pc; 2389 struct kmem_cache *cachep = NULL;
2282 gfp_t gfp; 2390 gfp_t gfp;
2283 2391
2284 /*
2285 * Sanity checks... these are all serious usage bugs.
2286 */
2287 if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2288 size > KMALLOC_MAX_SIZE) {
2289 printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
2290 name);
2291 BUG();
2292 }
2293
2294 /*
2295 * We use cache_chain_mutex to ensure a consistent view of
2296 * cpu_online_mask as well. Please see cpuup_callback
2297 */
2298 if (slab_is_available()) {
2299 get_online_cpus();
2300 mutex_lock(&cache_chain_mutex);
2301 }
2302
2303 list_for_each_entry(pc, &cache_chain, next) {
2304 char tmp;
2305 int res;
2306
2307 /*
2308 * This happens when the module gets unloaded and doesn't
2309 * destroy its slab cache and no-one else reuses the vmalloc
2310 * area of the module. Print a warning.
2311 */
2312 res = probe_kernel_address(pc->name, tmp);
2313 if (res) {
2314 printk(KERN_ERR
2315 "SLAB: cache with size %d has lost its name\n",
2316 pc->buffer_size);
2317 continue;
2318 }
2319
2320 if (!strcmp(pc->name, name)) {
2321 printk(KERN_ERR
2322 "kmem_cache_create: duplicate cache %s\n", name);
2323 dump_stack();
2324 goto oops;
2325 }
2326 }
2327
2328#if DEBUG 2392#if DEBUG
2329 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
2330#if FORCED_DEBUG 2393#if FORCED_DEBUG
2331 /* 2394 /*
2332 * Enable redzoning and last user accounting, except for caches with 2395 * Enable redzoning and last user accounting, except for caches with
@@ -2415,11 +2478,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2415 /* Get cache's description obj. */ 2478 /* Get cache's description obj. */
2416 cachep = kmem_cache_zalloc(&cache_cache, gfp); 2479 cachep = kmem_cache_zalloc(&cache_cache, gfp);
2417 if (!cachep) 2480 if (!cachep)
2418 goto oops; 2481 return NULL;
2419 2482
2420 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; 2483 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
2484 cachep->object_size = size;
2485 cachep->align = align;
2421#if DEBUG 2486#if DEBUG
2422 cachep->obj_size = size;
2423 2487
2424 /* 2488 /*
2425 * Both debugging options require word-alignment which is calculated 2489 * Both debugging options require word-alignment which is calculated
@@ -2442,7 +2506,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2442 } 2506 }
2443#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2507#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2444 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 2508 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2445 && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { 2509 && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
2446 cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); 2510 cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
2447 size = PAGE_SIZE; 2511 size = PAGE_SIZE;
2448 } 2512 }
@@ -2471,8 +2535,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2471 printk(KERN_ERR 2535 printk(KERN_ERR
2472 "kmem_cache_create: couldn't create cache %s.\n", name); 2536 "kmem_cache_create: couldn't create cache %s.\n", name);
2473 kmem_cache_free(&cache_cache, cachep); 2537 kmem_cache_free(&cache_cache, cachep);
2474 cachep = NULL; 2538 return NULL;
2475 goto oops;
2476 } 2539 }
2477 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) 2540 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2478 + sizeof(struct slab), align); 2541 + sizeof(struct slab), align);
@@ -2508,10 +2571,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2508 cachep->colour = left_over / cachep->colour_off; 2571 cachep->colour = left_over / cachep->colour_off;
2509 cachep->slab_size = slab_size; 2572 cachep->slab_size = slab_size;
2510 cachep->flags = flags; 2573 cachep->flags = flags;
2511 cachep->gfpflags = 0; 2574 cachep->allocflags = 0;
2512 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) 2575 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2513 cachep->gfpflags |= GFP_DMA; 2576 cachep->allocflags |= GFP_DMA;
2514 cachep->buffer_size = size; 2577 cachep->size = size;
2515 cachep->reciprocal_buffer_size = reciprocal_value(size); 2578 cachep->reciprocal_buffer_size = reciprocal_value(size);
2516 2579
2517 if (flags & CFLGS_OFF_SLAB) { 2580 if (flags & CFLGS_OFF_SLAB) {
@@ -2530,8 +2593,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2530 2593
2531 if (setup_cpu_cache(cachep, gfp)) { 2594 if (setup_cpu_cache(cachep, gfp)) {
2532 __kmem_cache_destroy(cachep); 2595 __kmem_cache_destroy(cachep);
2533 cachep = NULL; 2596 return NULL;
2534 goto oops;
2535 } 2597 }
2536 2598
2537 if (flags & SLAB_DEBUG_OBJECTS) { 2599 if (flags & SLAB_DEBUG_OBJECTS) {
@@ -2545,18 +2607,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2545 } 2607 }
2546 2608
2547 /* cache setup completed, link it into the list */ 2609 /* cache setup completed, link it into the list */
2548 list_add(&cachep->next, &cache_chain); 2610 list_add(&cachep->list, &slab_caches);
2549oops:
2550 if (!cachep && (flags & SLAB_PANIC))
2551 panic("kmem_cache_create(): failed to create slab `%s'\n",
2552 name);
2553 if (slab_is_available()) {
2554 mutex_unlock(&cache_chain_mutex);
2555 put_online_cpus();
2556 }
2557 return cachep; 2611 return cachep;
2558} 2612}
2559EXPORT_SYMBOL(kmem_cache_create);
2560 2613
2561#if DEBUG 2614#if DEBUG
2562static void check_irq_off(void) 2615static void check_irq_off(void)
@@ -2671,7 +2724,7 @@ out:
2671 return nr_freed; 2724 return nr_freed;
2672} 2725}
2673 2726
2674/* Called with cache_chain_mutex held to protect against cpu hotplug */ 2727/* Called with slab_mutex held to protect against cpu hotplug */
2675static int __cache_shrink(struct kmem_cache *cachep) 2728static int __cache_shrink(struct kmem_cache *cachep)
2676{ 2729{
2677 int ret = 0, i = 0; 2730 int ret = 0, i = 0;
@@ -2706,9 +2759,9 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
2706 BUG_ON(!cachep || in_interrupt()); 2759 BUG_ON(!cachep || in_interrupt());
2707 2760
2708 get_online_cpus(); 2761 get_online_cpus();
2709 mutex_lock(&cache_chain_mutex); 2762 mutex_lock(&slab_mutex);
2710 ret = __cache_shrink(cachep); 2763 ret = __cache_shrink(cachep);
2711 mutex_unlock(&cache_chain_mutex); 2764 mutex_unlock(&slab_mutex);
2712 put_online_cpus(); 2765 put_online_cpus();
2713 return ret; 2766 return ret;
2714} 2767}
@@ -2736,15 +2789,15 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2736 2789
2737 /* Find the cache in the chain of caches. */ 2790 /* Find the cache in the chain of caches. */
2738 get_online_cpus(); 2791 get_online_cpus();
2739 mutex_lock(&cache_chain_mutex); 2792 mutex_lock(&slab_mutex);
2740 /* 2793 /*
2741 * the chain is never empty, cache_cache is never destroyed 2794 * the chain is never empty, cache_cache is never destroyed
2742 */ 2795 */
2743 list_del(&cachep->next); 2796 list_del(&cachep->list);
2744 if (__cache_shrink(cachep)) { 2797 if (__cache_shrink(cachep)) {
2745 slab_error(cachep, "Can't free all objects"); 2798 slab_error(cachep, "Can't free all objects");
2746 list_add(&cachep->next, &cache_chain); 2799 list_add(&cachep->list, &slab_caches);
2747 mutex_unlock(&cache_chain_mutex); 2800 mutex_unlock(&slab_mutex);
2748 put_online_cpus(); 2801 put_online_cpus();
2749 return; 2802 return;
2750 } 2803 }
@@ -2753,7 +2806,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2753 rcu_barrier(); 2806 rcu_barrier();
2754 2807
2755 __kmem_cache_destroy(cachep); 2808 __kmem_cache_destroy(cachep);
2756 mutex_unlock(&cache_chain_mutex); 2809 mutex_unlock(&slab_mutex);
2757 put_online_cpus(); 2810 put_online_cpus();
2758} 2811}
2759EXPORT_SYMBOL(kmem_cache_destroy); 2812EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2840,10 +2893,10 @@ static void cache_init_objs(struct kmem_cache *cachep,
2840 slab_error(cachep, "constructor overwrote the" 2893 slab_error(cachep, "constructor overwrote the"
2841 " start of an object"); 2894 " start of an object");
2842 } 2895 }
2843 if ((cachep->buffer_size % PAGE_SIZE) == 0 && 2896 if ((cachep->size % PAGE_SIZE) == 0 &&
2844 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2897 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2845 kernel_map_pages(virt_to_page(objp), 2898 kernel_map_pages(virt_to_page(objp),
2846 cachep->buffer_size / PAGE_SIZE, 0); 2899 cachep->size / PAGE_SIZE, 0);
2847#else 2900#else
2848 if (cachep->ctor) 2901 if (cachep->ctor)
2849 cachep->ctor(objp); 2902 cachep->ctor(objp);
@@ -2857,9 +2910,9 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2857{ 2910{
2858 if (CONFIG_ZONE_DMA_FLAG) { 2911 if (CONFIG_ZONE_DMA_FLAG) {
2859 if (flags & GFP_DMA) 2912 if (flags & GFP_DMA)
2860 BUG_ON(!(cachep->gfpflags & GFP_DMA)); 2913 BUG_ON(!(cachep->allocflags & GFP_DMA));
2861 else 2914 else
2862 BUG_ON(cachep->gfpflags & GFP_DMA); 2915 BUG_ON(cachep->allocflags & GFP_DMA);
2863 } 2916 }
2864} 2917}
2865 2918
@@ -2918,8 +2971,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2918 nr_pages <<= cache->gfporder; 2971 nr_pages <<= cache->gfporder;
2919 2972
2920 do { 2973 do {
2921 page_set_cache(page, cache); 2974 page->slab_cache = cache;
2922 page_set_slab(page, slab); 2975 page->slab_page = slab;
2923 page++; 2976 page++;
2924 } while (--nr_pages); 2977 } while (--nr_pages);
2925} 2978}
@@ -3057,7 +3110,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
3057 kfree_debugcheck(objp); 3110 kfree_debugcheck(objp);
3058 page = virt_to_head_page(objp); 3111 page = virt_to_head_page(objp);
3059 3112
3060 slabp = page_get_slab(page); 3113 slabp = page->slab_page;
3061 3114
3062 if (cachep->flags & SLAB_RED_ZONE) { 3115 if (cachep->flags & SLAB_RED_ZONE) {
3063 verify_redzone_free(cachep, objp); 3116 verify_redzone_free(cachep, objp);
@@ -3077,10 +3130,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
3077#endif 3130#endif
3078 if (cachep->flags & SLAB_POISON) { 3131 if (cachep->flags & SLAB_POISON) {
3079#ifdef CONFIG_DEBUG_PAGEALLOC 3132#ifdef CONFIG_DEBUG_PAGEALLOC
3080 if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 3133 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
3081 store_stackinfo(cachep, objp, (unsigned long)caller); 3134 store_stackinfo(cachep, objp, (unsigned long)caller);
3082 kernel_map_pages(virt_to_page(objp), 3135 kernel_map_pages(virt_to_page(objp),
3083 cachep->buffer_size / PAGE_SIZE, 0); 3136 cachep->size / PAGE_SIZE, 0);
3084 } else { 3137 } else {
3085 poison_obj(cachep, objp, POISON_FREE); 3138 poison_obj(cachep, objp, POISON_FREE);
3086 } 3139 }
@@ -3120,16 +3173,19 @@ bad:
3120#define check_slabp(x,y) do { } while(0) 3173#define check_slabp(x,y) do { } while(0)
3121#endif 3174#endif
3122 3175
3123static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) 3176static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
3177 bool force_refill)
3124{ 3178{
3125 int batchcount; 3179 int batchcount;
3126 struct kmem_list3 *l3; 3180 struct kmem_list3 *l3;
3127 struct array_cache *ac; 3181 struct array_cache *ac;
3128 int node; 3182 int node;
3129 3183
3130retry:
3131 check_irq_off(); 3184 check_irq_off();
3132 node = numa_mem_id(); 3185 node = numa_mem_id();
3186 if (unlikely(force_refill))
3187 goto force_grow;
3188retry:
3133 ac = cpu_cache_get(cachep); 3189 ac = cpu_cache_get(cachep);
3134 batchcount = ac->batchcount; 3190 batchcount = ac->batchcount;
3135 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 3191 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3179,8 +3235,8 @@ retry:
3179 STATS_INC_ACTIVE(cachep); 3235 STATS_INC_ACTIVE(cachep);
3180 STATS_SET_HIGH(cachep); 3236 STATS_SET_HIGH(cachep);
3181 3237
3182 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, 3238 ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
3183 node); 3239 node));
3184 } 3240 }
3185 check_slabp(cachep, slabp); 3241 check_slabp(cachep, slabp);
3186 3242
@@ -3199,18 +3255,22 @@ alloc_done:
3199 3255
3200 if (unlikely(!ac->avail)) { 3256 if (unlikely(!ac->avail)) {
3201 int x; 3257 int x;
3258force_grow:
3202 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); 3259 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3203 3260
3204 /* cache_grow can reenable interrupts, then ac could change. */ 3261 /* cache_grow can reenable interrupts, then ac could change. */
3205 ac = cpu_cache_get(cachep); 3262 ac = cpu_cache_get(cachep);
3206 if (!x && ac->avail == 0) /* no objects in sight? abort */ 3263
3264 /* no objects in sight? abort */
3265 if (!x && (ac->avail == 0 || force_refill))
3207 return NULL; 3266 return NULL;
3208 3267
3209 if (!ac->avail) /* objects refilled by interrupt? */ 3268 if (!ac->avail) /* objects refilled by interrupt? */
3210 goto retry; 3269 goto retry;
3211 } 3270 }
3212 ac->touched = 1; 3271 ac->touched = 1;
3213 return ac->entry[--ac->avail]; 3272
3273 return ac_get_obj(cachep, ac, flags, force_refill);
3214} 3274}
3215 3275
3216static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 3276static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3230,9 +3290,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3230 return objp; 3290 return objp;
3231 if (cachep->flags & SLAB_POISON) { 3291 if (cachep->flags & SLAB_POISON) {
3232#ifdef CONFIG_DEBUG_PAGEALLOC 3292#ifdef CONFIG_DEBUG_PAGEALLOC
3233 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 3293 if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3234 kernel_map_pages(virt_to_page(objp), 3294 kernel_map_pages(virt_to_page(objp),
3235 cachep->buffer_size / PAGE_SIZE, 1); 3295 cachep->size / PAGE_SIZE, 1);
3236 else 3296 else
3237 check_poison_obj(cachep, objp); 3297 check_poison_obj(cachep, objp);
3238#else 3298#else
@@ -3261,8 +3321,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3261 struct slab *slabp; 3321 struct slab *slabp;
3262 unsigned objnr; 3322 unsigned objnr;
3263 3323
3264 slabp = page_get_slab(virt_to_head_page(objp)); 3324 slabp = virt_to_head_page(objp)->slab_page;
3265 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 3325 objnr = (unsigned)(objp - slabp->s_mem) / cachep->size;
3266 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; 3326 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3267 } 3327 }
3268#endif 3328#endif
@@ -3285,30 +3345,42 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3285 if (cachep == &cache_cache) 3345 if (cachep == &cache_cache)
3286 return false; 3346 return false;
3287 3347
3288 return should_failslab(obj_size(cachep), flags, cachep->flags); 3348 return should_failslab(cachep->object_size, flags, cachep->flags);
3289} 3349}
3290 3350
3291static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3351static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3292{ 3352{
3293 void *objp; 3353 void *objp;
3294 struct array_cache *ac; 3354 struct array_cache *ac;
3355 bool force_refill = false;
3295 3356
3296 check_irq_off(); 3357 check_irq_off();
3297 3358
3298 ac = cpu_cache_get(cachep); 3359 ac = cpu_cache_get(cachep);
3299 if (likely(ac->avail)) { 3360 if (likely(ac->avail)) {
3300 STATS_INC_ALLOCHIT(cachep);
3301 ac->touched = 1; 3361 ac->touched = 1;
3302 objp = ac->entry[--ac->avail]; 3362 objp = ac_get_obj(cachep, ac, flags, false);
3303 } else { 3363
3304 STATS_INC_ALLOCMISS(cachep);
3305 objp = cache_alloc_refill(cachep, flags);
3306 /* 3364 /*
3307 * the 'ac' may be updated by cache_alloc_refill(), 3365 * Allow for the possibility all avail objects are not allowed
3308 * and kmemleak_erase() requires its correct value. 3366 * by the current flags
3309 */ 3367 */
3310 ac = cpu_cache_get(cachep); 3368 if (objp) {
3369 STATS_INC_ALLOCHIT(cachep);
3370 goto out;
3371 }
3372 force_refill = true;
3311 } 3373 }
3374
3375 STATS_INC_ALLOCMISS(cachep);
3376 objp = cache_alloc_refill(cachep, flags, force_refill);
3377 /*
3378 * the 'ac' may be updated by cache_alloc_refill(),
3379 * and kmemleak_erase() requires its correct value.
3380 */
3381 ac = cpu_cache_get(cachep);
3382
3383out:
3312 /* 3384 /*
3313 * To avoid a false negative, if an object that is in one of the 3385 * To avoid a false negative, if an object that is in one of the
3314 * per-CPU caches is leaked, we need to make sure kmemleak doesn't 3386 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
@@ -3336,7 +3408,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3336 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3408 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3337 nid_alloc = cpuset_slab_spread_node(); 3409 nid_alloc = cpuset_slab_spread_node();
3338 else if (current->mempolicy) 3410 else if (current->mempolicy)
3339 nid_alloc = slab_node(current->mempolicy); 3411 nid_alloc = slab_node();
3340 if (nid_alloc != nid_here) 3412 if (nid_alloc != nid_here)
3341 return ____cache_alloc_node(cachep, flags, nid_alloc); 3413 return ____cache_alloc_node(cachep, flags, nid_alloc);
3342 return NULL; 3414 return NULL;
@@ -3368,7 +3440,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3368 3440
3369retry_cpuset: 3441retry_cpuset:
3370 cpuset_mems_cookie = get_mems_allowed(); 3442 cpuset_mems_cookie = get_mems_allowed();
3371 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 3443 zonelist = node_zonelist(slab_node(), flags);
3372 3444
3373retry: 3445retry:
3374 /* 3446 /*
@@ -3545,14 +3617,14 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3545 out: 3617 out:
3546 local_irq_restore(save_flags); 3618 local_irq_restore(save_flags);
3547 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3619 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3548 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, 3620 kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
3549 flags); 3621 flags);
3550 3622
3551 if (likely(ptr)) 3623 if (likely(ptr))
3552 kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); 3624 kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
3553 3625
3554 if (unlikely((flags & __GFP_ZERO) && ptr)) 3626 if (unlikely((flags & __GFP_ZERO) && ptr))
3555 memset(ptr, 0, obj_size(cachep)); 3627 memset(ptr, 0, cachep->object_size);
3556 3628
3557 return ptr; 3629 return ptr;
3558} 3630}
@@ -3607,15 +3679,15 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3607 objp = __do_cache_alloc(cachep, flags); 3679 objp = __do_cache_alloc(cachep, flags);
3608 local_irq_restore(save_flags); 3680 local_irq_restore(save_flags);
3609 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3681 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3610 kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, 3682 kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
3611 flags); 3683 flags);
3612 prefetchw(objp); 3684 prefetchw(objp);
3613 3685
3614 if (likely(objp)) 3686 if (likely(objp))
3615 kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); 3687 kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
3616 3688
3617 if (unlikely((flags & __GFP_ZERO) && objp)) 3689 if (unlikely((flags & __GFP_ZERO) && objp))
3618 memset(objp, 0, obj_size(cachep)); 3690 memset(objp, 0, cachep->object_size);
3619 3691
3620 return objp; 3692 return objp;
3621} 3693}
@@ -3630,9 +3702,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3630 struct kmem_list3 *l3; 3702 struct kmem_list3 *l3;
3631 3703
3632 for (i = 0; i < nr_objects; i++) { 3704 for (i = 0; i < nr_objects; i++) {
3633 void *objp = objpp[i]; 3705 void *objp;
3634 struct slab *slabp; 3706 struct slab *slabp;
3635 3707
3708 clear_obj_pfmemalloc(&objpp[i]);
3709 objp = objpp[i];
3710
3636 slabp = virt_to_slab(objp); 3711 slabp = virt_to_slab(objp);
3637 l3 = cachep->nodelists[node]; 3712 l3 = cachep->nodelists[node];
3638 list_del(&slabp->list); 3713 list_del(&slabp->list);
@@ -3731,7 +3806,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3731 kmemleak_free_recursive(objp, cachep->flags); 3806 kmemleak_free_recursive(objp, cachep->flags);
3732 objp = cache_free_debugcheck(cachep, objp, caller); 3807 objp = cache_free_debugcheck(cachep, objp, caller);
3733 3808
3734 kmemcheck_slab_free(cachep, objp, obj_size(cachep)); 3809 kmemcheck_slab_free(cachep, objp, cachep->object_size);
3735 3810
3736 /* 3811 /*
3737 * Skip calling cache_free_alien() when the platform is not numa. 3812 * Skip calling cache_free_alien() when the platform is not numa.
@@ -3750,7 +3825,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3750 cache_flusharray(cachep, ac); 3825 cache_flusharray(cachep, ac);
3751 } 3826 }
3752 3827
3753 ac->entry[ac->avail++] = objp; 3828 ac_put_obj(cachep, ac, objp);
3754} 3829}
3755 3830
3756/** 3831/**
@@ -3766,7 +3841,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3766 void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); 3841 void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3767 3842
3768 trace_kmem_cache_alloc(_RET_IP_, ret, 3843 trace_kmem_cache_alloc(_RET_IP_, ret,
3769 obj_size(cachep), cachep->buffer_size, flags); 3844 cachep->object_size, cachep->size, flags);
3770 3845
3771 return ret; 3846 return ret;
3772} 3847}
@@ -3794,7 +3869,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3794 __builtin_return_address(0)); 3869 __builtin_return_address(0));
3795 3870
3796 trace_kmem_cache_alloc_node(_RET_IP_, ret, 3871 trace_kmem_cache_alloc_node(_RET_IP_, ret,
3797 obj_size(cachep), cachep->buffer_size, 3872 cachep->object_size, cachep->size,
3798 flags, nodeid); 3873 flags, nodeid);
3799 3874
3800 return ret; 3875 return ret;
@@ -3876,7 +3951,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3876 ret = __cache_alloc(cachep, flags, caller); 3951 ret = __cache_alloc(cachep, flags, caller);
3877 3952
3878 trace_kmalloc((unsigned long) caller, ret, 3953 trace_kmalloc((unsigned long) caller, ret,
3879 size, cachep->buffer_size, flags); 3954 size, cachep->size, flags);
3880 3955
3881 return ret; 3956 return ret;
3882} 3957}
@@ -3916,9 +3991,9 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3916 unsigned long flags; 3991 unsigned long flags;
3917 3992
3918 local_irq_save(flags); 3993 local_irq_save(flags);
3919 debug_check_no_locks_freed(objp, obj_size(cachep)); 3994 debug_check_no_locks_freed(objp, cachep->object_size);
3920 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3995 if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3921 debug_check_no_obj_freed(objp, obj_size(cachep)); 3996 debug_check_no_obj_freed(objp, cachep->object_size);
3922 __cache_free(cachep, objp, __builtin_return_address(0)); 3997 __cache_free(cachep, objp, __builtin_return_address(0));
3923 local_irq_restore(flags); 3998 local_irq_restore(flags);
3924 3999
@@ -3947,8 +4022,9 @@ void kfree(const void *objp)
3947 local_irq_save(flags); 4022 local_irq_save(flags);
3948 kfree_debugcheck(objp); 4023 kfree_debugcheck(objp);
3949 c = virt_to_cache(objp); 4024 c = virt_to_cache(objp);
3950 debug_check_no_locks_freed(objp, obj_size(c)); 4025 debug_check_no_locks_freed(objp, c->object_size);
3951 debug_check_no_obj_freed(objp, obj_size(c)); 4026
4027 debug_check_no_obj_freed(objp, c->object_size);
3952 __cache_free(c, (void *)objp, __builtin_return_address(0)); 4028 __cache_free(c, (void *)objp, __builtin_return_address(0));
3953 local_irq_restore(flags); 4029 local_irq_restore(flags);
3954} 4030}
@@ -3956,7 +4032,7 @@ EXPORT_SYMBOL(kfree);
3956 4032
3957unsigned int kmem_cache_size(struct kmem_cache *cachep) 4033unsigned int kmem_cache_size(struct kmem_cache *cachep)
3958{ 4034{
3959 return obj_size(cachep); 4035 return cachep->object_size;
3960} 4036}
3961EXPORT_SYMBOL(kmem_cache_size); 4037EXPORT_SYMBOL(kmem_cache_size);
3962 4038
@@ -4030,7 +4106,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
4030 return 0; 4106 return 0;
4031 4107
4032fail: 4108fail:
4033 if (!cachep->next.next) { 4109 if (!cachep->list.next) {
4034 /* Cache is not active yet. Roll back what we did */ 4110 /* Cache is not active yet. Roll back what we did */
4035 node--; 4111 node--;
4036 while (node >= 0) { 4112 while (node >= 0) {
@@ -4065,7 +4141,7 @@ static void do_ccupdate_local(void *info)
4065 new->new[smp_processor_id()] = old; 4141 new->new[smp_processor_id()] = old;
4066} 4142}
4067 4143
4068/* Always called with the cache_chain_mutex held */ 4144/* Always called with the slab_mutex held */
4069static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 4145static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4070 int batchcount, int shared, gfp_t gfp) 4146 int batchcount, int shared, gfp_t gfp)
4071{ 4147{
@@ -4109,7 +4185,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4109 return alloc_kmemlist(cachep, gfp); 4185 return alloc_kmemlist(cachep, gfp);
4110} 4186}
4111 4187
4112/* Called with cache_chain_mutex held always */ 4188/* Called with slab_mutex held always */
4113static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) 4189static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4114{ 4190{
4115 int err; 4191 int err;
@@ -4124,13 +4200,13 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4124 * The numbers are guessed, we should auto-tune as described by 4200 * The numbers are guessed, we should auto-tune as described by
4125 * Bonwick. 4201 * Bonwick.
4126 */ 4202 */
4127 if (cachep->buffer_size > 131072) 4203 if (cachep->size > 131072)
4128 limit = 1; 4204 limit = 1;
4129 else if (cachep->buffer_size > PAGE_SIZE) 4205 else if (cachep->size > PAGE_SIZE)
4130 limit = 8; 4206 limit = 8;
4131 else if (cachep->buffer_size > 1024) 4207 else if (cachep->size > 1024)
4132 limit = 24; 4208 limit = 24;
4133 else if (cachep->buffer_size > 256) 4209 else if (cachep->size > 256)
4134 limit = 54; 4210 limit = 54;
4135 else 4211 else
4136 limit = 120; 4212 limit = 120;
@@ -4145,7 +4221,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4145 * to a larger limit. Thus disabled by default. 4221 * to a larger limit. Thus disabled by default.
4146 */ 4222 */
4147 shared = 0; 4223 shared = 0;
4148 if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) 4224 if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
4149 shared = 8; 4225 shared = 8;
4150 4226
4151#if DEBUG 4227#if DEBUG
@@ -4211,11 +4287,11 @@ static void cache_reap(struct work_struct *w)
4211 int node = numa_mem_id(); 4287 int node = numa_mem_id();
4212 struct delayed_work *work = to_delayed_work(w); 4288 struct delayed_work *work = to_delayed_work(w);
4213 4289
4214 if (!mutex_trylock(&cache_chain_mutex)) 4290 if (!mutex_trylock(&slab_mutex))
4215 /* Give up. Setup the next iteration. */ 4291 /* Give up. Setup the next iteration. */
4216 goto out; 4292 goto out;
4217 4293
4218 list_for_each_entry(searchp, &cache_chain, next) { 4294 list_for_each_entry(searchp, &slab_caches, list) {
4219 check_irq_on(); 4295 check_irq_on();
4220 4296
4221 /* 4297 /*
@@ -4253,7 +4329,7 @@ next:
4253 cond_resched(); 4329 cond_resched();
4254 } 4330 }
4255 check_irq_on(); 4331 check_irq_on();
4256 mutex_unlock(&cache_chain_mutex); 4332 mutex_unlock(&slab_mutex);
4257 next_reap_node(); 4333 next_reap_node();
4258out: 4334out:
4259 /* Set up the next iteration */ 4335 /* Set up the next iteration */
@@ -4289,26 +4365,26 @@ static void *s_start(struct seq_file *m, loff_t *pos)
4289{ 4365{
4290 loff_t n = *pos; 4366 loff_t n = *pos;
4291 4367
4292 mutex_lock(&cache_chain_mutex); 4368 mutex_lock(&slab_mutex);
4293 if (!n) 4369 if (!n)
4294 print_slabinfo_header(m); 4370 print_slabinfo_header(m);
4295 4371
4296 return seq_list_start(&cache_chain, *pos); 4372 return seq_list_start(&slab_caches, *pos);
4297} 4373}
4298 4374
4299static void *s_next(struct seq_file *m, void *p, loff_t *pos) 4375static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4300{ 4376{
4301 return seq_list_next(p, &cache_chain, pos); 4377 return seq_list_next(p, &slab_caches, pos);
4302} 4378}
4303 4379
4304static void s_stop(struct seq_file *m, void *p) 4380static void s_stop(struct seq_file *m, void *p)
4305{ 4381{
4306 mutex_unlock(&cache_chain_mutex); 4382 mutex_unlock(&slab_mutex);
4307} 4383}
4308 4384
4309static int s_show(struct seq_file *m, void *p) 4385static int s_show(struct seq_file *m, void *p)
4310{ 4386{
4311 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); 4387 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4312 struct slab *slabp; 4388 struct slab *slabp;
4313 unsigned long active_objs; 4389 unsigned long active_objs;
4314 unsigned long num_objs; 4390 unsigned long num_objs;
@@ -4364,7 +4440,7 @@ static int s_show(struct seq_file *m, void *p)
4364 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 4440 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4365 4441
4366 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 4442 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4367 name, active_objs, num_objs, cachep->buffer_size, 4443 name, active_objs, num_objs, cachep->size,
4368 cachep->num, (1 << cachep->gfporder)); 4444 cachep->num, (1 << cachep->gfporder));
4369 seq_printf(m, " : tunables %4u %4u %4u", 4445 seq_printf(m, " : tunables %4u %4u %4u",
4370 cachep->limit, cachep->batchcount, cachep->shared); 4446 cachep->limit, cachep->batchcount, cachep->shared);
@@ -4454,9 +4530,9 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4454 return -EINVAL; 4530 return -EINVAL;
4455 4531
4456 /* Find the cache in the chain of caches. */ 4532 /* Find the cache in the chain of caches. */
4457 mutex_lock(&cache_chain_mutex); 4533 mutex_lock(&slab_mutex);
4458 res = -EINVAL; 4534 res = -EINVAL;
4459 list_for_each_entry(cachep, &cache_chain, next) { 4535 list_for_each_entry(cachep, &slab_caches, list) {
4460 if (!strcmp(cachep->name, kbuf)) { 4536 if (!strcmp(cachep->name, kbuf)) {
4461 if (limit < 1 || batchcount < 1 || 4537 if (limit < 1 || batchcount < 1 ||
4462 batchcount > limit || shared < 0) { 4538 batchcount > limit || shared < 0) {
@@ -4469,7 +4545,7 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4469 break; 4545 break;
4470 } 4546 }
4471 } 4547 }
4472 mutex_unlock(&cache_chain_mutex); 4548 mutex_unlock(&slab_mutex);
4473 if (res >= 0) 4549 if (res >= 0)
4474 res = count; 4550 res = count;
4475 return res; 4551 return res;
@@ -4492,8 +4568,8 @@ static const struct file_operations proc_slabinfo_operations = {
4492 4568
4493static void *leaks_start(struct seq_file *m, loff_t *pos) 4569static void *leaks_start(struct seq_file *m, loff_t *pos)
4494{ 4570{
4495 mutex_lock(&cache_chain_mutex); 4571 mutex_lock(&slab_mutex);
4496 return seq_list_start(&cache_chain, *pos); 4572 return seq_list_start(&slab_caches, *pos);
4497} 4573}
4498 4574
4499static inline int add_caller(unsigned long *n, unsigned long v) 4575static inline int add_caller(unsigned long *n, unsigned long v)
@@ -4532,7 +4608,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4532 int i; 4608 int i;
4533 if (n[0] == n[1]) 4609 if (n[0] == n[1])
4534 return; 4610 return;
4535 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { 4611 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) {
4536 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) 4612 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4537 continue; 4613 continue;
4538 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4614 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
@@ -4558,7 +4634,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
4558 4634
4559static int leaks_show(struct seq_file *m, void *p) 4635static int leaks_show(struct seq_file *m, void *p)
4560{ 4636{
4561 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); 4637 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4562 struct slab *slabp; 4638 struct slab *slabp;
4563 struct kmem_list3 *l3; 4639 struct kmem_list3 *l3;
4564 const char *name; 4640 const char *name;
@@ -4592,17 +4668,17 @@ static int leaks_show(struct seq_file *m, void *p)
4592 name = cachep->name; 4668 name = cachep->name;
4593 if (n[0] == n[1]) { 4669 if (n[0] == n[1]) {
4594 /* Increase the buffer size */ 4670 /* Increase the buffer size */
4595 mutex_unlock(&cache_chain_mutex); 4671 mutex_unlock(&slab_mutex);
4596 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); 4672 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4597 if (!m->private) { 4673 if (!m->private) {
4598 /* Too bad, we are really out */ 4674 /* Too bad, we are really out */
4599 m->private = n; 4675 m->private = n;
4600 mutex_lock(&cache_chain_mutex); 4676 mutex_lock(&slab_mutex);
4601 return -ENOMEM; 4677 return -ENOMEM;
4602 } 4678 }
4603 *(unsigned long *)m->private = n[0] * 2; 4679 *(unsigned long *)m->private = n[0] * 2;
4604 kfree(n); 4680 kfree(n);
4605 mutex_lock(&cache_chain_mutex); 4681 mutex_lock(&slab_mutex);
4606 /* Now make sure this entry will be retried */ 4682 /* Now make sure this entry will be retried */
4607 m->count = m->size; 4683 m->count = m->size;
4608 return 0; 4684 return 0;
@@ -4677,6 +4753,6 @@ size_t ksize(const void *objp)
4677 if (unlikely(objp == ZERO_SIZE_PTR)) 4753 if (unlikely(objp == ZERO_SIZE_PTR))
4678 return 0; 4754 return 0;
4679 4755
4680 return obj_size(virt_to_cache(objp)); 4756 return virt_to_cache(objp)->object_size;
4681} 4757}
4682EXPORT_SYMBOL(ksize); 4758EXPORT_SYMBOL(ksize);
diff --git a/mm/slab.h b/mm/slab.h
new file mode 100644
index 000000000000..db7848caaa25
--- /dev/null
+++ b/mm/slab.h
@@ -0,0 +1,33 @@
1#ifndef MM_SLAB_H
2#define MM_SLAB_H
3/*
4 * Internal slab definitions
5 */
6
7/*
8 * State of the slab allocator.
9 *
10 * This is used to describe the states of the allocator during bootup.
11 * Allocators use this to gradually bootstrap themselves. Most allocators
12 * have the problem that the structures used for managing slab caches are
13 * allocated from slab caches themselves.
14 */
15enum slab_state {
16 DOWN, /* No slab functionality yet */
17 PARTIAL, /* SLUB: kmem_cache_node available */
18 PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */
19 PARTIAL_L3, /* SLAB: kmalloc size for l3 struct available */
20 UP, /* Slab caches usable but not all extras yet */
21 FULL /* Everything is working */
22};
23
24extern enum slab_state slab_state;
25
26/* The slab cache mutex protects the management structures during changes */
27extern struct mutex slab_mutex;
28extern struct list_head slab_caches;
29
30struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
31 size_t align, unsigned long flags, void (*ctor)(void *));
32
33#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
new file mode 100644
index 000000000000..aa3ca5bb01b5
--- /dev/null
+++ b/mm/slab_common.c
@@ -0,0 +1,120 @@
1/*
2 * Slab allocator functions that are independent of the allocator strategy
3 *
4 * (C) 2012 Christoph Lameter <cl@linux.com>
5 */
6#include <linux/slab.h>
7
8#include <linux/mm.h>
9#include <linux/poison.h>
10#include <linux/interrupt.h>
11#include <linux/memory.h>
12#include <linux/compiler.h>
13#include <linux/module.h>
14#include <linux/cpu.h>
15#include <linux/uaccess.h>
16#include <asm/cacheflush.h>
17#include <asm/tlbflush.h>
18#include <asm/page.h>
19
20#include "slab.h"
21
22enum slab_state slab_state;
23LIST_HEAD(slab_caches);
24DEFINE_MUTEX(slab_mutex);
25
26/*
27 * kmem_cache_create - Create a cache.
28 * @name: A string which is used in /proc/slabinfo to identify this cache.
29 * @size: The size of objects to be created in this cache.
30 * @align: The required alignment for the objects.
31 * @flags: SLAB flags
32 * @ctor: A constructor for the objects.
33 *
34 * Returns a ptr to the cache on success, NULL on failure.
35 * Cannot be called within a interrupt, but can be interrupted.
36 * The @ctor is run when new pages are allocated by the cache.
37 *
38 * The flags are
39 *
40 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
41 * to catch references to uninitialised memory.
42 *
43 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
44 * for buffer overruns.
45 *
46 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
47 * cacheline. This can be beneficial if you're counting cycles as closely
48 * as davem.
49 */
50
51struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
52 unsigned long flags, void (*ctor)(void *))
53{
54 struct kmem_cache *s = NULL;
55
56#ifdef CONFIG_DEBUG_VM
57 if (!name || in_interrupt() || size < sizeof(void *) ||
58 size > KMALLOC_MAX_SIZE) {
59 printk(KERN_ERR "kmem_cache_create(%s) integrity check"
60 " failed\n", name);
61 goto out;
62 }
63#endif
64
65 get_online_cpus();
66 mutex_lock(&slab_mutex);
67
68#ifdef CONFIG_DEBUG_VM
69 list_for_each_entry(s, &slab_caches, list) {
70 char tmp;
71 int res;
72
73 /*
74 * This happens when the module gets unloaded and doesn't
75 * destroy its slab cache and no-one else reuses the vmalloc
76 * area of the module. Print a warning.
77 */
78 res = probe_kernel_address(s->name, tmp);
79 if (res) {
80 printk(KERN_ERR
81 "Slab cache with size %d has lost its name\n",
82 s->object_size);
83 continue;
84 }
85
86 if (!strcmp(s->name, name)) {
87 printk(KERN_ERR "kmem_cache_create(%s): Cache name"
88 " already exists.\n",
89 name);
90 dump_stack();
91 s = NULL;
92 goto oops;
93 }
94 }
95
96 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
97#endif
98
99 s = __kmem_cache_create(name, size, align, flags, ctor);
100
101#ifdef CONFIG_DEBUG_VM
102oops:
103#endif
104 mutex_unlock(&slab_mutex);
105 put_online_cpus();
106
107#ifdef CONFIG_DEBUG_VM
108out:
109#endif
110 if (!s && (flags & SLAB_PANIC))
111 panic("kmem_cache_create: Failed to create slab '%s'\n", name);
112
113 return s;
114}
115EXPORT_SYMBOL(kmem_cache_create);
116
117int slab_is_available(void)
118{
119 return slab_state >= UP;
120}
diff --git a/mm/slob.c b/mm/slob.c
index 8105be42cad1..45d4ca79933a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -59,6 +59,8 @@
59 59
60#include <linux/kernel.h> 60#include <linux/kernel.h>
61#include <linux/slab.h> 61#include <linux/slab.h>
62#include "slab.h"
63
62#include <linux/mm.h> 64#include <linux/mm.h>
63#include <linux/swap.h> /* struct reclaim_state */ 65#include <linux/swap.h> /* struct reclaim_state */
64#include <linux/cache.h> 66#include <linux/cache.h>
@@ -92,36 +94,6 @@ struct slob_block {
92typedef struct slob_block slob_t; 94typedef struct slob_block slob_t;
93 95
94/* 96/*
95 * We use struct page fields to manage some slob allocation aspects,
96 * however to avoid the horrible mess in include/linux/mm_types.h, we'll
97 * just define our own struct page type variant here.
98 */
99struct slob_page {
100 union {
101 struct {
102 unsigned long flags; /* mandatory */
103 atomic_t _count; /* mandatory */
104 slobidx_t units; /* free units left in page */
105 unsigned long pad[2];
106 slob_t *free; /* first free slob_t in page */
107 struct list_head list; /* linked list of free pages */
108 };
109 struct page page;
110 };
111};
112static inline void struct_slob_page_wrong_size(void)
113{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); }
114
115/*
116 * free_slob_page: call before a slob_page is returned to the page allocator.
117 */
118static inline void free_slob_page(struct slob_page *sp)
119{
120 reset_page_mapcount(&sp->page);
121 sp->page.mapping = NULL;
122}
123
124/*
125 * All partially free slob pages go on these lists. 97 * All partially free slob pages go on these lists.
126 */ 98 */
127#define SLOB_BREAK1 256 99#define SLOB_BREAK1 256
@@ -131,46 +103,23 @@ static LIST_HEAD(free_slob_medium);
131static LIST_HEAD(free_slob_large); 103static LIST_HEAD(free_slob_large);
132 104
133/* 105/*
134 * is_slob_page: True for all slob pages (false for bigblock pages)
135 */
136static inline int is_slob_page(struct slob_page *sp)
137{
138 return PageSlab((struct page *)sp);
139}
140
141static inline void set_slob_page(struct slob_page *sp)
142{
143 __SetPageSlab((struct page *)sp);
144}
145
146static inline void clear_slob_page(struct slob_page *sp)
147{
148 __ClearPageSlab((struct page *)sp);
149}
150
151static inline struct slob_page *slob_page(const void *addr)
152{
153 return (struct slob_page *)virt_to_page(addr);
154}
155
156/*
157 * slob_page_free: true for pages on free_slob_pages list. 106 * slob_page_free: true for pages on free_slob_pages list.
158 */ 107 */
159static inline int slob_page_free(struct slob_page *sp) 108static inline int slob_page_free(struct page *sp)
160{ 109{
161 return PageSlobFree((struct page *)sp); 110 return PageSlobFree(sp);
162} 111}
163 112
164static void set_slob_page_free(struct slob_page *sp, struct list_head *list) 113static void set_slob_page_free(struct page *sp, struct list_head *list)
165{ 114{
166 list_add(&sp->list, list); 115 list_add(&sp->list, list);
167 __SetPageSlobFree((struct page *)sp); 116 __SetPageSlobFree(sp);
168} 117}
169 118
170static inline void clear_slob_page_free(struct slob_page *sp) 119static inline void clear_slob_page_free(struct page *sp)
171{ 120{
172 list_del(&sp->list); 121 list_del(&sp->list);
173 __ClearPageSlobFree((struct page *)sp); 122 __ClearPageSlobFree(sp);
174} 123}
175 124
176#define SLOB_UNIT sizeof(slob_t) 125#define SLOB_UNIT sizeof(slob_t)
@@ -267,12 +216,12 @@ static void slob_free_pages(void *b, int order)
267/* 216/*
268 * Allocate a slob block within a given slob_page sp. 217 * Allocate a slob block within a given slob_page sp.
269 */ 218 */
270static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) 219static void *slob_page_alloc(struct page *sp, size_t size, int align)
271{ 220{
272 slob_t *prev, *cur, *aligned = NULL; 221 slob_t *prev, *cur, *aligned = NULL;
273 int delta = 0, units = SLOB_UNITS(size); 222 int delta = 0, units = SLOB_UNITS(size);
274 223
275 for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { 224 for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
276 slobidx_t avail = slob_units(cur); 225 slobidx_t avail = slob_units(cur);
277 226
278 if (align) { 227 if (align) {
@@ -296,12 +245,12 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
296 if (prev) 245 if (prev)
297 set_slob(prev, slob_units(prev), next); 246 set_slob(prev, slob_units(prev), next);
298 else 247 else
299 sp->free = next; 248 sp->freelist = next;
300 } else { /* fragment */ 249 } else { /* fragment */
301 if (prev) 250 if (prev)
302 set_slob(prev, slob_units(prev), cur + units); 251 set_slob(prev, slob_units(prev), cur + units);
303 else 252 else
304 sp->free = cur + units; 253 sp->freelist = cur + units;
305 set_slob(cur + units, avail - units, next); 254 set_slob(cur + units, avail - units, next);
306 } 255 }
307 256
@@ -320,7 +269,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
320 */ 269 */
321static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) 270static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
322{ 271{
323 struct slob_page *sp; 272 struct page *sp;
324 struct list_head *prev; 273 struct list_head *prev;
325 struct list_head *slob_list; 274 struct list_head *slob_list;
326 slob_t *b = NULL; 275 slob_t *b = NULL;
@@ -341,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
341 * If there's a node specification, search for a partial 290 * If there's a node specification, search for a partial
342 * page with a matching node id in the freelist. 291 * page with a matching node id in the freelist.
343 */ 292 */
344 if (node != -1 && page_to_nid(&sp->page) != node) 293 if (node != -1 && page_to_nid(sp) != node)
345 continue; 294 continue;
346#endif 295#endif
347 /* Enough room on this page? */ 296 /* Enough room on this page? */
@@ -369,12 +318,12 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
369 b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); 318 b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
370 if (!b) 319 if (!b)
371 return NULL; 320 return NULL;
372 sp = slob_page(b); 321 sp = virt_to_page(b);
373 set_slob_page(sp); 322 __SetPageSlab(sp);
374 323
375 spin_lock_irqsave(&slob_lock, flags); 324 spin_lock_irqsave(&slob_lock, flags);
376 sp->units = SLOB_UNITS(PAGE_SIZE); 325 sp->units = SLOB_UNITS(PAGE_SIZE);
377 sp->free = b; 326 sp->freelist = b;
378 INIT_LIST_HEAD(&sp->list); 327 INIT_LIST_HEAD(&sp->list);
379 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); 328 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
380 set_slob_page_free(sp, slob_list); 329 set_slob_page_free(sp, slob_list);
@@ -392,7 +341,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
392 */ 341 */
393static void slob_free(void *block, int size) 342static void slob_free(void *block, int size)
394{ 343{
395 struct slob_page *sp; 344 struct page *sp;
396 slob_t *prev, *next, *b = (slob_t *)block; 345 slob_t *prev, *next, *b = (slob_t *)block;
397 slobidx_t units; 346 slobidx_t units;
398 unsigned long flags; 347 unsigned long flags;
@@ -402,7 +351,7 @@ static void slob_free(void *block, int size)
402 return; 351 return;
403 BUG_ON(!size); 352 BUG_ON(!size);
404 353
405 sp = slob_page(block); 354 sp = virt_to_page(block);
406 units = SLOB_UNITS(size); 355 units = SLOB_UNITS(size);
407 356
408 spin_lock_irqsave(&slob_lock, flags); 357 spin_lock_irqsave(&slob_lock, flags);
@@ -412,8 +361,8 @@ static void slob_free(void *block, int size)
412 if (slob_page_free(sp)) 361 if (slob_page_free(sp))
413 clear_slob_page_free(sp); 362 clear_slob_page_free(sp);
414 spin_unlock_irqrestore(&slob_lock, flags); 363 spin_unlock_irqrestore(&slob_lock, flags);
415 clear_slob_page(sp); 364 __ClearPageSlab(sp);
416 free_slob_page(sp); 365 reset_page_mapcount(sp);
417 slob_free_pages(b, 0); 366 slob_free_pages(b, 0);
418 return; 367 return;
419 } 368 }
@@ -421,7 +370,7 @@ static void slob_free(void *block, int size)
421 if (!slob_page_free(sp)) { 370 if (!slob_page_free(sp)) {
422 /* This slob page is about to become partially free. Easy! */ 371 /* This slob page is about to become partially free. Easy! */
423 sp->units = units; 372 sp->units = units;
424 sp->free = b; 373 sp->freelist = b;
425 set_slob(b, units, 374 set_slob(b, units,
426 (void *)((unsigned long)(b + 375 (void *)((unsigned long)(b +
427 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); 376 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
@@ -441,15 +390,15 @@ static void slob_free(void *block, int size)
441 */ 390 */
442 sp->units += units; 391 sp->units += units;
443 392
444 if (b < sp->free) { 393 if (b < (slob_t *)sp->freelist) {
445 if (b + units == sp->free) { 394 if (b + units == sp->freelist) {
446 units += slob_units(sp->free); 395 units += slob_units(sp->freelist);
447 sp->free = slob_next(sp->free); 396 sp->freelist = slob_next(sp->freelist);
448 } 397 }
449 set_slob(b, units, sp->free); 398 set_slob(b, units, sp->freelist);
450 sp->free = b; 399 sp->freelist = b;
451 } else { 400 } else {
452 prev = sp->free; 401 prev = sp->freelist;
453 next = slob_next(prev); 402 next = slob_next(prev);
454 while (b > next) { 403 while (b > next) {
455 prev = next; 404 prev = next;
@@ -522,7 +471,7 @@ EXPORT_SYMBOL(__kmalloc_node);
522 471
523void kfree(const void *block) 472void kfree(const void *block)
524{ 473{
525 struct slob_page *sp; 474 struct page *sp;
526 475
527 trace_kfree(_RET_IP_, block); 476 trace_kfree(_RET_IP_, block);
528 477
@@ -530,43 +479,36 @@ void kfree(const void *block)
530 return; 479 return;
531 kmemleak_free(block); 480 kmemleak_free(block);
532 481
533 sp = slob_page(block); 482 sp = virt_to_page(block);
534 if (is_slob_page(sp)) { 483 if (PageSlab(sp)) {
535 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 484 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
536 unsigned int *m = (unsigned int *)(block - align); 485 unsigned int *m = (unsigned int *)(block - align);
537 slob_free(m, *m + align); 486 slob_free(m, *m + align);
538 } else 487 } else
539 put_page(&sp->page); 488 put_page(sp);
540} 489}
541EXPORT_SYMBOL(kfree); 490EXPORT_SYMBOL(kfree);
542 491
543/* can't use ksize for kmem_cache_alloc memory, only kmalloc */ 492/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
544size_t ksize(const void *block) 493size_t ksize(const void *block)
545{ 494{
546 struct slob_page *sp; 495 struct page *sp;
547 496
548 BUG_ON(!block); 497 BUG_ON(!block);
549 if (unlikely(block == ZERO_SIZE_PTR)) 498 if (unlikely(block == ZERO_SIZE_PTR))
550 return 0; 499 return 0;
551 500
552 sp = slob_page(block); 501 sp = virt_to_page(block);
553 if (is_slob_page(sp)) { 502 if (PageSlab(sp)) {
554 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 503 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
555 unsigned int *m = (unsigned int *)(block - align); 504 unsigned int *m = (unsigned int *)(block - align);
556 return SLOB_UNITS(*m) * SLOB_UNIT; 505 return SLOB_UNITS(*m) * SLOB_UNIT;
557 } else 506 } else
558 return sp->page.private; 507 return sp->private;
559} 508}
560EXPORT_SYMBOL(ksize); 509EXPORT_SYMBOL(ksize);
561 510
562struct kmem_cache { 511struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
563 unsigned int size, align;
564 unsigned long flags;
565 const char *name;
566 void (*ctor)(void *);
567};
568
569struct kmem_cache *kmem_cache_create(const char *name, size_t size,
570 size_t align, unsigned long flags, void (*ctor)(void *)) 512 size_t align, unsigned long flags, void (*ctor)(void *))
571{ 513{
572 struct kmem_cache *c; 514 struct kmem_cache *c;
@@ -589,13 +531,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
589 c->align = ARCH_SLAB_MINALIGN; 531 c->align = ARCH_SLAB_MINALIGN;
590 if (c->align < align) 532 if (c->align < align)
591 c->align = align; 533 c->align = align;
592 } else if (flags & SLAB_PANIC)
593 panic("Cannot create slab cache %s\n", name);
594 534
595 kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); 535 kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
536 c->refcount = 1;
537 }
596 return c; 538 return c;
597} 539}
598EXPORT_SYMBOL(kmem_cache_create);
599 540
600void kmem_cache_destroy(struct kmem_cache *c) 541void kmem_cache_destroy(struct kmem_cache *c)
601{ 542{
@@ -678,19 +619,12 @@ int kmem_cache_shrink(struct kmem_cache *d)
678} 619}
679EXPORT_SYMBOL(kmem_cache_shrink); 620EXPORT_SYMBOL(kmem_cache_shrink);
680 621
681static unsigned int slob_ready __read_mostly;
682
683int slab_is_available(void)
684{
685 return slob_ready;
686}
687
688void __init kmem_cache_init(void) 622void __init kmem_cache_init(void)
689{ 623{
690 slob_ready = 1; 624 slab_state = UP;
691} 625}
692 626
693void __init kmem_cache_init_late(void) 627void __init kmem_cache_init_late(void)
694{ 628{
695 /* Nothing to do */ 629 slab_state = FULL;
696} 630}
diff --git a/mm/slub.c b/mm/slub.c
index ffe13fdf8144..8f78e2577031 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include "slab.h"
19#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
20#include <linux/seq_file.h> 21#include <linux/seq_file.h>
21#include <linux/kmemcheck.h> 22#include <linux/kmemcheck.h>
@@ -33,15 +34,17 @@
33 34
34#include <trace/events/kmem.h> 35#include <trace/events/kmem.h>
35 36
37#include "internal.h"
38
36/* 39/*
37 * Lock order: 40 * Lock order:
38 * 1. slub_lock (Global Semaphore) 41 * 1. slab_mutex (Global Mutex)
39 * 2. node->list_lock 42 * 2. node->list_lock
40 * 3. slab_lock(page) (Only on some arches and for debugging) 43 * 3. slab_lock(page) (Only on some arches and for debugging)
41 * 44 *
42 * slub_lock 45 * slab_mutex
43 * 46 *
44 * The role of the slub_lock is to protect the list of all the slabs 47 * The role of the slab_mutex is to protect the list of all the slabs
45 * and to synchronize major metadata changes to slab cache structures. 48 * and to synchronize major metadata changes to slab cache structures.
46 * 49 *
47 * The slab_lock is only used for debugging and on arches that do not 50 * The slab_lock is only used for debugging and on arches that do not
@@ -182,17 +185,6 @@ static int kmem_size = sizeof(struct kmem_cache);
182static struct notifier_block slab_notifier; 185static struct notifier_block slab_notifier;
183#endif 186#endif
184 187
185static enum {
186 DOWN, /* No slab functionality available */
187 PARTIAL, /* Kmem_cache_node works */
188 UP, /* Everything works but does not show up in sysfs */
189 SYSFS /* Sysfs up */
190} slab_state = DOWN;
191
192/* A list of all slab caches on the system */
193static DECLARE_RWSEM(slub_lock);
194static LIST_HEAD(slab_caches);
195
196/* 188/*
197 * Tracking user of a slab. 189 * Tracking user of a slab.
198 */ 190 */
@@ -237,11 +229,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
237 * Core slab cache functions 229 * Core slab cache functions
238 *******************************************************************/ 230 *******************************************************************/
239 231
240int slab_is_available(void)
241{
242 return slab_state >= UP;
243}
244
245static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 232static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
246{ 233{
247 return s->node[node]; 234 return s->node[node];
@@ -311,7 +298,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
311 * and whatever may come after it. 298 * and whatever may come after it.
312 */ 299 */
313 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 300 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
314 return s->objsize; 301 return s->object_size;
315 302
316#endif 303#endif
317 /* 304 /*
@@ -609,11 +596,11 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
609 if (p > addr + 16) 596 if (p > addr + 16)
610 print_section("Bytes b4 ", p - 16, 16); 597 print_section("Bytes b4 ", p - 16, 16);
611 598
612 print_section("Object ", p, min_t(unsigned long, s->objsize, 599 print_section("Object ", p, min_t(unsigned long, s->object_size,
613 PAGE_SIZE)); 600 PAGE_SIZE));
614 if (s->flags & SLAB_RED_ZONE) 601 if (s->flags & SLAB_RED_ZONE)
615 print_section("Redzone ", p + s->objsize, 602 print_section("Redzone ", p + s->object_size,
616 s->inuse - s->objsize); 603 s->inuse - s->object_size);
617 604
618 if (s->offset) 605 if (s->offset)
619 off = s->offset + sizeof(void *); 606 off = s->offset + sizeof(void *);
@@ -655,12 +642,12 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
655 u8 *p = object; 642 u8 *p = object;
656 643
657 if (s->flags & __OBJECT_POISON) { 644 if (s->flags & __OBJECT_POISON) {
658 memset(p, POISON_FREE, s->objsize - 1); 645 memset(p, POISON_FREE, s->object_size - 1);
659 p[s->objsize - 1] = POISON_END; 646 p[s->object_size - 1] = POISON_END;
660 } 647 }
661 648
662 if (s->flags & SLAB_RED_ZONE) 649 if (s->flags & SLAB_RED_ZONE)
663 memset(p + s->objsize, val, s->inuse - s->objsize); 650 memset(p + s->object_size, val, s->inuse - s->object_size);
664} 651}
665 652
666static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 653static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
@@ -705,10 +692,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
705 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 692 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
706 * 0xa5 (POISON_END) 693 * 0xa5 (POISON_END)
707 * 694 *
708 * object + s->objsize 695 * object + s->object_size
709 * Padding to reach word boundary. This is also used for Redzoning. 696 * Padding to reach word boundary. This is also used for Redzoning.
710 * Padding is extended by another word if Redzoning is enabled and 697 * Padding is extended by another word if Redzoning is enabled and
711 * objsize == inuse. 698 * object_size == inuse.
712 * 699 *
713 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 700 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
714 * 0xcc (RED_ACTIVE) for objects in use. 701 * 0xcc (RED_ACTIVE) for objects in use.
@@ -727,7 +714,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
727 * object + s->size 714 * object + s->size
728 * Nothing is used beyond s->size. 715 * Nothing is used beyond s->size.
729 * 716 *
730 * If slabcaches are merged then the objsize and inuse boundaries are mostly 717 * If slabcaches are merged then the object_size and inuse boundaries are mostly
731 * ignored. And therefore no slab options that rely on these boundaries 718 * ignored. And therefore no slab options that rely on these boundaries
732 * may be used with merged slabcaches. 719 * may be used with merged slabcaches.
733 */ 720 */
@@ -787,25 +774,25 @@ static int check_object(struct kmem_cache *s, struct page *page,
787 void *object, u8 val) 774 void *object, u8 val)
788{ 775{
789 u8 *p = object; 776 u8 *p = object;
790 u8 *endobject = object + s->objsize; 777 u8 *endobject = object + s->object_size;
791 778
792 if (s->flags & SLAB_RED_ZONE) { 779 if (s->flags & SLAB_RED_ZONE) {
793 if (!check_bytes_and_report(s, page, object, "Redzone", 780 if (!check_bytes_and_report(s, page, object, "Redzone",
794 endobject, val, s->inuse - s->objsize)) 781 endobject, val, s->inuse - s->object_size))
795 return 0; 782 return 0;
796 } else { 783 } else {
797 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { 784 if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
798 check_bytes_and_report(s, page, p, "Alignment padding", 785 check_bytes_and_report(s, page, p, "Alignment padding",
799 endobject, POISON_INUSE, s->inuse - s->objsize); 786 endobject, POISON_INUSE, s->inuse - s->object_size);
800 } 787 }
801 } 788 }
802 789
803 if (s->flags & SLAB_POISON) { 790 if (s->flags & SLAB_POISON) {
804 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && 791 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
805 (!check_bytes_and_report(s, page, p, "Poison", p, 792 (!check_bytes_and_report(s, page, p, "Poison", p,
806 POISON_FREE, s->objsize - 1) || 793 POISON_FREE, s->object_size - 1) ||
807 !check_bytes_and_report(s, page, p, "Poison", 794 !check_bytes_and_report(s, page, p, "Poison",
808 p + s->objsize - 1, POISON_END, 1))) 795 p + s->object_size - 1, POISON_END, 1)))
809 return 0; 796 return 0;
810 /* 797 /*
811 * check_pad_bytes cleans up on its own. 798 * check_pad_bytes cleans up on its own.
@@ -926,7 +913,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
926 page->freelist); 913 page->freelist);
927 914
928 if (!alloc) 915 if (!alloc)
929 print_section("Object ", (void *)object, s->objsize); 916 print_section("Object ", (void *)object, s->object_size);
930 917
931 dump_stack(); 918 dump_stack();
932 } 919 }
@@ -942,14 +929,14 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
942 lockdep_trace_alloc(flags); 929 lockdep_trace_alloc(flags);
943 might_sleep_if(flags & __GFP_WAIT); 930 might_sleep_if(flags & __GFP_WAIT);
944 931
945 return should_failslab(s->objsize, flags, s->flags); 932 return should_failslab(s->object_size, flags, s->flags);
946} 933}
947 934
948static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) 935static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
949{ 936{
950 flags &= gfp_allowed_mask; 937 flags &= gfp_allowed_mask;
951 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 938 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
952 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); 939 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
953} 940}
954 941
955static inline void slab_free_hook(struct kmem_cache *s, void *x) 942static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -966,13 +953,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
966 unsigned long flags; 953 unsigned long flags;
967 954
968 local_irq_save(flags); 955 local_irq_save(flags);
969 kmemcheck_slab_free(s, x, s->objsize); 956 kmemcheck_slab_free(s, x, s->object_size);
970 debug_check_no_locks_freed(x, s->objsize); 957 debug_check_no_locks_freed(x, s->object_size);
971 local_irq_restore(flags); 958 local_irq_restore(flags);
972 } 959 }
973#endif 960#endif
974 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 961 if (!(s->flags & SLAB_DEBUG_OBJECTS))
975 debug_check_no_obj_freed(x, s->objsize); 962 debug_check_no_obj_freed(x, s->object_size);
976} 963}
977 964
978/* 965/*
@@ -1207,7 +1194,7 @@ out:
1207 1194
1208__setup("slub_debug", setup_slub_debug); 1195__setup("slub_debug", setup_slub_debug);
1209 1196
1210static unsigned long kmem_cache_flags(unsigned long objsize, 1197static unsigned long kmem_cache_flags(unsigned long object_size,
1211 unsigned long flags, const char *name, 1198 unsigned long flags, const char *name,
1212 void (*ctor)(void *)) 1199 void (*ctor)(void *))
1213{ 1200{
@@ -1237,7 +1224,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
1237static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, 1224static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1238 struct page *page) {} 1225 struct page *page) {}
1239static inline void remove_full(struct kmem_cache *s, struct page *page) {} 1226static inline void remove_full(struct kmem_cache *s, struct page *page) {}
1240static inline unsigned long kmem_cache_flags(unsigned long objsize, 1227static inline unsigned long kmem_cache_flags(unsigned long object_size,
1241 unsigned long flags, const char *name, 1228 unsigned long flags, const char *name,
1242 void (*ctor)(void *)) 1229 void (*ctor)(void *))
1243{ 1230{
@@ -1314,13 +1301,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1314 stat(s, ORDER_FALLBACK); 1301 stat(s, ORDER_FALLBACK);
1315 } 1302 }
1316 1303
1317 if (flags & __GFP_WAIT) 1304 if (kmemcheck_enabled && page
1318 local_irq_disable();
1319
1320 if (!page)
1321 return NULL;
1322
1323 if (kmemcheck_enabled
1324 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1305 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1325 int pages = 1 << oo_order(oo); 1306 int pages = 1 << oo_order(oo);
1326 1307
@@ -1336,6 +1317,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1336 kmemcheck_mark_unallocated_pages(page, pages); 1317 kmemcheck_mark_unallocated_pages(page, pages);
1337 } 1318 }
1338 1319
1320 if (flags & __GFP_WAIT)
1321 local_irq_disable();
1322 if (!page)
1323 return NULL;
1324
1339 page->objects = oo_objects(oo); 1325 page->objects = oo_objects(oo);
1340 mod_zone_page_state(page_zone(page), 1326 mod_zone_page_state(page_zone(page),
1341 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1327 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -1369,7 +1355,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1369 1355
1370 inc_slabs_node(s, page_to_nid(page), page->objects); 1356 inc_slabs_node(s, page_to_nid(page), page->objects);
1371 page->slab = s; 1357 page->slab = s;
1372 page->flags |= 1 << PG_slab; 1358 __SetPageSlab(page);
1359 if (page->pfmemalloc)
1360 SetPageSlabPfmemalloc(page);
1373 1361
1374 start = page_address(page); 1362 start = page_address(page);
1375 1363
@@ -1413,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1413 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1401 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1414 -pages); 1402 -pages);
1415 1403
1404 __ClearPageSlabPfmemalloc(page);
1416 __ClearPageSlab(page); 1405 __ClearPageSlab(page);
1417 reset_page_mapcount(page); 1406 reset_page_mapcount(page);
1418 if (current->reclaim_state) 1407 if (current->reclaim_state)
@@ -1490,12 +1479,12 @@ static inline void remove_partial(struct kmem_cache_node *n,
1490} 1479}
1491 1480
1492/* 1481/*
1493 * Lock slab, remove from the partial list and put the object into the 1482 * Remove slab from the partial list, freeze it and
1494 * per cpu freelist. 1483 * return the pointer to the freelist.
1495 * 1484 *
1496 * Returns a list of objects or NULL if it fails. 1485 * Returns a list of objects or NULL if it fails.
1497 * 1486 *
1498 * Must hold list_lock. 1487 * Must hold list_lock since we modify the partial list.
1499 */ 1488 */
1500static inline void *acquire_slab(struct kmem_cache *s, 1489static inline void *acquire_slab(struct kmem_cache *s,
1501 struct kmem_cache_node *n, struct page *page, 1490 struct kmem_cache_node *n, struct page *page,
@@ -1510,22 +1499,27 @@ static inline void *acquire_slab(struct kmem_cache *s,
1510 * The old freelist is the list of objects for the 1499 * The old freelist is the list of objects for the
1511 * per cpu allocation list. 1500 * per cpu allocation list.
1512 */ 1501 */
1513 do { 1502 freelist = page->freelist;
1514 freelist = page->freelist; 1503 counters = page->counters;
1515 counters = page->counters; 1504 new.counters = counters;
1516 new.counters = counters; 1505 if (mode) {
1517 if (mode) 1506 new.inuse = page->objects;
1518 new.inuse = page->objects; 1507 new.freelist = NULL;
1508 } else {
1509 new.freelist = freelist;
1510 }
1519 1511
1520 VM_BUG_ON(new.frozen); 1512 VM_BUG_ON(new.frozen);
1521 new.frozen = 1; 1513 new.frozen = 1;
1522 1514
1523 } while (!__cmpxchg_double_slab(s, page, 1515 if (!__cmpxchg_double_slab(s, page,
1524 freelist, counters, 1516 freelist, counters,
1525 NULL, new.counters, 1517 new.freelist, new.counters,
1526 "lock and freeze")); 1518 "acquire_slab"))
1519 return NULL;
1527 1520
1528 remove_partial(n, page); 1521 remove_partial(n, page);
1522 WARN_ON(!freelist);
1529 return freelist; 1523 return freelist;
1530} 1524}
1531 1525
@@ -1559,12 +1553,10 @@ static void *get_partial_node(struct kmem_cache *s,
1559 1553
1560 if (!object) { 1554 if (!object) {
1561 c->page = page; 1555 c->page = page;
1562 c->node = page_to_nid(page);
1563 stat(s, ALLOC_FROM_PARTIAL); 1556 stat(s, ALLOC_FROM_PARTIAL);
1564 object = t; 1557 object = t;
1565 available = page->objects - page->inuse; 1558 available = page->objects - page->inuse;
1566 } else { 1559 } else {
1567 page->freelist = t;
1568 available = put_cpu_partial(s, page, 0); 1560 available = put_cpu_partial(s, page, 0);
1569 stat(s, CPU_PARTIAL_NODE); 1561 stat(s, CPU_PARTIAL_NODE);
1570 } 1562 }
@@ -1579,7 +1571,7 @@ static void *get_partial_node(struct kmem_cache *s,
1579/* 1571/*
1580 * Get a page from somewhere. Search in increasing NUMA distances. 1572 * Get a page from somewhere. Search in increasing NUMA distances.
1581 */ 1573 */
1582static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, 1574static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1583 struct kmem_cache_cpu *c) 1575 struct kmem_cache_cpu *c)
1584{ 1576{
1585#ifdef CONFIG_NUMA 1577#ifdef CONFIG_NUMA
@@ -1614,7 +1606,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
1614 1606
1615 do { 1607 do {
1616 cpuset_mems_cookie = get_mems_allowed(); 1608 cpuset_mems_cookie = get_mems_allowed();
1617 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1609 zonelist = node_zonelist(slab_node(), flags);
1618 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1610 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1619 struct kmem_cache_node *n; 1611 struct kmem_cache_node *n;
1620 1612
@@ -1728,14 +1720,12 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
1728/* 1720/*
1729 * Remove the cpu slab 1721 * Remove the cpu slab
1730 */ 1722 */
1731static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1723static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist)
1732{ 1724{
1733 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; 1725 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
1734 struct page *page = c->page;
1735 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1726 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1736 int lock = 0; 1727 int lock = 0;
1737 enum slab_modes l = M_NONE, m = M_NONE; 1728 enum slab_modes l = M_NONE, m = M_NONE;
1738 void *freelist;
1739 void *nextfree; 1729 void *nextfree;
1740 int tail = DEACTIVATE_TO_HEAD; 1730 int tail = DEACTIVATE_TO_HEAD;
1741 struct page new; 1731 struct page new;
@@ -1746,11 +1736,6 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1746 tail = DEACTIVATE_TO_TAIL; 1736 tail = DEACTIVATE_TO_TAIL;
1747 } 1737 }
1748 1738
1749 c->tid = next_tid(c->tid);
1750 c->page = NULL;
1751 freelist = c->freelist;
1752 c->freelist = NULL;
1753
1754 /* 1739 /*
1755 * Stage one: Free all available per cpu objects back 1740 * Stage one: Free all available per cpu objects back
1756 * to the page freelist while it is still frozen. Leave the 1741 * to the page freelist while it is still frozen. Leave the
@@ -1876,21 +1861,31 @@ redo:
1876 } 1861 }
1877} 1862}
1878 1863
1879/* Unfreeze all the cpu partial slabs */ 1864/*
1865 * Unfreeze all the cpu partial slabs.
1866 *
1867 * This function must be called with interrupt disabled.
1868 */
1880static void unfreeze_partials(struct kmem_cache *s) 1869static void unfreeze_partials(struct kmem_cache *s)
1881{ 1870{
1882 struct kmem_cache_node *n = NULL; 1871 struct kmem_cache_node *n = NULL, *n2 = NULL;
1883 struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); 1872 struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
1884 struct page *page, *discard_page = NULL; 1873 struct page *page, *discard_page = NULL;
1885 1874
1886 while ((page = c->partial)) { 1875 while ((page = c->partial)) {
1887 enum slab_modes { M_PARTIAL, M_FREE };
1888 enum slab_modes l, m;
1889 struct page new; 1876 struct page new;
1890 struct page old; 1877 struct page old;
1891 1878
1892 c->partial = page->next; 1879 c->partial = page->next;
1893 l = M_FREE; 1880
1881 n2 = get_node(s, page_to_nid(page));
1882 if (n != n2) {
1883 if (n)
1884 spin_unlock(&n->list_lock);
1885
1886 n = n2;
1887 spin_lock(&n->list_lock);
1888 }
1894 1889
1895 do { 1890 do {
1896 1891
@@ -1903,43 +1898,17 @@ static void unfreeze_partials(struct kmem_cache *s)
1903 1898
1904 new.frozen = 0; 1899 new.frozen = 0;
1905 1900
1906 if (!new.inuse && (!n || n->nr_partial > s->min_partial)) 1901 } while (!__cmpxchg_double_slab(s, page,
1907 m = M_FREE;
1908 else {
1909 struct kmem_cache_node *n2 = get_node(s,
1910 page_to_nid(page));
1911
1912 m = M_PARTIAL;
1913 if (n != n2) {
1914 if (n)
1915 spin_unlock(&n->list_lock);
1916
1917 n = n2;
1918 spin_lock(&n->list_lock);
1919 }
1920 }
1921
1922 if (l != m) {
1923 if (l == M_PARTIAL) {
1924 remove_partial(n, page);
1925 stat(s, FREE_REMOVE_PARTIAL);
1926 } else {
1927 add_partial(n, page,
1928 DEACTIVATE_TO_TAIL);
1929 stat(s, FREE_ADD_PARTIAL);
1930 }
1931
1932 l = m;
1933 }
1934
1935 } while (!cmpxchg_double_slab(s, page,
1936 old.freelist, old.counters, 1902 old.freelist, old.counters,
1937 new.freelist, new.counters, 1903 new.freelist, new.counters,
1938 "unfreezing slab")); 1904 "unfreezing slab"));
1939 1905
1940 if (m == M_FREE) { 1906 if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
1941 page->next = discard_page; 1907 page->next = discard_page;
1942 discard_page = page; 1908 discard_page = page;
1909 } else {
1910 add_partial(n, page, DEACTIVATE_TO_TAIL);
1911 stat(s, FREE_ADD_PARTIAL);
1943 } 1912 }
1944 } 1913 }
1945 1914
@@ -2008,7 +1977,11 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2008static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1977static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
2009{ 1978{
2010 stat(s, CPUSLAB_FLUSH); 1979 stat(s, CPUSLAB_FLUSH);
2011 deactivate_slab(s, c); 1980 deactivate_slab(s, c->page, c->freelist);
1981
1982 c->tid = next_tid(c->tid);
1983 c->page = NULL;
1984 c->freelist = NULL;
2012} 1985}
2013 1986
2014/* 1987/*
@@ -2040,7 +2013,7 @@ static bool has_cpu_slab(int cpu, void *info)
2040 struct kmem_cache *s = info; 2013 struct kmem_cache *s = info;
2041 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 2014 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2042 2015
2043 return !!(c->page); 2016 return c->page || c->partial;
2044} 2017}
2045 2018
2046static void flush_all(struct kmem_cache *s) 2019static void flush_all(struct kmem_cache *s)
@@ -2052,10 +2025,10 @@ static void flush_all(struct kmem_cache *s)
2052 * Check if the objects in a per cpu structure fit numa 2025 * Check if the objects in a per cpu structure fit numa
2053 * locality expectations. 2026 * locality expectations.
2054 */ 2027 */
2055static inline int node_match(struct kmem_cache_cpu *c, int node) 2028static inline int node_match(struct page *page, int node)
2056{ 2029{
2057#ifdef CONFIG_NUMA 2030#ifdef CONFIG_NUMA
2058 if (node != NUMA_NO_NODE && c->node != node) 2031 if (node != NUMA_NO_NODE && page_to_nid(page) != node)
2059 return 0; 2032 return 0;
2060#endif 2033#endif
2061 return 1; 2034 return 1;
@@ -2098,10 +2071,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2098 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 2071 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
2099 nid, gfpflags); 2072 nid, gfpflags);
2100 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " 2073 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, "
2101 "default order: %d, min order: %d\n", s->name, s->objsize, 2074 "default order: %d, min order: %d\n", s->name, s->object_size,
2102 s->size, oo_order(s->oo), oo_order(s->min)); 2075 s->size, oo_order(s->oo), oo_order(s->min));
2103 2076
2104 if (oo_order(s->min) > get_order(s->objsize)) 2077 if (oo_order(s->min) > get_order(s->object_size))
2105 printk(KERN_WARNING " %s debugging increased min order, use " 2078 printk(KERN_WARNING " %s debugging increased min order, use "
2106 "slub_debug=O to disable.\n", s->name); 2079 "slub_debug=O to disable.\n", s->name);
2107 2080
@@ -2127,10 +2100,16 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2127static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, 2100static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2128 int node, struct kmem_cache_cpu **pc) 2101 int node, struct kmem_cache_cpu **pc)
2129{ 2102{
2130 void *object; 2103 void *freelist;
2131 struct kmem_cache_cpu *c; 2104 struct kmem_cache_cpu *c = *pc;
2132 struct page *page = new_slab(s, flags, node); 2105 struct page *page;
2106
2107 freelist = get_partial(s, flags, node, c);
2133 2108
2109 if (freelist)
2110 return freelist;
2111
2112 page = new_slab(s, flags, node);
2134 if (page) { 2113 if (page) {
2135 c = __this_cpu_ptr(s->cpu_slab); 2114 c = __this_cpu_ptr(s->cpu_slab);
2136 if (c->page) 2115 if (c->page)
@@ -2140,17 +2119,24 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2140 * No other reference to the page yet so we can 2119 * No other reference to the page yet so we can
2141 * muck around with it freely without cmpxchg 2120 * muck around with it freely without cmpxchg
2142 */ 2121 */
2143 object = page->freelist; 2122 freelist = page->freelist;
2144 page->freelist = NULL; 2123 page->freelist = NULL;
2145 2124
2146 stat(s, ALLOC_SLAB); 2125 stat(s, ALLOC_SLAB);
2147 c->node = page_to_nid(page);
2148 c->page = page; 2126 c->page = page;
2149 *pc = c; 2127 *pc = c;
2150 } else 2128 } else
2151 object = NULL; 2129 freelist = NULL;
2152 2130
2153 return object; 2131 return freelist;
2132}
2133
2134static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
2135{
2136 if (unlikely(PageSlabPfmemalloc(page)))
2137 return gfp_pfmemalloc_allowed(gfpflags);
2138
2139 return true;
2154} 2140}
2155 2141
2156/* 2142/*
@@ -2160,6 +2146,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2160 * The page is still frozen if the return value is not NULL. 2146 * The page is still frozen if the return value is not NULL.
2161 * 2147 *
2162 * If this function returns NULL then the page has been unfrozen. 2148 * If this function returns NULL then the page has been unfrozen.
2149 *
2150 * This function must be called with interrupt disabled.
2163 */ 2151 */
2164static inline void *get_freelist(struct kmem_cache *s, struct page *page) 2152static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2165{ 2153{
@@ -2170,13 +2158,14 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2170 do { 2158 do {
2171 freelist = page->freelist; 2159 freelist = page->freelist;
2172 counters = page->counters; 2160 counters = page->counters;
2161
2173 new.counters = counters; 2162 new.counters = counters;
2174 VM_BUG_ON(!new.frozen); 2163 VM_BUG_ON(!new.frozen);
2175 2164
2176 new.inuse = page->objects; 2165 new.inuse = page->objects;
2177 new.frozen = freelist != NULL; 2166 new.frozen = freelist != NULL;
2178 2167
2179 } while (!cmpxchg_double_slab(s, page, 2168 } while (!__cmpxchg_double_slab(s, page,
2180 freelist, counters, 2169 freelist, counters,
2181 NULL, new.counters, 2170 NULL, new.counters,
2182 "get_freelist")); 2171 "get_freelist"));
@@ -2203,7 +2192,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2203static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 2192static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2204 unsigned long addr, struct kmem_cache_cpu *c) 2193 unsigned long addr, struct kmem_cache_cpu *c)
2205{ 2194{
2206 void **object; 2195 void *freelist;
2196 struct page *page;
2207 unsigned long flags; 2197 unsigned long flags;
2208 2198
2209 local_irq_save(flags); 2199 local_irq_save(flags);
@@ -2216,25 +2206,41 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2216 c = this_cpu_ptr(s->cpu_slab); 2206 c = this_cpu_ptr(s->cpu_slab);
2217#endif 2207#endif
2218 2208
2219 if (!c->page) 2209 page = c->page;
2210 if (!page)
2220 goto new_slab; 2211 goto new_slab;
2221redo: 2212redo:
2222 if (unlikely(!node_match(c, node))) { 2213
2214 if (unlikely(!node_match(page, node))) {
2223 stat(s, ALLOC_NODE_MISMATCH); 2215 stat(s, ALLOC_NODE_MISMATCH);
2224 deactivate_slab(s, c); 2216 deactivate_slab(s, page, c->freelist);
2217 c->page = NULL;
2218 c->freelist = NULL;
2219 goto new_slab;
2220 }
2221
2222 /*
2223 * By rights, we should be searching for a slab page that was
2224 * PFMEMALLOC but right now, we are losing the pfmemalloc
2225 * information when the page leaves the per-cpu allocator
2226 */
2227 if (unlikely(!pfmemalloc_match(page, gfpflags))) {
2228 deactivate_slab(s, page, c->freelist);
2229 c->page = NULL;
2230 c->freelist = NULL;
2225 goto new_slab; 2231 goto new_slab;
2226 } 2232 }
2227 2233
2228 /* must check again c->freelist in case of cpu migration or IRQ */ 2234 /* must check again c->freelist in case of cpu migration or IRQ */
2229 object = c->freelist; 2235 freelist = c->freelist;
2230 if (object) 2236 if (freelist)
2231 goto load_freelist; 2237 goto load_freelist;
2232 2238
2233 stat(s, ALLOC_SLOWPATH); 2239 stat(s, ALLOC_SLOWPATH);
2234 2240
2235 object = get_freelist(s, c->page); 2241 freelist = get_freelist(s, page);
2236 2242
2237 if (!object) { 2243 if (!freelist) {
2238 c->page = NULL; 2244 c->page = NULL;
2239 stat(s, DEACTIVATE_BYPASS); 2245 stat(s, DEACTIVATE_BYPASS);
2240 goto new_slab; 2246 goto new_slab;
@@ -2243,50 +2249,50 @@ redo:
2243 stat(s, ALLOC_REFILL); 2249 stat(s, ALLOC_REFILL);
2244 2250
2245load_freelist: 2251load_freelist:
2246 c->freelist = get_freepointer(s, object); 2252 /*
2253 * freelist is pointing to the list of objects to be used.
2254 * page is pointing to the page from which the objects are obtained.
2255 * That page must be frozen for per cpu allocations to work.
2256 */
2257 VM_BUG_ON(!c->page->frozen);
2258 c->freelist = get_freepointer(s, freelist);
2247 c->tid = next_tid(c->tid); 2259 c->tid = next_tid(c->tid);
2248 local_irq_restore(flags); 2260 local_irq_restore(flags);
2249 return object; 2261 return freelist;
2250 2262
2251new_slab: 2263new_slab:
2252 2264
2253 if (c->partial) { 2265 if (c->partial) {
2254 c->page = c->partial; 2266 page = c->page = c->partial;
2255 c->partial = c->page->next; 2267 c->partial = page->next;
2256 c->node = page_to_nid(c->page);
2257 stat(s, CPU_PARTIAL_ALLOC); 2268 stat(s, CPU_PARTIAL_ALLOC);
2258 c->freelist = NULL; 2269 c->freelist = NULL;
2259 goto redo; 2270 goto redo;
2260 } 2271 }
2261 2272
2262 /* Then do expensive stuff like retrieving pages from the partial lists */ 2273 freelist = new_slab_objects(s, gfpflags, node, &c);
2263 object = get_partial(s, gfpflags, node, c);
2264
2265 if (unlikely(!object)) {
2266 2274
2267 object = new_slab_objects(s, gfpflags, node, &c); 2275 if (unlikely(!freelist)) {
2276 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
2277 slab_out_of_memory(s, gfpflags, node);
2268 2278
2269 if (unlikely(!object)) { 2279 local_irq_restore(flags);
2270 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2280 return NULL;
2271 slab_out_of_memory(s, gfpflags, node);
2272
2273 local_irq_restore(flags);
2274 return NULL;
2275 }
2276 } 2281 }
2277 2282
2278 if (likely(!kmem_cache_debug(s))) 2283 page = c->page;
2284 if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
2279 goto load_freelist; 2285 goto load_freelist;
2280 2286
2281 /* Only entered in the debug case */ 2287 /* Only entered in the debug case */
2282 if (!alloc_debug_processing(s, c->page, object, addr)) 2288 if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
2283 goto new_slab; /* Slab failed checks. Next slab needed */ 2289 goto new_slab; /* Slab failed checks. Next slab needed */
2284 2290
2285 c->freelist = get_freepointer(s, object); 2291 deactivate_slab(s, page, get_freepointer(s, freelist));
2286 deactivate_slab(s, c); 2292 c->page = NULL;
2287 c->node = NUMA_NO_NODE; 2293 c->freelist = NULL;
2288 local_irq_restore(flags); 2294 local_irq_restore(flags);
2289 return object; 2295 return freelist;
2290} 2296}
2291 2297
2292/* 2298/*
@@ -2304,6 +2310,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
2304{ 2310{
2305 void **object; 2311 void **object;
2306 struct kmem_cache_cpu *c; 2312 struct kmem_cache_cpu *c;
2313 struct page *page;
2307 unsigned long tid; 2314 unsigned long tid;
2308 2315
2309 if (slab_pre_alloc_hook(s, gfpflags)) 2316 if (slab_pre_alloc_hook(s, gfpflags))
@@ -2329,8 +2336,8 @@ redo:
2329 barrier(); 2336 barrier();
2330 2337
2331 object = c->freelist; 2338 object = c->freelist;
2332 if (unlikely(!object || !node_match(c, node))) 2339 page = c->page;
2333 2340 if (unlikely(!object || !node_match(page, node)))
2334 object = __slab_alloc(s, gfpflags, node, addr, c); 2341 object = __slab_alloc(s, gfpflags, node, addr, c);
2335 2342
2336 else { 2343 else {
@@ -2361,7 +2368,7 @@ redo:
2361 } 2368 }
2362 2369
2363 if (unlikely(gfpflags & __GFP_ZERO) && object) 2370 if (unlikely(gfpflags & __GFP_ZERO) && object)
2364 memset(object, 0, s->objsize); 2371 memset(object, 0, s->object_size);
2365 2372
2366 slab_post_alloc_hook(s, gfpflags, object); 2373 slab_post_alloc_hook(s, gfpflags, object);
2367 2374
@@ -2372,7 +2379,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
2372{ 2379{
2373 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 2380 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
2374 2381
2375 trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); 2382 trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
2376 2383
2377 return ret; 2384 return ret;
2378} 2385}
@@ -2402,7 +2409,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
2402 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); 2409 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
2403 2410
2404 trace_kmem_cache_alloc_node(_RET_IP_, ret, 2411 trace_kmem_cache_alloc_node(_RET_IP_, ret,
2405 s->objsize, s->size, gfpflags, node); 2412 s->object_size, s->size, gfpflags, node);
2406 2413
2407 return ret; 2414 return ret;
2408} 2415}
@@ -2766,7 +2773,7 @@ static unsigned long calculate_alignment(unsigned long flags,
2766} 2773}
2767 2774
2768static void 2775static void
2769init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 2776init_kmem_cache_node(struct kmem_cache_node *n)
2770{ 2777{
2771 n->nr_partial = 0; 2778 n->nr_partial = 0;
2772 spin_lock_init(&n->list_lock); 2779 spin_lock_init(&n->list_lock);
@@ -2836,7 +2843,7 @@ static void early_kmem_cache_node_alloc(int node)
2836 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2843 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
2837 init_tracking(kmem_cache_node, n); 2844 init_tracking(kmem_cache_node, n);
2838#endif 2845#endif
2839 init_kmem_cache_node(n, kmem_cache_node); 2846 init_kmem_cache_node(n);
2840 inc_slabs_node(kmem_cache_node, node, page->objects); 2847 inc_slabs_node(kmem_cache_node, node, page->objects);
2841 2848
2842 add_partial(n, page, DEACTIVATE_TO_HEAD); 2849 add_partial(n, page, DEACTIVATE_TO_HEAD);
@@ -2876,7 +2883,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
2876 } 2883 }
2877 2884
2878 s->node[node] = n; 2885 s->node[node] = n;
2879 init_kmem_cache_node(n, s); 2886 init_kmem_cache_node(n);
2880 } 2887 }
2881 return 1; 2888 return 1;
2882} 2889}
@@ -2897,7 +2904,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
2897static int calculate_sizes(struct kmem_cache *s, int forced_order) 2904static int calculate_sizes(struct kmem_cache *s, int forced_order)
2898{ 2905{
2899 unsigned long flags = s->flags; 2906 unsigned long flags = s->flags;
2900 unsigned long size = s->objsize; 2907 unsigned long size = s->object_size;
2901 unsigned long align = s->align; 2908 unsigned long align = s->align;
2902 int order; 2909 int order;
2903 2910
@@ -2926,7 +2933,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2926 * end of the object and the free pointer. If not then add an 2933 * end of the object and the free pointer. If not then add an
2927 * additional word to have some bytes to store Redzone information. 2934 * additional word to have some bytes to store Redzone information.
2928 */ 2935 */
2929 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 2936 if ((flags & SLAB_RED_ZONE) && size == s->object_size)
2930 size += sizeof(void *); 2937 size += sizeof(void *);
2931#endif 2938#endif
2932 2939
@@ -2974,7 +2981,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2974 * user specified and the dynamic determination of cache line size 2981 * user specified and the dynamic determination of cache line size
2975 * on bootup. 2982 * on bootup.
2976 */ 2983 */
2977 align = calculate_alignment(flags, align, s->objsize); 2984 align = calculate_alignment(flags, align, s->object_size);
2978 s->align = align; 2985 s->align = align;
2979 2986
2980 /* 2987 /*
@@ -3022,7 +3029,7 @@ static int kmem_cache_open(struct kmem_cache *s,
3022 memset(s, 0, kmem_size); 3029 memset(s, 0, kmem_size);
3023 s->name = name; 3030 s->name = name;
3024 s->ctor = ctor; 3031 s->ctor = ctor;
3025 s->objsize = size; 3032 s->object_size = size;
3026 s->align = align; 3033 s->align = align;
3027 s->flags = kmem_cache_flags(size, flags, name, ctor); 3034 s->flags = kmem_cache_flags(size, flags, name, ctor);
3028 s->reserved = 0; 3035 s->reserved = 0;
@@ -3037,7 +3044,7 @@ static int kmem_cache_open(struct kmem_cache *s,
3037 * Disable debugging flags that store metadata if the min slab 3044 * Disable debugging flags that store metadata if the min slab
3038 * order increased. 3045 * order increased.
3039 */ 3046 */
3040 if (get_order(s->size) > get_order(s->objsize)) { 3047 if (get_order(s->size) > get_order(s->object_size)) {
3041 s->flags &= ~DEBUG_METADATA_FLAGS; 3048 s->flags &= ~DEBUG_METADATA_FLAGS;
3042 s->offset = 0; 3049 s->offset = 0;
3043 if (!calculate_sizes(s, -1)) 3050 if (!calculate_sizes(s, -1))
@@ -3111,7 +3118,7 @@ error:
3111 */ 3118 */
3112unsigned int kmem_cache_size(struct kmem_cache *s) 3119unsigned int kmem_cache_size(struct kmem_cache *s)
3113{ 3120{
3114 return s->objsize; 3121 return s->object_size;
3115} 3122}
3116EXPORT_SYMBOL(kmem_cache_size); 3123EXPORT_SYMBOL(kmem_cache_size);
3117 3124
@@ -3189,11 +3196,11 @@ static inline int kmem_cache_close(struct kmem_cache *s)
3189 */ 3196 */
3190void kmem_cache_destroy(struct kmem_cache *s) 3197void kmem_cache_destroy(struct kmem_cache *s)
3191{ 3198{
3192 down_write(&slub_lock); 3199 mutex_lock(&slab_mutex);
3193 s->refcount--; 3200 s->refcount--;
3194 if (!s->refcount) { 3201 if (!s->refcount) {
3195 list_del(&s->list); 3202 list_del(&s->list);
3196 up_write(&slub_lock); 3203 mutex_unlock(&slab_mutex);
3197 if (kmem_cache_close(s)) { 3204 if (kmem_cache_close(s)) {
3198 printk(KERN_ERR "SLUB %s: %s called for cache that " 3205 printk(KERN_ERR "SLUB %s: %s called for cache that "
3199 "still has objects.\n", s->name, __func__); 3206 "still has objects.\n", s->name, __func__);
@@ -3203,7 +3210,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
3203 rcu_barrier(); 3210 rcu_barrier();
3204 sysfs_slab_remove(s); 3211 sysfs_slab_remove(s);
3205 } else 3212 } else
3206 up_write(&slub_lock); 3213 mutex_unlock(&slab_mutex);
3207} 3214}
3208EXPORT_SYMBOL(kmem_cache_destroy); 3215EXPORT_SYMBOL(kmem_cache_destroy);
3209 3216
@@ -3265,7 +3272,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
3265 3272
3266 /* 3273 /*
3267 * This function is called with IRQs disabled during early-boot on 3274 * This function is called with IRQs disabled during early-boot on
3268 * single CPU so there's no need to take slub_lock here. 3275 * single CPU so there's no need to take slab_mutex here.
3269 */ 3276 */
3270 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, 3277 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
3271 flags, NULL)) 3278 flags, NULL))
@@ -3550,10 +3557,10 @@ static int slab_mem_going_offline_callback(void *arg)
3550{ 3557{
3551 struct kmem_cache *s; 3558 struct kmem_cache *s;
3552 3559
3553 down_read(&slub_lock); 3560 mutex_lock(&slab_mutex);
3554 list_for_each_entry(s, &slab_caches, list) 3561 list_for_each_entry(s, &slab_caches, list)
3555 kmem_cache_shrink(s); 3562 kmem_cache_shrink(s);
3556 up_read(&slub_lock); 3563 mutex_unlock(&slab_mutex);
3557 3564
3558 return 0; 3565 return 0;
3559} 3566}
@@ -3574,7 +3581,7 @@ static void slab_mem_offline_callback(void *arg)
3574 if (offline_node < 0) 3581 if (offline_node < 0)
3575 return; 3582 return;
3576 3583
3577 down_read(&slub_lock); 3584 mutex_lock(&slab_mutex);
3578 list_for_each_entry(s, &slab_caches, list) { 3585 list_for_each_entry(s, &slab_caches, list) {
3579 n = get_node(s, offline_node); 3586 n = get_node(s, offline_node);
3580 if (n) { 3587 if (n) {
@@ -3590,7 +3597,7 @@ static void slab_mem_offline_callback(void *arg)
3590 kmem_cache_free(kmem_cache_node, n); 3597 kmem_cache_free(kmem_cache_node, n);
3591 } 3598 }
3592 } 3599 }
3593 up_read(&slub_lock); 3600 mutex_unlock(&slab_mutex);
3594} 3601}
3595 3602
3596static int slab_mem_going_online_callback(void *arg) 3603static int slab_mem_going_online_callback(void *arg)
@@ -3613,7 +3620,7 @@ static int slab_mem_going_online_callback(void *arg)
3613 * allocate a kmem_cache_node structure in order to bring the node 3620 * allocate a kmem_cache_node structure in order to bring the node
3614 * online. 3621 * online.
3615 */ 3622 */
3616 down_read(&slub_lock); 3623 mutex_lock(&slab_mutex);
3617 list_for_each_entry(s, &slab_caches, list) { 3624 list_for_each_entry(s, &slab_caches, list) {
3618 /* 3625 /*
3619 * XXX: kmem_cache_alloc_node will fallback to other nodes 3626 * XXX: kmem_cache_alloc_node will fallback to other nodes
@@ -3625,11 +3632,11 @@ static int slab_mem_going_online_callback(void *arg)
3625 ret = -ENOMEM; 3632 ret = -ENOMEM;
3626 goto out; 3633 goto out;
3627 } 3634 }
3628 init_kmem_cache_node(n, s); 3635 init_kmem_cache_node(n);
3629 s->node[nid] = n; 3636 s->node[nid] = n;
3630 } 3637 }
3631out: 3638out:
3632 up_read(&slub_lock); 3639 mutex_unlock(&slab_mutex);
3633 return ret; 3640 return ret;
3634} 3641}
3635 3642
@@ -3840,11 +3847,11 @@ void __init kmem_cache_init(void)
3840 3847
3841 if (s && s->size) { 3848 if (s && s->size) {
3842 char *name = kasprintf(GFP_NOWAIT, 3849 char *name = kasprintf(GFP_NOWAIT,
3843 "dma-kmalloc-%d", s->objsize); 3850 "dma-kmalloc-%d", s->object_size);
3844 3851
3845 BUG_ON(!name); 3852 BUG_ON(!name);
3846 kmalloc_dma_caches[i] = create_kmalloc_cache(name, 3853 kmalloc_dma_caches[i] = create_kmalloc_cache(name,
3847 s->objsize, SLAB_CACHE_DMA); 3854 s->object_size, SLAB_CACHE_DMA);
3848 } 3855 }
3849 } 3856 }
3850#endif 3857#endif
@@ -3921,16 +3928,12 @@ static struct kmem_cache *find_mergeable(size_t size,
3921 return NULL; 3928 return NULL;
3922} 3929}
3923 3930
3924struct kmem_cache *kmem_cache_create(const char *name, size_t size, 3931struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
3925 size_t align, unsigned long flags, void (*ctor)(void *)) 3932 size_t align, unsigned long flags, void (*ctor)(void *))
3926{ 3933{
3927 struct kmem_cache *s; 3934 struct kmem_cache *s;
3928 char *n; 3935 char *n;
3929 3936
3930 if (WARN_ON(!name))
3931 return NULL;
3932
3933 down_write(&slub_lock);
3934 s = find_mergeable(size, align, flags, name, ctor); 3937 s = find_mergeable(size, align, flags, name, ctor);
3935 if (s) { 3938 if (s) {
3936 s->refcount++; 3939 s->refcount++;
@@ -3938,49 +3941,42 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3938 * Adjust the object sizes so that we clear 3941 * Adjust the object sizes so that we clear
3939 * the complete object on kzalloc. 3942 * the complete object on kzalloc.
3940 */ 3943 */
3941 s->objsize = max(s->objsize, (int)size); 3944 s->object_size = max(s->object_size, (int)size);
3942 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3945 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3943 3946
3944 if (sysfs_slab_alias(s, name)) { 3947 if (sysfs_slab_alias(s, name)) {
3945 s->refcount--; 3948 s->refcount--;
3946 goto err; 3949 return NULL;
3947 } 3950 }
3948 up_write(&slub_lock);
3949 return s; 3951 return s;
3950 } 3952 }
3951 3953
3952 n = kstrdup(name, GFP_KERNEL); 3954 n = kstrdup(name, GFP_KERNEL);
3953 if (!n) 3955 if (!n)
3954 goto err; 3956 return NULL;
3955 3957
3956 s = kmalloc(kmem_size, GFP_KERNEL); 3958 s = kmalloc(kmem_size, GFP_KERNEL);
3957 if (s) { 3959 if (s) {
3958 if (kmem_cache_open(s, n, 3960 if (kmem_cache_open(s, n,
3959 size, align, flags, ctor)) { 3961 size, align, flags, ctor)) {
3962 int r;
3963
3960 list_add(&s->list, &slab_caches); 3964 list_add(&s->list, &slab_caches);
3961 up_write(&slub_lock); 3965 mutex_unlock(&slab_mutex);
3962 if (sysfs_slab_add(s)) { 3966 r = sysfs_slab_add(s);
3963 down_write(&slub_lock); 3967 mutex_lock(&slab_mutex);
3964 list_del(&s->list); 3968
3965 kfree(n); 3969 if (!r)
3966 kfree(s); 3970 return s;
3967 goto err; 3971
3968 } 3972 list_del(&s->list);
3969 return s; 3973 kmem_cache_close(s);
3970 } 3974 }
3971 kfree(n);
3972 kfree(s); 3975 kfree(s);
3973 } 3976 }
3974err: 3977 kfree(n);
3975 up_write(&slub_lock); 3978 return NULL;
3976
3977 if (flags & SLAB_PANIC)
3978 panic("Cannot create slabcache %s\n", name);
3979 else
3980 s = NULL;
3981 return s;
3982} 3979}
3983EXPORT_SYMBOL(kmem_cache_create);
3984 3980
3985#ifdef CONFIG_SMP 3981#ifdef CONFIG_SMP
3986/* 3982/*
@@ -3999,13 +3995,13 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3999 case CPU_UP_CANCELED_FROZEN: 3995 case CPU_UP_CANCELED_FROZEN:
4000 case CPU_DEAD: 3996 case CPU_DEAD:
4001 case CPU_DEAD_FROZEN: 3997 case CPU_DEAD_FROZEN:
4002 down_read(&slub_lock); 3998 mutex_lock(&slab_mutex);
4003 list_for_each_entry(s, &slab_caches, list) { 3999 list_for_each_entry(s, &slab_caches, list) {
4004 local_irq_save(flags); 4000 local_irq_save(flags);
4005 __flush_cpu_slab(s, cpu); 4001 __flush_cpu_slab(s, cpu);
4006 local_irq_restore(flags); 4002 local_irq_restore(flags);
4007 } 4003 }
4008 up_read(&slub_lock); 4004 mutex_unlock(&slab_mutex);
4009 break; 4005 break;
4010 default: 4006 default:
4011 break; 4007 break;
@@ -4497,30 +4493,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4497 4493
4498 for_each_possible_cpu(cpu) { 4494 for_each_possible_cpu(cpu) {
4499 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4495 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
4500 int node = ACCESS_ONCE(c->node); 4496 int node;
4501 struct page *page; 4497 struct page *page;
4502 4498
4503 if (node < 0)
4504 continue;
4505 page = ACCESS_ONCE(c->page); 4499 page = ACCESS_ONCE(c->page);
4506 if (page) { 4500 if (!page)
4507 if (flags & SO_TOTAL) 4501 continue;
4508 x = page->objects;
4509 else if (flags & SO_OBJECTS)
4510 x = page->inuse;
4511 else
4512 x = 1;
4513 4502
4514 total += x; 4503 node = page_to_nid(page);
4515 nodes[node] += x; 4504 if (flags & SO_TOTAL)
4516 } 4505 x = page->objects;
4517 page = c->partial; 4506 else if (flags & SO_OBJECTS)
4507 x = page->inuse;
4508 else
4509 x = 1;
4518 4510
4511 total += x;
4512 nodes[node] += x;
4513
4514 page = ACCESS_ONCE(c->partial);
4519 if (page) { 4515 if (page) {
4520 x = page->pobjects; 4516 x = page->pobjects;
4521 total += x; 4517 total += x;
4522 nodes[node] += x; 4518 nodes[node] += x;
4523 } 4519 }
4520
4524 per_cpu[node]++; 4521 per_cpu[node]++;
4525 } 4522 }
4526 } 4523 }
@@ -4620,7 +4617,7 @@ SLAB_ATTR_RO(align);
4620 4617
4621static ssize_t object_size_show(struct kmem_cache *s, char *buf) 4618static ssize_t object_size_show(struct kmem_cache *s, char *buf)
4622{ 4619{
4623 return sprintf(buf, "%d\n", s->objsize); 4620 return sprintf(buf, "%d\n", s->object_size);
4624} 4621}
4625SLAB_ATTR_RO(object_size); 4622SLAB_ATTR_RO(object_size);
4626 4623
@@ -5283,7 +5280,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
5283 const char *name; 5280 const char *name;
5284 int unmergeable; 5281 int unmergeable;
5285 5282
5286 if (slab_state < SYSFS) 5283 if (slab_state < FULL)
5287 /* Defer until later */ 5284 /* Defer until later */
5288 return 0; 5285 return 0;
5289 5286
@@ -5328,7 +5325,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
5328 5325
5329static void sysfs_slab_remove(struct kmem_cache *s) 5326static void sysfs_slab_remove(struct kmem_cache *s)
5330{ 5327{
5331 if (slab_state < SYSFS) 5328 if (slab_state < FULL)
5332 /* 5329 /*
5333 * Sysfs has not been setup yet so no need to remove the 5330 * Sysfs has not been setup yet so no need to remove the
5334 * cache from sysfs. 5331 * cache from sysfs.
@@ -5356,7 +5353,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
5356{ 5353{
5357 struct saved_alias *al; 5354 struct saved_alias *al;
5358 5355
5359 if (slab_state == SYSFS) { 5356 if (slab_state == FULL) {
5360 /* 5357 /*
5361 * If we have a leftover link then remove it. 5358 * If we have a leftover link then remove it.
5362 */ 5359 */
@@ -5380,16 +5377,16 @@ static int __init slab_sysfs_init(void)
5380 struct kmem_cache *s; 5377 struct kmem_cache *s;
5381 int err; 5378 int err;
5382 5379
5383 down_write(&slub_lock); 5380 mutex_lock(&slab_mutex);
5384 5381
5385 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 5382 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
5386 if (!slab_kset) { 5383 if (!slab_kset) {
5387 up_write(&slub_lock); 5384 mutex_unlock(&slab_mutex);
5388 printk(KERN_ERR "Cannot register slab subsystem.\n"); 5385 printk(KERN_ERR "Cannot register slab subsystem.\n");
5389 return -ENOSYS; 5386 return -ENOSYS;
5390 } 5387 }
5391 5388
5392 slab_state = SYSFS; 5389 slab_state = FULL;
5393 5390
5394 list_for_each_entry(s, &slab_caches, list) { 5391 list_for_each_entry(s, &slab_caches, list) {
5395 err = sysfs_slab_add(s); 5392 err = sysfs_slab_add(s);
@@ -5405,11 +5402,11 @@ static int __init slab_sysfs_init(void)
5405 err = sysfs_slab_alias(al->s, al->name); 5402 err = sysfs_slab_alias(al->s, al->name);
5406 if (err) 5403 if (err)
5407 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 5404 printk(KERN_ERR "SLUB: Unable to add boot slab alias"
5408 " %s to sysfs\n", s->name); 5405 " %s to sysfs\n", al->name);
5409 kfree(al); 5406 kfree(al);
5410 } 5407 }
5411 5408
5412 up_write(&slub_lock); 5409 mutex_unlock(&slab_mutex);
5413 resiliency_test(); 5410 resiliency_test();
5414 return 0; 5411 return 0;
5415} 5412}
@@ -5424,7 +5421,7 @@ __initcall(slab_sysfs_init);
5424static void print_slabinfo_header(struct seq_file *m) 5421static void print_slabinfo_header(struct seq_file *m)
5425{ 5422{
5426 seq_puts(m, "slabinfo - version: 2.1\n"); 5423 seq_puts(m, "slabinfo - version: 2.1\n");
5427 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 5424 seq_puts(m, "# name <active_objs> <num_objs> <object_size> "
5428 "<objperslab> <pagesperslab>"); 5425 "<objperslab> <pagesperslab>");
5429 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 5426 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
5430 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 5427 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
@@ -5435,7 +5432,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
5435{ 5432{
5436 loff_t n = *pos; 5433 loff_t n = *pos;
5437 5434
5438 down_read(&slub_lock); 5435 mutex_lock(&slab_mutex);
5439 if (!n) 5436 if (!n)
5440 print_slabinfo_header(m); 5437 print_slabinfo_header(m);
5441 5438
@@ -5449,7 +5446,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
5449 5446
5450static void s_stop(struct seq_file *m, void *p) 5447static void s_stop(struct seq_file *m, void *p)
5451{ 5448{
5452 up_read(&slub_lock); 5449 mutex_unlock(&slab_mutex);
5453} 5450}
5454 5451
5455static int s_show(struct seq_file *m, void *p) 5452static int s_show(struct seq_file *m, void *p)
diff --git a/mm/sparse.c b/mm/sparse.c
index a8bc7d364deb..fac95f2888f2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,21 +65,18 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
65 65
66 if (slab_is_available()) { 66 if (slab_is_available()) {
67 if (node_state(nid, N_HIGH_MEMORY)) 67 if (node_state(nid, N_HIGH_MEMORY))
68 section = kmalloc_node(array_size, GFP_KERNEL, nid); 68 section = kzalloc_node(array_size, GFP_KERNEL, nid);
69 else 69 else
70 section = kmalloc(array_size, GFP_KERNEL); 70 section = kzalloc(array_size, GFP_KERNEL);
71 } else 71 } else {
72 section = alloc_bootmem_node(NODE_DATA(nid), array_size); 72 section = alloc_bootmem_node(NODE_DATA(nid), array_size);
73 73 }
74 if (section)
75 memset(section, 0, array_size);
76 74
77 return section; 75 return section;
78} 76}
79 77
80static int __meminit sparse_index_init(unsigned long section_nr, int nid) 78static int __meminit sparse_index_init(unsigned long section_nr, int nid)
81{ 79{
82 static DEFINE_SPINLOCK(index_init_lock);
83 unsigned long root = SECTION_NR_TO_ROOT(section_nr); 80 unsigned long root = SECTION_NR_TO_ROOT(section_nr);
84 struct mem_section *section; 81 struct mem_section *section;
85 int ret = 0; 82 int ret = 0;
@@ -90,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
90 section = sparse_index_alloc(nid); 87 section = sparse_index_alloc(nid);
91 if (!section) 88 if (!section)
92 return -ENOMEM; 89 return -ENOMEM;
93 /*
94 * This lock keeps two different sections from
95 * reallocating for the same index
96 */
97 spin_lock(&index_init_lock);
98
99 if (mem_section[root]) {
100 ret = -EEXIST;
101 goto out;
102 }
103 90
104 mem_section[root] = section; 91 mem_section[root] = section;
105out: 92
106 spin_unlock(&index_init_lock);
107 return ret; 93 return ret;
108} 94}
109#else /* !SPARSEMEM_EXTREME */ 95#else /* !SPARSEMEM_EXTREME */
@@ -132,6 +118,8 @@ int __section_nr(struct mem_section* ms)
132 break; 118 break;
133 } 119 }
134 120
121 VM_BUG_ON(root_nr == NR_SECTION_ROOTS);
122
135 return (root_nr * SECTIONS_PER_ROOT) + (ms - root); 123 return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
136} 124}
137 125
@@ -273,10 +261,11 @@ static unsigned long *__kmalloc_section_usemap(void)
273#ifdef CONFIG_MEMORY_HOTREMOVE 261#ifdef CONFIG_MEMORY_HOTREMOVE
274static unsigned long * __init 262static unsigned long * __init
275sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, 263sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
276 unsigned long count) 264 unsigned long size)
277{ 265{
278 unsigned long section_nr; 266 unsigned long goal, limit;
279 267 unsigned long *p;
268 int nid;
280 /* 269 /*
281 * A page may contain usemaps for other sections preventing the 270 * A page may contain usemaps for other sections preventing the
282 * page being freed and making a section unremovable while 271 * page being freed and making a section unremovable while
@@ -287,8 +276,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
287 * from the same section as the pgdat where possible to avoid 276 * from the same section as the pgdat where possible to avoid
288 * this problem. 277 * this problem.
289 */ 278 */
290 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); 279 goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
291 return alloc_bootmem_section(usemap_size() * count, section_nr); 280 limit = goal + (1UL << PA_SECTION_SHIFT);
281 nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
282again:
283 p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
284 SMP_CACHE_BYTES, goal, limit);
285 if (!p && limit) {
286 limit = 0;
287 goto again;
288 }
289 return p;
292} 290}
293 291
294static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 292static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -332,9 +330,9 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
332#else 330#else
333static unsigned long * __init 331static unsigned long * __init
334sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, 332sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
335 unsigned long count) 333 unsigned long size)
336{ 334{
337 return NULL; 335 return alloc_bootmem_node_nopanic(pgdat, size);
338} 336}
339 337
340static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 338static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -352,13 +350,10 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
352 int size = usemap_size(); 350 int size = usemap_size();
353 351
354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), 352 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
355 usemap_count); 353 size * usemap_count);
356 if (!usemap) { 354 if (!usemap) {
357 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); 355 printk(KERN_WARNING "%s: allocation failed\n", __func__);
358 if (!usemap) { 356 return;
359 printk(KERN_WARNING "%s: allocation failed\n", __func__);
360 return;
361 }
362 } 357 }
363 358
364 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 359 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
@@ -486,6 +481,9 @@ void __init sparse_init(void)
486 struct page **map_map; 481 struct page **map_map;
487#endif 482#endif
488 483
484 /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
485 set_pageblock_order();
486
489 /* 487 /*
490 * map is using big page (aka 2M in x86 64 bit) 488 * map is using big page (aka 2M in x86 64 bit)
491 * usemap is less one page (aka 24 bytes) 489 * usemap is less one page (aka 24 bytes)
diff --git a/mm/swap.c b/mm/swap.c
index 5c13f1338972..77825883298f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,13 +47,15 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
47static void __page_cache_release(struct page *page) 47static void __page_cache_release(struct page *page)
48{ 48{
49 if (PageLRU(page)) { 49 if (PageLRU(page)) {
50 unsigned long flags;
51 struct zone *zone = page_zone(page); 50 struct zone *zone = page_zone(page);
51 struct lruvec *lruvec;
52 unsigned long flags;
52 53
53 spin_lock_irqsave(&zone->lru_lock, flags); 54 spin_lock_irqsave(&zone->lru_lock, flags);
55 lruvec = mem_cgroup_page_lruvec(page, zone);
54 VM_BUG_ON(!PageLRU(page)); 56 VM_BUG_ON(!PageLRU(page));
55 __ClearPageLRU(page); 57 __ClearPageLRU(page);
56 del_page_from_lru_list(zone, page, page_off_lru(page)); 58 del_page_from_lru_list(page, lruvec, page_off_lru(page));
57 spin_unlock_irqrestore(&zone->lru_lock, flags); 59 spin_unlock_irqrestore(&zone->lru_lock, flags);
58 } 60 }
59} 61}
@@ -82,6 +84,25 @@ static void put_compound_page(struct page *page)
82 if (likely(page != page_head && 84 if (likely(page != page_head &&
83 get_page_unless_zero(page_head))) { 85 get_page_unless_zero(page_head))) {
84 unsigned long flags; 86 unsigned long flags;
87
88 /*
89 * THP can not break up slab pages so avoid taking
90 * compound_lock(). Slab performs non-atomic bit ops
91 * on page->flags for better performance. In particular
92 * slab_unlock() in slub used to be a hot path. It is
93 * still hot on arches that do not support
94 * this_cpu_cmpxchg_double().
95 */
96 if (PageSlab(page_head)) {
97 if (PageTail(page)) {
98 if (put_page_testzero(page_head))
99 VM_BUG_ON(1);
100
101 atomic_dec(&page->_mapcount);
102 goto skip_lock_tail;
103 } else
104 goto skip_lock;
105 }
85 /* 106 /*
86 * page_head wasn't a dangling pointer but it 107 * page_head wasn't a dangling pointer but it
87 * may not be a head page anymore by the time 108 * may not be a head page anymore by the time
@@ -92,10 +113,10 @@ static void put_compound_page(struct page *page)
92 if (unlikely(!PageTail(page))) { 113 if (unlikely(!PageTail(page))) {
93 /* __split_huge_page_refcount run before us */ 114 /* __split_huge_page_refcount run before us */
94 compound_unlock_irqrestore(page_head, flags); 115 compound_unlock_irqrestore(page_head, flags);
95 VM_BUG_ON(PageHead(page_head)); 116skip_lock:
96 if (put_page_testzero(page_head)) 117 if (put_page_testzero(page_head))
97 __put_single_page(page_head); 118 __put_single_page(page_head);
98 out_put_single: 119out_put_single:
99 if (put_page_testzero(page)) 120 if (put_page_testzero(page))
100 __put_single_page(page); 121 __put_single_page(page);
101 return; 122 return;
@@ -115,6 +136,8 @@ static void put_compound_page(struct page *page)
115 VM_BUG_ON(atomic_read(&page_head->_count) <= 0); 136 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
116 VM_BUG_ON(atomic_read(&page->_count) != 0); 137 VM_BUG_ON(atomic_read(&page->_count) != 0);
117 compound_unlock_irqrestore(page_head, flags); 138 compound_unlock_irqrestore(page_head, flags);
139
140skip_lock_tail:
118 if (put_page_testzero(page_head)) { 141 if (put_page_testzero(page_head)) {
119 if (PageHead(page_head)) 142 if (PageHead(page_head))
120 __put_compound_page(page_head); 143 __put_compound_page(page_head);
@@ -162,6 +185,18 @@ bool __get_page_tail(struct page *page)
162 struct page *page_head = compound_trans_head(page); 185 struct page *page_head = compound_trans_head(page);
163 186
164 if (likely(page != page_head && get_page_unless_zero(page_head))) { 187 if (likely(page != page_head && get_page_unless_zero(page_head))) {
188
189 /* Ref to put_compound_page() comment. */
190 if (PageSlab(page_head)) {
191 if (likely(PageTail(page))) {
192 __get_page_tail_foll(page, false);
193 return true;
194 } else {
195 put_page(page_head);
196 return false;
197 }
198 }
199
165 /* 200 /*
166 * page_head wasn't a dangling pointer but it 201 * page_head wasn't a dangling pointer but it
167 * may not be a head page anymore by the time 202 * may not be a head page anymore by the time
@@ -201,12 +236,65 @@ void put_pages_list(struct list_head *pages)
201} 236}
202EXPORT_SYMBOL(put_pages_list); 237EXPORT_SYMBOL(put_pages_list);
203 238
239/*
240 * get_kernel_pages() - pin kernel pages in memory
241 * @kiov: An array of struct kvec structures
242 * @nr_segs: number of segments to pin
243 * @write: pinning for read/write, currently ignored
244 * @pages: array that receives pointers to the pages pinned.
245 * Should be at least nr_segs long.
246 *
247 * Returns number of pages pinned. This may be fewer than the number
248 * requested. If nr_pages is 0 or negative, returns 0. If no pages
249 * were pinned, returns -errno. Each page returned must be released
250 * with a put_page() call when it is finished with.
251 */
252int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
253 struct page **pages)
254{
255 int seg;
256
257 for (seg = 0; seg < nr_segs; seg++) {
258 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
259 return seg;
260
261 pages[seg] = kmap_to_page(kiov[seg].iov_base);
262 page_cache_get(pages[seg]);
263 }
264
265 return seg;
266}
267EXPORT_SYMBOL_GPL(get_kernel_pages);
268
269/*
270 * get_kernel_page() - pin a kernel page in memory
271 * @start: starting kernel address
272 * @write: pinning for read/write, currently ignored
273 * @pages: array that receives pointer to the page pinned.
274 * Must be at least nr_segs long.
275 *
276 * Returns 1 if page is pinned. If the page was not pinned, returns
277 * -errno. The page returned must be released with a put_page() call
278 * when it is finished with.
279 */
280int get_kernel_page(unsigned long start, int write, struct page **pages)
281{
282 const struct kvec kiov = {
283 .iov_base = (void *)start,
284 .iov_len = PAGE_SIZE
285 };
286
287 return get_kernel_pages(&kiov, 1, write, pages);
288}
289EXPORT_SYMBOL_GPL(get_kernel_page);
290
204static void pagevec_lru_move_fn(struct pagevec *pvec, 291static void pagevec_lru_move_fn(struct pagevec *pvec,
205 void (*move_fn)(struct page *page, void *arg), 292 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
206 void *arg) 293 void *arg)
207{ 294{
208 int i; 295 int i;
209 struct zone *zone = NULL; 296 struct zone *zone = NULL;
297 struct lruvec *lruvec;
210 unsigned long flags = 0; 298 unsigned long flags = 0;
211 299
212 for (i = 0; i < pagevec_count(pvec); i++) { 300 for (i = 0; i < pagevec_count(pvec); i++) {
@@ -220,7 +308,8 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
220 spin_lock_irqsave(&zone->lru_lock, flags); 308 spin_lock_irqsave(&zone->lru_lock, flags);
221 } 309 }
222 310
223 (*move_fn)(page, arg); 311 lruvec = mem_cgroup_page_lruvec(page, zone);
312 (*move_fn)(page, lruvec, arg);
224 } 313 }
225 if (zone) 314 if (zone)
226 spin_unlock_irqrestore(&zone->lru_lock, flags); 315 spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -228,16 +317,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
228 pagevec_reinit(pvec); 317 pagevec_reinit(pvec);
229} 318}
230 319
231static void pagevec_move_tail_fn(struct page *page, void *arg) 320static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
321 void *arg)
232{ 322{
233 int *pgmoved = arg; 323 int *pgmoved = arg;
234 324
235 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 325 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
236 enum lru_list lru = page_lru_base_type(page); 326 enum lru_list lru = page_lru_base_type(page);
237 struct lruvec *lruvec;
238
239 lruvec = mem_cgroup_lru_move_lists(page_zone(page),
240 page, lru, lru);
241 list_move_tail(&page->lru, &lruvec->lists[lru]); 327 list_move_tail(&page->lru, &lruvec->lists[lru]);
242 (*pgmoved)++; 328 (*pgmoved)++;
243 } 329 }
@@ -276,41 +362,30 @@ void rotate_reclaimable_page(struct page *page)
276 } 362 }
277} 363}
278 364
279static void update_page_reclaim_stat(struct zone *zone, struct page *page, 365static void update_page_reclaim_stat(struct lruvec *lruvec,
280 int file, int rotated) 366 int file, int rotated)
281{ 367{
282 struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat; 368 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
283 struct zone_reclaim_stat *memcg_reclaim_stat;
284
285 memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
286 369
287 reclaim_stat->recent_scanned[file]++; 370 reclaim_stat->recent_scanned[file]++;
288 if (rotated) 371 if (rotated)
289 reclaim_stat->recent_rotated[file]++; 372 reclaim_stat->recent_rotated[file]++;
290
291 if (!memcg_reclaim_stat)
292 return;
293
294 memcg_reclaim_stat->recent_scanned[file]++;
295 if (rotated)
296 memcg_reclaim_stat->recent_rotated[file]++;
297} 373}
298 374
299static void __activate_page(struct page *page, void *arg) 375static void __activate_page(struct page *page, struct lruvec *lruvec,
376 void *arg)
300{ 377{
301 struct zone *zone = page_zone(page);
302
303 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 378 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
304 int file = page_is_file_cache(page); 379 int file = page_is_file_cache(page);
305 int lru = page_lru_base_type(page); 380 int lru = page_lru_base_type(page);
306 del_page_from_lru_list(zone, page, lru);
307 381
382 del_page_from_lru_list(page, lruvec, lru);
308 SetPageActive(page); 383 SetPageActive(page);
309 lru += LRU_ACTIVE; 384 lru += LRU_ACTIVE;
310 add_page_to_lru_list(zone, page, lru); 385 add_page_to_lru_list(page, lruvec, lru);
311 __count_vm_event(PGACTIVATE);
312 386
313 update_page_reclaim_stat(zone, page, file, 1); 387 __count_vm_event(PGACTIVATE);
388 update_page_reclaim_stat(lruvec, file, 1);
314 } 389 }
315} 390}
316 391
@@ -347,7 +422,7 @@ void activate_page(struct page *page)
347 struct zone *zone = page_zone(page); 422 struct zone *zone = page_zone(page);
348 423
349 spin_lock_irq(&zone->lru_lock); 424 spin_lock_irq(&zone->lru_lock);
350 __activate_page(page, NULL); 425 __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
351 spin_unlock_irq(&zone->lru_lock); 426 spin_unlock_irq(&zone->lru_lock);
352} 427}
353#endif 428#endif
@@ -414,11 +489,13 @@ void lru_cache_add_lru(struct page *page, enum lru_list lru)
414void add_page_to_unevictable_list(struct page *page) 489void add_page_to_unevictable_list(struct page *page)
415{ 490{
416 struct zone *zone = page_zone(page); 491 struct zone *zone = page_zone(page);
492 struct lruvec *lruvec;
417 493
418 spin_lock_irq(&zone->lru_lock); 494 spin_lock_irq(&zone->lru_lock);
495 lruvec = mem_cgroup_page_lruvec(page, zone);
419 SetPageUnevictable(page); 496 SetPageUnevictable(page);
420 SetPageLRU(page); 497 SetPageLRU(page);
421 add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); 498 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
422 spin_unlock_irq(&zone->lru_lock); 499 spin_unlock_irq(&zone->lru_lock);
423} 500}
424 501
@@ -443,11 +520,11 @@ void add_page_to_unevictable_list(struct page *page)
443 * be write it out by flusher threads as this is much more effective 520 * be write it out by flusher threads as this is much more effective
444 * than the single-page writeout from reclaim. 521 * than the single-page writeout from reclaim.
445 */ 522 */
446static void lru_deactivate_fn(struct page *page, void *arg) 523static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
524 void *arg)
447{ 525{
448 int lru, file; 526 int lru, file;
449 bool active; 527 bool active;
450 struct zone *zone = page_zone(page);
451 528
452 if (!PageLRU(page)) 529 if (!PageLRU(page))
453 return; 530 return;
@@ -460,13 +537,13 @@ static void lru_deactivate_fn(struct page *page, void *arg)
460 return; 537 return;
461 538
462 active = PageActive(page); 539 active = PageActive(page);
463
464 file = page_is_file_cache(page); 540 file = page_is_file_cache(page);
465 lru = page_lru_base_type(page); 541 lru = page_lru_base_type(page);
466 del_page_from_lru_list(zone, page, lru + active); 542
543 del_page_from_lru_list(page, lruvec, lru + active);
467 ClearPageActive(page); 544 ClearPageActive(page);
468 ClearPageReferenced(page); 545 ClearPageReferenced(page);
469 add_page_to_lru_list(zone, page, lru); 546 add_page_to_lru_list(page, lruvec, lru);
470 547
471 if (PageWriteback(page) || PageDirty(page)) { 548 if (PageWriteback(page) || PageDirty(page)) {
472 /* 549 /*
@@ -476,19 +553,17 @@ static void lru_deactivate_fn(struct page *page, void *arg)
476 */ 553 */
477 SetPageReclaim(page); 554 SetPageReclaim(page);
478 } else { 555 } else {
479 struct lruvec *lruvec;
480 /* 556 /*
481 * The page's writeback ends up during pagevec 557 * The page's writeback ends up during pagevec
482 * We moves tha page into tail of inactive. 558 * We moves tha page into tail of inactive.
483 */ 559 */
484 lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru);
485 list_move_tail(&page->lru, &lruvec->lists[lru]); 560 list_move_tail(&page->lru, &lruvec->lists[lru]);
486 __count_vm_event(PGROTATED); 561 __count_vm_event(PGROTATED);
487 } 562 }
488 563
489 if (active) 564 if (active)
490 __count_vm_event(PGDEACTIVATE); 565 __count_vm_event(PGDEACTIVATE);
491 update_page_reclaim_stat(zone, page, file, 0); 566 update_page_reclaim_stat(lruvec, file, 0);
492} 567}
493 568
494/* 569/*
@@ -588,6 +663,7 @@ void release_pages(struct page **pages, int nr, int cold)
588 int i; 663 int i;
589 LIST_HEAD(pages_to_free); 664 LIST_HEAD(pages_to_free);
590 struct zone *zone = NULL; 665 struct zone *zone = NULL;
666 struct lruvec *lruvec;
591 unsigned long uninitialized_var(flags); 667 unsigned long uninitialized_var(flags);
592 668
593 for (i = 0; i < nr; i++) { 669 for (i = 0; i < nr; i++) {
@@ -615,9 +691,11 @@ void release_pages(struct page **pages, int nr, int cold)
615 zone = pagezone; 691 zone = pagezone;
616 spin_lock_irqsave(&zone->lru_lock, flags); 692 spin_lock_irqsave(&zone->lru_lock, flags);
617 } 693 }
694
695 lruvec = mem_cgroup_page_lruvec(page, zone);
618 VM_BUG_ON(!PageLRU(page)); 696 VM_BUG_ON(!PageLRU(page));
619 __ClearPageLRU(page); 697 __ClearPageLRU(page);
620 del_page_from_lru_list(zone, page, page_off_lru(page)); 698 del_page_from_lru_list(page, lruvec, page_off_lru(page));
621 } 699 }
622 700
623 list_add(&page->lru, &pages_to_free); 701 list_add(&page->lru, &pages_to_free);
@@ -649,8 +727,8 @@ EXPORT_SYMBOL(__pagevec_release);
649 727
650#ifdef CONFIG_TRANSPARENT_HUGEPAGE 728#ifdef CONFIG_TRANSPARENT_HUGEPAGE
651/* used by __split_huge_page_refcount() */ 729/* used by __split_huge_page_refcount() */
652void lru_add_page_tail(struct zone* zone, 730void lru_add_page_tail(struct page *page, struct page *page_tail,
653 struct page *page, struct page *page_tail) 731 struct lruvec *lruvec)
654{ 732{
655 int uninitialized_var(active); 733 int uninitialized_var(active);
656 enum lru_list lru; 734 enum lru_list lru;
@@ -659,7 +737,8 @@ void lru_add_page_tail(struct zone* zone,
659 VM_BUG_ON(!PageHead(page)); 737 VM_BUG_ON(!PageHead(page));
660 VM_BUG_ON(PageCompound(page_tail)); 738 VM_BUG_ON(PageCompound(page_tail));
661 VM_BUG_ON(PageLRU(page_tail)); 739 VM_BUG_ON(PageLRU(page_tail));
662 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock)); 740 VM_BUG_ON(NR_CPUS != 1 &&
741 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
663 742
664 SetPageLRU(page_tail); 743 SetPageLRU(page_tail);
665 744
@@ -688,20 +767,20 @@ void lru_add_page_tail(struct zone* zone,
688 * Use the standard add function to put page_tail on the list, 767 * Use the standard add function to put page_tail on the list,
689 * but then correct its position so they all end up in order. 768 * but then correct its position so they all end up in order.
690 */ 769 */
691 add_page_to_lru_list(zone, page_tail, lru); 770 add_page_to_lru_list(page_tail, lruvec, lru);
692 list_head = page_tail->lru.prev; 771 list_head = page_tail->lru.prev;
693 list_move_tail(&page_tail->lru, list_head); 772 list_move_tail(&page_tail->lru, list_head);
694 } 773 }
695 774
696 if (!PageUnevictable(page)) 775 if (!PageUnevictable(page))
697 update_page_reclaim_stat(zone, page_tail, file, active); 776 update_page_reclaim_stat(lruvec, file, active);
698} 777}
699#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 778#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
700 779
701static void __pagevec_lru_add_fn(struct page *page, void *arg) 780static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
781 void *arg)
702{ 782{
703 enum lru_list lru = (enum lru_list)arg; 783 enum lru_list lru = (enum lru_list)arg;
704 struct zone *zone = page_zone(page);
705 int file = is_file_lru(lru); 784 int file = is_file_lru(lru);
706 int active = is_active_lru(lru); 785 int active = is_active_lru(lru);
707 786
@@ -712,8 +791,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg)
712 SetPageLRU(page); 791 SetPageLRU(page);
713 if (active) 792 if (active)
714 SetPageActive(page); 793 SetPageActive(page);
715 add_page_to_lru_list(zone, page, lru); 794 add_page_to_lru_list(page, lruvec, lru);
716 update_page_reclaim_stat(zone, page, file, active); 795 update_page_reclaim_stat(lruvec, file, active);
717} 796}
718 797
719/* 798/*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9d3dd3763cf7..0cb36fb1f61c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/blkdev.h>
17#include <linux/pagevec.h> 18#include <linux/pagevec.h>
18#include <linux/migrate.h> 19#include <linux/migrate.h>
19#include <linux/page_cgroup.h> 20#include <linux/page_cgroup.h>
@@ -26,7 +27,7 @@
26 */ 27 */
27static const struct address_space_operations swap_aops = { 28static const struct address_space_operations swap_aops = {
28 .writepage = swap_writepage, 29 .writepage = swap_writepage,
29 .set_page_dirty = __set_page_dirty_nobuffers, 30 .set_page_dirty = swap_set_page_dirty,
30 .migratepage = migrate_page, 31 .migratepage = migrate_page,
31}; 32};
32 33
@@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
376 unsigned long offset = swp_offset(entry); 377 unsigned long offset = swp_offset(entry);
377 unsigned long start_offset, end_offset; 378 unsigned long start_offset, end_offset;
378 unsigned long mask = (1UL << page_cluster) - 1; 379 unsigned long mask = (1UL << page_cluster) - 1;
380 struct blk_plug plug;
379 381
380 /* Read a page_cluster sized and aligned cluster around offset. */ 382 /* Read a page_cluster sized and aligned cluster around offset. */
381 start_offset = offset & ~mask; 383 start_offset = offset & ~mask;
@@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
383 if (!start_offset) /* First page is swap header. */ 385 if (!start_offset) /* First page is swap header. */
384 start_offset++; 386 start_offset++;
385 387
388 blk_start_plug(&plug);
386 for (offset = start_offset; offset <= end_offset ; offset++) { 389 for (offset = start_offset; offset <= end_offset ; offset++) {
387 /* Ok, do the async read-ahead now */ 390 /* Ok, do the async read-ahead now */
388 page = read_swap_cache_async(swp_entry(swp_type(entry), offset), 391 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
@@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
391 continue; 394 continue;
392 page_cache_release(page); 395 page_cache_release(page);
393 } 396 }
397 blk_finish_plug(&plug);
398
394 lru_add_drain(); /* Push any new pages onto the LRU now */ 399 lru_add_drain(); /* Push any new pages onto the LRU now */
395 return read_swap_cache_async(entry, gfp_mask, vma, addr); 400 return read_swap_cache_async(entry, gfp_mask, vma, addr);
396} 401}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fafc26d1b1dc..14e254c768fc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,9 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/poll.h> 32#include <linux/poll.h>
33#include <linux/oom.h> 33#include <linux/oom.h>
34#include <linux/frontswap.h>
35#include <linux/swapfile.h>
36#include <linux/export.h>
34 37
35#include <asm/pgtable.h> 38#include <asm/pgtable.h>
36#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
@@ -42,7 +45,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
42static void free_swap_count_continuations(struct swap_info_struct *); 45static void free_swap_count_continuations(struct swap_info_struct *);
43static sector_t map_swap_entry(swp_entry_t, struct block_device**); 46static sector_t map_swap_entry(swp_entry_t, struct block_device**);
44 47
45static DEFINE_SPINLOCK(swap_lock); 48DEFINE_SPINLOCK(swap_lock);
46static unsigned int nr_swapfiles; 49static unsigned int nr_swapfiles;
47long nr_swap_pages; 50long nr_swap_pages;
48long total_swap_pages; 51long total_swap_pages;
@@ -53,9 +56,9 @@ static const char Unused_file[] = "Unused swap file entry ";
53static const char Bad_offset[] = "Bad swap offset entry "; 56static const char Bad_offset[] = "Bad swap offset entry ";
54static const char Unused_offset[] = "Unused swap offset entry "; 57static const char Unused_offset[] = "Unused swap offset entry ";
55 58
56static struct swap_list_t swap_list = {-1, -1}; 59struct swap_list_t swap_list = {-1, -1};
57 60
58static struct swap_info_struct *swap_info[MAX_SWAPFILES]; 61struct swap_info_struct *swap_info[MAX_SWAPFILES];
59 62
60static DEFINE_MUTEX(swapon_mutex); 63static DEFINE_MUTEX(swapon_mutex);
61 64
@@ -546,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
546 549
547 /* free if no reference */ 550 /* free if no reference */
548 if (!usage) { 551 if (!usage) {
549 struct gendisk *disk = p->bdev->bd_disk;
550 if (offset < p->lowest_bit) 552 if (offset < p->lowest_bit)
551 p->lowest_bit = offset; 553 p->lowest_bit = offset;
552 if (offset > p->highest_bit) 554 if (offset > p->highest_bit)
@@ -556,9 +558,13 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
556 swap_list.next = p->type; 558 swap_list.next = p->type;
557 nr_swap_pages++; 559 nr_swap_pages++;
558 p->inuse_pages--; 560 p->inuse_pages--;
559 if ((p->flags & SWP_BLKDEV) && 561 frontswap_invalidate_page(p->type, offset);
560 disk->fops->swap_slot_free_notify) 562 if (p->flags & SWP_BLKDEV) {
561 disk->fops->swap_slot_free_notify(p->bdev, offset); 563 struct gendisk *disk = p->bdev->bd_disk;
564 if (disk->fops->swap_slot_free_notify)
565 disk->fops->swap_slot_free_notify(p->bdev,
566 offset);
567 }
562 } 568 }
563 569
564 return usage; 570 return usage;
@@ -601,7 +607,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
601 * This does not give an exact answer when swap count is continued, 607 * This does not give an exact answer when swap count is continued,
602 * but does include the high COUNT_CONTINUED flag to allow for that. 608 * but does include the high COUNT_CONTINUED flag to allow for that.
603 */ 609 */
604static inline int page_swapcount(struct page *page) 610int page_swapcount(struct page *page)
605{ 611{
606 int count = 0; 612 int count = 0;
607 struct swap_info_struct *p; 613 struct swap_info_struct *p;
@@ -717,37 +723,6 @@ int free_swap_and_cache(swp_entry_t entry)
717 return p != NULL; 723 return p != NULL;
718} 724}
719 725
720#ifdef CONFIG_CGROUP_MEM_RES_CTLR
721/**
722 * mem_cgroup_count_swap_user - count the user of a swap entry
723 * @ent: the swap entry to be checked
724 * @pagep: the pointer for the swap cache page of the entry to be stored
725 *
726 * Returns the number of the user of the swap entry. The number is valid only
727 * for swaps of anonymous pages.
728 * If the entry is found on swap cache, the page is stored to pagep with
729 * refcount of it being incremented.
730 */
731int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
732{
733 struct page *page;
734 struct swap_info_struct *p;
735 int count = 0;
736
737 page = find_get_page(&swapper_space, ent.val);
738 if (page)
739 count += page_mapcount(page);
740 p = swap_info_get(ent);
741 if (p) {
742 count += swap_count(p->swap_map[swp_offset(ent)]);
743 spin_unlock(&swap_lock);
744 }
745
746 *pagep = page;
747 return count;
748}
749#endif
750
751#ifdef CONFIG_HIBERNATION 726#ifdef CONFIG_HIBERNATION
752/* 727/*
753 * Find the swap type that corresponds to given device (if any). 728 * Find the swap type that corresponds to given device (if any).
@@ -860,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
860 835
861 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 836 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
862 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 837 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
863 if (ret > 0) 838 mem_cgroup_cancel_charge_swapin(memcg);
864 mem_cgroup_cancel_charge_swapin(memcg);
865 ret = 0; 839 ret = 0;
866 goto out; 840 goto out;
867 } 841 }
@@ -1016,11 +990,12 @@ static int unuse_mm(struct mm_struct *mm,
1016} 990}
1017 991
1018/* 992/*
1019 * Scan swap_map from current position to next entry still in use. 993 * Scan swap_map (or frontswap_map if frontswap parameter is true)
994 * from current position to next entry still in use.
1020 * Recycle to start on reaching the end, returning 0 when empty. 995 * Recycle to start on reaching the end, returning 0 when empty.
1021 */ 996 */
1022static unsigned int find_next_to_unuse(struct swap_info_struct *si, 997static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1023 unsigned int prev) 998 unsigned int prev, bool frontswap)
1024{ 999{
1025 unsigned int max = si->max; 1000 unsigned int max = si->max;
1026 unsigned int i = prev; 1001 unsigned int i = prev;
@@ -1046,6 +1021,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1046 prev = 0; 1021 prev = 0;
1047 i = 1; 1022 i = 1;
1048 } 1023 }
1024 if (frontswap) {
1025 if (frontswap_test(si, i))
1026 break;
1027 else
1028 continue;
1029 }
1049 count = si->swap_map[i]; 1030 count = si->swap_map[i];
1050 if (count && swap_count(count) != SWAP_MAP_BAD) 1031 if (count && swap_count(count) != SWAP_MAP_BAD)
1051 break; 1032 break;
@@ -1057,8 +1038,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1057 * We completely avoid races by reading each swap page in advance, 1038 * We completely avoid races by reading each swap page in advance,
1058 * and then search for the process using it. All the necessary 1039 * and then search for the process using it. All the necessary
1059 * page table adjustments can then be made atomically. 1040 * page table adjustments can then be made atomically.
1041 *
1042 * if the boolean frontswap is true, only unuse pages_to_unuse pages;
1043 * pages_to_unuse==0 means all pages; ignored if frontswap is false
1060 */ 1044 */
1061static int try_to_unuse(unsigned int type) 1045int try_to_unuse(unsigned int type, bool frontswap,
1046 unsigned long pages_to_unuse)
1062{ 1047{
1063 struct swap_info_struct *si = swap_info[type]; 1048 struct swap_info_struct *si = swap_info[type];
1064 struct mm_struct *start_mm; 1049 struct mm_struct *start_mm;
@@ -1091,7 +1076,7 @@ static int try_to_unuse(unsigned int type)
1091 * one pass through swap_map is enough, but not necessarily: 1076 * one pass through swap_map is enough, but not necessarily:
1092 * there are races when an instance of an entry might be missed. 1077 * there are races when an instance of an entry might be missed.
1093 */ 1078 */
1094 while ((i = find_next_to_unuse(si, i)) != 0) { 1079 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
1095 if (signal_pending(current)) { 1080 if (signal_pending(current)) {
1096 retval = -EINTR; 1081 retval = -EINTR;
1097 break; 1082 break;
@@ -1258,6 +1243,10 @@ static int try_to_unuse(unsigned int type)
1258 * interactive performance. 1243 * interactive performance.
1259 */ 1244 */
1260 cond_resched(); 1245 cond_resched();
1246 if (frontswap && pages_to_unuse > 0) {
1247 if (!--pages_to_unuse)
1248 break;
1249 }
1261 } 1250 }
1262 1251
1263 mmput(start_mm); 1252 mmput(start_mm);
@@ -1341,6 +1330,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
1341 list_del(&se->list); 1330 list_del(&se->list);
1342 kfree(se); 1331 kfree(se);
1343 } 1332 }
1333
1334 if (sis->flags & SWP_FILE) {
1335 struct file *swap_file = sis->swap_file;
1336 struct address_space *mapping = swap_file->f_mapping;
1337
1338 sis->flags &= ~SWP_FILE;
1339 mapping->a_ops->swap_deactivate(swap_file);
1340 }
1344} 1341}
1345 1342
1346/* 1343/*
@@ -1349,7 +1346,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
1349 * 1346 *
1350 * This function rather assumes that it is called in ascending page order. 1347 * This function rather assumes that it is called in ascending page order.
1351 */ 1348 */
1352static int 1349int
1353add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 1350add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1354 unsigned long nr_pages, sector_t start_block) 1351 unsigned long nr_pages, sector_t start_block)
1355{ 1352{
@@ -1422,102 +1419,33 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1422 */ 1419 */
1423static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 1420static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1424{ 1421{
1425 struct inode *inode; 1422 struct file *swap_file = sis->swap_file;
1426 unsigned blocks_per_page; 1423 struct address_space *mapping = swap_file->f_mapping;
1427 unsigned long page_no; 1424 struct inode *inode = mapping->host;
1428 unsigned blkbits;
1429 sector_t probe_block;
1430 sector_t last_block;
1431 sector_t lowest_block = -1;
1432 sector_t highest_block = 0;
1433 int nr_extents = 0;
1434 int ret; 1425 int ret;
1435 1426
1436 inode = sis->swap_file->f_mapping->host;
1437 if (S_ISBLK(inode->i_mode)) { 1427 if (S_ISBLK(inode->i_mode)) {
1438 ret = add_swap_extent(sis, 0, sis->max, 0); 1428 ret = add_swap_extent(sis, 0, sis->max, 0);
1439 *span = sis->pages; 1429 *span = sis->pages;
1440 goto out; 1430 return ret;
1441 } 1431 }
1442 1432
1443 blkbits = inode->i_blkbits; 1433 if (mapping->a_ops->swap_activate) {
1444 blocks_per_page = PAGE_SIZE >> blkbits; 1434 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
1445 1435 if (!ret) {
1446 /* 1436 sis->flags |= SWP_FILE;
1447 * Map all the blocks into the extent list. This code doesn't try 1437 ret = add_swap_extent(sis, 0, sis->max, 0);
1448 * to be very smart. 1438 *span = sis->pages;
1449 */
1450 probe_block = 0;
1451 page_no = 0;
1452 last_block = i_size_read(inode) >> blkbits;
1453 while ((probe_block + blocks_per_page) <= last_block &&
1454 page_no < sis->max) {
1455 unsigned block_in_page;
1456 sector_t first_block;
1457
1458 first_block = bmap(inode, probe_block);
1459 if (first_block == 0)
1460 goto bad_bmap;
1461
1462 /*
1463 * It must be PAGE_SIZE aligned on-disk
1464 */
1465 if (first_block & (blocks_per_page - 1)) {
1466 probe_block++;
1467 goto reprobe;
1468 }
1469
1470 for (block_in_page = 1; block_in_page < blocks_per_page;
1471 block_in_page++) {
1472 sector_t block;
1473
1474 block = bmap(inode, probe_block + block_in_page);
1475 if (block == 0)
1476 goto bad_bmap;
1477 if (block != first_block + block_in_page) {
1478 /* Discontiguity */
1479 probe_block++;
1480 goto reprobe;
1481 }
1482 }
1483
1484 first_block >>= (PAGE_SHIFT - blkbits);
1485 if (page_no) { /* exclude the header page */
1486 if (first_block < lowest_block)
1487 lowest_block = first_block;
1488 if (first_block > highest_block)
1489 highest_block = first_block;
1490 } 1439 }
1440 return ret;
1441 }
1491 1442
1492 /* 1443 return generic_swapfile_activate(sis, swap_file, span);
1493 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
1494 */
1495 ret = add_swap_extent(sis, page_no, 1, first_block);
1496 if (ret < 0)
1497 goto out;
1498 nr_extents += ret;
1499 page_no++;
1500 probe_block += blocks_per_page;
1501reprobe:
1502 continue;
1503 }
1504 ret = nr_extents;
1505 *span = 1 + highest_block - lowest_block;
1506 if (page_no == 0)
1507 page_no = 1; /* force Empty message */
1508 sis->max = page_no;
1509 sis->pages = page_no - 1;
1510 sis->highest_bit = page_no - 1;
1511out:
1512 return ret;
1513bad_bmap:
1514 printk(KERN_ERR "swapon: swapfile has holes\n");
1515 ret = -EINVAL;
1516 goto out;
1517} 1444}
1518 1445
1519static void enable_swap_info(struct swap_info_struct *p, int prio, 1446static void enable_swap_info(struct swap_info_struct *p, int prio,
1520 unsigned char *swap_map) 1447 unsigned char *swap_map,
1448 unsigned long *frontswap_map)
1521{ 1449{
1522 int i, prev; 1450 int i, prev;
1523 1451
@@ -1527,6 +1455,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1527 else 1455 else
1528 p->prio = --least_priority; 1456 p->prio = --least_priority;
1529 p->swap_map = swap_map; 1457 p->swap_map = swap_map;
1458 frontswap_map_set(p, frontswap_map);
1530 p->flags |= SWP_WRITEOK; 1459 p->flags |= SWP_WRITEOK;
1531 nr_swap_pages += p->pages; 1460 nr_swap_pages += p->pages;
1532 total_swap_pages += p->pages; 1461 total_swap_pages += p->pages;
@@ -1543,6 +1472,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1543 swap_list.head = swap_list.next = p->type; 1472 swap_list.head = swap_list.next = p->type;
1544 else 1473 else
1545 swap_info[prev]->next = p->type; 1474 swap_info[prev]->next = p->type;
1475 frontswap_init(p->type);
1546 spin_unlock(&swap_lock); 1476 spin_unlock(&swap_lock);
1547} 1477}
1548 1478
@@ -1616,7 +1546,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1616 spin_unlock(&swap_lock); 1546 spin_unlock(&swap_lock);
1617 1547
1618 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1548 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1619 err = try_to_unuse(type); 1549 err = try_to_unuse(type, false, 0); /* force all pages to be unused */
1620 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); 1550 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
1621 1551
1622 if (err) { 1552 if (err) {
@@ -1627,7 +1557,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1627 * sys_swapoff for this swap_info_struct at this point. 1557 * sys_swapoff for this swap_info_struct at this point.
1628 */ 1558 */
1629 /* re-insert swap space back into swap_list */ 1559 /* re-insert swap space back into swap_list */
1630 enable_swap_info(p, p->prio, p->swap_map); 1560 enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1631 goto out_dput; 1561 goto out_dput;
1632 } 1562 }
1633 1563
@@ -1653,9 +1583,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1653 swap_map = p->swap_map; 1583 swap_map = p->swap_map;
1654 p->swap_map = NULL; 1584 p->swap_map = NULL;
1655 p->flags = 0; 1585 p->flags = 0;
1586 frontswap_invalidate_area(type);
1656 spin_unlock(&swap_lock); 1587 spin_unlock(&swap_lock);
1657 mutex_unlock(&swapon_mutex); 1588 mutex_unlock(&swapon_mutex);
1658 vfree(swap_map); 1589 vfree(swap_map);
1590 vfree(frontswap_map_get(p));
1659 /* Destroy swap account informatin */ 1591 /* Destroy swap account informatin */
1660 swap_cgroup_swapoff(type); 1592 swap_cgroup_swapoff(type);
1661 1593
@@ -1924,24 +1856,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1924 1856
1925 /* 1857 /*
1926 * Find out how many pages are allowed for a single swap 1858 * Find out how many pages are allowed for a single swap
1927 * device. There are three limiting factors: 1) the number 1859 * device. There are two limiting factors: 1) the number
1928 * of bits for the swap offset in the swp_entry_t type, and 1860 * of bits for the swap offset in the swp_entry_t type, and
1929 * 2) the number of bits in the swap pte as defined by the 1861 * 2) the number of bits in the swap pte as defined by the
1930 * the different architectures, and 3) the number of free bits 1862 * different architectures. In order to find the
1931 * in an exceptional radix_tree entry. In order to find the
1932 * largest possible bit mask, a swap entry with swap type 0 1863 * largest possible bit mask, a swap entry with swap type 0
1933 * and swap offset ~0UL is created, encoded to a swap pte, 1864 * and swap offset ~0UL is created, encoded to a swap pte,
1934 * decoded to a swp_entry_t again, and finally the swap 1865 * decoded to a swp_entry_t again, and finally the swap
1935 * offset is extracted. This will mask all the bits from 1866 * offset is extracted. This will mask all the bits from
1936 * the initial ~0UL mask that can't be encoded in either 1867 * the initial ~0UL mask that can't be encoded in either
1937 * the swp_entry_t or the architecture definition of a 1868 * the swp_entry_t or the architecture definition of a
1938 * swap pte. Then the same is done for a radix_tree entry. 1869 * swap pte.
1939 */ 1870 */
1940 maxpages = swp_offset(pte_to_swp_entry( 1871 maxpages = swp_offset(pte_to_swp_entry(
1941 swp_entry_to_pte(swp_entry(0, ~0UL)))); 1872 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1942 maxpages = swp_offset(radix_to_swp_entry(
1943 swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
1944
1945 if (maxpages > swap_header->info.last_page) { 1873 if (maxpages > swap_header->info.last_page) {
1946 maxpages = swap_header->info.last_page + 1; 1874 maxpages = swap_header->info.last_page + 1;
1947 /* p->max is an unsigned int: don't overflow it */ 1875 /* p->max is an unsigned int: don't overflow it */
@@ -2019,6 +1947,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2019 sector_t span; 1947 sector_t span;
2020 unsigned long maxpages; 1948 unsigned long maxpages;
2021 unsigned char *swap_map = NULL; 1949 unsigned char *swap_map = NULL;
1950 unsigned long *frontswap_map = NULL;
2022 struct page *page = NULL; 1951 struct page *page = NULL;
2023 struct inode *inode = NULL; 1952 struct inode *inode = NULL;
2024 1953
@@ -2102,6 +2031,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2102 error = nr_extents; 2031 error = nr_extents;
2103 goto bad_swap; 2032 goto bad_swap;
2104 } 2033 }
2034 /* frontswap enabled? set up bit-per-page map for frontswap */
2035 if (frontswap_enabled)
2036 frontswap_map = vzalloc(maxpages / sizeof(long));
2105 2037
2106 if (p->bdev) { 2038 if (p->bdev) {
2107 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2039 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2117,14 +2049,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2117 if (swap_flags & SWAP_FLAG_PREFER) 2049 if (swap_flags & SWAP_FLAG_PREFER)
2118 prio = 2050 prio =
2119 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2051 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2120 enable_swap_info(p, prio, swap_map); 2052 enable_swap_info(p, prio, swap_map, frontswap_map);
2121 2053
2122 printk(KERN_INFO "Adding %uk swap on %s. " 2054 printk(KERN_INFO "Adding %uk swap on %s. "
2123 "Priority:%d extents:%d across:%lluk %s%s\n", 2055 "Priority:%d extents:%d across:%lluk %s%s%s\n",
2124 p->pages<<(PAGE_SHIFT-10), name, p->prio, 2056 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2125 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2057 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2126 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2058 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2127 (p->flags & SWP_DISCARDABLE) ? "D" : ""); 2059 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2060 (frontswap_map) ? "FS" : "");
2128 2061
2129 mutex_unlock(&swapon_mutex); 2062 mutex_unlock(&swapon_mutex);
2130 atomic_inc(&proc_poll_event); 2063 atomic_inc(&proc_poll_event);
@@ -2292,6 +2225,31 @@ int swapcache_prepare(swp_entry_t entry)
2292 return __swap_duplicate(entry, SWAP_HAS_CACHE); 2225 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2293} 2226}
2294 2227
2228struct swap_info_struct *page_swap_info(struct page *page)
2229{
2230 swp_entry_t swap = { .val = page_private(page) };
2231 BUG_ON(!PageSwapCache(page));
2232 return swap_info[swp_type(swap)];
2233}
2234
2235/*
2236 * out-of-line __page_file_ methods to avoid include hell.
2237 */
2238struct address_space *__page_file_mapping(struct page *page)
2239{
2240 VM_BUG_ON(!PageSwapCache(page));
2241 return page_swap_info(page)->swap_file->f_mapping;
2242}
2243EXPORT_SYMBOL_GPL(__page_file_mapping);
2244
2245pgoff_t __page_file_index(struct page *page)
2246{
2247 swp_entry_t swap = { .val = page_private(page) };
2248 VM_BUG_ON(!PageSwapCache(page));
2249 return swp_offset(swap);
2250}
2251EXPORT_SYMBOL_GPL(__page_file_index);
2252
2295/* 2253/*
2296 * add_swap_count_continuation - called when a swap count is duplicated 2254 * add_swap_count_continuation - called when a swap count is duplicated
2297 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 2255 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
diff --git a/mm/thrash.c b/mm/thrash.c
deleted file mode 100644
index 57ad495dbd54..000000000000
--- a/mm/thrash.c
+++ /dev/null
@@ -1,155 +0,0 @@
1/*
2 * mm/thrash.c
3 *
4 * Copyright (C) 2004, Red Hat, Inc.
5 * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
6 * Released under the GPL, see the file COPYING for details.
7 *
8 * Simple token based thrashing protection, using the algorithm
9 * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
10 *
11 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
12 * Improved algorithm to pass token:
13 * Each task has a priority which is incremented if it contended
14 * for the token in an interval less than its previous attempt.
15 * If the token is acquired, that task's priority is boosted to prevent
16 * the token from bouncing around too often and to let the task make
17 * some progress in its execution.
18 */
19
20#include <linux/jiffies.h>
21#include <linux/mm.h>
22#include <linux/sched.h>
23#include <linux/swap.h>
24#include <linux/memcontrol.h>
25
26#include <trace/events/vmscan.h>
27
28#define TOKEN_AGING_INTERVAL (0xFF)
29
30static DEFINE_SPINLOCK(swap_token_lock);
31struct mm_struct *swap_token_mm;
32static struct mem_cgroup *swap_token_memcg;
33
34#ifdef CONFIG_CGROUP_MEM_RES_CTLR
35static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
36{
37 struct mem_cgroup *memcg;
38
39 memcg = try_get_mem_cgroup_from_mm(mm);
40 if (memcg)
41 css_put(mem_cgroup_css(memcg));
42
43 return memcg;
44}
45#else
46static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
47{
48 return NULL;
49}
50#endif
51
52void grab_swap_token(struct mm_struct *mm)
53{
54 int current_interval;
55 unsigned int old_prio = mm->token_priority;
56 static unsigned int global_faults;
57 static unsigned int last_aging;
58
59 global_faults++;
60
61 current_interval = global_faults - mm->faultstamp;
62
63 if (!spin_trylock(&swap_token_lock))
64 return;
65
66 /* First come first served */
67 if (!swap_token_mm)
68 goto replace_token;
69
70 /*
71 * Usually, we don't need priority aging because long interval faults
72 * makes priority decrease quickly. But there is one exception. If the
73 * token owner task is sleeping, it never make long interval faults.
74 * Thus, we need a priority aging mechanism instead. The requirements
75 * of priority aging are
76 * 1) An aging interval is reasonable enough long. Too short aging
77 * interval makes quick swap token lost and decrease performance.
78 * 2) The swap token owner task have to get priority aging even if
79 * it's under sleep.
80 */
81 if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
82 swap_token_mm->token_priority /= 2;
83 last_aging = global_faults;
84 }
85
86 if (mm == swap_token_mm) {
87 mm->token_priority += 2;
88 goto update_priority;
89 }
90
91 if (current_interval < mm->last_interval)
92 mm->token_priority++;
93 else {
94 if (likely(mm->token_priority > 0))
95 mm->token_priority--;
96 }
97
98 /* Check if we deserve the token */
99 if (mm->token_priority > swap_token_mm->token_priority)
100 goto replace_token;
101
102update_priority:
103 trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
104
105out:
106 mm->faultstamp = global_faults;
107 mm->last_interval = current_interval;
108 spin_unlock(&swap_token_lock);
109 return;
110
111replace_token:
112 mm->token_priority += 2;
113 trace_replace_swap_token(swap_token_mm, mm);
114 swap_token_mm = mm;
115 swap_token_memcg = swap_token_memcg_from_mm(mm);
116 last_aging = global_faults;
117 goto out;
118}
119
120/* Called on process exit. */
121void __put_swap_token(struct mm_struct *mm)
122{
123 spin_lock(&swap_token_lock);
124 if (likely(mm == swap_token_mm)) {
125 trace_put_swap_token(swap_token_mm);
126 swap_token_mm = NULL;
127 swap_token_memcg = NULL;
128 }
129 spin_unlock(&swap_token_lock);
130}
131
132static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
133{
134 if (!a)
135 return true;
136 if (!b)
137 return true;
138 if (a == b)
139 return true;
140 return false;
141}
142
143void disable_swap_token(struct mem_cgroup *memcg)
144{
145 /* memcg reclaim don't disable unrelated mm token. */
146 if (match_memcg(memcg, swap_token_memcg)) {
147 spin_lock(&swap_token_lock);
148 if (match_memcg(memcg, swap_token_memcg)) {
149 trace_disable_swap_token(swap_token_mm);
150 swap_token_mm = NULL;
151 swap_token_memcg = NULL;
152 }
153 spin_unlock(&swap_token_lock);
154 }
155}
diff --git a/mm/truncate.c b/mm/truncate.c
index 61a183b89df6..75801acdaac7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -602,31 +602,6 @@ int vmtruncate(struct inode *inode, loff_t newsize)
602} 602}
603EXPORT_SYMBOL(vmtruncate); 603EXPORT_SYMBOL(vmtruncate);
604 604
605int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
606{
607 struct address_space *mapping = inode->i_mapping;
608 loff_t holebegin = round_up(lstart, PAGE_SIZE);
609 loff_t holelen = 1 + lend - holebegin;
610
611 /*
612 * If the underlying filesystem is not going to provide
613 * a way to truncate a range of blocks (punch a hole) -
614 * we should return failure right now.
615 */
616 if (!inode->i_op->truncate_range)
617 return -ENOSYS;
618
619 mutex_lock(&inode->i_mutex);
620 inode_dio_wait(inode);
621 unmap_mapping_range(mapping, holebegin, holelen, 1);
622 inode->i_op->truncate_range(inode, lstart, lend);
623 /* unmap again to remove racily COWed private pages */
624 unmap_mapping_range(mapping, holebegin, holelen, 1);
625 mutex_unlock(&inode->i_mutex);
626
627 return 0;
628}
629
630/** 605/**
631 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched 606 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
632 * @inode: inode 607 * @inode: inode
diff --git a/mm/util.c b/mm/util.c
index ae962b31de88..8c7265afa29f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,7 @@
4#include <linux/export.h> 4#include <linux/export.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/security.h>
7#include <asm/uaccess.h> 8#include <asm/uaccess.h>
8 9
9#include "internal.h" 10#include "internal.h"
@@ -341,6 +342,35 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
341} 342}
342EXPORT_SYMBOL_GPL(get_user_pages_fast); 343EXPORT_SYMBOL_GPL(get_user_pages_fast);
343 344
345unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
346 unsigned long len, unsigned long prot,
347 unsigned long flag, unsigned long pgoff)
348{
349 unsigned long ret;
350 struct mm_struct *mm = current->mm;
351
352 ret = security_mmap_file(file, prot, flag);
353 if (!ret) {
354 down_write(&mm->mmap_sem);
355 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
356 up_write(&mm->mmap_sem);
357 }
358 return ret;
359}
360
361unsigned long vm_mmap(struct file *file, unsigned long addr,
362 unsigned long len, unsigned long prot,
363 unsigned long flag, unsigned long offset)
364{
365 if (unlikely(offset + PAGE_ALIGN(len) < offset))
366 return -EINVAL;
367 if (unlikely(offset & ~PAGE_MASK))
368 return -EINVAL;
369
370 return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
371}
372EXPORT_SYMBOL(vm_mmap);
373
344/* Tracepoints definitions. */ 374/* Tracepoints definitions. */
345EXPORT_TRACEPOINT_SYMBOL(kmalloc); 375EXPORT_TRACEPOINT_SYMBOL(kmalloc);
346EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 376EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 94dff883b449..2bb90b1d241c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -413,11 +413,11 @@ nocache:
413 if (addr + size - 1 < addr) 413 if (addr + size - 1 < addr)
414 goto overflow; 414 goto overflow;
415 415
416 n = rb_next(&first->rb_node); 416 if (list_is_last(&first->list, &vmap_area_list))
417 if (n)
418 first = rb_entry(n, struct vmap_area, rb_node);
419 else
420 goto found; 417 goto found;
418
419 first = list_entry(first->list.next,
420 struct vmap_area, list);
421 } 421 }
422 422
423found: 423found:
@@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
904 904
905 BUG_ON(size & ~PAGE_MASK); 905 BUG_ON(size & ~PAGE_MASK);
906 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 906 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
907 if (WARN_ON(size == 0)) {
908 /*
909 * Allocating 0 bytes isn't what caller wants since
910 * get_order(0) returns funny result. Just warn and terminate
911 * early.
912 */
913 return NULL;
914 }
907 order = get_order(size); 915 order = get_order(size);
908 916
909again: 917again:
@@ -1185,9 +1193,10 @@ void __init vmalloc_init(void)
1185 /* Import existing vmlist entries. */ 1193 /* Import existing vmlist entries. */
1186 for (tmp = vmlist; tmp; tmp = tmp->next) { 1194 for (tmp = vmlist; tmp; tmp = tmp->next) {
1187 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); 1195 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
1188 va->flags = tmp->flags | VM_VM_AREA; 1196 va->flags = VM_VM_AREA;
1189 va->va_start = (unsigned long)tmp->addr; 1197 va->va_start = (unsigned long)tmp->addr;
1190 va->va_end = va->va_start + tmp->size; 1198 va->va_end = va->va_start + tmp->size;
1199 va->vm = tmp;
1191 __insert_vmap_area(va); 1200 __insert_vmap_area(va);
1192 } 1201 }
1193 1202
@@ -1279,7 +1288,7 @@ DEFINE_RWLOCK(vmlist_lock);
1279struct vm_struct *vmlist; 1288struct vm_struct *vmlist;
1280 1289
1281static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1290static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1282 unsigned long flags, void *caller) 1291 unsigned long flags, const void *caller)
1283{ 1292{
1284 vm->flags = flags; 1293 vm->flags = flags;
1285 vm->addr = (void *)va->va_start; 1294 vm->addr = (void *)va->va_start;
@@ -1305,7 +1314,7 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm)
1305} 1314}
1306 1315
1307static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1316static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1308 unsigned long flags, void *caller) 1317 unsigned long flags, const void *caller)
1309{ 1318{
1310 setup_vmalloc_vm(vm, va, flags, caller); 1319 setup_vmalloc_vm(vm, va, flags, caller);
1311 insert_vmalloc_vmlist(vm); 1320 insert_vmalloc_vmlist(vm);
@@ -1313,7 +1322,7 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1313 1322
1314static struct vm_struct *__get_vm_area_node(unsigned long size, 1323static struct vm_struct *__get_vm_area_node(unsigned long size,
1315 unsigned long align, unsigned long flags, unsigned long start, 1324 unsigned long align, unsigned long flags, unsigned long start,
1316 unsigned long end, int node, gfp_t gfp_mask, void *caller) 1325 unsigned long end, int node, gfp_t gfp_mask, const void *caller)
1317{ 1326{
1318 struct vmap_area *va; 1327 struct vmap_area *va;
1319 struct vm_struct *area; 1328 struct vm_struct *area;
@@ -1374,7 +1383,7 @@ EXPORT_SYMBOL_GPL(__get_vm_area);
1374 1383
1375struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 1384struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1376 unsigned long start, unsigned long end, 1385 unsigned long start, unsigned long end,
1377 void *caller) 1386 const void *caller)
1378{ 1387{
1379 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1388 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1380 caller); 1389 caller);
@@ -1396,13 +1405,21 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1396} 1405}
1397 1406
1398struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 1407struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1399 void *caller) 1408 const void *caller)
1400{ 1409{
1401 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1410 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1402 -1, GFP_KERNEL, caller); 1411 -1, GFP_KERNEL, caller);
1403} 1412}
1404 1413
1405static struct vm_struct *find_vm_area(const void *addr) 1414/**
1415 * find_vm_area - find a continuous kernel virtual area
1416 * @addr: base address
1417 *
1418 * Search for the kernel VM area starting at @addr, and return it.
1419 * It is up to the caller to do all required locking to keep the returned
1420 * pointer valid.
1421 */
1422struct vm_struct *find_vm_area(const void *addr)
1406{ 1423{
1407 struct vmap_area *va; 1424 struct vmap_area *va;
1408 1425
@@ -1567,9 +1584,9 @@ EXPORT_SYMBOL(vmap);
1567 1584
1568static void *__vmalloc_node(unsigned long size, unsigned long align, 1585static void *__vmalloc_node(unsigned long size, unsigned long align,
1569 gfp_t gfp_mask, pgprot_t prot, 1586 gfp_t gfp_mask, pgprot_t prot,
1570 int node, void *caller); 1587 int node, const void *caller);
1571static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1588static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1572 pgprot_t prot, int node, void *caller) 1589 pgprot_t prot, int node, const void *caller)
1573{ 1590{
1574 const int order = 0; 1591 const int order = 0;
1575 struct page **pages; 1592 struct page **pages;
@@ -1642,7 +1659,7 @@ fail:
1642 */ 1659 */
1643void *__vmalloc_node_range(unsigned long size, unsigned long align, 1660void *__vmalloc_node_range(unsigned long size, unsigned long align,
1644 unsigned long start, unsigned long end, gfp_t gfp_mask, 1661 unsigned long start, unsigned long end, gfp_t gfp_mask,
1645 pgprot_t prot, int node, void *caller) 1662 pgprot_t prot, int node, const void *caller)
1646{ 1663{
1647 struct vm_struct *area; 1664 struct vm_struct *area;
1648 void *addr; 1665 void *addr;
@@ -1698,7 +1715,7 @@ fail:
1698 */ 1715 */
1699static void *__vmalloc_node(unsigned long size, unsigned long align, 1716static void *__vmalloc_node(unsigned long size, unsigned long align,
1700 gfp_t gfp_mask, pgprot_t prot, 1717 gfp_t gfp_mask, pgprot_t prot,
1701 int node, void *caller) 1718 int node, const void *caller)
1702{ 1719{
1703 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 1720 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
1704 gfp_mask, prot, node, caller); 1721 gfp_mask, prot, node, caller);
@@ -1974,9 +1991,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
1974 * IOREMAP area is treated as memory hole and no copy is done. 1991 * IOREMAP area is treated as memory hole and no copy is done.
1975 * 1992 *
1976 * If [addr...addr+count) doesn't includes any intersects with alive 1993 * If [addr...addr+count) doesn't includes any intersects with alive
1977 * vm_struct area, returns 0. 1994 * vm_struct area, returns 0. @buf should be kernel's buffer.
1978 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1979 * the caller should guarantee KM_USER0 is not used.
1980 * 1995 *
1981 * Note: In usual ops, vread() is never necessary because the caller 1996 * Note: In usual ops, vread() is never necessary because the caller
1982 * should know vmalloc() area is valid and can use memcpy(). 1997 * should know vmalloc() area is valid and can use memcpy().
@@ -2050,9 +2065,7 @@ finished:
2050 * IOREMAP area is treated as memory hole and no copy is done. 2065 * IOREMAP area is treated as memory hole and no copy is done.
2051 * 2066 *
2052 * If [addr...addr+count) doesn't includes any intersects with alive 2067 * If [addr...addr+count) doesn't includes any intersects with alive
2053 * vm_struct area, returns 0. 2068 * vm_struct area, returns 0. @buf should be kernel's buffer.
2054 * @buf should be kernel's buffer. Because this function uses KM_USER0,
2055 * the caller should guarantee KM_USER0 is not used.
2056 * 2069 *
2057 * Note: In usual ops, vwrite() is never necessary because the caller 2070 * Note: In usual ops, vwrite() is never necessary because the caller
2058 * should know vmalloc() area is valid and can use memcpy(). 2071 * should know vmalloc() area is valid and can use memcpy().
@@ -2375,8 +2388,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2375 return NULL; 2388 return NULL;
2376 } 2389 }
2377 2390
2378 vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); 2391 vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
2379 vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); 2392 vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
2380 if (!vas || !vms) 2393 if (!vas || !vms)
2381 goto err_free2; 2394 goto err_free2;
2382 2395
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 33c332bbab73..8d01243d9560 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -53,24 +53,6 @@
53#define CREATE_TRACE_POINTS 53#define CREATE_TRACE_POINTS
54#include <trace/events/vmscan.h> 54#include <trace/events/vmscan.h>
55 55
56/*
57 * reclaim_mode determines how the inactive list is shrunk
58 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
59 * RECLAIM_MODE_ASYNC: Do not block
60 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
61 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
62 * page from the LRU and reclaim all pages within a
63 * naturally aligned range
64 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
65 * order-0 pages and then compact the zone
66 */
67typedef unsigned __bitwise__ reclaim_mode_t;
68#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
69#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
70#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
71#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
72#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
73
74struct scan_control { 56struct scan_control {
75 /* Incremented by the number of inactive pages that were scanned */ 57 /* Incremented by the number of inactive pages that were scanned */
76 unsigned long nr_scanned; 58 unsigned long nr_scanned;
@@ -96,11 +78,8 @@ struct scan_control {
96 78
97 int order; 79 int order;
98 80
99 /* 81 /* Scan (total_size >> priority) pages at once */
100 * Intend to reclaim enough continuous memory rather than reclaim 82 int priority;
101 * enough amount of memory. i.e, mode for high order allocation.
102 */
103 reclaim_mode_t reclaim_mode;
104 83
105 /* 84 /*
106 * The memory cgroup that hit its limit and as a result is the 85 * The memory cgroup that hit its limit and as a result is the
@@ -115,11 +94,6 @@ struct scan_control {
115 nodemask_t *nodemask; 94 nodemask_t *nodemask;
116}; 95};
117 96
118struct mem_cgroup_zone {
119 struct mem_cgroup *mem_cgroup;
120 struct zone *zone;
121};
122
123#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 97#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
124 98
125#ifdef ARCH_HAS_PREFETCH 99#ifdef ARCH_HAS_PREFETCH
@@ -159,49 +133,26 @@ long vm_total_pages; /* The total number of pages which the VM controls */
159static LIST_HEAD(shrinker_list); 133static LIST_HEAD(shrinker_list);
160static DECLARE_RWSEM(shrinker_rwsem); 134static DECLARE_RWSEM(shrinker_rwsem);
161 135
162#ifdef CONFIG_CGROUP_MEM_RES_CTLR 136#ifdef CONFIG_MEMCG
163static bool global_reclaim(struct scan_control *sc) 137static bool global_reclaim(struct scan_control *sc)
164{ 138{
165 return !sc->target_mem_cgroup; 139 return !sc->target_mem_cgroup;
166} 140}
167
168static bool scanning_global_lru(struct mem_cgroup_zone *mz)
169{
170 return !mz->mem_cgroup;
171}
172#else 141#else
173static bool global_reclaim(struct scan_control *sc) 142static bool global_reclaim(struct scan_control *sc)
174{ 143{
175 return true; 144 return true;
176} 145}
177
178static bool scanning_global_lru(struct mem_cgroup_zone *mz)
179{
180 return true;
181}
182#endif 146#endif
183 147
184static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) 148static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
185{ 149{
186 if (!scanning_global_lru(mz)) 150 if (!mem_cgroup_disabled())
187 return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); 151 return mem_cgroup_get_lru_size(lruvec, lru);
188 152
189 return &mz->zone->reclaim_stat; 153 return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
190} 154}
191 155
192static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
193 enum lru_list lru)
194{
195 if (!scanning_global_lru(mz))
196 return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
197 zone_to_nid(mz->zone),
198 zone_idx(mz->zone),
199 BIT(lru));
200
201 return zone_page_state(mz->zone, NR_LRU_BASE + lru);
202}
203
204
205/* 156/*
206 * Add a shrinker callback to be called from the vm 157 * Add a shrinker callback to be called from the vm
207 */ 158 */
@@ -364,39 +315,6 @@ out:
364 return ret; 315 return ret;
365} 316}
366 317
367static void set_reclaim_mode(int priority, struct scan_control *sc,
368 bool sync)
369{
370 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
371
372 /*
373 * Initially assume we are entering either lumpy reclaim or
374 * reclaim/compaction.Depending on the order, we will either set the
375 * sync mode or just reclaim order-0 pages later.
376 */
377 if (COMPACTION_BUILD)
378 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
379 else
380 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
381
382 /*
383 * Avoid using lumpy reclaim or reclaim/compaction if possible by
384 * restricting when its set to either costly allocations or when
385 * under memory pressure
386 */
387 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
388 sc->reclaim_mode |= syncmode;
389 else if (sc->order && priority < DEF_PRIORITY - 2)
390 sc->reclaim_mode |= syncmode;
391 else
392 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
393}
394
395static void reset_reclaim_mode(struct scan_control *sc)
396{
397 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
398}
399
400static inline int is_page_cache_freeable(struct page *page) 318static inline int is_page_cache_freeable(struct page *page)
401{ 319{
402 /* 320 /*
@@ -416,10 +334,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
416 return 1; 334 return 1;
417 if (bdi == current->backing_dev_info) 335 if (bdi == current->backing_dev_info)
418 return 1; 336 return 1;
419
420 /* lumpy reclaim for hugepage often need a lot of write */
421 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
422 return 1;
423 return 0; 337 return 0;
424} 338}
425 339
@@ -523,8 +437,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
523 /* synchronous write or broken a_ops? */ 437 /* synchronous write or broken a_ops? */
524 ClearPageReclaim(page); 438 ClearPageReclaim(page);
525 } 439 }
526 trace_mm_vmscan_writepage(page, 440 trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
527 trace_reclaim_flags(page, sc->reclaim_mode));
528 inc_zone_page_state(page, NR_VMSCAN_WRITE); 441 inc_zone_page_state(page, NR_VMSCAN_WRITE);
529 return PAGE_SUCCESS; 442 return PAGE_SUCCESS;
530 } 443 }
@@ -701,19 +614,15 @@ enum page_references {
701}; 614};
702 615
703static enum page_references page_check_references(struct page *page, 616static enum page_references page_check_references(struct page *page,
704 struct mem_cgroup_zone *mz,
705 struct scan_control *sc) 617 struct scan_control *sc)
706{ 618{
707 int referenced_ptes, referenced_page; 619 int referenced_ptes, referenced_page;
708 unsigned long vm_flags; 620 unsigned long vm_flags;
709 621
710 referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); 622 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
623 &vm_flags);
711 referenced_page = TestClearPageReferenced(page); 624 referenced_page = TestClearPageReferenced(page);
712 625
713 /* Lumpy reclaim - ignore references */
714 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
715 return PAGEREF_RECLAIM;
716
717 /* 626 /*
718 * Mlock lost the isolation race with us. Let try_to_unmap() 627 * Mlock lost the isolation race with us. Let try_to_unmap()
719 * move the page to the unevictable list. 628 * move the page to the unevictable list.
@@ -722,7 +631,7 @@ static enum page_references page_check_references(struct page *page,
722 return PAGEREF_RECLAIM; 631 return PAGEREF_RECLAIM;
723 632
724 if (referenced_ptes) { 633 if (referenced_ptes) {
725 if (PageAnon(page)) 634 if (PageSwapBacked(page))
726 return PAGEREF_ACTIVATE; 635 return PAGEREF_ACTIVATE;
727 /* 636 /*
728 * All mapped pages start out with page table 637 * All mapped pages start out with page table
@@ -763,9 +672,8 @@ static enum page_references page_check_references(struct page *page,
763 * shrink_page_list() returns the number of reclaimed pages 672 * shrink_page_list() returns the number of reclaimed pages
764 */ 673 */
765static unsigned long shrink_page_list(struct list_head *page_list, 674static unsigned long shrink_page_list(struct list_head *page_list,
766 struct mem_cgroup_zone *mz, 675 struct zone *zone,
767 struct scan_control *sc, 676 struct scan_control *sc,
768 int priority,
769 unsigned long *ret_nr_dirty, 677 unsigned long *ret_nr_dirty,
770 unsigned long *ret_nr_writeback) 678 unsigned long *ret_nr_writeback)
771{ 679{
@@ -779,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
779 687
780 cond_resched(); 688 cond_resched();
781 689
690 mem_cgroup_uncharge_start();
782 while (!list_empty(page_list)) { 691 while (!list_empty(page_list)) {
783 enum page_references references; 692 enum page_references references;
784 struct address_space *mapping; 693 struct address_space *mapping;
@@ -794,7 +703,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
794 goto keep; 703 goto keep;
795 704
796 VM_BUG_ON(PageActive(page)); 705 VM_BUG_ON(PageActive(page));
797 VM_BUG_ON(page_zone(page) != mz->zone); 706 VM_BUG_ON(page_zone(page) != zone);
798 707
799 sc->nr_scanned++; 708 sc->nr_scanned++;
800 709
@@ -812,23 +721,44 @@ static unsigned long shrink_page_list(struct list_head *page_list,
812 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 721 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
813 722
814 if (PageWriteback(page)) { 723 if (PageWriteback(page)) {
815 nr_writeback++;
816 /* 724 /*
817 * Synchronous reclaim cannot queue pages for 725 * memcg doesn't have any dirty pages throttling so we
818 * writeback due to the possibility of stack overflow 726 * could easily OOM just because too many pages are in
819 * but if it encounters a page under writeback, wait 727 * writeback and there is nothing else to reclaim.
820 * for the IO to complete. 728 *
729 * Check __GFP_IO, certainly because a loop driver
730 * thread might enter reclaim, and deadlock if it waits
731 * on a page for which it is needed to do the write
732 * (loop masks off __GFP_IO|__GFP_FS for this reason);
733 * but more thought would probably show more reasons.
734 *
735 * Don't require __GFP_FS, since we're not going into
736 * the FS, just waiting on its writeback completion.
737 * Worryingly, ext4 gfs2 and xfs allocate pages with
738 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
739 * testing may_enter_fs here is liable to OOM on them.
821 */ 740 */
822 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && 741 if (global_reclaim(sc) ||
823 may_enter_fs) 742 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
824 wait_on_page_writeback(page); 743 /*
825 else { 744 * This is slightly racy - end_page_writeback()
826 unlock_page(page); 745 * might have just cleared PageReclaim, then
827 goto keep_lumpy; 746 * setting PageReclaim here end up interpreted
747 * as PageReadahead - but that does not matter
748 * enough to care. What we do want is for this
749 * page to have PageReclaim set next time memcg
750 * reclaim reaches the tests above, so it will
751 * then wait_on_page_writeback() to avoid OOM;
752 * and it's also appropriate in global reclaim.
753 */
754 SetPageReclaim(page);
755 nr_writeback++;
756 goto keep_locked;
828 } 757 }
758 wait_on_page_writeback(page);
829 } 759 }
830 760
831 references = page_check_references(page, mz, sc); 761 references = page_check_references(page, sc);
832 switch (references) { 762 switch (references) {
833 case PAGEREF_ACTIVATE: 763 case PAGEREF_ACTIVATE:
834 goto activate_locked; 764 goto activate_locked;
@@ -879,7 +809,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
879 * unless under significant pressure. 809 * unless under significant pressure.
880 */ 810 */
881 if (page_is_file_cache(page) && 811 if (page_is_file_cache(page) &&
882 (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { 812 (!current_is_kswapd() ||
813 sc->priority >= DEF_PRIORITY - 2)) {
883 /* 814 /*
884 * Immediately reclaim when written back. 815 * Immediately reclaim when written back.
885 * Similar in principal to deactivate_page() 816 * Similar in principal to deactivate_page()
@@ -908,7 +839,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
908 goto activate_locked; 839 goto activate_locked;
909 case PAGE_SUCCESS: 840 case PAGE_SUCCESS:
910 if (PageWriteback(page)) 841 if (PageWriteback(page))
911 goto keep_lumpy; 842 goto keep;
912 if (PageDirty(page)) 843 if (PageDirty(page))
913 goto keep; 844 goto keep;
914 845
@@ -994,7 +925,6 @@ cull_mlocked:
994 try_to_free_swap(page); 925 try_to_free_swap(page);
995 unlock_page(page); 926 unlock_page(page);
996 putback_lru_page(page); 927 putback_lru_page(page);
997 reset_reclaim_mode(sc);
998 continue; 928 continue;
999 929
1000activate_locked: 930activate_locked:
@@ -1007,8 +937,6 @@ activate_locked:
1007keep_locked: 937keep_locked:
1008 unlock_page(page); 938 unlock_page(page);
1009keep: 939keep:
1010 reset_reclaim_mode(sc);
1011keep_lumpy:
1012 list_add(&page->lru, &ret_pages); 940 list_add(&page->lru, &ret_pages);
1013 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 941 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
1014 } 942 }
@@ -1020,12 +948,13 @@ keep_lumpy:
1020 * will encounter the same problem 948 * will encounter the same problem
1021 */ 949 */
1022 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) 950 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
1023 zone_set_flag(mz->zone, ZONE_CONGESTED); 951 zone_set_flag(zone, ZONE_CONGESTED);
1024 952
1025 free_hot_cold_page_list(&free_pages, 1); 953 free_hot_cold_page_list(&free_pages, 1);
1026 954
1027 list_splice(&ret_pages, page_list); 955 list_splice(&ret_pages, page_list);
1028 count_vm_events(PGACTIVATE, pgactivate); 956 count_vm_events(PGACTIVATE, pgactivate);
957 mem_cgroup_uncharge_end();
1029 *ret_nr_dirty += nr_dirty; 958 *ret_nr_dirty += nr_dirty;
1030 *ret_nr_writeback += nr_writeback; 959 *ret_nr_writeback += nr_writeback;
1031 return nr_reclaimed; 960 return nr_reclaimed;
@@ -1041,34 +970,15 @@ keep_lumpy:
1041 * 970 *
1042 * returns 0 on success, -ve errno on failure. 971 * returns 0 on success, -ve errno on failure.
1043 */ 972 */
1044int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) 973int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1045{ 974{
1046 bool all_lru_mode;
1047 int ret = -EINVAL; 975 int ret = -EINVAL;
1048 976
1049 /* Only take pages on the LRU. */ 977 /* Only take pages on the LRU. */
1050 if (!PageLRU(page)) 978 if (!PageLRU(page))
1051 return ret; 979 return ret;
1052 980
1053 all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == 981 /* Do not give back unevictable pages for compaction */
1054 (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
1055
1056 /*
1057 * When checking the active state, we need to be sure we are
1058 * dealing with comparible boolean values. Take the logical not
1059 * of each.
1060 */
1061 if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
1062 return ret;
1063
1064 if (!all_lru_mode && !!page_is_file_cache(page) != file)
1065 return ret;
1066
1067 /*
1068 * When this function is being called for lumpy reclaim, we
1069 * initially look into all LRU pages, active, inactive and
1070 * unevictable; only give shrink_page_list evictable pages.
1071 */
1072 if (PageUnevictable(page)) 982 if (PageUnevictable(page))
1073 return ret; 983 return ret;
1074 984
@@ -1135,54 +1045,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1135 * Appropriate locks must be held before calling this function. 1045 * Appropriate locks must be held before calling this function.
1136 * 1046 *
1137 * @nr_to_scan: The number of pages to look through on the list. 1047 * @nr_to_scan: The number of pages to look through on the list.
1138 * @mz: The mem_cgroup_zone to pull pages from. 1048 * @lruvec: The LRU vector to pull pages from.
1139 * @dst: The temp list to put pages on to. 1049 * @dst: The temp list to put pages on to.
1140 * @nr_scanned: The number of pages that were scanned. 1050 * @nr_scanned: The number of pages that were scanned.
1141 * @sc: The scan_control struct for this reclaim session 1051 * @sc: The scan_control struct for this reclaim session
1142 * @mode: One of the LRU isolation modes 1052 * @mode: One of the LRU isolation modes
1143 * @active: True [1] if isolating active pages 1053 * @lru: LRU list id for isolating
1144 * @file: True [1] if isolating file [!anon] pages
1145 * 1054 *
1146 * returns how many pages were moved onto *@dst. 1055 * returns how many pages were moved onto *@dst.
1147 */ 1056 */
1148static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1057static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1149 struct mem_cgroup_zone *mz, struct list_head *dst, 1058 struct lruvec *lruvec, struct list_head *dst,
1150 unsigned long *nr_scanned, struct scan_control *sc, 1059 unsigned long *nr_scanned, struct scan_control *sc,
1151 isolate_mode_t mode, int active, int file) 1060 isolate_mode_t mode, enum lru_list lru)
1152{ 1061{
1153 struct lruvec *lruvec; 1062 struct list_head *src = &lruvec->lists[lru];
1154 struct list_head *src;
1155 unsigned long nr_taken = 0; 1063 unsigned long nr_taken = 0;
1156 unsigned long nr_lumpy_taken = 0;
1157 unsigned long nr_lumpy_dirty = 0;
1158 unsigned long nr_lumpy_failed = 0;
1159 unsigned long scan; 1064 unsigned long scan;
1160 int lru = LRU_BASE;
1161
1162 lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
1163 if (active)
1164 lru += LRU_ACTIVE;
1165 if (file)
1166 lru += LRU_FILE;
1167 src = &lruvec->lists[lru];
1168 1065
1169 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1066 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1170 struct page *page; 1067 struct page *page;
1171 unsigned long pfn; 1068 int nr_pages;
1172 unsigned long end_pfn;
1173 unsigned long page_pfn;
1174 int zone_id;
1175 1069
1176 page = lru_to_page(src); 1070 page = lru_to_page(src);
1177 prefetchw_prev_lru_page(page, src, flags); 1071 prefetchw_prev_lru_page(page, src, flags);
1178 1072
1179 VM_BUG_ON(!PageLRU(page)); 1073 VM_BUG_ON(!PageLRU(page));
1180 1074
1181 switch (__isolate_lru_page(page, mode, file)) { 1075 switch (__isolate_lru_page(page, mode)) {
1182 case 0: 1076 case 0:
1183 mem_cgroup_lru_del(page); 1077 nr_pages = hpage_nr_pages(page);
1078 mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
1184 list_move(&page->lru, dst); 1079 list_move(&page->lru, dst);
1185 nr_taken += hpage_nr_pages(page); 1080 nr_taken += nr_pages;
1186 break; 1081 break;
1187 1082
1188 case -EBUSY: 1083 case -EBUSY:
@@ -1193,93 +1088,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1193 default: 1088 default:
1194 BUG(); 1089 BUG();
1195 } 1090 }
1196
1197 if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
1198 continue;
1199
1200 /*
1201 * Attempt to take all pages in the order aligned region
1202 * surrounding the tag page. Only take those pages of
1203 * the same active state as that tag page. We may safely
1204 * round the target page pfn down to the requested order
1205 * as the mem_map is guaranteed valid out to MAX_ORDER,
1206 * where that page is in a different zone we will detect
1207 * it from its zone id and abort this block scan.
1208 */
1209 zone_id = page_zone_id(page);
1210 page_pfn = page_to_pfn(page);
1211 pfn = page_pfn & ~((1 << sc->order) - 1);
1212 end_pfn = pfn + (1 << sc->order);
1213 for (; pfn < end_pfn; pfn++) {
1214 struct page *cursor_page;
1215
1216 /* The target page is in the block, ignore it. */
1217 if (unlikely(pfn == page_pfn))
1218 continue;
1219
1220 /* Avoid holes within the zone. */
1221 if (unlikely(!pfn_valid_within(pfn)))
1222 break;
1223
1224 cursor_page = pfn_to_page(pfn);
1225
1226 /* Check that we have not crossed a zone boundary. */
1227 if (unlikely(page_zone_id(cursor_page) != zone_id))
1228 break;
1229
1230 /*
1231 * If we don't have enough swap space, reclaiming of
1232 * anon page which don't already have a swap slot is
1233 * pointless.
1234 */
1235 if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
1236 !PageSwapCache(cursor_page))
1237 break;
1238
1239 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1240 unsigned int isolated_pages;
1241
1242 mem_cgroup_lru_del(cursor_page);
1243 list_move(&cursor_page->lru, dst);
1244 isolated_pages = hpage_nr_pages(cursor_page);
1245 nr_taken += isolated_pages;
1246 nr_lumpy_taken += isolated_pages;
1247 if (PageDirty(cursor_page))
1248 nr_lumpy_dirty += isolated_pages;
1249 scan++;
1250 pfn += isolated_pages - 1;
1251 } else {
1252 /*
1253 * Check if the page is freed already.
1254 *
1255 * We can't use page_count() as that
1256 * requires compound_head and we don't
1257 * have a pin on the page here. If a
1258 * page is tail, we may or may not
1259 * have isolated the head, so assume
1260 * it's not free, it'd be tricky to
1261 * track the head status without a
1262 * page pin.
1263 */
1264 if (!PageTail(cursor_page) &&
1265 !atomic_read(&cursor_page->_count))
1266 continue;
1267 break;
1268 }
1269 }
1270
1271 /* If we break out of the loop above, lumpy reclaim failed */
1272 if (pfn < end_pfn)
1273 nr_lumpy_failed++;
1274 } 1091 }
1275 1092
1276 *nr_scanned = scan; 1093 *nr_scanned = scan;
1277 1094 trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
1278 trace_mm_vmscan_lru_isolate(sc->order, 1095 nr_taken, mode, is_file_lru(lru));
1279 nr_to_scan, scan,
1280 nr_taken,
1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1282 mode, file);
1283 return nr_taken; 1096 return nr_taken;
1284} 1097}
1285 1098
@@ -1316,15 +1129,16 @@ int isolate_lru_page(struct page *page)
1316 1129
1317 if (PageLRU(page)) { 1130 if (PageLRU(page)) {
1318 struct zone *zone = page_zone(page); 1131 struct zone *zone = page_zone(page);
1132 struct lruvec *lruvec;
1319 1133
1320 spin_lock_irq(&zone->lru_lock); 1134 spin_lock_irq(&zone->lru_lock);
1135 lruvec = mem_cgroup_page_lruvec(page, zone);
1321 if (PageLRU(page)) { 1136 if (PageLRU(page)) {
1322 int lru = page_lru(page); 1137 int lru = page_lru(page);
1323 ret = 0;
1324 get_page(page); 1138 get_page(page);
1325 ClearPageLRU(page); 1139 ClearPageLRU(page);
1326 1140 del_page_from_lru_list(page, lruvec, lru);
1327 del_page_from_lru_list(zone, page, lru); 1141 ret = 0;
1328 } 1142 }
1329 spin_unlock_irq(&zone->lru_lock); 1143 spin_unlock_irq(&zone->lru_lock);
1330 } 1144 }
@@ -1357,11 +1171,10 @@ static int too_many_isolated(struct zone *zone, int file,
1357} 1171}
1358 1172
1359static noinline_for_stack void 1173static noinline_for_stack void
1360putback_inactive_pages(struct mem_cgroup_zone *mz, 1174putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1361 struct list_head *page_list)
1362{ 1175{
1363 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1176 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1364 struct zone *zone = mz->zone; 1177 struct zone *zone = lruvec_zone(lruvec);
1365 LIST_HEAD(pages_to_free); 1178 LIST_HEAD(pages_to_free);
1366 1179
1367 /* 1180 /*
@@ -1379,9 +1192,13 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
1379 spin_lock_irq(&zone->lru_lock); 1192 spin_lock_irq(&zone->lru_lock);
1380 continue; 1193 continue;
1381 } 1194 }
1195
1196 lruvec = mem_cgroup_page_lruvec(page, zone);
1197
1382 SetPageLRU(page); 1198 SetPageLRU(page);
1383 lru = page_lru(page); 1199 lru = page_lru(page);
1384 add_page_to_lru_list(zone, page, lru); 1200 add_page_to_lru_list(page, lruvec, lru);
1201
1385 if (is_active_lru(lru)) { 1202 if (is_active_lru(lru)) {
1386 int file = is_file_lru(lru); 1203 int file = is_file_lru(lru);
1387 int numpages = hpage_nr_pages(page); 1204 int numpages = hpage_nr_pages(page);
@@ -1390,7 +1207,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
1390 if (put_page_testzero(page)) { 1207 if (put_page_testzero(page)) {
1391 __ClearPageLRU(page); 1208 __ClearPageLRU(page);
1392 __ClearPageActive(page); 1209 __ClearPageActive(page);
1393 del_page_from_lru_list(zone, page, lru); 1210 del_page_from_lru_list(page, lruvec, lru);
1394 1211
1395 if (unlikely(PageCompound(page))) { 1212 if (unlikely(PageCompound(page))) {
1396 spin_unlock_irq(&zone->lru_lock); 1213 spin_unlock_irq(&zone->lru_lock);
@@ -1407,112 +1224,24 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
1407 list_splice(&pages_to_free, page_list); 1224 list_splice(&pages_to_free, page_list);
1408} 1225}
1409 1226
1410static noinline_for_stack void
1411update_isolated_counts(struct mem_cgroup_zone *mz,
1412 struct list_head *page_list,
1413 unsigned long *nr_anon,
1414 unsigned long *nr_file)
1415{
1416 struct zone *zone = mz->zone;
1417 unsigned int count[NR_LRU_LISTS] = { 0, };
1418 unsigned long nr_active = 0;
1419 struct page *page;
1420 int lru;
1421
1422 /*
1423 * Count pages and clear active flags
1424 */
1425 list_for_each_entry(page, page_list, lru) {
1426 int numpages = hpage_nr_pages(page);
1427 lru = page_lru_base_type(page);
1428 if (PageActive(page)) {
1429 lru += LRU_ACTIVE;
1430 ClearPageActive(page);
1431 nr_active += numpages;
1432 }
1433 count[lru] += numpages;
1434 }
1435
1436 preempt_disable();
1437 __count_vm_events(PGDEACTIVATE, nr_active);
1438
1439 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1440 -count[LRU_ACTIVE_FILE]);
1441 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1442 -count[LRU_INACTIVE_FILE]);
1443 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1444 -count[LRU_ACTIVE_ANON]);
1445 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1446 -count[LRU_INACTIVE_ANON]);
1447
1448 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1449 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1450
1451 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1452 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1453 preempt_enable();
1454}
1455
1456/*
1457 * Returns true if a direct reclaim should wait on pages under writeback.
1458 *
1459 * If we are direct reclaiming for contiguous pages and we do not reclaim
1460 * everything in the list, try again and wait for writeback IO to complete.
1461 * This will stall high-order allocations noticeably. Only do that when really
1462 * need to free the pages under high memory pressure.
1463 */
1464static inline bool should_reclaim_stall(unsigned long nr_taken,
1465 unsigned long nr_freed,
1466 int priority,
1467 struct scan_control *sc)
1468{
1469 int lumpy_stall_priority;
1470
1471 /* kswapd should not stall on sync IO */
1472 if (current_is_kswapd())
1473 return false;
1474
1475 /* Only stall on lumpy reclaim */
1476 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1477 return false;
1478
1479 /* If we have reclaimed everything on the isolated list, no stall */
1480 if (nr_freed == nr_taken)
1481 return false;
1482
1483 /*
1484 * For high-order allocations, there are two stall thresholds.
1485 * High-cost allocations stall immediately where as lower
1486 * order allocations such as stacks require the scanning
1487 * priority to be much higher before stalling.
1488 */
1489 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1490 lumpy_stall_priority = DEF_PRIORITY;
1491 else
1492 lumpy_stall_priority = DEF_PRIORITY / 3;
1493
1494 return priority <= lumpy_stall_priority;
1495}
1496
1497/* 1227/*
1498 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1228 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1499 * of reclaimed pages 1229 * of reclaimed pages
1500 */ 1230 */
1501static noinline_for_stack unsigned long 1231static noinline_for_stack unsigned long
1502shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, 1232shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1503 struct scan_control *sc, int priority, int file) 1233 struct scan_control *sc, enum lru_list lru)
1504{ 1234{
1505 LIST_HEAD(page_list); 1235 LIST_HEAD(page_list);
1506 unsigned long nr_scanned; 1236 unsigned long nr_scanned;
1507 unsigned long nr_reclaimed = 0; 1237 unsigned long nr_reclaimed = 0;
1508 unsigned long nr_taken; 1238 unsigned long nr_taken;
1509 unsigned long nr_anon;
1510 unsigned long nr_file;
1511 unsigned long nr_dirty = 0; 1239 unsigned long nr_dirty = 0;
1512 unsigned long nr_writeback = 0; 1240 unsigned long nr_writeback = 0;
1513 isolate_mode_t isolate_mode = ISOLATE_INACTIVE; 1241 isolate_mode_t isolate_mode = 0;
1514 struct zone *zone = mz->zone; 1242 int file = is_file_lru(lru);
1515 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1243 struct zone *zone = lruvec_zone(lruvec);
1244 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1516 1245
1517 while (unlikely(too_many_isolated(zone, file, sc))) { 1246 while (unlikely(too_many_isolated(zone, file, sc))) {
1518 congestion_wait(BLK_RW_ASYNC, HZ/10); 1247 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1522,10 +1251,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1522 return SWAP_CLUSTER_MAX; 1251 return SWAP_CLUSTER_MAX;
1523 } 1252 }
1524 1253
1525 set_reclaim_mode(priority, sc, false);
1526 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1527 isolate_mode |= ISOLATE_ACTIVE;
1528
1529 lru_add_drain(); 1254 lru_add_drain();
1530 1255
1531 if (!sc->may_unmap) 1256 if (!sc->may_unmap)
@@ -1535,47 +1260,43 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1535 1260
1536 spin_lock_irq(&zone->lru_lock); 1261 spin_lock_irq(&zone->lru_lock);
1537 1262
1538 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned, 1263 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1539 sc, isolate_mode, 0, file); 1264 &nr_scanned, sc, isolate_mode, lru);
1265
1266 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1267 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1268
1540 if (global_reclaim(sc)) { 1269 if (global_reclaim(sc)) {
1541 zone->pages_scanned += nr_scanned; 1270 zone->pages_scanned += nr_scanned;
1542 if (current_is_kswapd()) 1271 if (current_is_kswapd())
1543 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1272 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
1544 nr_scanned);
1545 else 1273 else
1546 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1274 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
1547 nr_scanned);
1548 } 1275 }
1549 spin_unlock_irq(&zone->lru_lock); 1276 spin_unlock_irq(&zone->lru_lock);
1550 1277
1551 if (nr_taken == 0) 1278 if (nr_taken == 0)
1552 return 0; 1279 return 0;
1553 1280
1554 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); 1281 nr_reclaimed = shrink_page_list(&page_list, zone, sc,
1555
1556 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
1557 &nr_dirty, &nr_writeback); 1282 &nr_dirty, &nr_writeback);
1558 1283
1559 /* Check if we should syncronously wait for writeback */
1560 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1561 set_reclaim_mode(priority, sc, true);
1562 nr_reclaimed += shrink_page_list(&page_list, mz, sc,
1563 priority, &nr_dirty, &nr_writeback);
1564 }
1565
1566 spin_lock_irq(&zone->lru_lock); 1284 spin_lock_irq(&zone->lru_lock);
1567 1285
1568 reclaim_stat->recent_scanned[0] += nr_anon; 1286 reclaim_stat->recent_scanned[file] += nr_taken;
1569 reclaim_stat->recent_scanned[1] += nr_file;
1570 1287
1571 if (current_is_kswapd()) 1288 if (global_reclaim(sc)) {
1572 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1289 if (current_is_kswapd())
1573 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); 1290 __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
1291 nr_reclaimed);
1292 else
1293 __count_zone_vm_events(PGSTEAL_DIRECT, zone,
1294 nr_reclaimed);
1295 }
1574 1296
1575 putback_inactive_pages(mz, &page_list); 1297 putback_inactive_pages(lruvec, &page_list);
1576 1298
1577 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); 1299 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1578 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1579 1300
1580 spin_unlock_irq(&zone->lru_lock); 1301 spin_unlock_irq(&zone->lru_lock);
1581 1302
@@ -1604,14 +1325,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1604 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any 1325 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1605 * isolated page is PageWriteback 1326 * isolated page is PageWriteback
1606 */ 1327 */
1607 if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) 1328 if (nr_writeback && nr_writeback >=
1329 (nr_taken >> (DEF_PRIORITY - sc->priority)))
1608 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1330 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1609 1331
1610 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1332 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1611 zone_idx(zone), 1333 zone_idx(zone),
1612 nr_scanned, nr_reclaimed, 1334 nr_scanned, nr_reclaimed,
1613 priority, 1335 sc->priority,
1614 trace_shrink_flags(file, sc->reclaim_mode)); 1336 trace_shrink_flags(file));
1615 return nr_reclaimed; 1337 return nr_reclaimed;
1616} 1338}
1617 1339
@@ -1633,30 +1355,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1633 * But we had to alter page->flags anyway. 1355 * But we had to alter page->flags anyway.
1634 */ 1356 */
1635 1357
1636static void move_active_pages_to_lru(struct zone *zone, 1358static void move_active_pages_to_lru(struct lruvec *lruvec,
1637 struct list_head *list, 1359 struct list_head *list,
1638 struct list_head *pages_to_free, 1360 struct list_head *pages_to_free,
1639 enum lru_list lru) 1361 enum lru_list lru)
1640{ 1362{
1363 struct zone *zone = lruvec_zone(lruvec);
1641 unsigned long pgmoved = 0; 1364 unsigned long pgmoved = 0;
1642 struct page *page; 1365 struct page *page;
1366 int nr_pages;
1643 1367
1644 while (!list_empty(list)) { 1368 while (!list_empty(list)) {
1645 struct lruvec *lruvec;
1646
1647 page = lru_to_page(list); 1369 page = lru_to_page(list);
1370 lruvec = mem_cgroup_page_lruvec(page, zone);
1648 1371
1649 VM_BUG_ON(PageLRU(page)); 1372 VM_BUG_ON(PageLRU(page));
1650 SetPageLRU(page); 1373 SetPageLRU(page);
1651 1374
1652 lruvec = mem_cgroup_lru_add_list(zone, page, lru); 1375 nr_pages = hpage_nr_pages(page);
1376 mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
1653 list_move(&page->lru, &lruvec->lists[lru]); 1377 list_move(&page->lru, &lruvec->lists[lru]);
1654 pgmoved += hpage_nr_pages(page); 1378 pgmoved += nr_pages;
1655 1379
1656 if (put_page_testzero(page)) { 1380 if (put_page_testzero(page)) {
1657 __ClearPageLRU(page); 1381 __ClearPageLRU(page);
1658 __ClearPageActive(page); 1382 __ClearPageActive(page);
1659 del_page_from_lru_list(zone, page, lru); 1383 del_page_from_lru_list(page, lruvec, lru);
1660 1384
1661 if (unlikely(PageCompound(page))) { 1385 if (unlikely(PageCompound(page))) {
1662 spin_unlock_irq(&zone->lru_lock); 1386 spin_unlock_irq(&zone->lru_lock);
@@ -1672,9 +1396,9 @@ static void move_active_pages_to_lru(struct zone *zone,
1672} 1396}
1673 1397
1674static void shrink_active_list(unsigned long nr_to_scan, 1398static void shrink_active_list(unsigned long nr_to_scan,
1675 struct mem_cgroup_zone *mz, 1399 struct lruvec *lruvec,
1676 struct scan_control *sc, 1400 struct scan_control *sc,
1677 int priority, int file) 1401 enum lru_list lru)
1678{ 1402{
1679 unsigned long nr_taken; 1403 unsigned long nr_taken;
1680 unsigned long nr_scanned; 1404 unsigned long nr_scanned;
@@ -1683,15 +1407,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
1683 LIST_HEAD(l_active); 1407 LIST_HEAD(l_active);
1684 LIST_HEAD(l_inactive); 1408 LIST_HEAD(l_inactive);
1685 struct page *page; 1409 struct page *page;
1686 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1410 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1687 unsigned long nr_rotated = 0; 1411 unsigned long nr_rotated = 0;
1688 isolate_mode_t isolate_mode = ISOLATE_ACTIVE; 1412 isolate_mode_t isolate_mode = 0;
1689 struct zone *zone = mz->zone; 1413 int file = is_file_lru(lru);
1414 struct zone *zone = lruvec_zone(lruvec);
1690 1415
1691 lru_add_drain(); 1416 lru_add_drain();
1692 1417
1693 reset_reclaim_mode(sc);
1694
1695 if (!sc->may_unmap) 1418 if (!sc->may_unmap)
1696 isolate_mode |= ISOLATE_UNMAPPED; 1419 isolate_mode |= ISOLATE_UNMAPPED;
1697 if (!sc->may_writepage) 1420 if (!sc->may_writepage)
@@ -1699,18 +1422,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
1699 1422
1700 spin_lock_irq(&zone->lru_lock); 1423 spin_lock_irq(&zone->lru_lock);
1701 1424
1702 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc, 1425 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
1703 isolate_mode, 1, file); 1426 &nr_scanned, sc, isolate_mode, lru);
1704 if (global_reclaim(sc)) 1427 if (global_reclaim(sc))
1705 zone->pages_scanned += nr_scanned; 1428 zone->pages_scanned += nr_scanned;
1706 1429
1707 reclaim_stat->recent_scanned[file] += nr_taken; 1430 reclaim_stat->recent_scanned[file] += nr_taken;
1708 1431
1709 __count_zone_vm_events(PGREFILL, zone, nr_scanned); 1432 __count_zone_vm_events(PGREFILL, zone, nr_scanned);
1710 if (file) 1433 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1711 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1712 else
1713 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1714 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1434 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1715 spin_unlock_irq(&zone->lru_lock); 1435 spin_unlock_irq(&zone->lru_lock);
1716 1436
@@ -1732,7 +1452,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
1732 } 1452 }
1733 } 1453 }
1734 1454
1735 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { 1455 if (page_referenced(page, 0, sc->target_mem_cgroup,
1456 &vm_flags)) {
1736 nr_rotated += hpage_nr_pages(page); 1457 nr_rotated += hpage_nr_pages(page);
1737 /* 1458 /*
1738 * Identify referenced, file-backed active pages and 1459 * Identify referenced, file-backed active pages and
@@ -1765,10 +1486,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
1765 */ 1486 */
1766 reclaim_stat->recent_rotated[file] += nr_rotated; 1487 reclaim_stat->recent_rotated[file] += nr_rotated;
1767 1488
1768 move_active_pages_to_lru(zone, &l_active, &l_hold, 1489 move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
1769 LRU_ACTIVE + file * LRU_FILE); 1490 move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
1770 move_active_pages_to_lru(zone, &l_inactive, &l_hold,
1771 LRU_BASE + file * LRU_FILE);
1772 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1491 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1773 spin_unlock_irq(&zone->lru_lock); 1492 spin_unlock_irq(&zone->lru_lock);
1774 1493
@@ -1791,13 +1510,12 @@ static int inactive_anon_is_low_global(struct zone *zone)
1791 1510
1792/** 1511/**
1793 * inactive_anon_is_low - check if anonymous pages need to be deactivated 1512 * inactive_anon_is_low - check if anonymous pages need to be deactivated
1794 * @zone: zone to check 1513 * @lruvec: LRU vector to check
1795 * @sc: scan control of this context
1796 * 1514 *
1797 * Returns true if the zone does not have enough inactive anon pages, 1515 * Returns true if the zone does not have enough inactive anon pages,
1798 * meaning some active anon pages need to be deactivated. 1516 * meaning some active anon pages need to be deactivated.
1799 */ 1517 */
1800static int inactive_anon_is_low(struct mem_cgroup_zone *mz) 1518static int inactive_anon_is_low(struct lruvec *lruvec)
1801{ 1519{
1802 /* 1520 /*
1803 * If we don't have swap space, anonymous page deactivation 1521 * If we don't have swap space, anonymous page deactivation
@@ -1806,14 +1524,13 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
1806 if (!total_swap_pages) 1524 if (!total_swap_pages)
1807 return 0; 1525 return 0;
1808 1526
1809 if (!scanning_global_lru(mz)) 1527 if (!mem_cgroup_disabled())
1810 return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, 1528 return mem_cgroup_inactive_anon_is_low(lruvec);
1811 mz->zone);
1812 1529
1813 return inactive_anon_is_low_global(mz->zone); 1530 return inactive_anon_is_low_global(lruvec_zone(lruvec));
1814} 1531}
1815#else 1532#else
1816static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz) 1533static inline int inactive_anon_is_low(struct lruvec *lruvec)
1817{ 1534{
1818 return 0; 1535 return 0;
1819} 1536}
@@ -1831,7 +1548,7 @@ static int inactive_file_is_low_global(struct zone *zone)
1831 1548
1832/** 1549/**
1833 * inactive_file_is_low - check if file pages need to be deactivated 1550 * inactive_file_is_low - check if file pages need to be deactivated
1834 * @mz: memory cgroup and zone to check 1551 * @lruvec: LRU vector to check
1835 * 1552 *
1836 * When the system is doing streaming IO, memory pressure here 1553 * When the system is doing streaming IO, memory pressure here
1837 * ensures that active file pages get deactivated, until more 1554 * ensures that active file pages get deactivated, until more
@@ -1843,44 +1560,39 @@ static int inactive_file_is_low_global(struct zone *zone)
1843 * This uses a different ratio than the anonymous pages, because 1560 * This uses a different ratio than the anonymous pages, because
1844 * the page cache uses a use-once replacement algorithm. 1561 * the page cache uses a use-once replacement algorithm.
1845 */ 1562 */
1846static int inactive_file_is_low(struct mem_cgroup_zone *mz) 1563static int inactive_file_is_low(struct lruvec *lruvec)
1847{ 1564{
1848 if (!scanning_global_lru(mz)) 1565 if (!mem_cgroup_disabled())
1849 return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, 1566 return mem_cgroup_inactive_file_is_low(lruvec);
1850 mz->zone);
1851 1567
1852 return inactive_file_is_low_global(mz->zone); 1568 return inactive_file_is_low_global(lruvec_zone(lruvec));
1853} 1569}
1854 1570
1855static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file) 1571static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
1856{ 1572{
1857 if (file) 1573 if (is_file_lru(lru))
1858 return inactive_file_is_low(mz); 1574 return inactive_file_is_low(lruvec);
1859 else 1575 else
1860 return inactive_anon_is_low(mz); 1576 return inactive_anon_is_low(lruvec);
1861} 1577}
1862 1578
1863static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1579static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1864 struct mem_cgroup_zone *mz, 1580 struct lruvec *lruvec, struct scan_control *sc)
1865 struct scan_control *sc, int priority)
1866{ 1581{
1867 int file = is_file_lru(lru);
1868
1869 if (is_active_lru(lru)) { 1582 if (is_active_lru(lru)) {
1870 if (inactive_list_is_low(mz, file)) 1583 if (inactive_list_is_low(lruvec, lru))
1871 shrink_active_list(nr_to_scan, mz, sc, priority, file); 1584 shrink_active_list(nr_to_scan, lruvec, sc, lru);
1872 return 0; 1585 return 0;
1873 } 1586 }
1874 1587
1875 return shrink_inactive_list(nr_to_scan, mz, sc, priority, file); 1588 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
1876} 1589}
1877 1590
1878static int vmscan_swappiness(struct mem_cgroup_zone *mz, 1591static int vmscan_swappiness(struct scan_control *sc)
1879 struct scan_control *sc)
1880{ 1592{
1881 if (global_reclaim(sc)) 1593 if (global_reclaim(sc))
1882 return vm_swappiness; 1594 return vm_swappiness;
1883 return mem_cgroup_swappiness(mz->mem_cgroup); 1595 return mem_cgroup_swappiness(sc->target_mem_cgroup);
1884} 1596}
1885 1597
1886/* 1598/*
@@ -1889,19 +1601,21 @@ static int vmscan_swappiness(struct mem_cgroup_zone *mz,
1889 * by looking at the fraction of the pages scanned we did rotate back 1601 * by looking at the fraction of the pages scanned we did rotate back
1890 * onto the active list instead of evict. 1602 * onto the active list instead of evict.
1891 * 1603 *
1892 * nr[0] = anon pages to scan; nr[1] = file pages to scan 1604 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
1605 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
1893 */ 1606 */
1894static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, 1607static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1895 unsigned long *nr, int priority) 1608 unsigned long *nr)
1896{ 1609{
1897 unsigned long anon, file, free; 1610 unsigned long anon, file, free;
1898 unsigned long anon_prio, file_prio; 1611 unsigned long anon_prio, file_prio;
1899 unsigned long ap, fp; 1612 unsigned long ap, fp;
1900 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1613 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1901 u64 fraction[2], denominator; 1614 u64 fraction[2], denominator;
1902 enum lru_list lru; 1615 enum lru_list lru;
1903 int noswap = 0; 1616 int noswap = 0;
1904 bool force_scan = false; 1617 bool force_scan = false;
1618 struct zone *zone = lruvec_zone(lruvec);
1905 1619
1906 /* 1620 /*
1907 * If the zone or memcg is small, nr[l] can be 0. This 1621 * If the zone or memcg is small, nr[l] can be 0. This
@@ -1913,7 +1627,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1913 * latencies, so it's better to scan a minimum amount there as 1627 * latencies, so it's better to scan a minimum amount there as
1914 * well. 1628 * well.
1915 */ 1629 */
1916 if (current_is_kswapd() && mz->zone->all_unreclaimable) 1630 if (current_is_kswapd() && zone->all_unreclaimable)
1917 force_scan = true; 1631 force_scan = true;
1918 if (!global_reclaim(sc)) 1632 if (!global_reclaim(sc))
1919 force_scan = true; 1633 force_scan = true;
@@ -1927,16 +1641,16 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1927 goto out; 1641 goto out;
1928 } 1642 }
1929 1643
1930 anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) + 1644 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
1931 zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); 1645 get_lru_size(lruvec, LRU_INACTIVE_ANON);
1932 file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) + 1646 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1933 zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); 1647 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1934 1648
1935 if (global_reclaim(sc)) { 1649 if (global_reclaim(sc)) {
1936 free = zone_page_state(mz->zone, NR_FREE_PAGES); 1650 free = zone_page_state(zone, NR_FREE_PAGES);
1937 /* If we have very few page cache pages, 1651 /* If we have very few page cache pages,
1938 force-scan anon pages. */ 1652 force-scan anon pages. */
1939 if (unlikely(file + free <= high_wmark_pages(mz->zone))) { 1653 if (unlikely(file + free <= high_wmark_pages(zone))) {
1940 fraction[0] = 1; 1654 fraction[0] = 1;
1941 fraction[1] = 0; 1655 fraction[1] = 0;
1942 denominator = 1; 1656 denominator = 1;
@@ -1948,8 +1662,8 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1948 * With swappiness at 100, anonymous and file have the same priority. 1662 * With swappiness at 100, anonymous and file have the same priority.
1949 * This scanning priority is essentially the inverse of IO cost. 1663 * This scanning priority is essentially the inverse of IO cost.
1950 */ 1664 */
1951 anon_prio = vmscan_swappiness(mz, sc); 1665 anon_prio = vmscan_swappiness(sc);
1952 file_prio = 200 - vmscan_swappiness(mz, sc); 1666 file_prio = 200 - anon_prio;
1953 1667
1954 /* 1668 /*
1955 * OK, so we have swap space and a fair amount of page cache 1669 * OK, so we have swap space and a fair amount of page cache
@@ -1962,7 +1676,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1962 * 1676 *
1963 * anon in [0], file in [1] 1677 * anon in [0], file in [1]
1964 */ 1678 */
1965 spin_lock_irq(&mz->zone->lru_lock); 1679 spin_lock_irq(&zone->lru_lock);
1966 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1680 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1967 reclaim_stat->recent_scanned[0] /= 2; 1681 reclaim_stat->recent_scanned[0] /= 2;
1968 reclaim_stat->recent_rotated[0] /= 2; 1682 reclaim_stat->recent_rotated[0] /= 2;
@@ -1978,12 +1692,12 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1978 * proportional to the fraction of recently scanned pages on 1692 * proportional to the fraction of recently scanned pages on
1979 * each list that were recently referenced and in active use. 1693 * each list that were recently referenced and in active use.
1980 */ 1694 */
1981 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); 1695 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
1982 ap /= reclaim_stat->recent_rotated[0] + 1; 1696 ap /= reclaim_stat->recent_rotated[0] + 1;
1983 1697
1984 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1698 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
1985 fp /= reclaim_stat->recent_rotated[1] + 1; 1699 fp /= reclaim_stat->recent_rotated[1] + 1;
1986 spin_unlock_irq(&mz->zone->lru_lock); 1700 spin_unlock_irq(&zone->lru_lock);
1987 1701
1988 fraction[0] = ap; 1702 fraction[0] = ap;
1989 fraction[1] = fp; 1703 fraction[1] = fp;
@@ -1993,9 +1707,9 @@ out:
1993 int file = is_file_lru(lru); 1707 int file = is_file_lru(lru);
1994 unsigned long scan; 1708 unsigned long scan;
1995 1709
1996 scan = zone_nr_lru_pages(mz, lru); 1710 scan = get_lru_size(lruvec, lru);
1997 if (priority || noswap) { 1711 if (sc->priority || noswap || !vmscan_swappiness(sc)) {
1998 scan >>= priority; 1712 scan >>= sc->priority;
1999 if (!scan && force_scan) 1713 if (!scan && force_scan)
2000 scan = SWAP_CLUSTER_MAX; 1714 scan = SWAP_CLUSTER_MAX;
2001 scan = div64_u64(scan * fraction[file], denominator); 1715 scan = div64_u64(scan * fraction[file], denominator);
@@ -2004,14 +1718,25 @@ out:
2004 } 1718 }
2005} 1719}
2006 1720
1721/* Use reclaim/compaction for costly allocs or under memory pressure */
1722static bool in_reclaim_compaction(struct scan_control *sc)
1723{
1724 if (COMPACTION_BUILD && sc->order &&
1725 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
1726 sc->priority < DEF_PRIORITY - 2))
1727 return true;
1728
1729 return false;
1730}
1731
2007/* 1732/*
2008 * Reclaim/compaction depends on a number of pages being freed. To avoid 1733 * Reclaim/compaction is used for high-order allocation requests. It reclaims
2009 * disruption to the system, a small number of order-0 pages continue to be 1734 * order-0 pages before compacting the zone. should_continue_reclaim() returns
2010 * rotated and reclaimed in the normal fashion. However, by the time we get 1735 * true if more pages should be reclaimed such that when the page allocator
2011 * back to the allocator and call try_to_compact_zone(), we ensure that 1736 * calls try_to_compact_zone() that it will have enough free pages to succeed.
2012 * there are enough free pages for it to be likely successful 1737 * It will give up earlier than that if there is difficulty reclaiming pages.
2013 */ 1738 */
2014static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, 1739static inline bool should_continue_reclaim(struct lruvec *lruvec,
2015 unsigned long nr_reclaimed, 1740 unsigned long nr_reclaimed,
2016 unsigned long nr_scanned, 1741 unsigned long nr_scanned,
2017 struct scan_control *sc) 1742 struct scan_control *sc)
@@ -2020,7 +1745,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
2020 unsigned long inactive_lru_pages; 1745 unsigned long inactive_lru_pages;
2021 1746
2022 /* If not in reclaim/compaction mode, stop */ 1747 /* If not in reclaim/compaction mode, stop */
2023 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) 1748 if (!in_reclaim_compaction(sc))
2024 return false; 1749 return false;
2025 1750
2026 /* Consider stopping depending on scan and reclaim activity */ 1751 /* Consider stopping depending on scan and reclaim activity */
@@ -2051,15 +1776,15 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
2051 * inactive lists are large enough, continue reclaiming 1776 * inactive lists are large enough, continue reclaiming
2052 */ 1777 */
2053 pages_for_compaction = (2UL << sc->order); 1778 pages_for_compaction = (2UL << sc->order);
2054 inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); 1779 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
2055 if (nr_swap_pages > 0) 1780 if (nr_swap_pages > 0)
2056 inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); 1781 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
2057 if (sc->nr_reclaimed < pages_for_compaction && 1782 if (sc->nr_reclaimed < pages_for_compaction &&
2058 inactive_lru_pages > pages_for_compaction) 1783 inactive_lru_pages > pages_for_compaction)
2059 return true; 1784 return true;
2060 1785
2061 /* If compaction would go ahead or the allocation would succeed, stop */ 1786 /* If compaction would go ahead or the allocation would succeed, stop */
2062 switch (compaction_suitable(mz->zone, sc->order)) { 1787 switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) {
2063 case COMPACT_PARTIAL: 1788 case COMPACT_PARTIAL:
2064 case COMPACT_CONTINUE: 1789 case COMPACT_CONTINUE:
2065 return false; 1790 return false;
@@ -2071,8 +1796,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
2071/* 1796/*
2072 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1797 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
2073 */ 1798 */
2074static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, 1799static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2075 struct scan_control *sc)
2076{ 1800{
2077 unsigned long nr[NR_LRU_LISTS]; 1801 unsigned long nr[NR_LRU_LISTS];
2078 unsigned long nr_to_scan; 1802 unsigned long nr_to_scan;
@@ -2084,7 +1808,7 @@ static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
2084restart: 1808restart:
2085 nr_reclaimed = 0; 1809 nr_reclaimed = 0;
2086 nr_scanned = sc->nr_scanned; 1810 nr_scanned = sc->nr_scanned;
2087 get_scan_count(mz, sc, nr, priority); 1811 get_scan_count(lruvec, sc, nr);
2088 1812
2089 blk_start_plug(&plug); 1813 blk_start_plug(&plug);
2090 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1814 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2096,7 +1820,7 @@ restart:
2096 nr[lru] -= nr_to_scan; 1820 nr[lru] -= nr_to_scan;
2097 1821
2098 nr_reclaimed += shrink_list(lru, nr_to_scan, 1822 nr_reclaimed += shrink_list(lru, nr_to_scan,
2099 mz, sc, priority); 1823 lruvec, sc);
2100 } 1824 }
2101 } 1825 }
2102 /* 1826 /*
@@ -2107,12 +1831,8 @@ restart:
2107 * with multiple processes reclaiming pages, the total 1831 * with multiple processes reclaiming pages, the total
2108 * freeing target can get unreasonably large. 1832 * freeing target can get unreasonably large.
2109 */ 1833 */
2110 if (nr_reclaimed >= nr_to_reclaim) 1834 if (nr_reclaimed >= nr_to_reclaim &&
2111 nr_to_reclaim = 0; 1835 sc->priority < DEF_PRIORITY)
2112 else
2113 nr_to_reclaim -= nr_reclaimed;
2114
2115 if (!nr_to_reclaim && priority < DEF_PRIORITY)
2116 break; 1836 break;
2117 } 1837 }
2118 blk_finish_plug(&plug); 1838 blk_finish_plug(&plug);
@@ -2122,35 +1842,33 @@ restart:
2122 * Even if we did not try to evict anon pages at all, we want to 1842 * Even if we did not try to evict anon pages at all, we want to
2123 * rebalance the anon lru active/inactive ratio. 1843 * rebalance the anon lru active/inactive ratio.
2124 */ 1844 */
2125 if (inactive_anon_is_low(mz)) 1845 if (inactive_anon_is_low(lruvec))
2126 shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0); 1846 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
1847 sc, LRU_ACTIVE_ANON);
2127 1848
2128 /* reclaim/compaction might need reclaim to continue */ 1849 /* reclaim/compaction might need reclaim to continue */
2129 if (should_continue_reclaim(mz, nr_reclaimed, 1850 if (should_continue_reclaim(lruvec, nr_reclaimed,
2130 sc->nr_scanned - nr_scanned, sc)) 1851 sc->nr_scanned - nr_scanned, sc))
2131 goto restart; 1852 goto restart;
2132 1853
2133 throttle_vm_writeout(sc->gfp_mask); 1854 throttle_vm_writeout(sc->gfp_mask);
2134} 1855}
2135 1856
2136static void shrink_zone(int priority, struct zone *zone, 1857static void shrink_zone(struct zone *zone, struct scan_control *sc)
2137 struct scan_control *sc)
2138{ 1858{
2139 struct mem_cgroup *root = sc->target_mem_cgroup; 1859 struct mem_cgroup *root = sc->target_mem_cgroup;
2140 struct mem_cgroup_reclaim_cookie reclaim = { 1860 struct mem_cgroup_reclaim_cookie reclaim = {
2141 .zone = zone, 1861 .zone = zone,
2142 .priority = priority, 1862 .priority = sc->priority,
2143 }; 1863 };
2144 struct mem_cgroup *memcg; 1864 struct mem_cgroup *memcg;
2145 1865
2146 memcg = mem_cgroup_iter(root, NULL, &reclaim); 1866 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2147 do { 1867 do {
2148 struct mem_cgroup_zone mz = { 1868 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2149 .mem_cgroup = memcg, 1869
2150 .zone = zone, 1870 shrink_lruvec(lruvec, sc);
2151 };
2152 1871
2153 shrink_mem_cgroup_zone(priority, &mz, sc);
2154 /* 1872 /*
2155 * Limit reclaim has historically picked one memcg and 1873 * Limit reclaim has historically picked one memcg and
2156 * scanned it with decreasing priority levels until 1874 * scanned it with decreasing priority levels until
@@ -2226,8 +1944,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2226 * the caller that it should consider retrying the allocation instead of 1944 * the caller that it should consider retrying the allocation instead of
2227 * further reclaim. 1945 * further reclaim.
2228 */ 1946 */
2229static bool shrink_zones(int priority, struct zonelist *zonelist, 1947static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2230 struct scan_control *sc)
2231{ 1948{
2232 struct zoneref *z; 1949 struct zoneref *z;
2233 struct zone *zone; 1950 struct zone *zone;
@@ -2254,7 +1971,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2254 if (global_reclaim(sc)) { 1971 if (global_reclaim(sc)) {
2255 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1972 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2256 continue; 1973 continue;
2257 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1974 if (zone->all_unreclaimable &&
1975 sc->priority != DEF_PRIORITY)
2258 continue; /* Let kswapd poll it */ 1976 continue; /* Let kswapd poll it */
2259 if (COMPACTION_BUILD) { 1977 if (COMPACTION_BUILD) {
2260 /* 1978 /*
@@ -2286,7 +2004,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2286 /* need some check for avoid more shrink_zone() */ 2004 /* need some check for avoid more shrink_zone() */
2287 } 2005 }
2288 2006
2289 shrink_zone(priority, zone, sc); 2007 shrink_zone(zone, sc);
2290 } 2008 }
2291 2009
2292 return aborted_reclaim; 2010 return aborted_reclaim;
@@ -2337,7 +2055,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2337 struct scan_control *sc, 2055 struct scan_control *sc,
2338 struct shrink_control *shrink) 2056 struct shrink_control *shrink)
2339{ 2057{
2340 int priority;
2341 unsigned long total_scanned = 0; 2058 unsigned long total_scanned = 0;
2342 struct reclaim_state *reclaim_state = current->reclaim_state; 2059 struct reclaim_state *reclaim_state = current->reclaim_state;
2343 struct zoneref *z; 2060 struct zoneref *z;
@@ -2350,11 +2067,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2350 if (global_reclaim(sc)) 2067 if (global_reclaim(sc))
2351 count_vm_event(ALLOCSTALL); 2068 count_vm_event(ALLOCSTALL);
2352 2069
2353 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2070 do {
2354 sc->nr_scanned = 0; 2071 sc->nr_scanned = 0;
2355 if (!priority) 2072 aborted_reclaim = shrink_zones(zonelist, sc);
2356 disable_swap_token(sc->target_mem_cgroup);
2357 aborted_reclaim = shrink_zones(priority, zonelist, sc);
2358 2073
2359 /* 2074 /*
2360 * Don't shrink slabs when reclaiming memory from 2075 * Don't shrink slabs when reclaiming memory from
@@ -2396,7 +2111,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2396 2111
2397 /* Take a nap, wait for some writeback to complete */ 2112 /* Take a nap, wait for some writeback to complete */
2398 if (!sc->hibernation_mode && sc->nr_scanned && 2113 if (!sc->hibernation_mode && sc->nr_scanned &&
2399 priority < DEF_PRIORITY - 2) { 2114 sc->priority < DEF_PRIORITY - 2) {
2400 struct zone *preferred_zone; 2115 struct zone *preferred_zone;
2401 2116
2402 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), 2117 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
@@ -2404,7 +2119,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2404 &preferred_zone); 2119 &preferred_zone);
2405 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); 2120 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2406 } 2121 }
2407 } 2122 } while (--sc->priority >= 0);
2408 2123
2409out: 2124out:
2410 delayacct_freepages_end(); 2125 delayacct_freepages_end();
@@ -2431,6 +2146,83 @@ out:
2431 return 0; 2146 return 0;
2432} 2147}
2433 2148
2149static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2150{
2151 struct zone *zone;
2152 unsigned long pfmemalloc_reserve = 0;
2153 unsigned long free_pages = 0;
2154 int i;
2155 bool wmark_ok;
2156
2157 for (i = 0; i <= ZONE_NORMAL; i++) {
2158 zone = &pgdat->node_zones[i];
2159 pfmemalloc_reserve += min_wmark_pages(zone);
2160 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2161 }
2162
2163 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2164
2165 /* kswapd must be awake if processes are being throttled */
2166 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2167 pgdat->classzone_idx = min(pgdat->classzone_idx,
2168 (enum zone_type)ZONE_NORMAL);
2169 wake_up_interruptible(&pgdat->kswapd_wait);
2170 }
2171
2172 return wmark_ok;
2173}
2174
2175/*
2176 * Throttle direct reclaimers if backing storage is backed by the network
2177 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
2178 * depleted. kswapd will continue to make progress and wake the processes
2179 * when the low watermark is reached
2180 */
2181static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2182 nodemask_t *nodemask)
2183{
2184 struct zone *zone;
2185 int high_zoneidx = gfp_zone(gfp_mask);
2186 pg_data_t *pgdat;
2187
2188 /*
2189 * Kernel threads should not be throttled as they may be indirectly
2190 * responsible for cleaning pages necessary for reclaim to make forward
2191 * progress. kjournald for example may enter direct reclaim while
2192 * committing a transaction where throttling it could forcing other
2193 * processes to block on log_wait_commit().
2194 */
2195 if (current->flags & PF_KTHREAD)
2196 return;
2197
2198 /* Check if the pfmemalloc reserves are ok */
2199 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
2200 pgdat = zone->zone_pgdat;
2201 if (pfmemalloc_watermark_ok(pgdat))
2202 return;
2203
2204 /* Account for the throttling */
2205 count_vm_event(PGSCAN_DIRECT_THROTTLE);
2206
2207 /*
2208 * If the caller cannot enter the filesystem, it's possible that it
2209 * is due to the caller holding an FS lock or performing a journal
2210 * transaction in the case of a filesystem like ext[3|4]. In this case,
2211 * it is not safe to block on pfmemalloc_wait as kswapd could be
2212 * blocked waiting on the same lock. Instead, throttle for up to a
2213 * second before continuing.
2214 */
2215 if (!(gfp_mask & __GFP_FS)) {
2216 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2217 pfmemalloc_watermark_ok(pgdat), HZ);
2218 return;
2219 }
2220
2221 /* Throttle until kswapd wakes the process */
2222 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2223 pfmemalloc_watermark_ok(pgdat));
2224}
2225
2434unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2226unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2435 gfp_t gfp_mask, nodemask_t *nodemask) 2227 gfp_t gfp_mask, nodemask_t *nodemask)
2436{ 2228{
@@ -2442,6 +2234,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2442 .may_unmap = 1, 2234 .may_unmap = 1,
2443 .may_swap = 1, 2235 .may_swap = 1,
2444 .order = order, 2236 .order = order,
2237 .priority = DEF_PRIORITY,
2445 .target_mem_cgroup = NULL, 2238 .target_mem_cgroup = NULL,
2446 .nodemask = nodemask, 2239 .nodemask = nodemask,
2447 }; 2240 };
@@ -2449,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2449 .gfp_mask = sc.gfp_mask, 2242 .gfp_mask = sc.gfp_mask,
2450 }; 2243 };
2451 2244
2245 throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
2246
2247 /*
2248 * Do not enter reclaim if fatal signal is pending. 1 is returned so
2249 * that the page allocator does not consider triggering OOM
2250 */
2251 if (fatal_signal_pending(current))
2252 return 1;
2253
2452 trace_mm_vmscan_direct_reclaim_begin(order, 2254 trace_mm_vmscan_direct_reclaim_begin(order,
2453 sc.may_writepage, 2255 sc.may_writepage,
2454 gfp_mask); 2256 gfp_mask);
@@ -2460,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2460 return nr_reclaimed; 2262 return nr_reclaimed;
2461} 2263}
2462 2264
2463#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2265#ifdef CONFIG_MEMCG
2464 2266
2465unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, 2267unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2466 gfp_t gfp_mask, bool noswap, 2268 gfp_t gfp_mask, bool noswap,
@@ -2474,17 +2276,15 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2474 .may_unmap = 1, 2276 .may_unmap = 1,
2475 .may_swap = !noswap, 2277 .may_swap = !noswap,
2476 .order = 0, 2278 .order = 0,
2279 .priority = 0,
2477 .target_mem_cgroup = memcg, 2280 .target_mem_cgroup = memcg,
2478 }; 2281 };
2479 struct mem_cgroup_zone mz = { 2282 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2480 .mem_cgroup = memcg,
2481 .zone = zone,
2482 };
2483 2283
2484 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2284 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2485 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2285 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2486 2286
2487 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, 2287 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
2488 sc.may_writepage, 2288 sc.may_writepage,
2489 sc.gfp_mask); 2289 sc.gfp_mask);
2490 2290
@@ -2495,7 +2295,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2495 * will pick up pages from other mem cgroup's as well. We hack 2295 * will pick up pages from other mem cgroup's as well. We hack
2496 * the priority and make it zero. 2296 * the priority and make it zero.
2497 */ 2297 */
2498 shrink_mem_cgroup_zone(0, &mz, &sc); 2298 shrink_lruvec(lruvec, &sc);
2499 2299
2500 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2300 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2501 2301
@@ -2516,6 +2316,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2516 .may_swap = !noswap, 2316 .may_swap = !noswap,
2517 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2317 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2518 .order = 0, 2318 .order = 0,
2319 .priority = DEF_PRIORITY,
2519 .target_mem_cgroup = memcg, 2320 .target_mem_cgroup = memcg,
2520 .nodemask = NULL, /* we don't care the placement */ 2321 .nodemask = NULL, /* we don't care the placement */
2521 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2322 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2546,8 +2347,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2546} 2347}
2547#endif 2348#endif
2548 2349
2549static void age_active_anon(struct zone *zone, struct scan_control *sc, 2350static void age_active_anon(struct zone *zone, struct scan_control *sc)
2550 int priority)
2551{ 2351{
2552 struct mem_cgroup *memcg; 2352 struct mem_cgroup *memcg;
2553 2353
@@ -2556,14 +2356,11 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc,
2556 2356
2557 memcg = mem_cgroup_iter(NULL, NULL, NULL); 2357 memcg = mem_cgroup_iter(NULL, NULL, NULL);
2558 do { 2358 do {
2559 struct mem_cgroup_zone mz = { 2359 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2560 .mem_cgroup = memcg,
2561 .zone = zone,
2562 };
2563 2360
2564 if (inactive_anon_is_low(&mz)) 2361 if (inactive_anon_is_low(lruvec))
2565 shrink_active_list(SWAP_CLUSTER_MAX, &mz, 2362 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2566 sc, priority, 0); 2363 sc, LRU_ACTIVE_ANON);
2567 2364
2568 memcg = mem_cgroup_iter(NULL, memcg, NULL); 2365 memcg = mem_cgroup_iter(NULL, memcg, NULL);
2569 } while (memcg); 2366 } while (memcg);
@@ -2598,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2598 return balanced_pages >= (present_pages >> 2); 2395 return balanced_pages >= (present_pages >> 2);
2599} 2396}
2600 2397
2601/* is kswapd sleeping prematurely? */ 2398/*
2602static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, 2399 * Prepare kswapd for sleeping. This verifies that there are no processes
2400 * waiting in throttle_direct_reclaim() and that watermarks have been met.
2401 *
2402 * Returns true if kswapd is ready to sleep
2403 */
2404static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2603 int classzone_idx) 2405 int classzone_idx)
2604{ 2406{
2605 int i; 2407 int i;
@@ -2608,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2608 2410
2609 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2411 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2610 if (remaining) 2412 if (remaining)
2611 return true; 2413 return false;
2414
2415 /*
2416 * There is a potential race between when kswapd checks its watermarks
2417 * and a process gets throttled. There is also a potential race if
2418 * processes get throttled, kswapd wakes, a large process exits therby
2419 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
2420 * is going to sleep, no process should be sleeping on pfmemalloc_wait
2421 * so wake them now if necessary. If necessary, processes will wake
2422 * kswapd and get throttled again
2423 */
2424 if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
2425 wake_up(&pgdat->pfmemalloc_wait);
2426 return false;
2427 }
2612 2428
2613 /* Check the watermark levels */ 2429 /* Check the watermark levels */
2614 for (i = 0; i <= classzone_idx; i++) { 2430 for (i = 0; i <= classzone_idx; i++) {
@@ -2641,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2641 * must be balanced 2457 * must be balanced
2642 */ 2458 */
2643 if (order) 2459 if (order)
2644 return !pgdat_balanced(pgdat, balanced, classzone_idx); 2460 return pgdat_balanced(pgdat, balanced, classzone_idx);
2645 else 2461 else
2646 return !all_zones_ok; 2462 return all_zones_ok;
2647} 2463}
2648 2464
2649/* 2465/*
@@ -2672,7 +2488,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2672{ 2488{
2673 int all_zones_ok; 2489 int all_zones_ok;
2674 unsigned long balanced; 2490 unsigned long balanced;
2675 int priority;
2676 int i; 2491 int i;
2677 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2492 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2678 unsigned long total_scanned; 2493 unsigned long total_scanned;
@@ -2696,18 +2511,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2696 }; 2511 };
2697loop_again: 2512loop_again:
2698 total_scanned = 0; 2513 total_scanned = 0;
2514 sc.priority = DEF_PRIORITY;
2699 sc.nr_reclaimed = 0; 2515 sc.nr_reclaimed = 0;
2700 sc.may_writepage = !laptop_mode; 2516 sc.may_writepage = !laptop_mode;
2701 count_vm_event(PAGEOUTRUN); 2517 count_vm_event(PAGEOUTRUN);
2702 2518
2703 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2519 do {
2704 unsigned long lru_pages = 0; 2520 unsigned long lru_pages = 0;
2705 int has_under_min_watermark_zone = 0; 2521 int has_under_min_watermark_zone = 0;
2706 2522
2707 /* The swap token gets in the way of swapout... */
2708 if (!priority)
2709 disable_swap_token(NULL);
2710
2711 all_zones_ok = 1; 2523 all_zones_ok = 1;
2712 balanced = 0; 2524 balanced = 0;
2713 2525
@@ -2721,14 +2533,15 @@ loop_again:
2721 if (!populated_zone(zone)) 2533 if (!populated_zone(zone))
2722 continue; 2534 continue;
2723 2535
2724 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2536 if (zone->all_unreclaimable &&
2537 sc.priority != DEF_PRIORITY)
2725 continue; 2538 continue;
2726 2539
2727 /* 2540 /*
2728 * Do some background aging of the anon list, to give 2541 * Do some background aging of the anon list, to give
2729 * pages a chance to be referenced before reclaiming. 2542 * pages a chance to be referenced before reclaiming.
2730 */ 2543 */
2731 age_active_anon(zone, &sc, priority); 2544 age_active_anon(zone, &sc);
2732 2545
2733 /* 2546 /*
2734 * If the number of buffer_heads in the machine 2547 * If the number of buffer_heads in the machine
@@ -2776,7 +2589,8 @@ loop_again:
2776 if (!populated_zone(zone)) 2589 if (!populated_zone(zone))
2777 continue; 2590 continue;
2778 2591
2779 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2592 if (zone->all_unreclaimable &&
2593 sc.priority != DEF_PRIORITY)
2780 continue; 2594 continue;
2781 2595
2782 sc.nr_scanned = 0; 2596 sc.nr_scanned = 0;
@@ -2820,7 +2634,7 @@ loop_again:
2820 !zone_watermark_ok_safe(zone, testorder, 2634 !zone_watermark_ok_safe(zone, testorder,
2821 high_wmark_pages(zone) + balance_gap, 2635 high_wmark_pages(zone) + balance_gap,
2822 end_zone, 0)) { 2636 end_zone, 0)) {
2823 shrink_zone(priority, zone, &sc); 2637 shrink_zone(zone, &sc);
2824 2638
2825 reclaim_state->reclaimed_slab = 0; 2639 reclaim_state->reclaimed_slab = 0;
2826 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); 2640 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
@@ -2863,7 +2677,7 @@ loop_again:
2863 * consider it to be no longer congested. It's 2677 * consider it to be no longer congested. It's
2864 * possible there are dirty pages backed by 2678 * possible there are dirty pages backed by
2865 * congested BDIs but as pressure is relieved, 2679 * congested BDIs but as pressure is relieved,
2866 * spectulatively avoid congestion waits 2680 * speculatively avoid congestion waits
2867 */ 2681 */
2868 zone_clear_flag(zone, ZONE_CONGESTED); 2682 zone_clear_flag(zone, ZONE_CONGESTED);
2869 if (i <= *classzone_idx) 2683 if (i <= *classzone_idx)
@@ -2871,13 +2685,23 @@ loop_again:
2871 } 2685 }
2872 2686
2873 } 2687 }
2688
2689 /*
2690 * If the low watermark is met there is no need for processes
2691 * to be throttled on pfmemalloc_wait as they should not be
2692 * able to safely make forward progress. Wake them
2693 */
2694 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
2695 pfmemalloc_watermark_ok(pgdat))
2696 wake_up(&pgdat->pfmemalloc_wait);
2697
2874 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2698 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2875 break; /* kswapd: all done */ 2699 break; /* kswapd: all done */
2876 /* 2700 /*
2877 * OK, kswapd is getting into trouble. Take a nap, then take 2701 * OK, kswapd is getting into trouble. Take a nap, then take
2878 * another pass across the zones. 2702 * another pass across the zones.
2879 */ 2703 */
2880 if (total_scanned && (priority < DEF_PRIORITY - 2)) { 2704 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
2881 if (has_under_min_watermark_zone) 2705 if (has_under_min_watermark_zone)
2882 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); 2706 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2883 else 2707 else
@@ -2892,7 +2716,7 @@ loop_again:
2892 */ 2716 */
2893 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 2717 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2894 break; 2718 break;
2895 } 2719 } while (--sc.priority >= 0);
2896out: 2720out:
2897 2721
2898 /* 2722 /*
@@ -2942,7 +2766,8 @@ out:
2942 if (!populated_zone(zone)) 2766 if (!populated_zone(zone))
2943 continue; 2767 continue;
2944 2768
2945 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2769 if (zone->all_unreclaimable &&
2770 sc.priority != DEF_PRIORITY)
2946 continue; 2771 continue;
2947 2772
2948 /* Would compaction fail due to lack of free memory? */ 2773 /* Would compaction fail due to lack of free memory? */
@@ -2971,7 +2796,7 @@ out:
2971 } 2796 }
2972 2797
2973 /* 2798 /*
2974 * Return the order we were reclaiming at so sleeping_prematurely() 2799 * Return the order we were reclaiming at so prepare_kswapd_sleep()
2975 * makes a decision on the order we were last reclaiming at. However, 2800 * makes a decision on the order we were last reclaiming at. However,
2976 * if another caller entered the allocator slow path while kswapd 2801 * if another caller entered the allocator slow path while kswapd
2977 * was awake, order will remain at the higher level 2802 * was awake, order will remain at the higher level
@@ -2991,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2991 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2816 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2992 2817
2993 /* Try to sleep for a short interval */ 2818 /* Try to sleep for a short interval */
2994 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2819 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2995 remaining = schedule_timeout(HZ/10); 2820 remaining = schedule_timeout(HZ/10);
2996 finish_wait(&pgdat->kswapd_wait, &wait); 2821 finish_wait(&pgdat->kswapd_wait, &wait);
2997 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2822 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -3001,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3001 * After a short sleep, check if it was a premature sleep. If not, then 2826 * After a short sleep, check if it was a premature sleep. If not, then
3002 * go fully to sleep until explicitly woken up. 2827 * go fully to sleep until explicitly woken up.
3003 */ 2828 */
3004 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2829 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
3005 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 2830 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3006 2831
3007 /* 2832 /*
@@ -3013,7 +2838,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3013 * them before going back to sleep. 2838 * them before going back to sleep.
3014 */ 2839 */
3015 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2840 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3016 schedule(); 2841
2842 if (!kthread_should_stop())
2843 schedule();
2844
3017 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 2845 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3018 } else { 2846 } else {
3019 if (remaining) 2847 if (remaining)
@@ -3209,6 +3037,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3209 .nr_to_reclaim = nr_to_reclaim, 3037 .nr_to_reclaim = nr_to_reclaim,
3210 .hibernation_mode = 1, 3038 .hibernation_mode = 1,
3211 .order = 0, 3039 .order = 0,
3040 .priority = DEF_PRIORITY,
3212 }; 3041 };
3213 struct shrink_control shrink = { 3042 struct shrink_control shrink = {
3214 .gfp_mask = sc.gfp_mask, 3043 .gfp_mask = sc.gfp_mask,
@@ -3279,14 +3108,17 @@ int kswapd_run(int nid)
3279} 3108}
3280 3109
3281/* 3110/*
3282 * Called by memory hotplug when all memory in a node is offlined. 3111 * Called by memory hotplug when all memory in a node is offlined. Caller must
3112 * hold lock_memory_hotplug().
3283 */ 3113 */
3284void kswapd_stop(int nid) 3114void kswapd_stop(int nid)
3285{ 3115{
3286 struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 3116 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3287 3117
3288 if (kswapd) 3118 if (kswapd) {
3289 kthread_stop(kswapd); 3119 kthread_stop(kswapd);
3120 NODE_DATA(nid)->kswapd = NULL;
3121 }
3290} 3122}
3291 3123
3292static int __init kswapd_init(void) 3124static int __init kswapd_init(void)
@@ -3386,7 +3218,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3386 const unsigned long nr_pages = 1 << order; 3218 const unsigned long nr_pages = 1 << order;
3387 struct task_struct *p = current; 3219 struct task_struct *p = current;
3388 struct reclaim_state reclaim_state; 3220 struct reclaim_state reclaim_state;
3389 int priority;
3390 struct scan_control sc = { 3221 struct scan_control sc = {
3391 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3222 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3392 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3223 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -3395,6 +3226,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3395 SWAP_CLUSTER_MAX), 3226 SWAP_CLUSTER_MAX),
3396 .gfp_mask = gfp_mask, 3227 .gfp_mask = gfp_mask,
3397 .order = order, 3228 .order = order,
3229 .priority = ZONE_RECLAIM_PRIORITY,
3398 }; 3230 };
3399 struct shrink_control shrink = { 3231 struct shrink_control shrink = {
3400 .gfp_mask = sc.gfp_mask, 3232 .gfp_mask = sc.gfp_mask,
@@ -3417,11 +3249,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3417 * Free memory by calling shrink zone with increasing 3249 * Free memory by calling shrink zone with increasing
3418 * priorities until we have enough memory freed. 3250 * priorities until we have enough memory freed.
3419 */ 3251 */
3420 priority = ZONE_RECLAIM_PRIORITY;
3421 do { 3252 do {
3422 shrink_zone(priority, zone, &sc); 3253 shrink_zone(zone, &sc);
3423 priority--; 3254 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
3424 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
3425 } 3255 }
3426 3256
3427 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3257 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -3536,7 +3366,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
3536 if (mapping_unevictable(page_mapping(page))) 3366 if (mapping_unevictable(page_mapping(page)))
3537 return 0; 3367 return 0;
3538 3368
3539 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) 3369 if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
3540 return 0; 3370 return 0;
3541 3371
3542 return 1; 3372 return 1;
@@ -3572,6 +3402,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3572 zone = pagezone; 3402 zone = pagezone;
3573 spin_lock_irq(&zone->lru_lock); 3403 spin_lock_irq(&zone->lru_lock);
3574 } 3404 }
3405 lruvec = mem_cgroup_page_lruvec(page, zone);
3575 3406
3576 if (!PageLRU(page) || !PageUnevictable(page)) 3407 if (!PageLRU(page) || !PageUnevictable(page))
3577 continue; 3408 continue;
@@ -3581,11 +3412,8 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3581 3412
3582 VM_BUG_ON(PageActive(page)); 3413 VM_BUG_ON(PageActive(page));
3583 ClearPageUnevictable(page); 3414 ClearPageUnevictable(page);
3584 __dec_zone_state(zone, NR_UNEVICTABLE); 3415 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
3585 lruvec = mem_cgroup_lru_move_lists(zone, page, 3416 add_page_to_lru_list(page, lruvec, lru);
3586 LRU_UNEVICTABLE, lru);
3587 list_move(&page->lru, &lruvec->lists[lru]);
3588 __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
3589 pgrescued++; 3417 pgrescued++;
3590 } 3418 }
3591 } 3419 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f600557a7659..df7a6748231d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -613,6 +613,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
613 "Reclaimable", 613 "Reclaimable",
614 "Movable", 614 "Movable",
615 "Reserve", 615 "Reserve",
616#ifdef CONFIG_CMA
617 "CMA",
618#endif
616 "Isolate", 619 "Isolate",
617}; 620};
618 621
@@ -738,16 +741,17 @@ const char * const vmstat_text[] = {
738 "pgmajfault", 741 "pgmajfault",
739 742
740 TEXTS_FOR_ZONES("pgrefill") 743 TEXTS_FOR_ZONES("pgrefill")
741 TEXTS_FOR_ZONES("pgsteal") 744 TEXTS_FOR_ZONES("pgsteal_kswapd")
745 TEXTS_FOR_ZONES("pgsteal_direct")
742 TEXTS_FOR_ZONES("pgscan_kswapd") 746 TEXTS_FOR_ZONES("pgscan_kswapd")
743 TEXTS_FOR_ZONES("pgscan_direct") 747 TEXTS_FOR_ZONES("pgscan_direct")
748 "pgscan_direct_throttle",
744 749
745#ifdef CONFIG_NUMA 750#ifdef CONFIG_NUMA
746 "zone_reclaim_failed", 751 "zone_reclaim_failed",
747#endif 752#endif
748 "pginodesteal", 753 "pginodesteal",
749 "slabs_scanned", 754 "slabs_scanned",
750 "kswapd_steal",
751 "kswapd_inodesteal", 755 "kswapd_inodesteal",
752 "kswapd_low_wmark_hit_quickly", 756 "kswapd_low_wmark_hit_quickly",
753 "kswapd_high_wmark_hit_quickly", 757 "kswapd_high_wmark_hit_quickly",
@@ -1220,7 +1224,6 @@ module_init(setup_vmstat)
1220#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) 1224#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
1221#include <linux/debugfs.h> 1225#include <linux/debugfs.h>
1222 1226
1223static struct dentry *extfrag_debug_root;
1224 1227
1225/* 1228/*
1226 * Return an index indicating how much of the available free memory is 1229 * Return an index indicating how much of the available free memory is
@@ -1358,19 +1361,24 @@ static const struct file_operations extfrag_file_ops = {
1358 1361
1359static int __init extfrag_debug_init(void) 1362static int __init extfrag_debug_init(void)
1360{ 1363{
1364 struct dentry *extfrag_debug_root;
1365
1361 extfrag_debug_root = debugfs_create_dir("extfrag", NULL); 1366 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
1362 if (!extfrag_debug_root) 1367 if (!extfrag_debug_root)
1363 return -ENOMEM; 1368 return -ENOMEM;
1364 1369
1365 if (!debugfs_create_file("unusable_index", 0444, 1370 if (!debugfs_create_file("unusable_index", 0444,
1366 extfrag_debug_root, NULL, &unusable_file_ops)) 1371 extfrag_debug_root, NULL, &unusable_file_ops))
1367 return -ENOMEM; 1372 goto fail;
1368 1373
1369 if (!debugfs_create_file("extfrag_index", 0444, 1374 if (!debugfs_create_file("extfrag_index", 0444,
1370 extfrag_debug_root, NULL, &extfrag_file_ops)) 1375 extfrag_debug_root, NULL, &extfrag_file_ops))
1371 return -ENOMEM; 1376 goto fail;
1372 1377
1373 return 0; 1378 return 0;
1379fail:
1380 debugfs_remove_recursive(extfrag_debug_root);
1381 return -ENOMEM;
1374} 1382}
1375 1383
1376module_init(extfrag_debug_init); 1384module_init(extfrag_debug_init);