aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig22
-rw-r--r--mm/Makefile8
-rw-r--r--mm/backing-dev.c26
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/bounce.c8
-rw-r--r--mm/compaction.c68
-rw-r--r--mm/fadvise.c18
-rw-r--r--mm/frontswap.c344
-rw-r--r--mm/highmem.c12
-rw-r--r--mm/hugetlb.c195
-rw-r--r--mm/hugetlb_cgroup.c418
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h8
-rw-r--r--mm/madvise.c18
-rw-r--r--mm/memblock.c146
-rw-r--r--mm/memcontrol.c396
-rw-r--r--mm/memory-failure.c35
-rw-r--r--mm/memory.c30
-rw-r--r--mm/memory_hotplug.c22
-rw-r--r--mm/mempolicy.c10
-rw-r--r--mm/migrate.c81
-rw-r--r--mm/mmap.c11
-rw-r--r--mm/mmu_notifier.c45
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nobootmem.c40
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c244
-rw-r--r--mm/page-writeback.c107
-rw-r--r--mm/page_alloc.c325
-rw-r--r--mm/page_cgroup.c6
-rw-r--r--mm/page_io.c157
-rw-r--r--mm/page_isolation.c93
-rw-r--r--mm/pagewalk.c1
-rw-r--r--mm/percpu-vm.c1
-rw-r--r--mm/shmem.c256
-rw-r--r--mm/slab.c622
-rw-r--r--mm/slab.h33
-rw-r--r--mm/slab_common.c120
-rw-r--r--mm/slob.c152
-rw-r--r--mm/slub.c464
-rw-r--r--mm/sparse.c49
-rw-r--r--mm/swap.c52
-rw-r--r--mm/swap_state.c7
-rw-r--r--mm/swapfile.c211
-rw-r--r--mm/vmalloc.c52
-rw-r--r--mm/vmscan.c192
-rw-r--r--mm/vmstat.c1
48 files changed, 3410 insertions, 1710 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b2176374b98e..d5c8019c6627 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK
140config NO_BOOTMEM 140config NO_BOOTMEM
141 boolean 141 boolean
142 142
143config MEMORY_ISOLATION
144 boolean
145
143# eventually, we can have this option just 'select SPARSEMEM' 146# eventually, we can have this option just 'select SPARSEMEM'
144config MEMORY_HOTPLUG 147config MEMORY_HOTPLUG
145 bool "Allow for memory hot-add" 148 bool "Allow for memory hot-add"
149 select MEMORY_ISOLATION
146 depends on SPARSEMEM || X86_64_ACPI_NUMA 150 depends on SPARSEMEM || X86_64_ACPI_NUMA
147 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG 151 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
148 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) 152 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -272,6 +276,7 @@ config MEMORY_FAILURE
272 depends on MMU 276 depends on MMU
273 depends on ARCH_SUPPORTS_MEMORY_FAILURE 277 depends on ARCH_SUPPORTS_MEMORY_FAILURE
274 bool "Enable recovery from hardware memory errors" 278 bool "Enable recovery from hardware memory errors"
279 select MEMORY_ISOLATION
275 help 280 help
276 Enables code to recover from some memory failures on systems 281 Enables code to recover from some memory failures on systems
277 with MCA recovery. This allows a system to continue running 282 with MCA recovery. This allows a system to continue running
@@ -389,3 +394,20 @@ config CLEANCACHE
389 in a negligible performance hit. 394 in a negligible performance hit.
390 395
391 If unsure, say Y to enable cleancache 396 If unsure, say Y to enable cleancache
397
398config FRONTSWAP
399 bool "Enable frontswap to cache swap pages if tmem is present"
400 depends on SWAP
401 default n
402 help
403 Frontswap is so named because it can be thought of as the opposite
404 of a "backing" store for a swap device. The data is stored into
405 "transcendent memory", memory that is not directly accessible or
406 addressable by the kernel and is of unknown and possibly
407 time-varying size. When space in transcendent memory is available,
408 a significant swap I/O reduction may be achieved. When none is
409 available, all frontswap calls are reduced to a single pointer-
410 compare-against-NULL resulting in a negligible performance hit
411 and swap data is stored as normal on the matching swap device.
412
413 If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index a156285ce88d..92753e2d82da 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,8 +15,9 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
15 maccess.o page_alloc.o page-writeback.o \ 15 maccess.o page_alloc.o page-writeback.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 17 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
18 page_isolation.o mm_init.o mmu_context.o percpu.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o $(mmu-y) 19 compaction.o $(mmu-y)
20
20obj-y += init-mm.o 21obj-y += init-mm.o
21 22
22ifdef CONFIG_NO_BOOTMEM 23ifdef CONFIG_NO_BOOTMEM
@@ -29,6 +30,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
29 30
30obj-$(CONFIG_BOUNCE) += bounce.o 31obj-$(CONFIG_BOUNCE) += bounce.o
31obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o 32obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
33obj-$(CONFIG_FRONTSWAP) += frontswap.o
32obj-$(CONFIG_HAS_DMA) += dmapool.o 34obj-$(CONFIG_HAS_DMA) += dmapool.o
33obj-$(CONFIG_HUGETLBFS) += hugetlb.o 35obj-$(CONFIG_HUGETLBFS) += hugetlb.o
34obj-$(CONFIG_NUMA) += mempolicy.o 36obj-$(CONFIG_NUMA) += mempolicy.o
@@ -47,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
47obj-$(CONFIG_MIGRATION) += migrate.o 49obj-$(CONFIG_MIGRATION) += migrate.o
48obj-$(CONFIG_QUICKLIST) += quicklist.o 50obj-$(CONFIG_QUICKLIST) += quicklist.o
49obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o 51obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
50obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 52obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
53obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
51obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 54obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
52obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 55obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
53obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 56obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
54obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 57obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
55obj-$(CONFIG_CLEANCACHE) += cleancache.o 58obj-$(CONFIG_CLEANCACHE) += cleancache.o
59obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dd8e2aafb07e..6b4718e2ee34 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -677,7 +677,7 @@ int bdi_init(struct backing_dev_info *bdi)
677 677
678 bdi->min_ratio = 0; 678 bdi->min_ratio = 0;
679 bdi->max_ratio = 100; 679 bdi->max_ratio = 100;
680 bdi->max_prop_frac = PROP_FRAC_BASE; 680 bdi->max_prop_frac = FPROP_FRAC_BASE;
681 spin_lock_init(&bdi->wb_lock); 681 spin_lock_init(&bdi->wb_lock);
682 INIT_LIST_HEAD(&bdi->bdi_list); 682 INIT_LIST_HEAD(&bdi->bdi_list);
683 INIT_LIST_HEAD(&bdi->work_list); 683 INIT_LIST_HEAD(&bdi->work_list);
@@ -700,7 +700,7 @@ int bdi_init(struct backing_dev_info *bdi)
700 bdi->write_bandwidth = INIT_BW; 700 bdi->write_bandwidth = INIT_BW;
701 bdi->avg_write_bandwidth = INIT_BW; 701 bdi->avg_write_bandwidth = INIT_BW;
702 702
703 err = prop_local_init_percpu(&bdi->completions); 703 err = fprop_local_init_percpu(&bdi->completions);
704 704
705 if (err) { 705 if (err) {
706err: 706err:
@@ -744,7 +744,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
744 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 744 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
745 percpu_counter_destroy(&bdi->bdi_stat[i]); 745 percpu_counter_destroy(&bdi->bdi_stat[i]);
746 746
747 prop_local_destroy_percpu(&bdi->completions); 747 fprop_local_destroy_percpu(&bdi->completions);
748} 748}
749EXPORT_SYMBOL(bdi_destroy); 749EXPORT_SYMBOL(bdi_destroy);
750 750
@@ -886,3 +886,23 @@ out:
886 return ret; 886 return ret;
887} 887}
888EXPORT_SYMBOL(wait_iff_congested); 888EXPORT_SYMBOL(wait_iff_congested);
889
890int pdflush_proc_obsolete(struct ctl_table *table, int write,
891 void __user *buffer, size_t *lenp, loff_t *ppos)
892{
893 char kbuf[] = "0\n";
894
895 if (*ppos) {
896 *lenp = 0;
897 return 0;
898 }
899
900 if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
901 return -EFAULT;
902 printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
903 table->procname);
904
905 *lenp = 2;
906 *ppos += *lenp;
907 return 2;
908}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ec4fcb7a56c8..bcb63ac48cc5 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
698 return ___alloc_bootmem(size, align, goal, limit); 698 return ___alloc_bootmem(size, align, goal, limit);
699} 699}
700 700
701static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, 701void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
702 unsigned long size, unsigned long align, 702 unsigned long size, unsigned long align,
703 unsigned long goal, unsigned long limit) 703 unsigned long goal, unsigned long limit)
704{ 704{
@@ -710,6 +710,10 @@ again:
710 if (ptr) 710 if (ptr)
711 return ptr; 711 return ptr;
712 712
713 /* do not panic in alloc_bootmem_bdata() */
714 if (limit && goal + size > limit)
715 limit = 0;
716
713 ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); 717 ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
714 if (ptr) 718 if (ptr)
715 return ptr; 719 return ptr;
diff --git a/mm/bounce.c b/mm/bounce.c
index d1be02ca1889..042086775561 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -24,23 +24,25 @@
24 24
25static mempool_t *page_pool, *isa_page_pool; 25static mempool_t *page_pool, *isa_page_pool;
26 26
27#ifdef CONFIG_HIGHMEM 27#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
28static __init int init_emergency_pool(void) 28static __init int init_emergency_pool(void)
29{ 29{
30#ifndef CONFIG_MEMORY_HOTPLUG 30#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
31 if (max_pfn <= max_low_pfn) 31 if (max_pfn <= max_low_pfn)
32 return 0; 32 return 0;
33#endif 33#endif
34 34
35 page_pool = mempool_create_page_pool(POOL_SIZE, 0); 35 page_pool = mempool_create_page_pool(POOL_SIZE, 0);
36 BUG_ON(!page_pool); 36 BUG_ON(!page_pool);
37 printk("highmem bounce pool size: %d pages\n", POOL_SIZE); 37 printk("bounce pool size: %d pages\n", POOL_SIZE);
38 38
39 return 0; 39 return 0;
40} 40}
41 41
42__initcall(init_emergency_pool); 42__initcall(init_emergency_pool);
43#endif
43 44
45#ifdef CONFIG_HIGHMEM
44/* 46/*
45 * highmem version, map in to vec 47 * highmem version, map in to vec
46 */ 48 */
diff --git a/mm/compaction.c b/mm/compaction.c
index 7ea259d82a99..e78cb9688421 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -422,6 +422,17 @@ static void isolate_freepages(struct zone *zone,
422 pfn -= pageblock_nr_pages) { 422 pfn -= pageblock_nr_pages) {
423 unsigned long isolated; 423 unsigned long isolated;
424 424
425 /*
426 * Skip ahead if another thread is compacting in the area
427 * simultaneously. If we wrapped around, we can only skip
428 * ahead if zone->compact_cached_free_pfn also wrapped to
429 * above our starting point.
430 */
431 if (cc->order > 0 && (!cc->wrapped ||
432 zone->compact_cached_free_pfn >
433 cc->start_free_pfn))
434 pfn = min(pfn, zone->compact_cached_free_pfn);
435
425 if (!pfn_valid(pfn)) 436 if (!pfn_valid(pfn))
426 continue; 437 continue;
427 438
@@ -461,8 +472,11 @@ static void isolate_freepages(struct zone *zone,
461 * looking for free pages, the search will restart here as 472 * looking for free pages, the search will restart here as
462 * page migration may have returned some pages to the allocator 473 * page migration may have returned some pages to the allocator
463 */ 474 */
464 if (isolated) 475 if (isolated) {
465 high_pfn = max(high_pfn, pfn); 476 high_pfn = max(high_pfn, pfn);
477 if (cc->order > 0)
478 zone->compact_cached_free_pfn = high_pfn;
479 }
466 } 480 }
467 481
468 /* split_free_page does not map the pages */ 482 /* split_free_page does not map the pages */
@@ -556,6 +570,20 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
556 return ISOLATE_SUCCESS; 570 return ISOLATE_SUCCESS;
557} 571}
558 572
573/*
574 * Returns the start pfn of the last page block in a zone. This is the starting
575 * point for full compaction of a zone. Compaction searches for free pages from
576 * the end of each zone, while isolate_freepages_block scans forward inside each
577 * page block.
578 */
579static unsigned long start_free_pfn(struct zone *zone)
580{
581 unsigned long free_pfn;
582 free_pfn = zone->zone_start_pfn + zone->spanned_pages;
583 free_pfn &= ~(pageblock_nr_pages-1);
584 return free_pfn;
585}
586
559static int compact_finished(struct zone *zone, 587static int compact_finished(struct zone *zone,
560 struct compact_control *cc) 588 struct compact_control *cc)
561{ 589{
@@ -565,8 +593,26 @@ static int compact_finished(struct zone *zone,
565 if (fatal_signal_pending(current)) 593 if (fatal_signal_pending(current))
566 return COMPACT_PARTIAL; 594 return COMPACT_PARTIAL;
567 595
568 /* Compaction run completes if the migrate and free scanner meet */ 596 /*
569 if (cc->free_pfn <= cc->migrate_pfn) 597 * A full (order == -1) compaction run starts at the beginning and
598 * end of a zone; it completes when the migrate and free scanner meet.
599 * A partial (order > 0) compaction can start with the free scanner
600 * at a random point in the zone, and may have to restart.
601 */
602 if (cc->free_pfn <= cc->migrate_pfn) {
603 if (cc->order > 0 && !cc->wrapped) {
604 /* We started partway through; restart at the end. */
605 unsigned long free_pfn = start_free_pfn(zone);
606 zone->compact_cached_free_pfn = free_pfn;
607 cc->free_pfn = free_pfn;
608 cc->wrapped = 1;
609 return COMPACT_CONTINUE;
610 }
611 return COMPACT_COMPLETE;
612 }
613
614 /* We wrapped around and ended up where we started. */
615 if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
570 return COMPACT_COMPLETE; 616 return COMPACT_COMPLETE;
571 617
572 /* 618 /*
@@ -664,8 +710,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
664 710
665 /* Setup to move all movable pages to the end of the zone */ 711 /* Setup to move all movable pages to the end of the zone */
666 cc->migrate_pfn = zone->zone_start_pfn; 712 cc->migrate_pfn = zone->zone_start_pfn;
667 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; 713
668 cc->free_pfn &= ~(pageblock_nr_pages-1); 714 if (cc->order > 0) {
715 /* Incremental compaction. Start where the last one stopped. */
716 cc->free_pfn = zone->compact_cached_free_pfn;
717 cc->start_free_pfn = cc->free_pfn;
718 } else {
719 /* Order == -1 starts at the end of the zone. */
720 cc->free_pfn = start_free_pfn(zone);
721 }
669 722
670 migrate_prep_local(); 723 migrate_prep_local();
671 724
@@ -701,8 +754,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
701 if (err) { 754 if (err) {
702 putback_lru_pages(&cc->migratepages); 755 putback_lru_pages(&cc->migratepages);
703 cc->nr_migratepages = 0; 756 cc->nr_migratepages = 0;
757 if (err == -ENOMEM) {
758 ret = COMPACT_PARTIAL;
759 goto out;
760 }
704 } 761 }
705
706 } 762 }
707 763
708out: 764out:
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 469491e0af79..9b75a045dbf4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
93 spin_unlock(&file->f_lock); 93 spin_unlock(&file->f_lock);
94 break; 94 break;
95 case POSIX_FADV_WILLNEED: 95 case POSIX_FADV_WILLNEED:
96 if (!mapping->a_ops->readpage) {
97 ret = -EINVAL;
98 break;
99 }
100
101 /* First and last PARTIAL page! */ 96 /* First and last PARTIAL page! */
102 start_index = offset >> PAGE_CACHE_SHIFT; 97 start_index = offset >> PAGE_CACHE_SHIFT;
103 end_index = endbyte >> PAGE_CACHE_SHIFT; 98 end_index = endbyte >> PAGE_CACHE_SHIFT;
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
106 nrpages = end_index - start_index + 1; 101 nrpages = end_index - start_index + 1;
107 if (!nrpages) 102 if (!nrpages)
108 nrpages = ~0UL; 103 nrpages = ~0UL;
109 104
110 ret = force_page_cache_readahead(mapping, file, 105 /*
111 start_index, 106 * Ignore return value because fadvise() shall return
112 nrpages); 107 * success even if filesystem can't retrieve a hint,
113 if (ret > 0) 108 */
114 ret = 0; 109 force_page_cache_readahead(mapping, file, start_index,
110 nrpages);
115 break; 111 break;
116 case POSIX_FADV_NOREUSE: 112 case POSIX_FADV_NOREUSE:
117 break; 113 break;
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 000000000000..6b3e71a2cd48
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,344 @@
1/*
2 * Frontswap frontend
3 *
4 * This code provides the generic "frontend" layer to call a matching
5 * "backend" driver implementation of frontswap. See
6 * Documentation/vm/frontswap.txt for more information.
7 *
8 * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
9 * Author: Dan Magenheimer
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2.
12 */
13
14#include <linux/mman.h>
15#include <linux/swap.h>
16#include <linux/swapops.h>
17#include <linux/security.h>
18#include <linux/module.h>
19#include <linux/debugfs.h>
20#include <linux/frontswap.h>
21#include <linux/swapfile.h>
22
23/*
24 * frontswap_ops is set by frontswap_register_ops to contain the pointers
25 * to the frontswap "backend" implementation functions.
26 */
27static struct frontswap_ops frontswap_ops __read_mostly;
28
29/*
30 * This global enablement flag reduces overhead on systems where frontswap_ops
31 * has not been registered, so is preferred to the slower alternative: a
32 * function call that checks a non-global.
33 */
34bool frontswap_enabled __read_mostly;
35EXPORT_SYMBOL(frontswap_enabled);
36
37/*
38 * If enabled, frontswap_store will return failure even on success. As
39 * a result, the swap subsystem will always write the page to swap, in
40 * effect converting frontswap into a writethrough cache. In this mode,
41 * there is no direct reduction in swap writes, but a frontswap backend
42 * can unilaterally "reclaim" any pages in use with no data loss, thus
43 * providing increases control over maximum memory usage due to frontswap.
44 */
45static bool frontswap_writethrough_enabled __read_mostly;
46
47#ifdef CONFIG_DEBUG_FS
48/*
49 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
50 * properly configured). These are for information only so are not protected
51 * against increment races.
52 */
53static u64 frontswap_loads;
54static u64 frontswap_succ_stores;
55static u64 frontswap_failed_stores;
56static u64 frontswap_invalidates;
57
58static inline void inc_frontswap_loads(void) {
59 frontswap_loads++;
60}
61static inline void inc_frontswap_succ_stores(void) {
62 frontswap_succ_stores++;
63}
64static inline void inc_frontswap_failed_stores(void) {
65 frontswap_failed_stores++;
66}
67static inline void inc_frontswap_invalidates(void) {
68 frontswap_invalidates++;
69}
70#else
71static inline void inc_frontswap_loads(void) { }
72static inline void inc_frontswap_succ_stores(void) { }
73static inline void inc_frontswap_failed_stores(void) { }
74static inline void inc_frontswap_invalidates(void) { }
75#endif
76/*
77 * Register operations for frontswap, returning previous thus allowing
78 * detection of multiple backends and possible nesting.
79 */
80struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
81{
82 struct frontswap_ops old = frontswap_ops;
83
84 frontswap_ops = *ops;
85 frontswap_enabled = true;
86 return old;
87}
88EXPORT_SYMBOL(frontswap_register_ops);
89
90/*
91 * Enable/disable frontswap writethrough (see above).
92 */
93void frontswap_writethrough(bool enable)
94{
95 frontswap_writethrough_enabled = enable;
96}
97EXPORT_SYMBOL(frontswap_writethrough);
98
99/*
100 * Called when a swap device is swapon'd.
101 */
102void __frontswap_init(unsigned type)
103{
104 struct swap_info_struct *sis = swap_info[type];
105
106 BUG_ON(sis == NULL);
107 if (sis->frontswap_map == NULL)
108 return;
109 frontswap_ops.init(type);
110}
111EXPORT_SYMBOL(__frontswap_init);
112
113static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
114{
115 frontswap_clear(sis, offset);
116 atomic_dec(&sis->frontswap_pages);
117}
118
119/*
120 * "Store" data from a page to frontswap and associate it with the page's
121 * swaptype and offset. Page must be locked and in the swap cache.
122 * If frontswap already contains a page with matching swaptype and
123 * offset, the frontswap implementation may either overwrite the data and
124 * return success or invalidate the page from frontswap and return failure.
125 */
126int __frontswap_store(struct page *page)
127{
128 int ret = -1, dup = 0;
129 swp_entry_t entry = { .val = page_private(page), };
130 int type = swp_type(entry);
131 struct swap_info_struct *sis = swap_info[type];
132 pgoff_t offset = swp_offset(entry);
133
134 BUG_ON(!PageLocked(page));
135 BUG_ON(sis == NULL);
136 if (frontswap_test(sis, offset))
137 dup = 1;
138 ret = frontswap_ops.store(type, offset, page);
139 if (ret == 0) {
140 frontswap_set(sis, offset);
141 inc_frontswap_succ_stores();
142 if (!dup)
143 atomic_inc(&sis->frontswap_pages);
144 } else {
145 /*
146 failed dup always results in automatic invalidate of
147 the (older) page from frontswap
148 */
149 inc_frontswap_failed_stores();
150 if (dup)
151 __frontswap_clear(sis, offset);
152 }
153 if (frontswap_writethrough_enabled)
154 /* report failure so swap also writes to swap device */
155 ret = -1;
156 return ret;
157}
158EXPORT_SYMBOL(__frontswap_store);
159
160/*
161 * "Get" data from frontswap associated with swaptype and offset that were
162 * specified when the data was put to frontswap and use it to fill the
163 * specified page with data. Page must be locked and in the swap cache.
164 */
165int __frontswap_load(struct page *page)
166{
167 int ret = -1;
168 swp_entry_t entry = { .val = page_private(page), };
169 int type = swp_type(entry);
170 struct swap_info_struct *sis = swap_info[type];
171 pgoff_t offset = swp_offset(entry);
172
173 BUG_ON(!PageLocked(page));
174 BUG_ON(sis == NULL);
175 if (frontswap_test(sis, offset))
176 ret = frontswap_ops.load(type, offset, page);
177 if (ret == 0)
178 inc_frontswap_loads();
179 return ret;
180}
181EXPORT_SYMBOL(__frontswap_load);
182
183/*
184 * Invalidate any data from frontswap associated with the specified swaptype
185 * and offset so that a subsequent "get" will fail.
186 */
187void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
188{
189 struct swap_info_struct *sis = swap_info[type];
190
191 BUG_ON(sis == NULL);
192 if (frontswap_test(sis, offset)) {
193 frontswap_ops.invalidate_page(type, offset);
194 __frontswap_clear(sis, offset);
195 inc_frontswap_invalidates();
196 }
197}
198EXPORT_SYMBOL(__frontswap_invalidate_page);
199
200/*
201 * Invalidate all data from frontswap associated with all offsets for the
202 * specified swaptype.
203 */
204void __frontswap_invalidate_area(unsigned type)
205{
206 struct swap_info_struct *sis = swap_info[type];
207
208 BUG_ON(sis == NULL);
209 if (sis->frontswap_map == NULL)
210 return;
211 frontswap_ops.invalidate_area(type);
212 atomic_set(&sis->frontswap_pages, 0);
213 memset(sis->frontswap_map, 0, sis->max / sizeof(long));
214}
215EXPORT_SYMBOL(__frontswap_invalidate_area);
216
217static unsigned long __frontswap_curr_pages(void)
218{
219 int type;
220 unsigned long totalpages = 0;
221 struct swap_info_struct *si = NULL;
222
223 assert_spin_locked(&swap_lock);
224 for (type = swap_list.head; type >= 0; type = si->next) {
225 si = swap_info[type];
226 totalpages += atomic_read(&si->frontswap_pages);
227 }
228 return totalpages;
229}
230
231static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
232 int *swapid)
233{
234 int ret = -EINVAL;
235 struct swap_info_struct *si = NULL;
236 int si_frontswap_pages;
237 unsigned long total_pages_to_unuse = total;
238 unsigned long pages = 0, pages_to_unuse = 0;
239 int type;
240
241 assert_spin_locked(&swap_lock);
242 for (type = swap_list.head; type >= 0; type = si->next) {
243 si = swap_info[type];
244 si_frontswap_pages = atomic_read(&si->frontswap_pages);
245 if (total_pages_to_unuse < si_frontswap_pages) {
246 pages = pages_to_unuse = total_pages_to_unuse;
247 } else {
248 pages = si_frontswap_pages;
249 pages_to_unuse = 0; /* unuse all */
250 }
251 /* ensure there is enough RAM to fetch pages from frontswap */
252 if (security_vm_enough_memory_mm(current->mm, pages)) {
253 ret = -ENOMEM;
254 continue;
255 }
256 vm_unacct_memory(pages);
257 *unused = pages_to_unuse;
258 *swapid = type;
259 ret = 0;
260 break;
261 }
262
263 return ret;
264}
265
266static int __frontswap_shrink(unsigned long target_pages,
267 unsigned long *pages_to_unuse,
268 int *type)
269{
270 unsigned long total_pages = 0, total_pages_to_unuse;
271
272 assert_spin_locked(&swap_lock);
273
274 total_pages = __frontswap_curr_pages();
275 if (total_pages <= target_pages) {
276 /* Nothing to do */
277 *pages_to_unuse = 0;
278 return 0;
279 }
280 total_pages_to_unuse = total_pages - target_pages;
281 return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
282}
283
284/*
285 * Frontswap, like a true swap device, may unnecessarily retain pages
286 * under certain circumstances; "shrink" frontswap is essentially a
287 * "partial swapoff" and works by calling try_to_unuse to attempt to
288 * unuse enough frontswap pages to attempt to -- subject to memory
289 * constraints -- reduce the number of pages in frontswap to the
290 * number given in the parameter target_pages.
291 */
292void frontswap_shrink(unsigned long target_pages)
293{
294 unsigned long pages_to_unuse = 0;
295 int type, ret;
296
297 /*
298 * we don't want to hold swap_lock while doing a very
299 * lengthy try_to_unuse, but swap_list may change
300 * so restart scan from swap_list.head each time
301 */
302 spin_lock(&swap_lock);
303 ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
304 spin_unlock(&swap_lock);
305 if (ret == 0 && pages_to_unuse)
306 try_to_unuse(type, true, pages_to_unuse);
307 return;
308}
309EXPORT_SYMBOL(frontswap_shrink);
310
311/*
312 * Count and return the number of frontswap pages across all
313 * swap devices. This is exported so that backend drivers can
314 * determine current usage without reading debugfs.
315 */
316unsigned long frontswap_curr_pages(void)
317{
318 unsigned long totalpages = 0;
319
320 spin_lock(&swap_lock);
321 totalpages = __frontswap_curr_pages();
322 spin_unlock(&swap_lock);
323
324 return totalpages;
325}
326EXPORT_SYMBOL(frontswap_curr_pages);
327
328static int __init init_frontswap(void)
329{
330#ifdef CONFIG_DEBUG_FS
331 struct dentry *root = debugfs_create_dir("frontswap", NULL);
332 if (root == NULL)
333 return -ENXIO;
334 debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
335 debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
336 debugfs_create_u64("failed_stores", S_IRUGO, root,
337 &frontswap_failed_stores);
338 debugfs_create_u64("invalidates", S_IRUGO,
339 root, &frontswap_invalidates);
340#endif
341 return 0;
342}
343
344module_init(init_frontswap);
diff --git a/mm/highmem.c b/mm/highmem.c
index 57d82c6250c3..d517cd16a6eb 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
94 do { spin_unlock(&kmap_lock); (void)(flags); } while (0) 94 do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
95#endif 95#endif
96 96
97struct page *kmap_to_page(void *vaddr)
98{
99 unsigned long addr = (unsigned long)vaddr;
100
101 if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
102 int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
103 return pte_page(pkmap_page_table[i]);
104 }
105
106 return virt_to_page(addr);
107}
108
97static void flush_all_zero_pkmaps(void) 109static void flush_all_zero_pkmaps(void)
98{ 110{
99 int i; 111 int i;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e198831276a3..bc727122dd44 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,17 +24,20 @@
24 24
25#include <asm/page.h> 25#include <asm/page.h>
26#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <linux/io.h> 27#include <asm/tlb.h>
28 28
29#include <linux/io.h>
29#include <linux/hugetlb.h> 30#include <linux/hugetlb.h>
31#include <linux/hugetlb_cgroup.h>
30#include <linux/node.h> 32#include <linux/node.h>
33#include <linux/hugetlb_cgroup.h>
31#include "internal.h" 34#include "internal.h"
32 35
33const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 36const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 37static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable; 38unsigned long hugepages_treat_as_movable;
36 39
37static int max_hstate; 40int hugetlb_max_hstate __read_mostly;
38unsigned int default_hstate_idx; 41unsigned int default_hstate_idx;
39struct hstate hstates[HUGE_MAX_HSTATE]; 42struct hstate hstates[HUGE_MAX_HSTATE];
40 43
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate;
45static unsigned long __initdata default_hstate_max_huge_pages; 48static unsigned long __initdata default_hstate_max_huge_pages;
46static unsigned long __initdata default_hstate_size; 49static unsigned long __initdata default_hstate_size;
47 50
48#define for_each_hstate(h) \
49 for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
50
51/* 51/*
52 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 52 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
53 */ 53 */
54static DEFINE_SPINLOCK(hugetlb_lock); 54DEFINE_SPINLOCK(hugetlb_lock);
55 55
56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
57{ 57{
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src)
509static void enqueue_huge_page(struct hstate *h, struct page *page) 509static void enqueue_huge_page(struct hstate *h, struct page *page)
510{ 510{
511 int nid = page_to_nid(page); 511 int nid = page_to_nid(page);
512 list_add(&page->lru, &h->hugepage_freelists[nid]); 512 list_move(&page->lru, &h->hugepage_freelists[nid]);
513 h->free_huge_pages++; 513 h->free_huge_pages++;
514 h->free_huge_pages_node[nid]++; 514 h->free_huge_pages_node[nid]++;
515} 515}
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
521 if (list_empty(&h->hugepage_freelists[nid])) 521 if (list_empty(&h->hugepage_freelists[nid]))
522 return NULL; 522 return NULL;
523 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); 523 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
524 list_del(&page->lru); 524 list_move(&page->lru, &h->hugepage_activelist);
525 set_page_refcounted(page); 525 set_page_refcounted(page);
526 h->free_huge_pages--; 526 h->free_huge_pages--;
527 h->free_huge_pages_node[nid]--; 527 h->free_huge_pages_node[nid]--;
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
593 1 << PG_active | 1 << PG_reserved | 593 1 << PG_active | 1 << PG_reserved |
594 1 << PG_private | 1 << PG_writeback); 594 1 << PG_private | 1 << PG_writeback);
595 } 595 }
596 VM_BUG_ON(hugetlb_cgroup_from_page(page));
596 set_compound_page_dtor(page, NULL); 597 set_compound_page_dtor(page, NULL);
597 set_page_refcounted(page); 598 set_page_refcounted(page);
598 arch_release_hugepage(page); 599 arch_release_hugepage(page);
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page)
625 page->mapping = NULL; 626 page->mapping = NULL;
626 BUG_ON(page_count(page)); 627 BUG_ON(page_count(page));
627 BUG_ON(page_mapcount(page)); 628 BUG_ON(page_mapcount(page));
628 INIT_LIST_HEAD(&page->lru);
629 629
630 spin_lock(&hugetlb_lock); 630 spin_lock(&hugetlb_lock);
631 hugetlb_cgroup_uncharge_page(hstate_index(h),
632 pages_per_huge_page(h), page);
631 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { 633 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
634 /* remove the page from active list */
635 list_del(&page->lru);
632 update_and_free_page(h, page); 636 update_and_free_page(h, page);
633 h->surplus_huge_pages--; 637 h->surplus_huge_pages--;
634 h->surplus_huge_pages_node[nid]--; 638 h->surplus_huge_pages_node[nid]--;
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page)
641 645
642static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 646static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
643{ 647{
648 INIT_LIST_HEAD(&page->lru);
644 set_compound_page_dtor(page, free_huge_page); 649 set_compound_page_dtor(page, free_huge_page);
645 spin_lock(&hugetlb_lock); 650 spin_lock(&hugetlb_lock);
651 set_hugetlb_cgroup(page, NULL);
646 h->nr_huge_pages++; 652 h->nr_huge_pages++;
647 h->nr_huge_pages_node[nid]++; 653 h->nr_huge_pages_node[nid]++;
648 spin_unlock(&hugetlb_lock); 654 spin_unlock(&hugetlb_lock);
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
889 895
890 spin_lock(&hugetlb_lock); 896 spin_lock(&hugetlb_lock);
891 if (page) { 897 if (page) {
898 INIT_LIST_HEAD(&page->lru);
892 r_nid = page_to_nid(page); 899 r_nid = page_to_nid(page);
893 set_compound_page_dtor(page, free_huge_page); 900 set_compound_page_dtor(page, free_huge_page);
901 set_hugetlb_cgroup(page, NULL);
894 /* 902 /*
895 * We incremented the global counters already 903 * We incremented the global counters already
896 */ 904 */
@@ -993,7 +1001,6 @@ retry:
993 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1001 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
994 if ((--needed) < 0) 1002 if ((--needed) < 0)
995 break; 1003 break;
996 list_del(&page->lru);
997 /* 1004 /*
998 * This page is now managed by the hugetlb allocator and has 1005 * This page is now managed by the hugetlb allocator and has
999 * no users -- drop the buddy allocator's reference. 1006 * no users -- drop the buddy allocator's reference.
@@ -1008,7 +1015,6 @@ free:
1008 /* Free unnecessary surplus pages to the buddy allocator */ 1015 /* Free unnecessary surplus pages to the buddy allocator */
1009 if (!list_empty(&surplus_list)) { 1016 if (!list_empty(&surplus_list)) {
1010 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1017 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
1011 list_del(&page->lru);
1012 put_page(page); 1018 put_page(page);
1013 } 1019 }
1014 } 1020 }
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1112 struct hstate *h = hstate_vma(vma); 1118 struct hstate *h = hstate_vma(vma);
1113 struct page *page; 1119 struct page *page;
1114 long chg; 1120 long chg;
1121 int ret, idx;
1122 struct hugetlb_cgroup *h_cg;
1115 1123
1124 idx = hstate_index(h);
1116 /* 1125 /*
1117 * Processes that did not create the mapping will have no 1126 * Processes that did not create the mapping will have no
1118 * reserves and will not have accounted against subpool 1127 * reserves and will not have accounted against subpool
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1123 */ 1132 */
1124 chg = vma_needs_reservation(h, vma, addr); 1133 chg = vma_needs_reservation(h, vma, addr);
1125 if (chg < 0) 1134 if (chg < 0)
1126 return ERR_PTR(-VM_FAULT_OOM); 1135 return ERR_PTR(-ENOMEM);
1127 if (chg) 1136 if (chg)
1128 if (hugepage_subpool_get_pages(spool, chg)) 1137 if (hugepage_subpool_get_pages(spool, chg))
1129 return ERR_PTR(-VM_FAULT_SIGBUS); 1138 return ERR_PTR(-ENOSPC);
1130 1139
1140 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1141 if (ret) {
1142 hugepage_subpool_put_pages(spool, chg);
1143 return ERR_PTR(-ENOSPC);
1144 }
1131 spin_lock(&hugetlb_lock); 1145 spin_lock(&hugetlb_lock);
1132 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); 1146 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
1133 spin_unlock(&hugetlb_lock); 1147 if (page) {
1134 1148 /* update page cgroup details */
1135 if (!page) { 1149 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1150 h_cg, page);
1151 spin_unlock(&hugetlb_lock);
1152 } else {
1153 spin_unlock(&hugetlb_lock);
1136 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1154 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1137 if (!page) { 1155 if (!page) {
1156 hugetlb_cgroup_uncharge_cgroup(idx,
1157 pages_per_huge_page(h),
1158 h_cg);
1138 hugepage_subpool_put_pages(spool, chg); 1159 hugepage_subpool_put_pages(spool, chg);
1139 return ERR_PTR(-VM_FAULT_SIGBUS); 1160 return ERR_PTR(-ENOSPC);
1140 } 1161 }
1162 spin_lock(&hugetlb_lock);
1163 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1164 h_cg, page);
1165 list_move(&page->lru, &h->hugepage_activelist);
1166 spin_unlock(&hugetlb_lock);
1141 } 1167 }
1142 1168
1143 set_page_private(page, (unsigned long)spool); 1169 set_page_private(page, (unsigned long)spool);
1144 1170
1145 vma_commit_reservation(h, vma, addr); 1171 vma_commit_reservation(h, vma, addr);
1146
1147 return page; 1172 return page;
1148} 1173}
1149 1174
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1646 struct attribute_group *hstate_attr_group) 1671 struct attribute_group *hstate_attr_group)
1647{ 1672{
1648 int retval; 1673 int retval;
1649 int hi = h - hstates; 1674 int hi = hstate_index(h);
1650 1675
1651 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 1676 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1652 if (!hstate_kobjs[hi]) 1677 if (!hstate_kobjs[hi])
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node)
1741 if (!nhs->hugepages_kobj) 1766 if (!nhs->hugepages_kobj)
1742 return; /* no hstate attributes */ 1767 return; /* no hstate attributes */
1743 1768
1744 for_each_hstate(h) 1769 for_each_hstate(h) {
1745 if (nhs->hstate_kobjs[h - hstates]) { 1770 int idx = hstate_index(h);
1746 kobject_put(nhs->hstate_kobjs[h - hstates]); 1771 if (nhs->hstate_kobjs[idx]) {
1747 nhs->hstate_kobjs[h - hstates] = NULL; 1772 kobject_put(nhs->hstate_kobjs[idx]);
1773 nhs->hstate_kobjs[idx] = NULL;
1748 } 1774 }
1775 }
1749 1776
1750 kobject_put(nhs->hugepages_kobj); 1777 kobject_put(nhs->hugepages_kobj);
1751 nhs->hugepages_kobj = NULL; 1778 nhs->hugepages_kobj = NULL;
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void)
1848 hugetlb_unregister_all_nodes(); 1875 hugetlb_unregister_all_nodes();
1849 1876
1850 for_each_hstate(h) { 1877 for_each_hstate(h) {
1851 kobject_put(hstate_kobjs[h - hstates]); 1878 kobject_put(hstate_kobjs[hstate_index(h)]);
1852 } 1879 }
1853 1880
1854 kobject_put(hugepages_kobj); 1881 kobject_put(hugepages_kobj);
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void)
1869 if (!size_to_hstate(default_hstate_size)) 1896 if (!size_to_hstate(default_hstate_size))
1870 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 1897 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1871 } 1898 }
1872 default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; 1899 default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
1873 if (default_hstate_max_huge_pages) 1900 if (default_hstate_max_huge_pages)
1874 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 1901 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1875 1902
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order)
1897 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); 1924 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1898 return; 1925 return;
1899 } 1926 }
1900 BUG_ON(max_hstate >= HUGE_MAX_HSTATE); 1927 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
1901 BUG_ON(order == 0); 1928 BUG_ON(order == 0);
1902 h = &hstates[max_hstate++]; 1929 h = &hstates[hugetlb_max_hstate++];
1903 h->order = order; 1930 h->order = order;
1904 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 1931 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1905 h->nr_huge_pages = 0; 1932 h->nr_huge_pages = 0;
1906 h->free_huge_pages = 0; 1933 h->free_huge_pages = 0;
1907 for (i = 0; i < MAX_NUMNODES; ++i) 1934 for (i = 0; i < MAX_NUMNODES; ++i)
1908 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1935 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1936 INIT_LIST_HEAD(&h->hugepage_activelist);
1909 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); 1937 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
1910 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); 1938 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
1911 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1939 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1912 huge_page_size(h)/1024); 1940 huge_page_size(h)/1024);
1941 /*
1942 * Add cgroup control files only if the huge page consists
1943 * of more than two normal pages. This is because we use
1944 * page[2].lru.next for storing cgoup details.
1945 */
1946 if (order >= HUGETLB_CGROUP_MIN_ORDER)
1947 hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
1913 1948
1914 parsed_hstate = h; 1949 parsed_hstate = h;
1915} 1950}
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s)
1920 static unsigned long *last_mhp; 1955 static unsigned long *last_mhp;
1921 1956
1922 /* 1957 /*
1923 * !max_hstate means we haven't parsed a hugepagesz= parameter yet, 1958 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
1924 * so this hugepages= parameter goes to the "default hstate". 1959 * so this hugepages= parameter goes to the "default hstate".
1925 */ 1960 */
1926 if (!max_hstate) 1961 if (!hugetlb_max_hstate)
1927 mhp = &default_hstate_max_huge_pages; 1962 mhp = &default_hstate_max_huge_pages;
1928 else 1963 else
1929 mhp = &parsed_hstate->max_huge_pages; 1964 mhp = &parsed_hstate->max_huge_pages;
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s)
1942 * But we need to allocate >= MAX_ORDER hstates here early to still 1977 * But we need to allocate >= MAX_ORDER hstates here early to still
1943 * use the bootmem allocator. 1978 * use the bootmem allocator.
1944 */ 1979 */
1945 if (max_hstate && parsed_hstate->order >= MAX_ORDER) 1980 if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
1946 hugetlb_hstate_alloc_pages(parsed_hstate); 1981 hugetlb_hstate_alloc_pages(parsed_hstate);
1947 1982
1948 last_mhp = mhp; 1983 last_mhp = mhp;
@@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2308 return 0; 2343 return 0;
2309} 2344}
2310 2345
2311void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2346void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2312 unsigned long end, struct page *ref_page) 2347 unsigned long start, unsigned long end,
2348 struct page *ref_page)
2313{ 2349{
2350 int force_flush = 0;
2314 struct mm_struct *mm = vma->vm_mm; 2351 struct mm_struct *mm = vma->vm_mm;
2315 unsigned long address; 2352 unsigned long address;
2316 pte_t *ptep; 2353 pte_t *ptep;
2317 pte_t pte; 2354 pte_t pte;
2318 struct page *page; 2355 struct page *page;
2319 struct page *tmp;
2320 struct hstate *h = hstate_vma(vma); 2356 struct hstate *h = hstate_vma(vma);
2321 unsigned long sz = huge_page_size(h); 2357 unsigned long sz = huge_page_size(h);
2322 2358
2323 /*
2324 * A page gathering list, protected by per file i_mmap_mutex. The
2325 * lock is used to avoid list corruption from multiple unmapping
2326 * of the same page since we are using page->lru.
2327 */
2328 LIST_HEAD(page_list);
2329
2330 WARN_ON(!is_vm_hugetlb_page(vma)); 2359 WARN_ON(!is_vm_hugetlb_page(vma));
2331 BUG_ON(start & ~huge_page_mask(h)); 2360 BUG_ON(start & ~huge_page_mask(h));
2332 BUG_ON(end & ~huge_page_mask(h)); 2361 BUG_ON(end & ~huge_page_mask(h));
2333 2362
2363 tlb_start_vma(tlb, vma);
2334 mmu_notifier_invalidate_range_start(mm, start, end); 2364 mmu_notifier_invalidate_range_start(mm, start, end);
2365again:
2335 spin_lock(&mm->page_table_lock); 2366 spin_lock(&mm->page_table_lock);
2336 for (address = start; address < end; address += sz) { 2367 for (address = start; address < end; address += sz) {
2337 ptep = huge_pte_offset(mm, address); 2368 ptep = huge_pte_offset(mm, address);
@@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2370 } 2401 }
2371 2402
2372 pte = huge_ptep_get_and_clear(mm, address, ptep); 2403 pte = huge_ptep_get_and_clear(mm, address, ptep);
2404 tlb_remove_tlb_entry(tlb, ptep, address);
2373 if (pte_dirty(pte)) 2405 if (pte_dirty(pte))
2374 set_page_dirty(page); 2406 set_page_dirty(page);
2375 list_add(&page->lru, &page_list);
2376 2407
2408 page_remove_rmap(page);
2409 force_flush = !__tlb_remove_page(tlb, page);
2410 if (force_flush)
2411 break;
2377 /* Bail out after unmapping reference page if supplied */ 2412 /* Bail out after unmapping reference page if supplied */
2378 if (ref_page) 2413 if (ref_page)
2379 break; 2414 break;
2380 } 2415 }
2381 flush_tlb_range(vma, start, end);
2382 spin_unlock(&mm->page_table_lock); 2416 spin_unlock(&mm->page_table_lock);
2383 mmu_notifier_invalidate_range_end(mm, start, end); 2417 /*
2384 list_for_each_entry_safe(page, tmp, &page_list, lru) { 2418 * mmu_gather ran out of room to batch pages, we break out of
2385 page_remove_rmap(page); 2419 * the PTE lock to avoid doing the potential expensive TLB invalidate
2386 list_del(&page->lru); 2420 * and page-free while holding it.
2387 put_page(page); 2421 */
2422 if (force_flush) {
2423 force_flush = 0;
2424 tlb_flush_mmu(tlb);
2425 if (address < end && !ref_page)
2426 goto again;
2388 } 2427 }
2428 mmu_notifier_invalidate_range_end(mm, start, end);
2429 tlb_end_vma(tlb, vma);
2430}
2431
2432void __unmap_hugepage_range_final(struct mmu_gather *tlb,
2433 struct vm_area_struct *vma, unsigned long start,
2434 unsigned long end, struct page *ref_page)
2435{
2436 __unmap_hugepage_range(tlb, vma, start, end, ref_page);
2437
2438 /*
2439 * Clear this flag so that x86's huge_pmd_share page_table_shareable
2440 * test will fail on a vma being torn down, and not grab a page table
2441 * on its way out. We're lucky that the flag has such an appropriate
2442 * name, and can in fact be safely cleared here. We could clear it
2443 * before the __unmap_hugepage_range above, but all that's necessary
2444 * is to clear it before releasing the i_mmap_mutex. This works
2445 * because in the context this is called, the VMA is about to be
2446 * destroyed and the i_mmap_mutex is held.
2447 */
2448 vma->vm_flags &= ~VM_MAYSHARE;
2389} 2449}
2390 2450
2391void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2451void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2392 unsigned long end, struct page *ref_page) 2452 unsigned long end, struct page *ref_page)
2393{ 2453{
2394 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 2454 struct mm_struct *mm;
2395 __unmap_hugepage_range(vma, start, end, ref_page); 2455 struct mmu_gather tlb;
2396 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 2456
2457 mm = vma->vm_mm;
2458
2459 tlb_gather_mmu(&tlb, mm, 0);
2460 __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
2461 tlb_finish_mmu(&tlb, start, end);
2397} 2462}
2398 2463
2399/* 2464/*
@@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2438 * from the time of fork. This would look like data corruption 2503 * from the time of fork. This would look like data corruption
2439 */ 2504 */
2440 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2505 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
2441 __unmap_hugepage_range(iter_vma, 2506 unmap_hugepage_range(iter_vma, address,
2442 address, address + huge_page_size(h), 2507 address + huge_page_size(h), page);
2443 page);
2444 } 2508 }
2445 mutex_unlock(&mapping->i_mmap_mutex); 2509 mutex_unlock(&mapping->i_mmap_mutex);
2446 2510
@@ -2496,6 +2560,7 @@ retry_avoidcopy:
2496 new_page = alloc_huge_page(vma, address, outside_reserve); 2560 new_page = alloc_huge_page(vma, address, outside_reserve);
2497 2561
2498 if (IS_ERR(new_page)) { 2562 if (IS_ERR(new_page)) {
2563 long err = PTR_ERR(new_page);
2499 page_cache_release(old_page); 2564 page_cache_release(old_page);
2500 2565
2501 /* 2566 /*
@@ -2524,7 +2589,10 @@ retry_avoidcopy:
2524 2589
2525 /* Caller expects lock to be held */ 2590 /* Caller expects lock to be held */
2526 spin_lock(&mm->page_table_lock); 2591 spin_lock(&mm->page_table_lock);
2527 return -PTR_ERR(new_page); 2592 if (err == -ENOMEM)
2593 return VM_FAULT_OOM;
2594 else
2595 return VM_FAULT_SIGBUS;
2528 } 2596 }
2529 2597
2530 /* 2598 /*
@@ -2642,7 +2710,11 @@ retry:
2642 goto out; 2710 goto out;
2643 page = alloc_huge_page(vma, address, 0); 2711 page = alloc_huge_page(vma, address, 0);
2644 if (IS_ERR(page)) { 2712 if (IS_ERR(page)) {
2645 ret = -PTR_ERR(page); 2713 ret = PTR_ERR(page);
2714 if (ret == -ENOMEM)
2715 ret = VM_FAULT_OOM;
2716 else
2717 ret = VM_FAULT_SIGBUS;
2646 goto out; 2718 goto out;
2647 } 2719 }
2648 clear_huge_page(page, address, pages_per_huge_page(h)); 2720 clear_huge_page(page, address, pages_per_huge_page(h));
@@ -2679,7 +2751,7 @@ retry:
2679 */ 2751 */
2680 if (unlikely(PageHWPoison(page))) { 2752 if (unlikely(PageHWPoison(page))) {
2681 ret = VM_FAULT_HWPOISON | 2753 ret = VM_FAULT_HWPOISON |
2682 VM_FAULT_SET_HINDEX(h - hstates); 2754 VM_FAULT_SET_HINDEX(hstate_index(h));
2683 goto backout_unlocked; 2755 goto backout_unlocked;
2684 } 2756 }
2685 } 2757 }
@@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2752 return 0; 2824 return 0;
2753 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2825 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2754 return VM_FAULT_HWPOISON_LARGE | 2826 return VM_FAULT_HWPOISON_LARGE |
2755 VM_FAULT_SET_HINDEX(h - hstates); 2827 VM_FAULT_SET_HINDEX(hstate_index(h));
2756 } 2828 }
2757 2829
2758 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2830 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2959 } 3031 }
2960 } 3032 }
2961 spin_unlock(&mm->page_table_lock); 3033 spin_unlock(&mm->page_table_lock);
2962 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 3034 /*
2963 3035 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
3036 * may have cleared our pud entry and done put_page on the page table:
3037 * once we release i_mmap_mutex, another task can do the final put_page
3038 * and that page table be reused and filled with junk.
3039 */
2964 flush_tlb_range(vma, start, end); 3040 flush_tlb_range(vma, start, end);
3041 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2965} 3042}
2966 3043
2967int hugetlb_reserve_pages(struct inode *inode, 3044int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
new file mode 100644
index 000000000000..a3f358fb8a0c
--- /dev/null
+++ b/mm/hugetlb_cgroup.c
@@ -0,0 +1,418 @@
1/*
2 *
3 * Copyright IBM Corporation, 2012
4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of version 2.1 of the GNU Lesser General Public License
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 *
14 */
15
16#include <linux/cgroup.h>
17#include <linux/slab.h>
18#include <linux/hugetlb.h>
19#include <linux/hugetlb_cgroup.h>
20
21struct hugetlb_cgroup {
22 struct cgroup_subsys_state css;
23 /*
24 * the counter to account for hugepages from hugetlb.
25 */
26 struct res_counter hugepage[HUGE_MAX_HSTATE];
27};
28
29#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
30#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
31#define MEMFILE_ATTR(val) ((val) & 0xffff)
32
33struct cgroup_subsys hugetlb_subsys __read_mostly;
34static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
35
36static inline
37struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
38{
39 return container_of(s, struct hugetlb_cgroup, css);
40}
41
42static inline
43struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
44{
45 return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
46 hugetlb_subsys_id));
47}
48
49static inline
50struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
51{
52 return hugetlb_cgroup_from_css(task_subsys_state(task,
53 hugetlb_subsys_id));
54}
55
56static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
57{
58 return (h_cg == root_h_cgroup);
59}
60
61static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg)
62{
63 if (!cg->parent)
64 return NULL;
65 return hugetlb_cgroup_from_cgroup(cg->parent);
66}
67
68static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
69{
70 int idx;
71 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
72
73 for (idx = 0; idx < hugetlb_max_hstate; idx++) {
74 if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
75 return true;
76 }
77 return false;
78}
79
80static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
81{
82 int idx;
83 struct cgroup *parent_cgroup;
84 struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
85
86 h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
87 if (!h_cgroup)
88 return ERR_PTR(-ENOMEM);
89
90 parent_cgroup = cgroup->parent;
91 if (parent_cgroup) {
92 parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
93 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
94 res_counter_init(&h_cgroup->hugepage[idx],
95 &parent_h_cgroup->hugepage[idx]);
96 } else {
97 root_h_cgroup = h_cgroup;
98 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
99 res_counter_init(&h_cgroup->hugepage[idx], NULL);
100 }
101 return &h_cgroup->css;
102}
103
104static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
105{
106 struct hugetlb_cgroup *h_cgroup;
107
108 h_cgroup = hugetlb_cgroup_from_cgroup(cgroup);
109 kfree(h_cgroup);
110}
111
112
113/*
114 * Should be called with hugetlb_lock held.
115 * Since we are holding hugetlb_lock, pages cannot get moved from
116 * active list or uncharged from the cgroup, So no need to get
117 * page reference and test for page active here. This function
118 * cannot fail.
119 */
120static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup,
121 struct page *page)
122{
123 int csize;
124 struct res_counter *counter;
125 struct res_counter *fail_res;
126 struct hugetlb_cgroup *page_hcg;
127 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
128 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
129
130 page_hcg = hugetlb_cgroup_from_page(page);
131 /*
132 * We can have pages in active list without any cgroup
133 * ie, hugepage with less than 3 pages. We can safely
134 * ignore those pages.
135 */
136 if (!page_hcg || page_hcg != h_cg)
137 goto out;
138
139 csize = PAGE_SIZE << compound_order(page);
140 if (!parent) {
141 parent = root_h_cgroup;
142 /* root has no limit */
143 res_counter_charge_nofail(&parent->hugepage[idx],
144 csize, &fail_res);
145 }
146 counter = &h_cg->hugepage[idx];
147 res_counter_uncharge_until(counter, counter->parent, csize);
148
149 set_hugetlb_cgroup(page, parent);
150out:
151 return;
152}
153
154/*
155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
156 * the parent cgroup.
157 */
158static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
159{
160 struct hstate *h;
161 struct page *page;
162 int ret = 0, idx = 0;
163
164 do {
165 if (cgroup_task_count(cgroup) ||
166 !list_empty(&cgroup->children)) {
167 ret = -EBUSY;
168 goto out;
169 }
170 for_each_hstate(h) {
171 spin_lock(&hugetlb_lock);
172 list_for_each_entry(page, &h->hugepage_activelist, lru)
173 hugetlb_cgroup_move_parent(idx, cgroup, page);
174
175 spin_unlock(&hugetlb_lock);
176 idx++;
177 }
178 cond_resched();
179 } while (hugetlb_cgroup_have_usage(cgroup));
180out:
181 return ret;
182}
183
184int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
185 struct hugetlb_cgroup **ptr)
186{
187 int ret = 0;
188 struct res_counter *fail_res;
189 struct hugetlb_cgroup *h_cg = NULL;
190 unsigned long csize = nr_pages * PAGE_SIZE;
191
192 if (hugetlb_cgroup_disabled())
193 goto done;
194 /*
195 * We don't charge any cgroup if the compound page have less
196 * than 3 pages.
197 */
198 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
199 goto done;
200again:
201 rcu_read_lock();
202 h_cg = hugetlb_cgroup_from_task(current);
203 if (!css_tryget(&h_cg->css)) {
204 rcu_read_unlock();
205 goto again;
206 }
207 rcu_read_unlock();
208
209 ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
210 css_put(&h_cg->css);
211done:
212 *ptr = h_cg;
213 return ret;
214}
215
216/* Should be called with hugetlb_lock held */
217void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
218 struct hugetlb_cgroup *h_cg,
219 struct page *page)
220{
221 if (hugetlb_cgroup_disabled() || !h_cg)
222 return;
223
224 set_hugetlb_cgroup(page, h_cg);
225 return;
226}
227
228/*
229 * Should be called with hugetlb_lock held
230 */
231void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
232 struct page *page)
233{
234 struct hugetlb_cgroup *h_cg;
235 unsigned long csize = nr_pages * PAGE_SIZE;
236
237 if (hugetlb_cgroup_disabled())
238 return;
239 VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
240 h_cg = hugetlb_cgroup_from_page(page);
241 if (unlikely(!h_cg))
242 return;
243 set_hugetlb_cgroup(page, NULL);
244 res_counter_uncharge(&h_cg->hugepage[idx], csize);
245 return;
246}
247
248void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
249 struct hugetlb_cgroup *h_cg)
250{
251 unsigned long csize = nr_pages * PAGE_SIZE;
252
253 if (hugetlb_cgroup_disabled() || !h_cg)
254 return;
255
256 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
257 return;
258
259 res_counter_uncharge(&h_cg->hugepage[idx], csize);
260 return;
261}
262
263static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
264 struct file *file, char __user *buf,
265 size_t nbytes, loff_t *ppos)
266{
267 u64 val;
268 char str[64];
269 int idx, name, len;
270 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
271
272 idx = MEMFILE_IDX(cft->private);
273 name = MEMFILE_ATTR(cft->private);
274
275 val = res_counter_read_u64(&h_cg->hugepage[idx], name);
276 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
277 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
278}
279
280static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
281 const char *buffer)
282{
283 int idx, name, ret;
284 unsigned long long val;
285 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
286
287 idx = MEMFILE_IDX(cft->private);
288 name = MEMFILE_ATTR(cft->private);
289
290 switch (name) {
291 case RES_LIMIT:
292 if (hugetlb_cgroup_is_root(h_cg)) {
293 /* Can't set limit on root */
294 ret = -EINVAL;
295 break;
296 }
297 /* This function does all necessary parse...reuse it */
298 ret = res_counter_memparse_write_strategy(buffer, &val);
299 if (ret)
300 break;
301 ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
302 break;
303 default:
304 ret = -EINVAL;
305 break;
306 }
307 return ret;
308}
309
310static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event)
311{
312 int idx, name, ret = 0;
313 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
314
315 idx = MEMFILE_IDX(event);
316 name = MEMFILE_ATTR(event);
317
318 switch (name) {
319 case RES_MAX_USAGE:
320 res_counter_reset_max(&h_cg->hugepage[idx]);
321 break;
322 case RES_FAILCNT:
323 res_counter_reset_failcnt(&h_cg->hugepage[idx]);
324 break;
325 default:
326 ret = -EINVAL;
327 break;
328 }
329 return ret;
330}
331
332static char *mem_fmt(char *buf, int size, unsigned long hsize)
333{
334 if (hsize >= (1UL << 30))
335 snprintf(buf, size, "%luGB", hsize >> 30);
336 else if (hsize >= (1UL << 20))
337 snprintf(buf, size, "%luMB", hsize >> 20);
338 else
339 snprintf(buf, size, "%luKB", hsize >> 10);
340 return buf;
341}
342
343int __init hugetlb_cgroup_file_init(int idx)
344{
345 char buf[32];
346 struct cftype *cft;
347 struct hstate *h = &hstates[idx];
348
349 /* format the size */
350 mem_fmt(buf, 32, huge_page_size(h));
351
352 /* Add the limit file */
353 cft = &h->cgroup_files[0];
354 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
355 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
356 cft->read = hugetlb_cgroup_read;
357 cft->write_string = hugetlb_cgroup_write;
358
359 /* Add the usage file */
360 cft = &h->cgroup_files[1];
361 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
362 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
363 cft->read = hugetlb_cgroup_read;
364
365 /* Add the MAX usage file */
366 cft = &h->cgroup_files[2];
367 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
368 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
369 cft->trigger = hugetlb_cgroup_reset;
370 cft->read = hugetlb_cgroup_read;
371
372 /* Add the failcntfile */
373 cft = &h->cgroup_files[3];
374 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
375 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
376 cft->trigger = hugetlb_cgroup_reset;
377 cft->read = hugetlb_cgroup_read;
378
379 /* NULL terminate the last cft */
380 cft = &h->cgroup_files[4];
381 memset(cft, 0, sizeof(*cft));
382
383 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
384
385 return 0;
386}
387
388/*
389 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
390 * when we migrate hugepages
391 */
392void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
393{
394 struct hugetlb_cgroup *h_cg;
395 struct hstate *h = page_hstate(oldhpage);
396
397 if (hugetlb_cgroup_disabled())
398 return;
399
400 VM_BUG_ON(!PageHuge(oldhpage));
401 spin_lock(&hugetlb_lock);
402 h_cg = hugetlb_cgroup_from_page(oldhpage);
403 set_hugetlb_cgroup(oldhpage, NULL);
404
405 /* move the h_cg details to new cgroup */
406 set_hugetlb_cgroup(newhpage, h_cg);
407 list_move(&newhpage->lru, &h->hugepage_activelist);
408 spin_unlock(&hugetlb_lock);
409 return;
410}
411
412struct cgroup_subsys hugetlb_subsys = {
413 .name = "hugetlb",
414 .create = hugetlb_cgroup_create,
415 .pre_destroy = hugetlb_cgroup_pre_destroy,
416 .destroy = hugetlb_cgroup_destroy,
417 .subsys_id = hugetlb_subsys_id,
418};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index cc448bb983ba..3a61efc518d5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -123,7 +123,7 @@ static int pfn_inject_init(void)
123 if (!dentry) 123 if (!dentry)
124 goto fail; 124 goto fail;
125 125
126#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 126#ifdef CONFIG_MEMCG_SWAP
127 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, 127 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
128 hwpoison_dir, &hwpoison_filter_memcg); 128 hwpoison_dir, &hwpoison_filter_memcg);
129 if (!dentry) 129 if (!dentry)
diff --git a/mm/internal.h b/mm/internal.h
index 2ba87fbfb75b..3314f79d775a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,8 +118,14 @@ struct compact_control {
118 unsigned long nr_freepages; /* Number of isolated free pages */ 118 unsigned long nr_freepages; /* Number of isolated free pages */
119 unsigned long nr_migratepages; /* Number of pages to migrate */ 119 unsigned long nr_migratepages; /* Number of pages to migrate */
120 unsigned long free_pfn; /* isolate_freepages search base */ 120 unsigned long free_pfn; /* isolate_freepages search base */
121 unsigned long start_free_pfn; /* where we started the search */
121 unsigned long migrate_pfn; /* isolate_migratepages search base */ 122 unsigned long migrate_pfn; /* isolate_migratepages search base */
122 bool sync; /* Synchronous migration */ 123 bool sync; /* Synchronous migration */
124 bool wrapped; /* Order > 0 compactions are
125 incremental, once free_pfn
126 and migrate_pfn meet, we restart
127 from the top of the zone;
128 remember we wrapped around. */
123 129
124 int order; /* order a direct compactor needs */ 130 int order; /* order a direct compactor needs */
125 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 131 int migratetype; /* MOVABLE, RECLAIMABLE etc */
@@ -347,3 +353,5 @@ extern u32 hwpoison_filter_enable;
347extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, 353extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
348 unsigned long, unsigned long, 354 unsigned long, unsigned long,
349 unsigned long, unsigned long); 355 unsigned long, unsigned long);
356
357extern void set_pageblock_order(void);
diff --git a/mm/madvise.c b/mm/madvise.c
index deff1b64a08c..14d260fa0d17 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -15,6 +15,7 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/ksm.h> 16#include <linux/ksm.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/file.h>
18 19
19/* 20/*
20 * Any behaviour which results in changes to the vma->vm_flags needs to 21 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -204,14 +205,16 @@ static long madvise_remove(struct vm_area_struct *vma,
204{ 205{
205 loff_t offset; 206 loff_t offset;
206 int error; 207 int error;
208 struct file *f;
207 209
208 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 210 *prev = NULL; /* tell sys_madvise we drop mmap_sem */
209 211
210 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 212 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
211 return -EINVAL; 213 return -EINVAL;
212 214
213 if (!vma->vm_file || !vma->vm_file->f_mapping 215 f = vma->vm_file;
214 || !vma->vm_file->f_mapping->host) { 216
217 if (!f || !f->f_mapping || !f->f_mapping->host) {
215 return -EINVAL; 218 return -EINVAL;
216 } 219 }
217 220
@@ -221,11 +224,18 @@ static long madvise_remove(struct vm_area_struct *vma,
221 offset = (loff_t)(start - vma->vm_start) 224 offset = (loff_t)(start - vma->vm_start)
222 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 225 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
223 226
224 /* filesystem's fallocate may need to take i_mutex */ 227 /*
228 * Filesystem's fallocate may need to take i_mutex. We need to
229 * explicitly grab a reference because the vma (and hence the
230 * vma's reference to the file) can go away as soon as we drop
231 * mmap_sem.
232 */
233 get_file(f);
225 up_read(&current->mm->mmap_sem); 234 up_read(&current->mm->mmap_sem);
226 error = do_fallocate(vma->vm_file, 235 error = do_fallocate(f,
227 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 236 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
228 offset, end - start); 237 offset, end - start);
238 fput(f);
229 down_read(&current->mm->mmap_sem); 239 down_read(&current->mm->mmap_sem);
230 return error; 240 return error;
231} 241}
diff --git a/mm/memblock.c b/mm/memblock.c
index 952123eba433..4d9393c7edc9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -143,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
143 MAX_NUMNODES); 143 MAX_NUMNODES);
144} 144}
145 145
146/*
147 * Free memblock.reserved.regions
148 */
149int __init_memblock memblock_free_reserved_regions(void)
150{
151 if (memblock.reserved.regions == memblock_reserved_init_regions)
152 return 0;
153
154 return memblock_free(__pa(memblock.reserved.regions),
155 sizeof(struct memblock_region) * memblock.reserved.max);
156}
157
158/*
159 * Reserve memblock.reserved.regions
160 */
161int __init_memblock memblock_reserve_reserved_regions(void)
162{
163 if (memblock.reserved.regions == memblock_reserved_init_regions)
164 return 0;
165
166 return memblock_reserve(__pa(memblock.reserved.regions),
167 sizeof(struct memblock_region) * memblock.reserved.max);
168}
169
170static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) 146static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
171{ 147{
172 type->total_size -= type->regions[r].size; 148 type->total_size -= type->regions[r].size;
@@ -184,9 +160,39 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
184 } 160 }
185} 161}
186 162
187static int __init_memblock memblock_double_array(struct memblock_type *type) 163phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
164 phys_addr_t *addr)
165{
166 if (memblock.reserved.regions == memblock_reserved_init_regions)
167 return 0;
168
169 *addr = __pa(memblock.reserved.regions);
170
171 return PAGE_ALIGN(sizeof(struct memblock_region) *
172 memblock.reserved.max);
173}
174
175/**
176 * memblock_double_array - double the size of the memblock regions array
177 * @type: memblock type of the regions array being doubled
178 * @new_area_start: starting address of memory range to avoid overlap with
179 * @new_area_size: size of memory range to avoid overlap with
180 *
181 * Double the size of the @type regions array. If memblock is being used to
182 * allocate memory for a new reserved regions array and there is a previously
183 * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
184 * waiting to be reserved, ensure the memory used by the new array does
185 * not overlap.
186 *
187 * RETURNS:
188 * 0 on success, -1 on failure.
189 */
190static int __init_memblock memblock_double_array(struct memblock_type *type,
191 phys_addr_t new_area_start,
192 phys_addr_t new_area_size)
188{ 193{
189 struct memblock_region *new_array, *old_array; 194 struct memblock_region *new_array, *old_array;
195 phys_addr_t old_alloc_size, new_alloc_size;
190 phys_addr_t old_size, new_size, addr; 196 phys_addr_t old_size, new_size, addr;
191 int use_slab = slab_is_available(); 197 int use_slab = slab_is_available();
192 int *in_slab; 198 int *in_slab;
@@ -200,6 +206,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
200 /* Calculate new doubled size */ 206 /* Calculate new doubled size */
201 old_size = type->max * sizeof(struct memblock_region); 207 old_size = type->max * sizeof(struct memblock_region);
202 new_size = old_size << 1; 208 new_size = old_size << 1;
209 /*
210 * We need to allocated new one align to PAGE_SIZE,
211 * so we can free them completely later.
212 */
213 old_alloc_size = PAGE_ALIGN(old_size);
214 new_alloc_size = PAGE_ALIGN(new_size);
203 215
204 /* Retrieve the slab flag */ 216 /* Retrieve the slab flag */
205 if (type == &memblock.memory) 217 if (type == &memblock.memory)
@@ -210,19 +222,30 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
210 /* Try to find some space for it. 222 /* Try to find some space for it.
211 * 223 *
212 * WARNING: We assume that either slab_is_available() and we use it or 224 * WARNING: We assume that either slab_is_available() and we use it or
213 * we use MEMBLOCK for allocations. That means that this is unsafe to use 225 * we use MEMBLOCK for allocations. That means that this is unsafe to
214 * when bootmem is currently active (unless bootmem itself is implemented 226 * use when bootmem is currently active (unless bootmem itself is
215 * on top of MEMBLOCK which isn't the case yet) 227 * implemented on top of MEMBLOCK which isn't the case yet)
216 * 228 *
217 * This should however not be an issue for now, as we currently only 229 * This should however not be an issue for now, as we currently only
218 * call into MEMBLOCK while it's still active, or much later when slab is 230 * call into MEMBLOCK while it's still active, or much later when slab
219 * active for memory hotplug operations 231 * is active for memory hotplug operations
220 */ 232 */
221 if (use_slab) { 233 if (use_slab) {
222 new_array = kmalloc(new_size, GFP_KERNEL); 234 new_array = kmalloc(new_size, GFP_KERNEL);
223 addr = new_array ? __pa(new_array) : 0; 235 addr = new_array ? __pa(new_array) : 0;
224 } else { 236 } else {
225 addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); 237 /* only exclude range when trying to double reserved.regions */
238 if (type != &memblock.reserved)
239 new_area_start = new_area_size = 0;
240
241 addr = memblock_find_in_range(new_area_start + new_area_size,
242 memblock.current_limit,
243 new_alloc_size, PAGE_SIZE);
244 if (!addr && new_area_size)
245 addr = memblock_find_in_range(0,
246 min(new_area_start, memblock.current_limit),
247 new_alloc_size, PAGE_SIZE);
248
226 new_array = addr ? __va(addr) : 0; 249 new_array = addr ? __va(addr) : 0;
227 } 250 }
228 if (!addr) { 251 if (!addr) {
@@ -231,12 +254,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
231 return -1; 254 return -1;
232 } 255 }
233 256
234 memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", 257 memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
235 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); 258 memblock_type_name(type), type->max * 2, (u64)addr,
259 (u64)addr + new_size - 1);
236 260
237 /* Found space, we now need to move the array over before 261 /*
238 * we add the reserved region since it may be our reserved 262 * Found space, we now need to move the array over before we add the
239 * array itself that is full. 263 * reserved region since it may be our reserved array itself that is
264 * full.
240 */ 265 */
241 memcpy(new_array, type->regions, old_size); 266 memcpy(new_array, type->regions, old_size);
242 memset(new_array + type->max, 0, old_size); 267 memset(new_array + type->max, 0, old_size);
@@ -244,20 +269,19 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
244 type->regions = new_array; 269 type->regions = new_array;
245 type->max <<= 1; 270 type->max <<= 1;
246 271
247 /* Free old array. We needn't free it if the array is the 272 /* Free old array. We needn't free it if the array is the static one */
248 * static one
249 */
250 if (*in_slab) 273 if (*in_slab)
251 kfree(old_array); 274 kfree(old_array);
252 else if (old_array != memblock_memory_init_regions && 275 else if (old_array != memblock_memory_init_regions &&
253 old_array != memblock_reserved_init_regions) 276 old_array != memblock_reserved_init_regions)
254 memblock_free(__pa(old_array), old_size); 277 memblock_free(__pa(old_array), old_alloc_size);
255 278
256 /* Reserve the new array if that comes from the memblock. 279 /*
257 * Otherwise, we needn't do it 280 * Reserve the new array if that comes from the memblock. Otherwise, we
281 * needn't do it
258 */ 282 */
259 if (!use_slab) 283 if (!use_slab)
260 BUG_ON(memblock_reserve(addr, new_size)); 284 BUG_ON(memblock_reserve(addr, new_alloc_size));
261 285
262 /* Update slab flag */ 286 /* Update slab flag */
263 *in_slab = use_slab; 287 *in_slab = use_slab;
@@ -399,7 +423,7 @@ repeat:
399 */ 423 */
400 if (!insert) { 424 if (!insert) {
401 while (type->cnt + nr_new > type->max) 425 while (type->cnt + nr_new > type->max)
402 if (memblock_double_array(type) < 0) 426 if (memblock_double_array(type, obase, size) < 0)
403 return -ENOMEM; 427 return -ENOMEM;
404 insert = true; 428 insert = true;
405 goto repeat; 429 goto repeat;
@@ -450,7 +474,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
450 474
451 /* we'll create at most two more regions */ 475 /* we'll create at most two more regions */
452 while (type->cnt + 2 > type->max) 476 while (type->cnt + 2 > type->max)
453 if (memblock_double_array(type) < 0) 477 if (memblock_double_array(type, base, size) < 0)
454 return -ENOMEM; 478 return -ENOMEM;
455 479
456 for (i = 0; i < type->cnt; i++) { 480 for (i = 0; i < type->cnt; i++) {
@@ -540,9 +564,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
540 * __next_free_mem_range - next function for for_each_free_mem_range() 564 * __next_free_mem_range - next function for for_each_free_mem_range()
541 * @idx: pointer to u64 loop variable 565 * @idx: pointer to u64 loop variable
542 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 566 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
543 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 567 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
544 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 568 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
545 * @p_nid: ptr to int for nid of the range, can be %NULL 569 * @out_nid: ptr to int for nid of the range, can be %NULL
546 * 570 *
547 * Find the first free area from *@idx which matches @nid, fill the out 571 * Find the first free area from *@idx which matches @nid, fill the out
548 * parameters, and update *@idx for the next iteration. The lower 32bit of 572 * parameters, and update *@idx for the next iteration. The lower 32bit of
@@ -616,9 +640,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
616 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() 640 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
617 * @idx: pointer to u64 loop variable 641 * @idx: pointer to u64 loop variable
618 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 642 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
619 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 643 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
620 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 644 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
621 * @p_nid: ptr to int for nid of the range, can be %NULL 645 * @out_nid: ptr to int for nid of the range, can be %NULL
622 * 646 *
623 * Reverse of __next_free_mem_range(). 647 * Reverse of __next_free_mem_range().
624 */ 648 */
@@ -867,6 +891,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
867 return memblock_search(&memblock.memory, addr) != -1; 891 return memblock_search(&memblock.memory, addr) != -1;
868} 892}
869 893
894/**
895 * memblock_is_region_memory - check if a region is a subset of memory
896 * @base: base of region to check
897 * @size: size of region to check
898 *
899 * Check if the region [@base, @base+@size) is a subset of a memory block.
900 *
901 * RETURNS:
902 * 0 if false, non-zero if true
903 */
870int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) 904int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
871{ 905{
872 int idx = memblock_search(&memblock.memory, base); 906 int idx = memblock_search(&memblock.memory, base);
@@ -879,6 +913,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
879 memblock.memory.regions[idx].size) >= end; 913 memblock.memory.regions[idx].size) >= end;
880} 914}
881 915
916/**
917 * memblock_is_region_reserved - check if a region intersects reserved memory
918 * @base: base of region to check
919 * @size: size of region to check
920 *
921 * Check if the region [@base, @base+@size) intersects a reserved memory block.
922 *
923 * RETURNS:
924 * 0 if false, non-zero if true
925 */
882int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) 926int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
883{ 927{
884 memblock_cap_size(base, &size); 928 memblock_cap_size(base, &size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ac35bccadb7b..795e525afaba 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,12 +61,12 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly;
61#define MEM_CGROUP_RECLAIM_RETRIES 5 61#define MEM_CGROUP_RECLAIM_RETRIES 5
62static struct mem_cgroup *root_mem_cgroup __read_mostly; 62static struct mem_cgroup *root_mem_cgroup __read_mostly;
63 63
64#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 64#ifdef CONFIG_MEMCG_SWAP
65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
66int do_swap_account __read_mostly; 66int do_swap_account __read_mostly;
67 67
68/* for remember boot option*/ 68/* for remember boot option*/
69#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 69#ifdef CONFIG_MEMCG_SWAP_ENABLED
70static int really_do_swap_account __initdata = 1; 70static int really_do_swap_account __initdata = 1;
71#else 71#else
72static int really_do_swap_account __initdata = 0; 72static int really_do_swap_account __initdata = 0;
@@ -87,7 +87,7 @@ enum mem_cgroup_stat_index {
87 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 87 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 90 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
91 MEM_CGROUP_STAT_NSTATS, 91 MEM_CGROUP_STAT_NSTATS,
92}; 92};
93 93
@@ -378,9 +378,7 @@ static bool move_file(void)
378 378
379enum charge_type { 379enum charge_type {
380 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 380 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
381 MEM_CGROUP_CHARGE_TYPE_MAPPED, 381 MEM_CGROUP_CHARGE_TYPE_ANON,
382 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
383 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
384 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 382 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
385 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 383 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
386 NR_CHARGE_TYPE, 384 NR_CHARGE_TYPE,
@@ -407,8 +405,14 @@ enum charge_type {
407static void mem_cgroup_get(struct mem_cgroup *memcg); 405static void mem_cgroup_get(struct mem_cgroup *memcg);
408static void mem_cgroup_put(struct mem_cgroup *memcg); 406static void mem_cgroup_put(struct mem_cgroup *memcg);
409 407
408static inline
409struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
410{
411 return container_of(s, struct mem_cgroup, css);
412}
413
410/* Writing them here to avoid exposing memcg's inner layout */ 414/* Writing them here to avoid exposing memcg's inner layout */
411#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 415#ifdef CONFIG_MEMCG_KMEM
412#include <net/sock.h> 416#include <net/sock.h>
413#include <net/ip.h> 417#include <net/ip.h>
414 418
@@ -467,9 +471,9 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
467} 471}
468EXPORT_SYMBOL(tcp_proto_cgroup); 472EXPORT_SYMBOL(tcp_proto_cgroup);
469#endif /* CONFIG_INET */ 473#endif /* CONFIG_INET */
470#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ 474#endif /* CONFIG_MEMCG_KMEM */
471 475
472#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) 476#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
473static void disarm_sock_keys(struct mem_cgroup *memcg) 477static void disarm_sock_keys(struct mem_cgroup *memcg)
474{ 478{
475 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) 479 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -703,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
703 bool charge) 707 bool charge)
704{ 708{
705 int val = (charge) ? 1 : -1; 709 int val = (charge) ? 1 : -1;
706 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 710 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
707} 711}
708 712
709static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 713static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
@@ -864,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
864 868
865struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 869struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
866{ 870{
867 return container_of(cgroup_subsys_state(cont, 871 return mem_cgroup_from_css(
868 mem_cgroup_subsys_id), struct mem_cgroup, 872 cgroup_subsys_state(cont, mem_cgroup_subsys_id));
869 css);
870} 873}
871 874
872struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 875struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -879,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
879 if (unlikely(!p)) 882 if (unlikely(!p))
880 return NULL; 883 return NULL;
881 884
882 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 885 return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
883 struct mem_cgroup, css);
884} 886}
885 887
886struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 888struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -966,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
966 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); 968 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
967 if (css) { 969 if (css) {
968 if (css == &root->css || css_tryget(css)) 970 if (css == &root->css || css_tryget(css))
969 memcg = container_of(css, 971 memcg = mem_cgroup_from_css(css);
970 struct mem_cgroup, css);
971 } else 972 } else
972 id = 0; 973 id = 0;
973 rcu_read_unlock(); 974 rcu_read_unlock();
@@ -1148,7 +1149,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1148{ 1149{
1149 if (root_memcg == memcg) 1150 if (root_memcg == memcg)
1150 return true; 1151 return true;
1151 if (!root_memcg->use_hierarchy) 1152 if (!root_memcg->use_hierarchy || !memcg)
1152 return false; 1153 return false;
1153 return css_is_ancestor(&memcg->css, &root_memcg->css); 1154 return css_is_ancestor(&memcg->css, &root_memcg->css);
1154} 1155}
@@ -1234,7 +1235,7 @@ int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
1234 1235
1235/** 1236/**
1236 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1237 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1237 * @mem: the memory cgroup 1238 * @memcg: the memory cgroup
1238 * 1239 *
1239 * Returns the maximum amount of memory @mem can be charged with, in 1240 * Returns the maximum amount of memory @mem can be charged with, in
1240 * pages. 1241 * pages.
@@ -1454,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1454/* 1455/*
1455 * Return the memory (and swap, if configured) limit for a memcg. 1456 * Return the memory (and swap, if configured) limit for a memcg.
1456 */ 1457 */
1457u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1458static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1458{ 1459{
1459 u64 limit; 1460 u64 limit;
1460 u64 memsw; 1461 u64 memsw;
@@ -1470,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1470 return min(limit, memsw); 1471 return min(limit, memsw);
1471} 1472}
1472 1473
1474void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1475 int order)
1476{
1477 struct mem_cgroup *iter;
1478 unsigned long chosen_points = 0;
1479 unsigned long totalpages;
1480 unsigned int points = 0;
1481 struct task_struct *chosen = NULL;
1482
1483 /*
1484 * If current has a pending SIGKILL, then automatically select it. The
1485 * goal is to allow it to allocate so that it may quickly exit and free
1486 * its memory.
1487 */
1488 if (fatal_signal_pending(current)) {
1489 set_thread_flag(TIF_MEMDIE);
1490 return;
1491 }
1492
1493 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1494 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1495 for_each_mem_cgroup_tree(iter, memcg) {
1496 struct cgroup *cgroup = iter->css.cgroup;
1497 struct cgroup_iter it;
1498 struct task_struct *task;
1499
1500 cgroup_iter_start(cgroup, &it);
1501 while ((task = cgroup_iter_next(cgroup, &it))) {
1502 switch (oom_scan_process_thread(task, totalpages, NULL,
1503 false)) {
1504 case OOM_SCAN_SELECT:
1505 if (chosen)
1506 put_task_struct(chosen);
1507 chosen = task;
1508 chosen_points = ULONG_MAX;
1509 get_task_struct(chosen);
1510 /* fall through */
1511 case OOM_SCAN_CONTINUE:
1512 continue;
1513 case OOM_SCAN_ABORT:
1514 cgroup_iter_end(cgroup, &it);
1515 mem_cgroup_iter_break(memcg, iter);
1516 if (chosen)
1517 put_task_struct(chosen);
1518 return;
1519 case OOM_SCAN_OK:
1520 break;
1521 };
1522 points = oom_badness(task, memcg, NULL, totalpages);
1523 if (points > chosen_points) {
1524 if (chosen)
1525 put_task_struct(chosen);
1526 chosen = task;
1527 chosen_points = points;
1528 get_task_struct(chosen);
1529 }
1530 }
1531 cgroup_iter_end(cgroup, &it);
1532 }
1533
1534 if (!chosen)
1535 return;
1536 points = chosen_points * 1000 / totalpages;
1537 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1538 NULL, "Memory cgroup out of memory");
1539}
1540
1473static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, 1541static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1474 gfp_t gfp_mask, 1542 gfp_t gfp_mask,
1475 unsigned long flags) 1543 unsigned long flags)
@@ -1508,7 +1576,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1508 1576
1509/** 1577/**
1510 * test_mem_cgroup_node_reclaimable 1578 * test_mem_cgroup_node_reclaimable
1511 * @mem: the target memcg 1579 * @memcg: the target memcg
1512 * @nid: the node ID to be checked. 1580 * @nid: the node ID to be checked.
1513 * @noswap : specify true here if the user wants flle only information. 1581 * @noswap : specify true here if the user wants flle only information.
1514 * 1582 *
@@ -1899,7 +1967,7 @@ again:
1899 return; 1967 return;
1900 /* 1968 /*
1901 * If this memory cgroup is not under account moving, we don't 1969 * If this memory cgroup is not under account moving, we don't
1902 * need to take move_lock_page_cgroup(). Because we already hold 1970 * need to take move_lock_mem_cgroup(). Because we already hold
1903 * rcu_read_lock(), any calls to move_account will be delayed until 1971 * rcu_read_lock(), any calls to move_account will be delayed until
1904 * rcu_read_unlock() if mem_cgroup_stolen() == true. 1972 * rcu_read_unlock() if mem_cgroup_stolen() == true.
1905 */ 1973 */
@@ -1921,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
1921 /* 1989 /*
1922 * It's guaranteed that pc->mem_cgroup never changes while 1990 * It's guaranteed that pc->mem_cgroup never changes while
1923 * lock is held because a routine modifies pc->mem_cgroup 1991 * lock is held because a routine modifies pc->mem_cgroup
1924 * should take move_lock_page_cgroup(). 1992 * should take move_lock_mem_cgroup().
1925 */ 1993 */
1926 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 1994 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
1927} 1995}
@@ -2268,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2268 * We always charge the cgroup the mm_struct belongs to. 2336 * We always charge the cgroup the mm_struct belongs to.
2269 * The mm_struct's mem_cgroup changes on task migration if the 2337 * The mm_struct's mem_cgroup changes on task migration if the
2270 * thread group leader migrates. It's possible that mm is not 2338 * thread group leader migrates. It's possible that mm is not
2271 * set, if so charge the init_mm (happens for pagecache usage). 2339 * set, if so charge the root memcg (happens for pagecache usage).
2272 */ 2340 */
2273 if (!*ptr && !mm) 2341 if (!*ptr && !mm)
2274 *ptr = root_mem_cgroup; 2342 *ptr = root_mem_cgroup;
@@ -2429,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2429 css = css_lookup(&mem_cgroup_subsys, id); 2497 css = css_lookup(&mem_cgroup_subsys, id);
2430 if (!css) 2498 if (!css)
2431 return NULL; 2499 return NULL;
2432 return container_of(css, struct mem_cgroup, css); 2500 return mem_cgroup_from_css(css);
2433} 2501}
2434 2502
2435struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2503struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2473,11 +2541,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2473 bool anon; 2541 bool anon;
2474 2542
2475 lock_page_cgroup(pc); 2543 lock_page_cgroup(pc);
2476 if (unlikely(PageCgroupUsed(pc))) { 2544 VM_BUG_ON(PageCgroupUsed(pc));
2477 unlock_page_cgroup(pc);
2478 __mem_cgroup_cancel_charge(memcg, nr_pages);
2479 return;
2480 }
2481 /* 2545 /*
2482 * we don't need page_cgroup_lock about tail pages, becase they are not 2546 * we don't need page_cgroup_lock about tail pages, becase they are not
2483 * accessed by any other context at this point. 2547 * accessed by any other context at this point.
@@ -2519,7 +2583,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2519 spin_unlock_irq(&zone->lru_lock); 2583 spin_unlock_irq(&zone->lru_lock);
2520 } 2584 }
2521 2585
2522 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2586 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2523 anon = true; 2587 anon = true;
2524 else 2588 else
2525 anon = false; 2589 anon = false;
@@ -2644,8 +2708,7 @@ out:
2644 2708
2645static int mem_cgroup_move_parent(struct page *page, 2709static int mem_cgroup_move_parent(struct page *page,
2646 struct page_cgroup *pc, 2710 struct page_cgroup *pc,
2647 struct mem_cgroup *child, 2711 struct mem_cgroup *child)
2648 gfp_t gfp_mask)
2649{ 2712{
2650 struct mem_cgroup *parent; 2713 struct mem_cgroup *parent;
2651 unsigned int nr_pages; 2714 unsigned int nr_pages;
@@ -2728,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page,
2728 VM_BUG_ON(page->mapping && !PageAnon(page)); 2791 VM_BUG_ON(page->mapping && !PageAnon(page));
2729 VM_BUG_ON(!mm); 2792 VM_BUG_ON(!mm);
2730 return mem_cgroup_charge_common(page, mm, gfp_mask, 2793 return mem_cgroup_charge_common(page, mm, gfp_mask,
2731 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2794 MEM_CGROUP_CHARGE_TYPE_ANON);
2732}
2733
2734static void
2735__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2736 enum charge_type ctype);
2737
2738int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2739 gfp_t gfp_mask)
2740{
2741 struct mem_cgroup *memcg = NULL;
2742 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2743 int ret;
2744
2745 if (mem_cgroup_disabled())
2746 return 0;
2747 if (PageCompound(page))
2748 return 0;
2749
2750 if (unlikely(!mm))
2751 mm = &init_mm;
2752 if (!page_is_file_cache(page))
2753 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2754
2755 if (!PageSwapCache(page))
2756 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2757 else { /* page is swapcache/shmem */
2758 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2759 if (!ret)
2760 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2761 }
2762 return ret;
2763} 2795}
2764 2796
2765/* 2797/*
@@ -2768,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2768 * struct page_cgroup is acquired. This refcnt will be consumed by 2800 * struct page_cgroup is acquired. This refcnt will be consumed by
2769 * "commit()" or removed by "cancel()" 2801 * "commit()" or removed by "cancel()"
2770 */ 2802 */
2771int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2803static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2772 struct page *page, 2804 struct page *page,
2773 gfp_t mask, struct mem_cgroup **memcgp) 2805 gfp_t mask,
2806 struct mem_cgroup **memcgp)
2774{ 2807{
2775 struct mem_cgroup *memcg; 2808 struct mem_cgroup *memcg;
2809 struct page_cgroup *pc;
2776 int ret; 2810 int ret;
2777 2811
2778 *memcgp = NULL; 2812 pc = lookup_page_cgroup(page);
2779
2780 if (mem_cgroup_disabled())
2781 return 0;
2782
2783 if (!do_swap_account)
2784 goto charge_cur_mm;
2785 /* 2813 /*
2786 * A racing thread's fault, or swapoff, may have already updated 2814 * Every swap fault against a single page tries to charge the
2787 * the pte, and even removed page from swap cache: in those cases 2815 * page, bail as early as possible. shmem_unuse() encounters
2788 * do_swap_page()'s pte_same() test will fail; but there's also a 2816 * already charged pages, too. The USED bit is protected by
2789 * KSM case which does need to charge the page. 2817 * the page lock, which serializes swap cache removal, which
2818 * in turn serializes uncharging.
2790 */ 2819 */
2791 if (!PageSwapCache(page)) 2820 if (PageCgroupUsed(pc))
2821 return 0;
2822 if (!do_swap_account)
2792 goto charge_cur_mm; 2823 goto charge_cur_mm;
2793 memcg = try_get_mem_cgroup_from_page(page); 2824 memcg = try_get_mem_cgroup_from_page(page);
2794 if (!memcg) 2825 if (!memcg)
@@ -2800,14 +2831,44 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2800 ret = 0; 2831 ret = 0;
2801 return ret; 2832 return ret;
2802charge_cur_mm: 2833charge_cur_mm:
2803 if (unlikely(!mm))
2804 mm = &init_mm;
2805 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 2834 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
2806 if (ret == -EINTR) 2835 if (ret == -EINTR)
2807 ret = 0; 2836 ret = 0;
2808 return ret; 2837 return ret;
2809} 2838}
2810 2839
2840int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
2841 gfp_t gfp_mask, struct mem_cgroup **memcgp)
2842{
2843 *memcgp = NULL;
2844 if (mem_cgroup_disabled())
2845 return 0;
2846 /*
2847 * A racing thread's fault, or swapoff, may have already
2848 * updated the pte, and even removed page from swap cache: in
2849 * those cases unuse_pte()'s pte_same() test will fail; but
2850 * there's also a KSM case which does need to charge the page.
2851 */
2852 if (!PageSwapCache(page)) {
2853 int ret;
2854
2855 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
2856 if (ret == -EINTR)
2857 ret = 0;
2858 return ret;
2859 }
2860 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
2861}
2862
2863void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
2864{
2865 if (mem_cgroup_disabled())
2866 return;
2867 if (!memcg)
2868 return;
2869 __mem_cgroup_cancel_charge(memcg, 1);
2870}
2871
2811static void 2872static void
2812__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 2873__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2813 enum charge_type ctype) 2874 enum charge_type ctype)
@@ -2842,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
2842 struct mem_cgroup *memcg) 2903 struct mem_cgroup *memcg)
2843{ 2904{
2844 __mem_cgroup_commit_charge_swapin(page, memcg, 2905 __mem_cgroup_commit_charge_swapin(page, memcg,
2845 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2906 MEM_CGROUP_CHARGE_TYPE_ANON);
2846} 2907}
2847 2908
2848void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 2909int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2910 gfp_t gfp_mask)
2849{ 2911{
2912 struct mem_cgroup *memcg = NULL;
2913 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2914 int ret;
2915
2850 if (mem_cgroup_disabled()) 2916 if (mem_cgroup_disabled())
2851 return; 2917 return 0;
2852 if (!memcg) 2918 if (PageCompound(page))
2853 return; 2919 return 0;
2854 __mem_cgroup_cancel_charge(memcg, 1); 2920
2921 if (!PageSwapCache(page))
2922 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2923 else { /* page is swapcache/shmem */
2924 ret = __mem_cgroup_try_charge_swapin(mm, page,
2925 gfp_mask, &memcg);
2926 if (!ret)
2927 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2928 }
2929 return ret;
2855} 2930}
2856 2931
2857static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 2932static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -2911,7 +2986,8 @@ direct_uncharge:
2911 * uncharge if !page_mapped(page) 2986 * uncharge if !page_mapped(page)
2912 */ 2987 */
2913static struct mem_cgroup * 2988static struct mem_cgroup *
2914__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2989__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
2990 bool end_migration)
2915{ 2991{
2916 struct mem_cgroup *memcg = NULL; 2992 struct mem_cgroup *memcg = NULL;
2917 unsigned int nr_pages = 1; 2993 unsigned int nr_pages = 1;
@@ -2921,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2921 if (mem_cgroup_disabled()) 2997 if (mem_cgroup_disabled())
2922 return NULL; 2998 return NULL;
2923 2999
2924 if (PageSwapCache(page)) 3000 VM_BUG_ON(PageSwapCache(page));
2925 return NULL;
2926 3001
2927 if (PageTransHuge(page)) { 3002 if (PageTransHuge(page)) {
2928 nr_pages <<= compound_order(page); 3003 nr_pages <<= compound_order(page);
@@ -2945,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2945 anon = PageAnon(page); 3020 anon = PageAnon(page);
2946 3021
2947 switch (ctype) { 3022 switch (ctype) {
2948 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 3023 case MEM_CGROUP_CHARGE_TYPE_ANON:
2949 /* 3024 /*
2950 * Generally PageAnon tells if it's the anon statistics to be 3025 * Generally PageAnon tells if it's the anon statistics to be
2951 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is 3026 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
@@ -2955,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2955 /* fallthrough */ 3030 /* fallthrough */
2956 case MEM_CGROUP_CHARGE_TYPE_DROP: 3031 case MEM_CGROUP_CHARGE_TYPE_DROP:
2957 /* See mem_cgroup_prepare_migration() */ 3032 /* See mem_cgroup_prepare_migration() */
2958 if (page_mapped(page) || PageCgroupMigration(pc)) 3033 if (page_mapped(page))
3034 goto unlock_out;
3035 /*
3036 * Pages under migration may not be uncharged. But
3037 * end_migration() /must/ be the one uncharging the
3038 * unused post-migration page and so it has to call
3039 * here with the migration bit still set. See the
3040 * res_counter handling below.
3041 */
3042 if (!end_migration && PageCgroupMigration(pc))
2959 goto unlock_out; 3043 goto unlock_out;
2960 break; 3044 break;
2961 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 3045 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -2989,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2989 mem_cgroup_swap_statistics(memcg, true); 3073 mem_cgroup_swap_statistics(memcg, true);
2990 mem_cgroup_get(memcg); 3074 mem_cgroup_get(memcg);
2991 } 3075 }
2992 if (!mem_cgroup_is_root(memcg)) 3076 /*
3077 * Migration does not charge the res_counter for the
3078 * replacement page, so leave it alone when phasing out the
3079 * page that is unused after the migration.
3080 */
3081 if (!end_migration && !mem_cgroup_is_root(memcg))
2993 mem_cgroup_do_uncharge(memcg, nr_pages, ctype); 3082 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
2994 3083
2995 return memcg; 3084 return memcg;
@@ -3005,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page)
3005 if (page_mapped(page)) 3094 if (page_mapped(page))
3006 return; 3095 return;
3007 VM_BUG_ON(page->mapping && !PageAnon(page)); 3096 VM_BUG_ON(page->mapping && !PageAnon(page));
3008 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 3097 if (PageSwapCache(page))
3098 return;
3099 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
3009} 3100}
3010 3101
3011void mem_cgroup_uncharge_cache_page(struct page *page) 3102void mem_cgroup_uncharge_cache_page(struct page *page)
3012{ 3103{
3013 VM_BUG_ON(page_mapped(page)); 3104 VM_BUG_ON(page_mapped(page));
3014 VM_BUG_ON(page->mapping); 3105 VM_BUG_ON(page->mapping);
3015 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 3106 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
3016} 3107}
3017 3108
3018/* 3109/*
@@ -3076,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3076 if (!swapout) /* this was a swap cache but the swap is unused ! */ 3167 if (!swapout) /* this was a swap cache but the swap is unused ! */
3077 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 3168 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3078 3169
3079 memcg = __mem_cgroup_uncharge_common(page, ctype); 3170 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
3080 3171
3081 /* 3172 /*
3082 * record memcg information, if swapout && memcg != NULL, 3173 * record memcg information, if swapout && memcg != NULL,
@@ -3087,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3087} 3178}
3088#endif 3179#endif
3089 3180
3090#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3181#ifdef CONFIG_MEMCG_SWAP
3091/* 3182/*
3092 * called from swap_entry_free(). remove record in swap_cgroup and 3183 * called from swap_entry_free(). remove record in swap_cgroup and
3093 * uncharge "memsw" account. 3184 * uncharge "memsw" account.
@@ -3166,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3166 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 3257 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
3167 * page belongs to. 3258 * page belongs to.
3168 */ 3259 */
3169int mem_cgroup_prepare_migration(struct page *page, 3260void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3170 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) 3261 struct mem_cgroup **memcgp)
3171{ 3262{
3172 struct mem_cgroup *memcg = NULL; 3263 struct mem_cgroup *memcg = NULL;
3173 struct page_cgroup *pc; 3264 struct page_cgroup *pc;
3174 enum charge_type ctype; 3265 enum charge_type ctype;
3175 int ret = 0;
3176 3266
3177 *memcgp = NULL; 3267 *memcgp = NULL;
3178 3268
3179 VM_BUG_ON(PageTransHuge(page)); 3269 VM_BUG_ON(PageTransHuge(page));
3180 if (mem_cgroup_disabled()) 3270 if (mem_cgroup_disabled())
3181 return 0; 3271 return;
3182 3272
3183 pc = lookup_page_cgroup(page); 3273 pc = lookup_page_cgroup(page);
3184 lock_page_cgroup(pc); 3274 lock_page_cgroup(pc);
@@ -3223,24 +3313,9 @@ int mem_cgroup_prepare_migration(struct page *page,
3223 * we return here. 3313 * we return here.
3224 */ 3314 */
3225 if (!memcg) 3315 if (!memcg)
3226 return 0; 3316 return;
3227 3317
3228 *memcgp = memcg; 3318 *memcgp = memcg;
3229 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
3230 css_put(&memcg->css);/* drop extra refcnt */
3231 if (ret) {
3232 if (PageAnon(page)) {
3233 lock_page_cgroup(pc);
3234 ClearPageCgroupMigration(pc);
3235 unlock_page_cgroup(pc);
3236 /*
3237 * The old page may be fully unmapped while we kept it.
3238 */
3239 mem_cgroup_uncharge_page(page);
3240 }
3241 /* we'll need to revisit this error code (we have -EINTR) */
3242 return -ENOMEM;
3243 }
3244 /* 3319 /*
3245 * We charge new page before it's used/mapped. So, even if unlock_page() 3320 * We charge new page before it's used/mapped. So, even if unlock_page()
3246 * is called before end_migration, we can catch all events on this new 3321 * is called before end_migration, we can catch all events on this new
@@ -3248,13 +3323,15 @@ int mem_cgroup_prepare_migration(struct page *page,
3248 * mapcount will be finally 0 and we call uncharge in end_migration(). 3323 * mapcount will be finally 0 and we call uncharge in end_migration().
3249 */ 3324 */
3250 if (PageAnon(page)) 3325 if (PageAnon(page))
3251 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 3326 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
3252 else if (page_is_file_cache(page))
3253 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3254 else 3327 else
3255 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3328 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3329 /*
3330 * The page is committed to the memcg, but it's not actually
3331 * charged to the res_counter since we plan on replacing the
3332 * old one and only one page is going to be left afterwards.
3333 */
3256 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); 3334 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
3257 return ret;
3258} 3335}
3259 3336
3260/* remove redundant charge if migration failed*/ 3337/* remove redundant charge if migration failed*/
@@ -3276,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3276 used = newpage; 3353 used = newpage;
3277 unused = oldpage; 3354 unused = oldpage;
3278 } 3355 }
3356 anon = PageAnon(used);
3357 __mem_cgroup_uncharge_common(unused,
3358 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
3359 : MEM_CGROUP_CHARGE_TYPE_CACHE,
3360 true);
3361 css_put(&memcg->css);
3279 /* 3362 /*
3280 * We disallowed uncharge of pages under migration because mapcount 3363 * We disallowed uncharge of pages under migration because mapcount
3281 * of the page goes down to zero, temporarly. 3364 * of the page goes down to zero, temporarly.
@@ -3285,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3285 lock_page_cgroup(pc); 3368 lock_page_cgroup(pc);
3286 ClearPageCgroupMigration(pc); 3369 ClearPageCgroupMigration(pc);
3287 unlock_page_cgroup(pc); 3370 unlock_page_cgroup(pc);
3288 anon = PageAnon(used);
3289 __mem_cgroup_uncharge_common(unused,
3290 anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
3291 : MEM_CGROUP_CHARGE_TYPE_CACHE);
3292 3371
3293 /* 3372 /*
3294 * If a page is a file cache, radix-tree replacement is very atomic 3373 * If a page is a file cache, radix-tree replacement is very atomic
@@ -3340,10 +3419,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3340 */ 3419 */
3341 if (!memcg) 3420 if (!memcg)
3342 return; 3421 return;
3343
3344 if (PageSwapBacked(oldpage))
3345 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3346
3347 /* 3422 /*
3348 * Even if newpage->mapping was NULL before starting replacement, 3423 * Even if newpage->mapping was NULL before starting replacement,
3349 * the newpage may be on LRU(or pagevec for LRU) already. We lock 3424 * the newpage may be on LRU(or pagevec for LRU) already. We lock
@@ -3418,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3418 /* 3493 /*
3419 * Rather than hide all in some function, I do this in 3494 * Rather than hide all in some function, I do this in
3420 * open coded manner. You see what this really does. 3495 * open coded manner. You see what this really does.
3421 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3496 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
3422 */ 3497 */
3423 mutex_lock(&set_limit_mutex); 3498 mutex_lock(&set_limit_mutex);
3424 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3499 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3479,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3479 /* 3554 /*
3480 * Rather than hide all in some function, I do this in 3555 * Rather than hide all in some function, I do this in
3481 * open coded manner. You see what this really does. 3556 * open coded manner. You see what this really does.
3482 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3557 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
3483 */ 3558 */
3484 mutex_lock(&set_limit_mutex); 3559 mutex_lock(&set_limit_mutex);
3485 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3560 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3611,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3611} 3686}
3612 3687
3613/* 3688/*
3614 * This routine traverse page_cgroup in given list and drop them all. 3689 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3615 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3690 * reclaim the pages page themselves - it just removes the page_cgroups.
3691 * Returns true if some page_cgroups were not freed, indicating that the caller
3692 * must retry this operation.
3616 */ 3693 */
3617static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3694static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3618 int node, int zid, enum lru_list lru) 3695 int node, int zid, enum lru_list lru)
3619{ 3696{
3620 struct mem_cgroup_per_zone *mz; 3697 struct mem_cgroup_per_zone *mz;
@@ -3622,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3622 struct list_head *list; 3699 struct list_head *list;
3623 struct page *busy; 3700 struct page *busy;
3624 struct zone *zone; 3701 struct zone *zone;
3625 int ret = 0;
3626 3702
3627 zone = &NODE_DATA(node)->node_zones[zid]; 3703 zone = &NODE_DATA(node)->node_zones[zid];
3628 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3704 mz = mem_cgroup_zoneinfo(memcg, node, zid);
@@ -3636,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3636 struct page_cgroup *pc; 3712 struct page_cgroup *pc;
3637 struct page *page; 3713 struct page *page;
3638 3714
3639 ret = 0;
3640 spin_lock_irqsave(&zone->lru_lock, flags); 3715 spin_lock_irqsave(&zone->lru_lock, flags);
3641 if (list_empty(list)) { 3716 if (list_empty(list)) {
3642 spin_unlock_irqrestore(&zone->lru_lock, flags); 3717 spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -3653,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3653 3728
3654 pc = lookup_page_cgroup(page); 3729 pc = lookup_page_cgroup(page);
3655 3730
3656 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); 3731 if (mem_cgroup_move_parent(page, pc, memcg)) {
3657 if (ret == -ENOMEM || ret == -EINTR)
3658 break;
3659
3660 if (ret == -EBUSY || ret == -EINVAL) {
3661 /* found lock contention or "pc" is obsolete. */ 3732 /* found lock contention or "pc" is obsolete. */
3662 busy = page; 3733 busy = page;
3663 cond_resched(); 3734 cond_resched();
3664 } else 3735 } else
3665 busy = NULL; 3736 busy = NULL;
3666 } 3737 }
3667 3738 return !list_empty(list);
3668 if (!ret && !list_empty(list))
3669 return -EBUSY;
3670 return ret;
3671} 3739}
3672 3740
3673/* 3741/*
@@ -3692,9 +3760,6 @@ move_account:
3692 ret = -EBUSY; 3760 ret = -EBUSY;
3693 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3761 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3694 goto out; 3762 goto out;
3695 ret = -EINTR;
3696 if (signal_pending(current))
3697 goto out;
3698 /* This is for making all *used* pages to be on LRU. */ 3763 /* This is for making all *used* pages to be on LRU. */
3699 lru_add_drain_all(); 3764 lru_add_drain_all();
3700 drain_all_stock_sync(memcg); 3765 drain_all_stock_sync(memcg);
@@ -3715,9 +3780,6 @@ move_account:
3715 } 3780 }
3716 mem_cgroup_end_move(memcg); 3781 mem_cgroup_end_move(memcg);
3717 memcg_oom_recover(memcg); 3782 memcg_oom_recover(memcg);
3718 /* it seems parent cgroup doesn't have enough mem */
3719 if (ret == -ENOMEM)
3720 goto try_to_free;
3721 cond_resched(); 3783 cond_resched();
3722 /* "ret" should also be checked to ensure all lists are empty. */ 3784 /* "ret" should also be checked to ensure all lists are empty. */
3723 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); 3785 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
@@ -3779,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3779 parent_memcg = mem_cgroup_from_cont(parent); 3841 parent_memcg = mem_cgroup_from_cont(parent);
3780 3842
3781 cgroup_lock(); 3843 cgroup_lock();
3844
3845 if (memcg->use_hierarchy == val)
3846 goto out;
3847
3782 /* 3848 /*
3783 * If parent's use_hierarchy is set, we can't make any modifications 3849 * If parent's use_hierarchy is set, we can't make any modifications
3784 * in the child subtrees. If it is unset, then the change can 3850 * in the child subtrees. If it is unset, then the change can
@@ -3795,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3795 retval = -EBUSY; 3861 retval = -EBUSY;
3796 } else 3862 } else
3797 retval = -EINVAL; 3863 retval = -EINVAL;
3864
3865out:
3798 cgroup_unlock(); 3866 cgroup_unlock();
3799 3867
3800 return retval; 3868 return retval;
@@ -3831,7 +3899,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3831 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 3899 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
3832 3900
3833 if (swap) 3901 if (swap)
3834 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); 3902 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
3835 3903
3836 return val << PAGE_SHIFT; 3904 return val << PAGE_SHIFT;
3837} 3905}
@@ -4015,7 +4083,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4015#endif 4083#endif
4016 4084
4017#ifdef CONFIG_NUMA 4085#ifdef CONFIG_NUMA
4018static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft, 4086static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4019 struct seq_file *m) 4087 struct seq_file *m)
4020{ 4088{
4021 int nid; 4089 int nid;
@@ -4074,7 +4142,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
4074 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 4142 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
4075} 4143}
4076 4144
4077static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4145static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
4078 struct seq_file *m) 4146 struct seq_file *m)
4079{ 4147{
4080 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4148 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
@@ -4082,7 +4150,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4082 unsigned int i; 4150 unsigned int i;
4083 4151
4084 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4152 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4085 if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) 4153 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4086 continue; 4154 continue;
4087 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 4155 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
4088 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 4156 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
@@ -4109,7 +4177,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4109 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4177 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4110 long long val = 0; 4178 long long val = 0;
4111 4179
4112 if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) 4180 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4113 continue; 4181 continue;
4114 for_each_mem_cgroup_tree(mi, memcg) 4182 for_each_mem_cgroup_tree(mi, memcg)
4115 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 4183 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
@@ -4533,7 +4601,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4533 return 0; 4601 return 0;
4534} 4602}
4535 4603
4536#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 4604#ifdef CONFIG_MEMCG_KMEM
4537static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4605static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4538{ 4606{
4539 return mem_cgroup_sockets_init(memcg, ss); 4607 return mem_cgroup_sockets_init(memcg, ss);
@@ -4588,7 +4656,7 @@ static struct cftype mem_cgroup_files[] = {
4588 }, 4656 },
4589 { 4657 {
4590 .name = "stat", 4658 .name = "stat",
4591 .read_seq_string = mem_control_stat_show, 4659 .read_seq_string = memcg_stat_show,
4592 }, 4660 },
4593 { 4661 {
4594 .name = "force_empty", 4662 .name = "force_empty",
@@ -4620,10 +4688,10 @@ static struct cftype mem_cgroup_files[] = {
4620#ifdef CONFIG_NUMA 4688#ifdef CONFIG_NUMA
4621 { 4689 {
4622 .name = "numa_stat", 4690 .name = "numa_stat",
4623 .read_seq_string = mem_control_numa_stat_show, 4691 .read_seq_string = memcg_numa_stat_show,
4624 }, 4692 },
4625#endif 4693#endif
4626#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4694#ifdef CONFIG_MEMCG_SWAP
4627 { 4695 {
4628 .name = "memsw.usage_in_bytes", 4696 .name = "memsw.usage_in_bytes",
4629 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4697 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
@@ -4810,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4810} 4878}
4811EXPORT_SYMBOL(parent_mem_cgroup); 4879EXPORT_SYMBOL(parent_mem_cgroup);
4812 4880
4813#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4881#ifdef CONFIG_MEMCG_SWAP
4814static void __init enable_swap_cgroup(void) 4882static void __init enable_swap_cgroup(void)
4815{ 4883{
4816 if (!mem_cgroup_disabled() && really_do_swap_account) 4884 if (!mem_cgroup_disabled() && really_do_swap_account)
@@ -5541,7 +5609,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
5541 .__DEPRECATED_clear_css_refs = true, 5609 .__DEPRECATED_clear_css_refs = true,
5542}; 5610};
5543 5611
5544#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5612#ifdef CONFIG_MEMCG_SWAP
5545static int __init enable_swap_account(char *s) 5613static int __init enable_swap_account(char *s)
5546{ 5614{
5547 /* consider enabled if no parameter or 1 is given */ 5615 /* consider enabled if no parameter or 1 is given */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ab1e7145e290..a6e2141a6610 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p)
128 * can only guarantee that the page either belongs to the memcg tasks, or is 128 * can only guarantee that the page either belongs to the memcg tasks, or is
129 * a freed page. 129 * a freed page.
130 */ 130 */
131#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 131#ifdef CONFIG_MEMCG_SWAP
132u64 hwpoison_filter_memcg; 132u64 hwpoison_filter_memcg;
133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); 133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
134static int hwpoison_filter_task(struct page *p) 134static int hwpoison_filter_task(struct page *p)
@@ -345,14 +345,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
345 * Also when FAIL is set do a force kill because something went 345 * Also when FAIL is set do a force kill because something went
346 * wrong earlier. 346 * wrong earlier.
347 */ 347 */
348static void kill_procs(struct list_head *to_kill, int doit, int trapno, 348static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
349 int fail, struct page *page, unsigned long pfn, 349 int fail, struct page *page, unsigned long pfn,
350 int flags) 350 int flags)
351{ 351{
352 struct to_kill *tk, *next; 352 struct to_kill *tk, *next;
353 353
354 list_for_each_entry_safe (tk, next, to_kill, nd) { 354 list_for_each_entry_safe (tk, next, to_kill, nd) {
355 if (doit) { 355 if (forcekill) {
356 /* 356 /*
357 * In case something went wrong with munmapping 357 * In case something went wrong with munmapping
358 * make sure the process doesn't catch the 358 * make sure the process doesn't catch the
@@ -858,7 +858,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
858 struct address_space *mapping; 858 struct address_space *mapping;
859 LIST_HEAD(tokill); 859 LIST_HEAD(tokill);
860 int ret; 860 int ret;
861 int kill = 1; 861 int kill = 1, forcekill;
862 struct page *hpage = compound_head(p); 862 struct page *hpage = compound_head(p);
863 struct page *ppage; 863 struct page *ppage;
864 864
@@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
888 * be called inside page lock (it's recommended but not enforced). 888 * be called inside page lock (it's recommended but not enforced).
889 */ 889 */
890 mapping = page_mapping(hpage); 890 mapping = page_mapping(hpage);
891 if (!PageDirty(hpage) && mapping && 891 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
892 mapping_cap_writeback_dirty(mapping)) { 892 mapping_cap_writeback_dirty(mapping)) {
893 if (page_mkclean(hpage)) { 893 if (page_mkclean(hpage)) {
894 SetPageDirty(hpage); 894 SetPageDirty(hpage);
@@ -965,12 +965,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
965 * Now that the dirty bit has been propagated to the 965 * Now that the dirty bit has been propagated to the
966 * struct page and all unmaps done we can decide if 966 * struct page and all unmaps done we can decide if
967 * killing is needed or not. Only kill when the page 967 * killing is needed or not. Only kill when the page
968 * was dirty, otherwise the tokill list is merely 968 * was dirty or the process is not restartable,
969 * otherwise the tokill list is merely
969 * freed. When there was a problem unmapping earlier 970 * freed. When there was a problem unmapping earlier
970 * use a more force-full uncatchable kill to prevent 971 * use a more force-full uncatchable kill to prevent
971 * any accesses to the poisoned memory. 972 * any accesses to the poisoned memory.
972 */ 973 */
973 kill_procs(&tokill, !!PageDirty(ppage), trapno, 974 forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
975 kill_procs(&tokill, forcekill, trapno,
974 ret != SWAP_SUCCESS, p, pfn, flags); 976 ret != SWAP_SUCCESS, p, pfn, flags);
975 977
976 return ret; 978 return ret;
@@ -1414,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
1414 int ret; 1416 int ret;
1415 unsigned long pfn = page_to_pfn(page); 1417 unsigned long pfn = page_to_pfn(page);
1416 struct page *hpage = compound_head(page); 1418 struct page *hpage = compound_head(page);
1417 LIST_HEAD(pagelist);
1418 1419
1419 ret = get_any_page(page, pfn, flags); 1420 ret = get_any_page(page, pfn, flags);
1420 if (ret < 0) 1421 if (ret < 0)
@@ -1429,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
1429 } 1430 }
1430 1431
1431 /* Keep page count to indicate a given hugepage is isolated. */ 1432 /* Keep page count to indicate a given hugepage is isolated. */
1432 1433 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
1433 list_add(&hpage->lru, &pagelist); 1434 MIGRATE_SYNC);
1434 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, 1435 put_page(hpage);
1435 true);
1436 if (ret) { 1436 if (ret) {
1437 struct page *page1, *page2;
1438 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1439 put_page(page1);
1440
1441 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1437 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1442 pfn, ret, page->flags); 1438 pfn, ret, page->flags);
1443 if (ret > 0)
1444 ret = -EIO;
1445 return ret; 1439 return ret;
1446 } 1440 }
1447done: 1441done:
1448 if (!PageHWPoison(hpage)) 1442 if (!PageHWPoison(hpage))
1449 atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); 1443 atomic_long_add(1 << compound_trans_order(hpage),
1444 &mce_bad_pages);
1450 set_page_hwpoison_huge_page(hpage); 1445 set_page_hwpoison_huge_page(hpage);
1451 dequeue_hwpoisoned_huge_page(hpage); 1446 dequeue_hwpoisoned_huge_page(hpage);
1452 /* keep elevated page count for bad page */ 1447 /* keep elevated page count for bad page */
@@ -1561,7 +1556,7 @@ int soft_offline_page(struct page *page, int flags)
1561 page_is_file_cache(page)); 1556 page_is_file_cache(page));
1562 list_add(&page->lru, &pagelist); 1557 list_add(&page->lru, &pagelist);
1563 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1558 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1564 0, MIGRATE_SYNC); 1559 false, MIGRATE_SYNC);
1565 if (ret) { 1560 if (ret) {
1566 putback_lru_pages(&pagelist); 1561 putback_lru_pages(&pagelist);
1567 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1562 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 1b7dc662bf9f..482f089765ff 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -206,6 +206,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
206 tlb->mm = mm; 206 tlb->mm = mm;
207 207
208 tlb->fullmm = fullmm; 208 tlb->fullmm = fullmm;
209 tlb->start = -1UL;
210 tlb->end = 0;
209 tlb->need_flush = 0; 211 tlb->need_flush = 0;
210 tlb->fast_mode = (num_possible_cpus() == 1); 212 tlb->fast_mode = (num_possible_cpus() == 1);
211 tlb->local.next = NULL; 213 tlb->local.next = NULL;
@@ -248,6 +250,8 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
248{ 250{
249 struct mmu_gather_batch *batch, *next; 251 struct mmu_gather_batch *batch, *next;
250 252
253 tlb->start = start;
254 tlb->end = end;
251 tlb_flush_mmu(tlb); 255 tlb_flush_mmu(tlb);
252 256
253 /* keep the page table cache within bounds */ 257 /* keep the page table cache within bounds */
@@ -1204,6 +1208,11 @@ again:
1204 */ 1208 */
1205 if (force_flush) { 1209 if (force_flush) {
1206 force_flush = 0; 1210 force_flush = 0;
1211
1212#ifdef HAVE_GENERIC_MMU_GATHER
1213 tlb->start = addr;
1214 tlb->end = end;
1215#endif
1207 tlb_flush_mmu(tlb); 1216 tlb_flush_mmu(tlb);
1208 if (addr != end) 1217 if (addr != end)
1209 goto again; 1218 goto again;
@@ -1225,7 +1234,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1225 next = pmd_addr_end(addr, end); 1234 next = pmd_addr_end(addr, end);
1226 if (pmd_trans_huge(*pmd)) { 1235 if (pmd_trans_huge(*pmd)) {
1227 if (next - addr != HPAGE_PMD_SIZE) { 1236 if (next - addr != HPAGE_PMD_SIZE) {
1228 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1237#ifdef CONFIG_DEBUG_VM
1238 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1239 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1240 __func__, addr, end,
1241 vma->vm_start,
1242 vma->vm_end);
1243 BUG();
1244 }
1245#endif
1229 split_huge_page_pmd(vma->vm_mm, pmd); 1246 split_huge_page_pmd(vma->vm_mm, pmd);
1230 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1247 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1231 goto next; 1248 goto next;
@@ -1326,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1326 * Since no pte has actually been setup, it is 1343 * Since no pte has actually been setup, it is
1327 * safe to do nothing in this case. 1344 * safe to do nothing in this case.
1328 */ 1345 */
1329 if (vma->vm_file) 1346 if (vma->vm_file) {
1330 unmap_hugepage_range(vma, start, end, NULL); 1347 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
1348 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1349 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
1350 }
1331 } else 1351 } else
1332 unmap_page_range(tlb, vma, start, end, details); 1352 unmap_page_range(tlb, vma, start, end, details);
1333 } 1353 }
@@ -1366,7 +1386,7 @@ void unmap_vmas(struct mmu_gather *tlb,
1366/** 1386/**
1367 * zap_page_range - remove user pages in a given range 1387 * zap_page_range - remove user pages in a given range
1368 * @vma: vm_area_struct holding the applicable pages 1388 * @vma: vm_area_struct holding the applicable pages
1369 * @address: starting address of pages to zap 1389 * @start: starting address of pages to zap
1370 * @size: number of bytes to zap 1390 * @size: number of bytes to zap
1371 * @details: details of nonlinear truncation or shared cache invalidation 1391 * @details: details of nonlinear truncation or shared cache invalidation
1372 * 1392 *
@@ -3921,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
3921 free_page((unsigned long)buf); 3941 free_page((unsigned long)buf);
3922 } 3942 }
3923 } 3943 }
3924 up_read(&current->mm->mmap_sem); 3944 up_read(&mm->mmap_sem);
3925} 3945}
3926 3946
3927#ifdef CONFIG_PROVE_LOCKING 3947#ifdef CONFIG_PROVE_LOCKING
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0d7e3ec8e0f3..3ad25f9d1fc1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -512,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
512 512
513 zone->present_pages += onlined_pages; 513 zone->present_pages += onlined_pages;
514 zone->zone_pgdat->node_present_pages += onlined_pages; 514 zone->zone_pgdat->node_present_pages += onlined_pages;
515 if (need_zonelists_rebuild) 515 if (onlined_pages) {
516 build_all_zonelists(zone); 516 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
517 else 517 if (need_zonelists_rebuild)
518 zone_pcp_update(zone); 518 build_all_zonelists(NULL, zone);
519 else
520 zone_pcp_update(zone);
521 }
519 522
520 mutex_unlock(&zonelists_mutex); 523 mutex_unlock(&zonelists_mutex);
521 524
522 init_per_zone_wmark_min(); 525 init_per_zone_wmark_min();
523 526
524 if (onlined_pages) { 527 if (onlined_pages)
525 kswapd_run(zone_to_nid(zone)); 528 kswapd_run(zone_to_nid(zone));
526 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
527 }
528 529
529 vm_total_pages = nr_free_pagecache_pages(); 530 vm_total_pages = nr_free_pagecache_pages();
530 531
@@ -562,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
562 * to access not-initialized zonelist, build here. 563 * to access not-initialized zonelist, build here.
563 */ 564 */
564 mutex_lock(&zonelists_mutex); 565 mutex_lock(&zonelists_mutex);
565 build_all_zonelists(NULL); 566 build_all_zonelists(pgdat, NULL);
566 mutex_unlock(&zonelists_mutex); 567 mutex_unlock(&zonelists_mutex);
567 568
568 return pgdat; 569 return pgdat;
@@ -618,7 +619,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
618 pgdat = hotadd_new_pgdat(nid, start); 619 pgdat = hotadd_new_pgdat(nid, start);
619 ret = -ENOMEM; 620 ret = -ENOMEM;
620 if (!pgdat) 621 if (!pgdat)
621 goto out; 622 goto error;
622 new_pgdat = 1; 623 new_pgdat = 1;
623 } 624 }
624 625
@@ -965,6 +966,9 @@ repeat:
965 966
966 init_per_zone_wmark_min(); 967 init_per_zone_wmark_min();
967 968
969 if (!populated_zone(zone))
970 zone_pcp_reset(zone);
971
968 if (!node_present_pages(node)) { 972 if (!node_present_pages(node)) {
969 node_clear_state(node, N_HIGH_MEMORY); 973 node_clear_state(node, N_HIGH_MEMORY);
970 kswapd_stop(node); 974 kswapd_stop(node);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f15c1b24ca18..bd92431d4c49 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1177 if (!list_empty(&pagelist)) { 1177 if (!list_empty(&pagelist)) {
1178 nr_failed = migrate_pages(&pagelist, new_vma_page, 1178 nr_failed = migrate_pages(&pagelist, new_vma_page,
1179 (unsigned long)vma, 1179 (unsigned long)vma,
1180 false, true); 1180 false, MIGRATE_SYNC);
1181 if (nr_failed) 1181 if (nr_failed)
1182 putback_lru_pages(&pagelist); 1182 putback_lru_pages(&pagelist);
1183 } 1183 }
@@ -1602,8 +1602,14 @@ static unsigned interleave_nodes(struct mempolicy *policy)
1602 * task can change it's policy. The system default policy requires no 1602 * task can change it's policy. The system default policy requires no
1603 * such protection. 1603 * such protection.
1604 */ 1604 */
1605unsigned slab_node(struct mempolicy *policy) 1605unsigned slab_node(void)
1606{ 1606{
1607 struct mempolicy *policy;
1608
1609 if (in_interrupt())
1610 return numa_node_id();
1611
1612 policy = current->mempolicy;
1607 if (!policy || policy->flags & MPOL_F_LOCAL) 1613 if (!policy || policy->flags & MPOL_F_LOCAL)
1608 return numa_node_id(); 1614 return numa_node_id();
1609 1615
diff --git a/mm/migrate.c b/mm/migrate.c
index be26d5cbe56b..77ed2d773705 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -33,6 +33,7 @@
33#include <linux/memcontrol.h> 33#include <linux/memcontrol.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/hugetlb_cgroup.h>
36#include <linux/gfp.h> 37#include <linux/gfp.h>
37 38
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
@@ -682,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
682{ 683{
683 int rc = -EAGAIN; 684 int rc = -EAGAIN;
684 int remap_swapcache = 1; 685 int remap_swapcache = 1;
685 int charge = 0;
686 struct mem_cgroup *mem; 686 struct mem_cgroup *mem;
687 struct anon_vma *anon_vma = NULL; 687 struct anon_vma *anon_vma = NULL;
688 688
@@ -724,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
724 } 724 }
725 725
726 /* charge against new page */ 726 /* charge against new page */
727 charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); 727 mem_cgroup_prepare_migration(page, newpage, &mem);
728 if (charge == -ENOMEM) {
729 rc = -ENOMEM;
730 goto unlock;
731 }
732 BUG_ON(charge);
733 728
734 if (PageWriteback(page)) { 729 if (PageWriteback(page)) {
735 /* 730 /*
@@ -819,8 +814,7 @@ skip_unmap:
819 put_anon_vma(anon_vma); 814 put_anon_vma(anon_vma);
820 815
821uncharge: 816uncharge:
822 if (!charge) 817 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
823 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
824unlock: 818unlock:
825 unlock_page(page); 819 unlock_page(page);
826out: 820out:
@@ -931,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
931 925
932 if (anon_vma) 926 if (anon_vma)
933 put_anon_vma(anon_vma); 927 put_anon_vma(anon_vma);
934 unlock_page(hpage);
935 928
936out: 929 if (!rc)
937 if (rc != -EAGAIN) { 930 hugetlb_cgroup_migrate(hpage, new_hpage);
938 list_del(&hpage->lru);
939 put_page(hpage);
940 }
941 931
932 unlock_page(hpage);
933out:
942 put_page(new_hpage); 934 put_page(new_hpage);
943
944 if (result) { 935 if (result) {
945 if (rc) 936 if (rc)
946 *result = rc; 937 *result = rc;
@@ -1016,48 +1007,32 @@ out:
1016 return nr_failed + retry; 1007 return nr_failed + retry;
1017} 1008}
1018 1009
1019int migrate_huge_pages(struct list_head *from, 1010int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1020 new_page_t get_new_page, unsigned long private, bool offlining, 1011 unsigned long private, bool offlining,
1021 enum migrate_mode mode) 1012 enum migrate_mode mode)
1022{ 1013{
1023 int retry = 1; 1014 int pass, rc;
1024 int nr_failed = 0; 1015
1025 int pass = 0; 1016 for (pass = 0; pass < 10; pass++) {
1026 struct page *page; 1017 rc = unmap_and_move_huge_page(get_new_page,
1027 struct page *page2; 1018 private, hpage, pass > 2, offlining,
1028 int rc; 1019 mode);
1029 1020 switch (rc) {
1030 for (pass = 0; pass < 10 && retry; pass++) { 1021 case -ENOMEM:
1031 retry = 0; 1022 goto out;
1032 1023 case -EAGAIN:
1033 list_for_each_entry_safe(page, page2, from, lru) { 1024 /* try again */
1034 cond_resched(); 1025 cond_resched();
1035 1026 break;
1036 rc = unmap_and_move_huge_page(get_new_page, 1027 case 0:
1037 private, page, pass > 2, offlining, 1028 goto out;
1038 mode); 1029 default:
1039 1030 rc = -EIO;
1040 switch(rc) { 1031 goto out;
1041 case -ENOMEM:
1042 goto out;
1043 case -EAGAIN:
1044 retry++;
1045 break;
1046 case 0:
1047 break;
1048 default:
1049 /* Permanent failure */
1050 nr_failed++;
1051 break;
1052 }
1053 } 1032 }
1054 } 1033 }
1055 rc = 0;
1056out: 1034out:
1057 if (rc) 1035 return rc;
1058 return rc;
1059
1060 return nr_failed + retry;
1061} 1036}
1062 1037
1063#ifdef CONFIG_NUMA 1038#ifdef CONFIG_NUMA
diff --git a/mm/mmap.c b/mm/mmap.c
index 3edfcdfa42d9..e3e86914f11a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -943,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
943 const unsigned long stack_flags 943 const unsigned long stack_flags
944 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); 944 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
945 945
946 mm->total_vm += pages;
947
946 if (file) { 948 if (file) {
947 mm->shared_vm += pages; 949 mm->shared_vm += pages;
948 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) 950 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
@@ -1347,7 +1349,6 @@ munmap_back:
1347out: 1349out:
1348 perf_event_mmap(vma); 1350 perf_event_mmap(vma);
1349 1351
1350 mm->total_vm += len >> PAGE_SHIFT;
1351 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1352 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1352 if (vm_flags & VM_LOCKED) { 1353 if (vm_flags & VM_LOCKED) {
1353 if (!mlock_vma_pages_range(vma, addr, addr + len)) 1354 if (!mlock_vma_pages_range(vma, addr, addr + len))
@@ -1707,7 +1708,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1707 return -ENOMEM; 1708 return -ENOMEM;
1708 1709
1709 /* Ok, everything looks good - let it rip */ 1710 /* Ok, everything looks good - let it rip */
1710 mm->total_vm += grow;
1711 if (vma->vm_flags & VM_LOCKED) 1711 if (vma->vm_flags & VM_LOCKED)
1712 mm->locked_vm += grow; 1712 mm->locked_vm += grow;
1713 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); 1713 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
@@ -1889,7 +1889,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1889 1889
1890 if (vma->vm_flags & VM_ACCOUNT) 1890 if (vma->vm_flags & VM_ACCOUNT)
1891 nr_accounted += nrpages; 1891 nr_accounted += nrpages;
1892 mm->total_vm -= nrpages;
1893 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 1892 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1894 vma = remove_vma(vma); 1893 vma = remove_vma(vma);
1895 } while (vma); 1894 } while (vma);
@@ -2345,9 +2344,6 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2345 security_vm_enough_memory_mm(mm, vma_pages(vma))) 2344 security_vm_enough_memory_mm(mm, vma_pages(vma)))
2346 return -ENOMEM; 2345 return -ENOMEM;
2347 2346
2348 if (vma->vm_file && uprobe_mmap(vma))
2349 return -EINVAL;
2350
2351 vma_link(mm, vma, prev, rb_link, rb_parent); 2347 vma_link(mm, vma, prev, rb_link, rb_parent);
2352 return 0; 2348 return 0;
2353} 2349}
@@ -2418,9 +2414,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2418 if (new_vma->vm_file) { 2414 if (new_vma->vm_file) {
2419 get_file(new_vma->vm_file); 2415 get_file(new_vma->vm_file);
2420 2416
2421 if (uprobe_mmap(new_vma))
2422 goto out_free_mempol;
2423
2424 if (vma->vm_flags & VM_EXECUTABLE) 2417 if (vma->vm_flags & VM_EXECUTABLE)
2425 added_exe_file_vma(mm); 2418 added_exe_file_vma(mm);
2426 } 2419 }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9a611d3a1848..862b60822d9f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
33void __mmu_notifier_release(struct mm_struct *mm) 33void __mmu_notifier_release(struct mm_struct *mm)
34{ 34{
35 struct mmu_notifier *mn; 35 struct mmu_notifier *mn;
36 struct hlist_node *n;
37
38 /*
39 * RCU here will block mmu_notifier_unregister until
40 * ->release returns.
41 */
42 rcu_read_lock();
43 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
44 /*
45 * if ->release runs before mmu_notifier_unregister it
46 * must be handled as it's the only way for the driver
47 * to flush all existing sptes and stop the driver
48 * from establishing any more sptes before all the
49 * pages in the mm are freed.
50 */
51 if (mn->ops->release)
52 mn->ops->release(mn, mm);
53 rcu_read_unlock();
36 54
37 spin_lock(&mm->mmu_notifier_mm->lock); 55 spin_lock(&mm->mmu_notifier_mm->lock);
38 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 56 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
46 * mmu_notifier_unregister to return. 64 * mmu_notifier_unregister to return.
47 */ 65 */
48 hlist_del_init_rcu(&mn->hlist); 66 hlist_del_init_rcu(&mn->hlist);
49 /*
50 * RCU here will block mmu_notifier_unregister until
51 * ->release returns.
52 */
53 rcu_read_lock();
54 spin_unlock(&mm->mmu_notifier_mm->lock);
55 /*
56 * if ->release runs before mmu_notifier_unregister it
57 * must be handled as it's the only way for the driver
58 * to flush all existing sptes and stop the driver
59 * from establishing any more sptes before all the
60 * pages in the mm are freed.
61 */
62 if (mn->ops->release)
63 mn->ops->release(mn, mm);
64 rcu_read_unlock();
65 spin_lock(&mm->mmu_notifier_mm->lock);
66 } 67 }
67 spin_unlock(&mm->mmu_notifier_mm->lock); 68 spin_unlock(&mm->mmu_notifier_mm->lock);
68 69
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
284{ 285{
285 BUG_ON(atomic_read(&mm->mm_count) <= 0); 286 BUG_ON(atomic_read(&mm->mm_count) <= 0);
286 287
287 spin_lock(&mm->mmu_notifier_mm->lock);
288 if (!hlist_unhashed(&mn->hlist)) { 288 if (!hlist_unhashed(&mn->hlist)) {
289 hlist_del_rcu(&mn->hlist);
290
291 /* 289 /*
292 * RCU here will force exit_mmap to wait ->release to finish 290 * RCU here will force exit_mmap to wait ->release to finish
293 * before freeing the pages. 291 * before freeing the pages.
294 */ 292 */
295 rcu_read_lock(); 293 rcu_read_lock();
296 spin_unlock(&mm->mmu_notifier_mm->lock); 294
297 /* 295 /*
298 * exit_mmap will block in mmu_notifier_release to 296 * exit_mmap will block in mmu_notifier_release to
299 * guarantee ->release is called before freeing the 297 * guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
302 if (mn->ops->release) 300 if (mn->ops->release)
303 mn->ops->release(mn, mm); 301 mn->ops->release(mn, mm);
304 rcu_read_unlock(); 302 rcu_read_unlock();
305 } else 303
304 spin_lock(&mm->mmu_notifier_mm->lock);
305 hlist_del_rcu(&mn->hlist);
306 spin_unlock(&mm->mmu_notifier_mm->lock); 306 spin_unlock(&mm->mmu_notifier_mm->lock);
307 }
307 308
308 /* 309 /*
309 * Wait any running method to finish, of course including 310 * Wait any running method to finish, of course including
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 6830eab5bf09..3cef80f6ac79 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -96,7 +96,7 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone)
96 for_each_lru(lru) 96 for_each_lru(lru)
97 INIT_LIST_HEAD(&lruvec->lists[lru]); 97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98 98
99#ifdef CONFIG_CGROUP_MEM_RES_CTLR 99#ifdef CONFIG_MEMCG
100 lruvec->zone = zone; 100 lruvec->zone = zone;
101#endif 101#endif
102} 102}
diff --git a/mm/mremap.c b/mm/mremap.c
index 21fed202ddad..cc06d0e48d05 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
260 * If this were a serious issue, we'd add a flag to do_munmap(). 260 * If this were a serious issue, we'd add a flag to do_munmap().
261 */ 261 */
262 hiwater_vm = mm->hiwater_vm; 262 hiwater_vm = mm->hiwater_vm;
263 mm->total_vm += new_len >> PAGE_SHIFT;
264 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); 263 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
265 264
266 if (do_munmap(mm, old_addr, old_len) < 0) { 265 if (do_munmap(mm, old_addr, old_len) < 0) {
@@ -497,7 +496,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
497 goto out; 496 goto out;
498 } 497 }
499 498
500 mm->total_vm += pages;
501 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 499 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
502 if (vma->vm_flags & VM_LOCKED) { 500 if (vma->vm_flags & VM_LOCKED) {
503 mm->locked_vm += pages; 501 mm->locked_vm += pages;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index d23415c001bc..405573010f99 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
105 __free_pages_bootmem(pfn_to_page(i), 0); 105 __free_pages_bootmem(pfn_to_page(i), 0);
106} 106}
107 107
108static unsigned long __init __free_memory_core(phys_addr_t start,
109 phys_addr_t end)
110{
111 unsigned long start_pfn = PFN_UP(start);
112 unsigned long end_pfn = min_t(unsigned long,
113 PFN_DOWN(end), max_low_pfn);
114
115 if (start_pfn > end_pfn)
116 return 0;
117
118 __free_pages_memory(start_pfn, end_pfn);
119
120 return end_pfn - start_pfn;
121}
122
108unsigned long __init free_low_memory_core_early(int nodeid) 123unsigned long __init free_low_memory_core_early(int nodeid)
109{ 124{
110 unsigned long count = 0; 125 unsigned long count = 0;
111 phys_addr_t start, end; 126 phys_addr_t start, end, size;
112 u64 i; 127 u64 i;
113 128
114 /* free reserved array temporarily so that it's treated as free area */ 129 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
115 memblock_free_reserved_regions(); 130 count += __free_memory_core(start, end);
116 131
117 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { 132 /* free range that is used for reserved array if we allocate it */
118 unsigned long start_pfn = PFN_UP(start); 133 size = get_allocated_memblock_reserved_regions_info(&start);
119 unsigned long end_pfn = min_t(unsigned long, 134 if (size)
120 PFN_DOWN(end), max_low_pfn); 135 count += __free_memory_core(start, start + size);
121 if (start_pfn < end_pfn) {
122 __free_pages_memory(start_pfn, end_pfn);
123 count += end_pfn - start_pfn;
124 }
125 }
126 136
127 /* put region array back? */
128 memblock_reserve_reserved_regions();
129 return count; 137 return count;
130} 138}
131 139
@@ -274,7 +282,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
274 return ___alloc_bootmem(size, align, goal, limit); 282 return ___alloc_bootmem(size, align, goal, limit);
275} 283}
276 284
277static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, 285void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
278 unsigned long size, 286 unsigned long size,
279 unsigned long align, 287 unsigned long align,
280 unsigned long goal, 288 unsigned long goal,
diff --git a/mm/nommu.c b/mm/nommu.c
index c4acfbc09972..d4b0c10872de 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1486 1486
1487 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1487 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1488 1488
1489 ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); 1489 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1490 1490
1491 if (file) 1491 if (file)
1492 fput(file); 1492 fput(file);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ed0e19677360..198600861638 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -183,7 +183,8 @@ static bool oom_unkillable_task(struct task_struct *p,
183unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, 183unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
184 const nodemask_t *nodemask, unsigned long totalpages) 184 const nodemask_t *nodemask, unsigned long totalpages)
185{ 185{
186 unsigned long points; 186 long points;
187 long adj;
187 188
188 if (oom_unkillable_task(p, memcg, nodemask)) 189 if (oom_unkillable_task(p, memcg, nodemask))
189 return 0; 190 return 0;
@@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
192 if (!p) 193 if (!p)
193 return 0; 194 return 0;
194 195
195 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { 196 adj = p->signal->oom_score_adj;
197 if (adj == OOM_SCORE_ADJ_MIN) {
196 task_unlock(p); 198 task_unlock(p);
197 return 0; 199 return 0;
198 } 200 }
@@ -210,20 +212,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
210 * implementation used by LSMs. 212 * implementation used by LSMs.
211 */ 213 */
212 if (has_capability_noaudit(p, CAP_SYS_ADMIN)) 214 if (has_capability_noaudit(p, CAP_SYS_ADMIN))
213 points -= 30 * totalpages / 1000; 215 adj -= 30;
214 216
215 /* 217 /* Normalize to oom_score_adj units */
216 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may 218 adj *= totalpages / 1000;
217 * either completely disable oom killing or always prefer a certain 219 points += adj;
218 * task.
219 */
220 points += p->signal->oom_score_adj * totalpages / 1000;
221 220
222 /* 221 /*
223 * Never return 0 for an eligible task regardless of the root bonus and 222 * Never return 0 for an eligible task regardless of the root bonus and
224 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). 223 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
225 */ 224 */
226 return points ? points : 1; 225 return points > 0 ? points : 1;
227} 226}
228 227
229/* 228/*
@@ -289,76 +288,93 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
289} 288}
290#endif 289#endif
291 290
291enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
292 unsigned long totalpages, const nodemask_t *nodemask,
293 bool force_kill)
294{
295 if (task->exit_state)
296 return OOM_SCAN_CONTINUE;
297 if (oom_unkillable_task(task, NULL, nodemask))
298 return OOM_SCAN_CONTINUE;
299
300 /*
301 * This task already has access to memory reserves and is being killed.
302 * Don't allow any other task to have access to the reserves.
303 */
304 if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
305 if (unlikely(frozen(task)))
306 __thaw_task(task);
307 if (!force_kill)
308 return OOM_SCAN_ABORT;
309 }
310 if (!task->mm)
311 return OOM_SCAN_CONTINUE;
312
313 if (task->flags & PF_EXITING) {
314 /*
315 * If task is current and is in the process of releasing memory,
316 * allow the "kill" to set TIF_MEMDIE, which will allow it to
317 * access memory reserves. Otherwise, it may stall forever.
318 *
319 * The iteration isn't broken here, however, in case other
320 * threads are found to have already been oom killed.
321 */
322 if (task == current)
323 return OOM_SCAN_SELECT;
324 else if (!force_kill) {
325 /*
326 * If this task is not being ptraced on exit, then wait
327 * for it to finish before killing some other task
328 * unnecessarily.
329 */
330 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
331 return OOM_SCAN_ABORT;
332 }
333 }
334 return OOM_SCAN_OK;
335}
336
292/* 337/*
293 * Simple selection loop. We chose the process with the highest 338 * Simple selection loop. We chose the process with the highest
294 * number of 'points'. We expect the caller will lock the tasklist. 339 * number of 'points'.
295 * 340 *
296 * (not docbooked, we don't want this one cluttering up the manual) 341 * (not docbooked, we don't want this one cluttering up the manual)
297 */ 342 */
298static struct task_struct *select_bad_process(unsigned int *ppoints, 343static struct task_struct *select_bad_process(unsigned int *ppoints,
299 unsigned long totalpages, struct mem_cgroup *memcg, 344 unsigned long totalpages, const nodemask_t *nodemask,
300 const nodemask_t *nodemask, bool force_kill) 345 bool force_kill)
301{ 346{
302 struct task_struct *g, *p; 347 struct task_struct *g, *p;
303 struct task_struct *chosen = NULL; 348 struct task_struct *chosen = NULL;
304 unsigned long chosen_points = 0; 349 unsigned long chosen_points = 0;
305 350
351 rcu_read_lock();
306 do_each_thread(g, p) { 352 do_each_thread(g, p) {
307 unsigned int points; 353 unsigned int points;
308 354
309 if (p->exit_state) 355 switch (oom_scan_process_thread(p, totalpages, nodemask,
310 continue; 356 force_kill)) {
311 if (oom_unkillable_task(p, memcg, nodemask)) 357 case OOM_SCAN_SELECT:
312 continue; 358 chosen = p;
313 359 chosen_points = ULONG_MAX;
314 /* 360 /* fall through */
315 * This task already has access to memory reserves and is 361 case OOM_SCAN_CONTINUE:
316 * being killed. Don't allow any other task access to the
317 * memory reserve.
318 *
319 * Note: this may have a chance of deadlock if it gets
320 * blocked waiting for another task which itself is waiting
321 * for memory. Is there a better alternative?
322 */
323 if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
324 if (unlikely(frozen(p)))
325 __thaw_task(p);
326 if (!force_kill)
327 return ERR_PTR(-1UL);
328 }
329 if (!p->mm)
330 continue; 362 continue;
331 363 case OOM_SCAN_ABORT:
332 if (p->flags & PF_EXITING) { 364 rcu_read_unlock();
333 /* 365 return ERR_PTR(-1UL);
334 * If p is the current task and is in the process of 366 case OOM_SCAN_OK:
335 * releasing memory, we allow the "kill" to set 367 break;
336 * TIF_MEMDIE, which will allow it to gain access to 368 };
337 * memory reserves. Otherwise, it may stall forever. 369 points = oom_badness(p, NULL, nodemask, totalpages);
338 *
339 * The loop isn't broken here, however, in case other
340 * threads are found to have already been oom killed.
341 */
342 if (p == current) {
343 chosen = p;
344 chosen_points = ULONG_MAX;
345 } else if (!force_kill) {
346 /*
347 * If this task is not being ptraced on exit,
348 * then wait for it to finish before killing
349 * some other task unnecessarily.
350 */
351 if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
352 return ERR_PTR(-1UL);
353 }
354 }
355
356 points = oom_badness(p, memcg, nodemask, totalpages);
357 if (points > chosen_points) { 370 if (points > chosen_points) {
358 chosen = p; 371 chosen = p;
359 chosen_points = points; 372 chosen_points = points;
360 } 373 }
361 } while_each_thread(g, p); 374 } while_each_thread(g, p);
375 if (chosen)
376 get_task_struct(chosen);
377 rcu_read_unlock();
362 378
363 *ppoints = chosen_points * 1000 / totalpages; 379 *ppoints = chosen_points * 1000 / totalpages;
364 return chosen; 380 return chosen;
@@ -366,23 +382,22 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
366 382
367/** 383/**
368 * dump_tasks - dump current memory state of all system tasks 384 * dump_tasks - dump current memory state of all system tasks
369 * @mem: current's memory controller, if constrained 385 * @memcg: current's memory controller, if constrained
370 * @nodemask: nodemask passed to page allocator for mempolicy ooms 386 * @nodemask: nodemask passed to page allocator for mempolicy ooms
371 * 387 *
372 * Dumps the current memory state of all eligible tasks. Tasks not in the same 388 * Dumps the current memory state of all eligible tasks. Tasks not in the same
373 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes 389 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
374 * are not shown. 390 * are not shown.
375 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 391 * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
376 * value, oom_score_adj value, and name. 392 * swapents, oom_score_adj value, and name.
377 *
378 * Call with tasklist_lock read-locked.
379 */ 393 */
380static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) 394static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
381{ 395{
382 struct task_struct *p; 396 struct task_struct *p;
383 struct task_struct *task; 397 struct task_struct *task;
384 398
385 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); 399 pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n");
400 rcu_read_lock();
386 for_each_process(p) { 401 for_each_process(p) {
387 if (oom_unkillable_task(p, memcg, nodemask)) 402 if (oom_unkillable_task(p, memcg, nodemask))
388 continue; 403 continue;
@@ -397,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
397 continue; 412 continue;
398 } 413 }
399 414
400 pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", 415 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
401 task->pid, from_kuid(&init_user_ns, task_uid(task)), 416 task->pid, from_kuid(&init_user_ns, task_uid(task)),
402 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 417 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
403 task_cpu(task), task->signal->oom_adj, 418 task->mm->nr_ptes,
419 get_mm_counter(task->mm, MM_SWAPENTS),
404 task->signal->oom_score_adj, task->comm); 420 task->signal->oom_score_adj, task->comm);
405 task_unlock(task); 421 task_unlock(task);
406 } 422 }
423 rcu_read_unlock();
407} 424}
408 425
409static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 426static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
@@ -424,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
424} 441}
425 442
426#define K(x) ((x) << (PAGE_SHIFT-10)) 443#define K(x) ((x) << (PAGE_SHIFT-10))
427static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 444/*
428 unsigned int points, unsigned long totalpages, 445 * Must be called while holding a reference to p, which will be released upon
429 struct mem_cgroup *memcg, nodemask_t *nodemask, 446 * returning.
430 const char *message) 447 */
448void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
449 unsigned int points, unsigned long totalpages,
450 struct mem_cgroup *memcg, nodemask_t *nodemask,
451 const char *message)
431{ 452{
432 struct task_struct *victim = p; 453 struct task_struct *victim = p;
433 struct task_struct *child; 454 struct task_struct *child;
@@ -443,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
443 */ 464 */
444 if (p->flags & PF_EXITING) { 465 if (p->flags & PF_EXITING) {
445 set_tsk_thread_flag(p, TIF_MEMDIE); 466 set_tsk_thread_flag(p, TIF_MEMDIE);
467 put_task_struct(p);
446 return; 468 return;
447 } 469 }
448 470
@@ -460,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
460 * parent. This attempts to lose the minimal amount of work done while 482 * parent. This attempts to lose the minimal amount of work done while
461 * still freeing memory. 483 * still freeing memory.
462 */ 484 */
485 read_lock(&tasklist_lock);
463 do { 486 do {
464 list_for_each_entry(child, &t->children, sibling) { 487 list_for_each_entry(child, &t->children, sibling) {
465 unsigned int child_points; 488 unsigned int child_points;
@@ -472,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
472 child_points = oom_badness(child, memcg, nodemask, 495 child_points = oom_badness(child, memcg, nodemask,
473 totalpages); 496 totalpages);
474 if (child_points > victim_points) { 497 if (child_points > victim_points) {
498 put_task_struct(victim);
475 victim = child; 499 victim = child;
476 victim_points = child_points; 500 victim_points = child_points;
501 get_task_struct(victim);
477 } 502 }
478 } 503 }
479 } while_each_thread(p, t); 504 } while_each_thread(p, t);
505 read_unlock(&tasklist_lock);
480 506
481 victim = find_lock_task_mm(victim); 507 rcu_read_lock();
482 if (!victim) 508 p = find_lock_task_mm(victim);
509 if (!p) {
510 rcu_read_unlock();
511 put_task_struct(victim);
483 return; 512 return;
513 } else if (victim != p) {
514 get_task_struct(p);
515 put_task_struct(victim);
516 victim = p;
517 }
484 518
485 /* mm cannot safely be dereferenced after task_unlock(victim) */ 519 /* mm cannot safely be dereferenced after task_unlock(victim) */
486 mm = victim->mm; 520 mm = victim->mm;
@@ -511,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
511 task_unlock(p); 545 task_unlock(p);
512 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); 546 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
513 } 547 }
548 rcu_read_unlock();
514 549
515 set_tsk_thread_flag(victim, TIF_MEMDIE); 550 set_tsk_thread_flag(victim, TIF_MEMDIE);
516 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); 551 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
552 put_task_struct(victim);
517} 553}
518#undef K 554#undef K
519 555
520/* 556/*
521 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 557 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
522 */ 558 */
523static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, 559void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
524 int order, const nodemask_t *nodemask) 560 int order, const nodemask_t *nodemask)
525{ 561{
526 if (likely(!sysctl_panic_on_oom)) 562 if (likely(!sysctl_panic_on_oom))
527 return; 563 return;
@@ -534,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
534 if (constraint != CONSTRAINT_NONE) 570 if (constraint != CONSTRAINT_NONE)
535 return; 571 return;
536 } 572 }
537 read_lock(&tasklist_lock);
538 dump_header(NULL, gfp_mask, order, NULL, nodemask); 573 dump_header(NULL, gfp_mask, order, NULL, nodemask);
539 read_unlock(&tasklist_lock);
540 panic("Out of memory: %s panic_on_oom is enabled\n", 574 panic("Out of memory: %s panic_on_oom is enabled\n",
541 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); 575 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
542} 576}
543 577
544#ifdef CONFIG_CGROUP_MEM_RES_CTLR
545void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
546 int order)
547{
548 unsigned long limit;
549 unsigned int points = 0;
550 struct task_struct *p;
551
552 /*
553 * If current has a pending SIGKILL, then automatically select it. The
554 * goal is to allow it to allocate so that it may quickly exit and free
555 * its memory.
556 */
557 if (fatal_signal_pending(current)) {
558 set_thread_flag(TIF_MEMDIE);
559 return;
560 }
561
562 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
563 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
564 read_lock(&tasklist_lock);
565 p = select_bad_process(&points, limit, memcg, NULL, false);
566 if (p && PTR_ERR(p) != -1UL)
567 oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
568 "Memory cgroup out of memory");
569 read_unlock(&tasklist_lock);
570}
571#endif
572
573static BLOCKING_NOTIFIER_HEAD(oom_notify_list); 578static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
574 579
575int register_oom_notifier(struct notifier_block *nb) 580int register_oom_notifier(struct notifier_block *nb)
@@ -691,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
691 struct task_struct *p; 696 struct task_struct *p;
692 unsigned long totalpages; 697 unsigned long totalpages;
693 unsigned long freed = 0; 698 unsigned long freed = 0;
694 unsigned int points; 699 unsigned int uninitialized_var(points);
695 enum oom_constraint constraint = CONSTRAINT_NONE; 700 enum oom_constraint constraint = CONSTRAINT_NONE;
696 int killed = 0; 701 int killed = 0;
697 702
@@ -719,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
719 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; 724 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
720 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); 725 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
721 726
722 read_lock(&tasklist_lock); 727 if (sysctl_oom_kill_allocating_task && current->mm &&
723 if (sysctl_oom_kill_allocating_task &&
724 !oom_unkillable_task(current, NULL, nodemask) && 728 !oom_unkillable_task(current, NULL, nodemask) &&
725 current->mm) { 729 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
730 get_task_struct(current);
726 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, 731 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
727 nodemask, 732 nodemask,
728 "Out of memory (oom_kill_allocating_task)"); 733 "Out of memory (oom_kill_allocating_task)");
729 goto out; 734 goto out;
730 } 735 }
731 736
732 p = select_bad_process(&points, totalpages, NULL, mpol_mask, 737 p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
733 force_kill);
734 /* Found nothing?!?! Either we hang forever, or we panic. */ 738 /* Found nothing?!?! Either we hang forever, or we panic. */
735 if (!p) { 739 if (!p) {
736 dump_header(NULL, gfp_mask, order, NULL, mpol_mask); 740 dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
737 read_unlock(&tasklist_lock);
738 panic("Out of memory and no killable processes...\n"); 741 panic("Out of memory and no killable processes...\n");
739 } 742 }
740 if (PTR_ERR(p) != -1UL) { 743 if (PTR_ERR(p) != -1UL) {
@@ -743,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
743 killed = 1; 746 killed = 1;
744 } 747 }
745out: 748out:
746 read_unlock(&tasklist_lock);
747
748 /* 749 /*
749 * Give "p" a good chance of killing itself before we 750 * Give the killed threads a good chance of exiting before trying to
750 * retry to allocate memory unless "p" is current 751 * allocate memory again.
751 */ 752 */
752 if (killed && !test_thread_flag(TIF_MEMDIE)) 753 if (killed)
753 schedule_timeout_uninterruptible(1); 754 schedule_timeout_killable(1);
754} 755}
755 756
756/* 757/*
@@ -765,6 +766,5 @@ void pagefault_out_of_memory(void)
765 out_of_memory(NULL, 0, 0, NULL, false); 766 out_of_memory(NULL, 0, 0, NULL, false);
766 clear_system_oom(); 767 clear_system_oom();
767 } 768 }
768 if (!test_thread_flag(TIF_MEMDIE)) 769 schedule_timeout_killable(1);
769 schedule_timeout_uninterruptible(1);
770} 770}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 93d8d2f7108c..e5363f34e025 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ 35#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37#include <linux/timer.h>
37#include <trace/events/writeback.h> 38#include <trace/events/writeback.h>
38 39
39/* 40/*
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit;
135 * measured in page writeback completions. 136 * measured in page writeback completions.
136 * 137 *
137 */ 138 */
138static struct prop_descriptor vm_completions; 139static struct fprop_global writeout_completions;
140
141static void writeout_period(unsigned long t);
142/* Timer for aging of writeout_completions */
143static struct timer_list writeout_period_timer =
144 TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
145static unsigned long writeout_period_time = 0;
146
147/*
148 * Length of period for aging writeout fractions of bdis. This is an
149 * arbitrarily chosen number. The longer the period, the slower fractions will
150 * reflect changes in current writeout rate.
151 */
152#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
139 153
140/* 154/*
141 * Work out the current dirty-memory clamping and background writeout 155 * Work out the current dirty-memory clamping and background writeout
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone)
322 zone_page_state(zone, NR_WRITEBACK) <= limit; 336 zone_page_state(zone, NR_WRITEBACK) <= limit;
323} 337}
324 338
325/*
326 * couple the period to the dirty_ratio:
327 *
328 * period/2 ~ roundup_pow_of_two(dirty limit)
329 */
330static int calc_period_shift(void)
331{
332 unsigned long dirty_total;
333
334 if (vm_dirty_bytes)
335 dirty_total = vm_dirty_bytes / PAGE_SIZE;
336 else
337 dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
338 100;
339 return 2 + ilog2(dirty_total - 1);
340}
341
342/*
343 * update the period when the dirty threshold changes.
344 */
345static void update_completion_period(void)
346{
347 int shift = calc_period_shift();
348 prop_change_shift(&vm_completions, shift);
349
350 writeback_set_ratelimit();
351}
352
353int dirty_background_ratio_handler(struct ctl_table *table, int write, 339int dirty_background_ratio_handler(struct ctl_table *table, int write,
354 void __user *buffer, size_t *lenp, 340 void __user *buffer, size_t *lenp,
355 loff_t *ppos) 341 loff_t *ppos)
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
383 369
384 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 370 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
385 if (ret == 0 && write && vm_dirty_ratio != old_ratio) { 371 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
386 update_completion_period(); 372 writeback_set_ratelimit();
387 vm_dirty_bytes = 0; 373 vm_dirty_bytes = 0;
388 } 374 }
389 return ret; 375 return ret;
@@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
398 384
399 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 385 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
400 if (ret == 0 && write && vm_dirty_bytes != old_bytes) { 386 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
401 update_completion_period(); 387 writeback_set_ratelimit();
402 vm_dirty_ratio = 0; 388 vm_dirty_ratio = 0;
403 } 389 }
404 return ret; 390 return ret;
405} 391}
406 392
393static unsigned long wp_next_time(unsigned long cur_time)
394{
395 cur_time += VM_COMPLETIONS_PERIOD_LEN;
396 /* 0 has a special meaning... */
397 if (!cur_time)
398 return 1;
399 return cur_time;
400}
401
407/* 402/*
408 * Increment the BDI's writeout completion count and the global writeout 403 * Increment the BDI's writeout completion count and the global writeout
409 * completion count. Called from test_clear_page_writeback(). 404 * completion count. Called from test_clear_page_writeback().
@@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
411static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) 406static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
412{ 407{
413 __inc_bdi_stat(bdi, BDI_WRITTEN); 408 __inc_bdi_stat(bdi, BDI_WRITTEN);
414 __prop_inc_percpu_max(&vm_completions, &bdi->completions, 409 __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
415 bdi->max_prop_frac); 410 bdi->max_prop_frac);
411 /* First event after period switching was turned off? */
412 if (!unlikely(writeout_period_time)) {
413 /*
414 * We can race with other __bdi_writeout_inc calls here but
415 * it does not cause any harm since the resulting time when
416 * timer will fire and what is in writeout_period_time will be
417 * roughly the same.
418 */
419 writeout_period_time = wp_next_time(jiffies);
420 mod_timer(&writeout_period_timer, writeout_period_time);
421 }
416} 422}
417 423
418void bdi_writeout_inc(struct backing_dev_info *bdi) 424void bdi_writeout_inc(struct backing_dev_info *bdi)
@@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);
431static void bdi_writeout_fraction(struct backing_dev_info *bdi, 437static void bdi_writeout_fraction(struct backing_dev_info *bdi,
432 long *numerator, long *denominator) 438 long *numerator, long *denominator)
433{ 439{
434 prop_fraction_percpu(&vm_completions, &bdi->completions, 440 fprop_fraction_percpu(&writeout_completions, &bdi->completions,
435 numerator, denominator); 441 numerator, denominator);
436} 442}
437 443
438/* 444/*
445 * On idle system, we can be called long after we scheduled because we use
446 * deferred timers so count with missed periods.
447 */
448static void writeout_period(unsigned long t)
449{
450 int miss_periods = (jiffies - writeout_period_time) /
451 VM_COMPLETIONS_PERIOD_LEN;
452
453 if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
454 writeout_period_time = wp_next_time(writeout_period_time +
455 miss_periods * VM_COMPLETIONS_PERIOD_LEN);
456 mod_timer(&writeout_period_timer, writeout_period_time);
457 } else {
458 /*
459 * Aging has zeroed all fractions. Stop wasting CPU on period
460 * updates.
461 */
462 writeout_period_time = 0;
463 }
464}
465
466/*
439 * bdi_min_ratio keeps the sum of the minimum dirty shares of all 467 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
440 * registered backing devices, which, for obvious reasons, can not 468 * registered backing devices, which, for obvious reasons, can not
441 * exceed 100%. 469 * exceed 100%.
@@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
475 ret = -EINVAL; 503 ret = -EINVAL;
476 } else { 504 } else {
477 bdi->max_ratio = max_ratio; 505 bdi->max_ratio = max_ratio;
478 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 506 bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
479 } 507 }
480 spin_unlock_bh(&bdi_lock); 508 spin_unlock_bh(&bdi_lock);
481 509
@@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
918 * bdi->dirty_ratelimit = balanced_dirty_ratelimit; 946 * bdi->dirty_ratelimit = balanced_dirty_ratelimit;
919 * 947 *
920 * However to get a more stable dirty_ratelimit, the below elaborated 948 * However to get a more stable dirty_ratelimit, the below elaborated
921 * code makes use of task_ratelimit to filter out sigular points and 949 * code makes use of task_ratelimit to filter out singular points and
922 * limit the step size. 950 * limit the step size.
923 * 951 *
924 * The below code essentially only uses the relative value of 952 * The below code essentially only uses the relative value of
@@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
941 * feel and care are stable dirty rate and small position error. 969 * feel and care are stable dirty rate and small position error.
942 * 970 *
943 * |task_ratelimit - dirty_ratelimit| is used to limit the step size 971 * |task_ratelimit - dirty_ratelimit| is used to limit the step size
944 * and filter out the sigular points of balanced_dirty_ratelimit. Which 972 * and filter out the singular points of balanced_dirty_ratelimit. Which
945 * keeps jumping around randomly and can even leap far away at times 973 * keeps jumping around randomly and can even leap far away at times
946 * due to the small 200ms estimation period of dirty_rate (we want to 974 * due to the small 200ms estimation period of dirty_rate (we want to
947 * keep that period small to reduce time lags). 975 * keep that period small to reduce time lags).
@@ -1606,13 +1634,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
1606 */ 1634 */
1607void __init page_writeback_init(void) 1635void __init page_writeback_init(void)
1608{ 1636{
1609 int shift;
1610
1611 writeback_set_ratelimit(); 1637 writeback_set_ratelimit();
1612 register_cpu_notifier(&ratelimit_nb); 1638 register_cpu_notifier(&ratelimit_nb);
1613 1639
1614 shift = calc_period_shift(); 1640 fprop_global_init(&writeout_completions);
1615 prop_descriptor_init(&vm_completions, shift);
1616} 1641}
1617 1642
1618/** 1643/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44030096da63..889532b8e6c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@
51#include <linux/page_cgroup.h> 51#include <linux/page_cgroup.h>
52#include <linux/debugobjects.h> 52#include <linux/debugobjects.h>
53#include <linux/kmemleak.h> 53#include <linux/kmemleak.h>
54#include <linux/memory.h>
55#include <linux/compaction.h> 54#include <linux/compaction.h>
56#include <trace/events/kmem.h> 55#include <trace/events/kmem.h>
57#include <linux/ftrace_event.h> 56#include <linux/ftrace_event.h>
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);
219 218
220int page_group_by_mobility_disabled __read_mostly; 219int page_group_by_mobility_disabled __read_mostly;
221 220
222static void set_pageblock_migratetype(struct page *page, int migratetype) 221/*
222 * NOTE:
223 * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
224 * Instead, use {un}set_pageblock_isolate.
225 */
226void set_pageblock_migratetype(struct page *page, int migratetype)
223{ 227{
224 228
225 if (unlikely(page_group_by_mobility_disabled)) 229 if (unlikely(page_group_by_mobility_disabled))
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone,
954 return pages_moved; 958 return pages_moved;
955} 959}
956 960
957static int move_freepages_block(struct zone *zone, struct page *page, 961int move_freepages_block(struct zone *zone, struct page *page,
958 int migratetype) 962 int migratetype)
959{ 963{
960 unsigned long start_pfn, end_pfn; 964 unsigned long start_pfn, end_pfn;
@@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1158 to_drain = pcp->batch; 1162 to_drain = pcp->batch;
1159 else 1163 else
1160 to_drain = pcp->count; 1164 to_drain = pcp->count;
1161 free_pcppages_bulk(zone, to_drain, pcp); 1165 if (to_drain > 0) {
1162 pcp->count -= to_drain; 1166 free_pcppages_bulk(zone, to_drain, pcp);
1167 pcp->count -= to_drain;
1168 }
1163 local_irq_restore(flags); 1169 local_irq_restore(flags);
1164} 1170}
1165#endif 1171#endif
@@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str)
1529} 1535}
1530__setup("fail_page_alloc=", setup_fail_page_alloc); 1536__setup("fail_page_alloc=", setup_fail_page_alloc);
1531 1537
1532static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1538static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1533{ 1539{
1534 if (order < fail_page_alloc.min_order) 1540 if (order < fail_page_alloc.min_order)
1535 return 0; 1541 return false;
1536 if (gfp_mask & __GFP_NOFAIL) 1542 if (gfp_mask & __GFP_NOFAIL)
1537 return 0; 1543 return false;
1538 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1544 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1539 return 0; 1545 return false;
1540 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1546 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1541 return 0; 1547 return false;
1542 1548
1543 return should_fail(&fail_page_alloc.attr, 1 << order); 1549 return should_fail(&fail_page_alloc.attr, 1 << order);
1544} 1550}
@@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs);
1578 1584
1579#else /* CONFIG_FAIL_PAGE_ALLOC */ 1585#else /* CONFIG_FAIL_PAGE_ALLOC */
1580 1586
1581static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1587static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1582{ 1588{
1583 return 0; 1589 return false;
1584} 1590}
1585 1591
1586#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1592#endif /* CONFIG_FAIL_PAGE_ALLOC */
@@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1594{ 1600{
1595 /* free_pages my go negative - that's OK */ 1601 /* free_pages my go negative - that's OK */
1596 long min = mark; 1602 long min = mark;
1603 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1597 int o; 1604 int o;
1598 1605
1599 free_pages -= (1 << order) - 1; 1606 free_pages -= (1 << order) - 1;
@@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1602 if (alloc_flags & ALLOC_HARDER) 1609 if (alloc_flags & ALLOC_HARDER)
1603 min -= min / 4; 1610 min -= min / 4;
1604 1611
1605 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1612 if (free_pages <= min + lowmem_reserve)
1606 return false; 1613 return false;
1607 for (o = 0; o < order; o++) { 1614 for (o = 0; o < order; o++) {
1608 /* At the next order, this order's pages become unavailable */ 1615 /* At the next order, this order's pages become unavailable */
@@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1617 return true; 1624 return true;
1618} 1625}
1619 1626
1627#ifdef CONFIG_MEMORY_ISOLATION
1628static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1629{
1630 if (unlikely(zone->nr_pageblock_isolate))
1631 return zone->nr_pageblock_isolate * pageblock_nr_pages;
1632 return 0;
1633}
1634#else
1635static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1636{
1637 return 0;
1638}
1639#endif
1640
1620bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1641bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1621 int classzone_idx, int alloc_flags) 1642 int classzone_idx, int alloc_flags)
1622{ 1643{
@@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1632 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1653 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1633 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1654 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1634 1655
1656 /*
1657 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
1658 * it. nr_zone_isolate_freepages is never accurate so kswapd might not
1659 * sleep although it could do so. But this is more desirable for memory
1660 * hotplug than sleeping which can cause a livelock in the direct
1661 * reclaim path.
1662 */
1663 free_pages -= nr_zone_isolate_freepages(z);
1635 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1664 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1636 free_pages); 1665 free_pages);
1637} 1666}
@@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2087 2116
2088 page = get_page_from_freelist(gfp_mask, nodemask, 2117 page = get_page_from_freelist(gfp_mask, nodemask,
2089 order, zonelist, high_zoneidx, 2118 order, zonelist, high_zoneidx,
2090 alloc_flags, preferred_zone, 2119 alloc_flags & ~ALLOC_NO_WATERMARKS,
2091 migratetype); 2120 preferred_zone, migratetype);
2092 if (page) { 2121 if (page) {
2093 preferred_zone->compact_considered = 0; 2122 preferred_zone->compact_considered = 0;
2094 preferred_zone->compact_defer_shift = 0; 2123 preferred_zone->compact_defer_shift = 0;
@@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2180retry: 2209retry:
2181 page = get_page_from_freelist(gfp_mask, nodemask, order, 2210 page = get_page_from_freelist(gfp_mask, nodemask, order,
2182 zonelist, high_zoneidx, 2211 zonelist, high_zoneidx,
2183 alloc_flags, preferred_zone, 2212 alloc_flags & ~ALLOC_NO_WATERMARKS,
2184 migratetype); 2213 preferred_zone, migratetype);
2185 2214
2186 /* 2215 /*
2187 * If an allocation failed after direct reclaim, it could be because 2216 * If an allocation failed after direct reclaim, it could be because
@@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
2265 alloc_flags |= ALLOC_HARDER; 2294 alloc_flags |= ALLOC_HARDER;
2266 2295
2267 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2296 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2268 if (!in_interrupt() && 2297 if (gfp_mask & __GFP_MEMALLOC)
2269 ((current->flags & PF_MEMALLOC) || 2298 alloc_flags |= ALLOC_NO_WATERMARKS;
2270 unlikely(test_thread_flag(TIF_MEMDIE)))) 2299 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2300 alloc_flags |= ALLOC_NO_WATERMARKS;
2301 else if (!in_interrupt() &&
2302 ((current->flags & PF_MEMALLOC) ||
2303 unlikely(test_thread_flag(TIF_MEMDIE))))
2271 alloc_flags |= ALLOC_NO_WATERMARKS; 2304 alloc_flags |= ALLOC_NO_WATERMARKS;
2272 } 2305 }
2273 2306
2274 return alloc_flags; 2307 return alloc_flags;
2275} 2308}
2276 2309
2310bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2311{
2312 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2313}
2314
2277static inline struct page * 2315static inline struct page *
2278__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2316__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2279 struct zonelist *zonelist, enum zone_type high_zoneidx, 2317 struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2340,11 +2378,27 @@ rebalance:
2340 2378
2341 /* Allocate without watermarks if the context allows */ 2379 /* Allocate without watermarks if the context allows */
2342 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2380 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2381 /*
2382 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
2383 * the allocation is high priority and these type of
2384 * allocations are system rather than user orientated
2385 */
2386 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2387
2343 page = __alloc_pages_high_priority(gfp_mask, order, 2388 page = __alloc_pages_high_priority(gfp_mask, order,
2344 zonelist, high_zoneidx, nodemask, 2389 zonelist, high_zoneidx, nodemask,
2345 preferred_zone, migratetype); 2390 preferred_zone, migratetype);
2346 if (page) 2391 if (page) {
2392 /*
2393 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2394 * necessary to allocate the page. The expectation is
2395 * that the caller is taking steps that will free more
2396 * memory. The caller should avoid the page being used
2397 * for !PFMEMALLOC purposes.
2398 */
2399 page->pfmemalloc = true;
2347 goto got_pg; 2400 goto got_pg;
2401 }
2348 } 2402 }
2349 2403
2350 /* Atomic allocations - we can't balance anything */ 2404 /* Atomic allocations - we can't balance anything */
@@ -2463,8 +2517,8 @@ nopage:
2463got_pg: 2517got_pg:
2464 if (kmemcheck_enabled) 2518 if (kmemcheck_enabled)
2465 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2519 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2466 return page;
2467 2520
2521 return page;
2468} 2522}
2469 2523
2470/* 2524/*
@@ -2515,6 +2569,8 @@ retry_cpuset:
2515 page = __alloc_pages_slowpath(gfp_mask, order, 2569 page = __alloc_pages_slowpath(gfp_mask, order,
2516 zonelist, high_zoneidx, nodemask, 2570 zonelist, high_zoneidx, nodemask,
2517 preferred_zone, migratetype); 2571 preferred_zone, migratetype);
2572 else
2573 page->pfmemalloc = false;
2518 2574
2519 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2575 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2520 2576
@@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
3030 user_zonelist_order = oldval; 3086 user_zonelist_order = oldval;
3031 } else if (oldval != user_zonelist_order) { 3087 } else if (oldval != user_zonelist_order) {
3032 mutex_lock(&zonelists_mutex); 3088 mutex_lock(&zonelists_mutex);
3033 build_all_zonelists(NULL); 3089 build_all_zonelists(NULL, NULL);
3034 mutex_unlock(&zonelists_mutex); 3090 mutex_unlock(&zonelists_mutex);
3035 } 3091 }
3036 } 3092 }
@@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone);
3409DEFINE_MUTEX(zonelists_mutex); 3465DEFINE_MUTEX(zonelists_mutex);
3410 3466
3411/* return values int ....just for stop_machine() */ 3467/* return values int ....just for stop_machine() */
3412static __init_refok int __build_all_zonelists(void *data) 3468static int __build_all_zonelists(void *data)
3413{ 3469{
3414 int nid; 3470 int nid;
3415 int cpu; 3471 int cpu;
3472 pg_data_t *self = data;
3416 3473
3417#ifdef CONFIG_NUMA 3474#ifdef CONFIG_NUMA
3418 memset(node_load, 0, sizeof(node_load)); 3475 memset(node_load, 0, sizeof(node_load));
3419#endif 3476#endif
3477
3478 if (self && !node_online(self->node_id)) {
3479 build_zonelists(self);
3480 build_zonelist_cache(self);
3481 }
3482
3420 for_each_online_node(nid) { 3483 for_each_online_node(nid) {
3421 pg_data_t *pgdat = NODE_DATA(nid); 3484 pg_data_t *pgdat = NODE_DATA(nid);
3422 3485
@@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data)
3461 * Called with zonelists_mutex held always 3524 * Called with zonelists_mutex held always
3462 * unless system_state == SYSTEM_BOOTING. 3525 * unless system_state == SYSTEM_BOOTING.
3463 */ 3526 */
3464void __ref build_all_zonelists(void *data) 3527void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3465{ 3528{
3466 set_zonelist_order(); 3529 set_zonelist_order();
3467 3530
@@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data)
3473 /* we have to stop all cpus to guarantee there is no user 3536 /* we have to stop all cpus to guarantee there is no user
3474 of zonelist */ 3537 of zonelist */
3475#ifdef CONFIG_MEMORY_HOTPLUG 3538#ifdef CONFIG_MEMORY_HOTPLUG
3476 if (data) 3539 if (zone)
3477 setup_zone_pageset((struct zone *)data); 3540 setup_zone_pageset(zone);
3478#endif 3541#endif
3479 stop_machine(__build_all_zonelists, NULL, NULL); 3542 stop_machine(__build_all_zonelists, pgdat, NULL);
3480 /* cpuset refresh routine should be here */ 3543 /* cpuset refresh routine should be here */
3481 } 3544 }
3482 vm_total_pages = nr_free_pagecache_pages(); 3545 vm_total_pages = nr_free_pagecache_pages();
@@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
3746 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 3809 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3747#endif 3810#endif
3748 3811
3749static int zone_batchsize(struct zone *zone) 3812static int __meminit zone_batchsize(struct zone *zone)
3750{ 3813{
3751#ifdef CONFIG_MMU 3814#ifdef CONFIG_MMU
3752 int batch; 3815 int batch;
@@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3828 pcp->batch = PAGE_SHIFT * 8; 3891 pcp->batch = PAGE_SHIFT * 8;
3829} 3892}
3830 3893
3831static void setup_zone_pageset(struct zone *zone) 3894static void __meminit setup_zone_pageset(struct zone *zone)
3832{ 3895{
3833 int cpu; 3896 int cpu;
3834 3897
@@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3901 return 0; 3964 return 0;
3902} 3965}
3903 3966
3904static int __zone_pcp_update(void *data)
3905{
3906 struct zone *zone = data;
3907 int cpu;
3908 unsigned long batch = zone_batchsize(zone), flags;
3909
3910 for_each_possible_cpu(cpu) {
3911 struct per_cpu_pageset *pset;
3912 struct per_cpu_pages *pcp;
3913
3914 pset = per_cpu_ptr(zone->pageset, cpu);
3915 pcp = &pset->pcp;
3916
3917 local_irq_save(flags);
3918 free_pcppages_bulk(zone, pcp->count, pcp);
3919 setup_pageset(pset, batch);
3920 local_irq_restore(flags);
3921 }
3922 return 0;
3923}
3924
3925void zone_pcp_update(struct zone *zone)
3926{
3927 stop_machine(__zone_pcp_update, zone, NULL);
3928}
3929
3930static __meminit void zone_pcp_init(struct zone *zone) 3967static __meminit void zone_pcp_init(struct zone *zone)
3931{ 3968{
3932 /* 3969 /*
@@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
3942 zone_batchsize(zone)); 3979 zone_batchsize(zone));
3943} 3980}
3944 3981
3945__meminit int init_currently_empty_zone(struct zone *zone, 3982int __meminit init_currently_empty_zone(struct zone *zone,
3946 unsigned long zone_start_pfn, 3983 unsigned long zone_start_pfn,
3947 unsigned long size, 3984 unsigned long size,
3948 enum memmap_context context) 3985 enum memmap_context context)
@@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat,
4301#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4338#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4302 4339
4303/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4340/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4304static inline void __init set_pageblock_order(void) 4341void __init set_pageblock_order(void)
4305{ 4342{
4306 unsigned int order; 4343 unsigned int order;
4307 4344
@@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void)
4329 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4366 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4330 * the kernel config 4367 * the kernel config
4331 */ 4368 */
4332static inline void set_pageblock_order(void) 4369void __init set_pageblock_order(void)
4333{ 4370{
4334} 4371}
4335 4372
@@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void)
4340 * - mark all pages reserved 4377 * - mark all pages reserved
4341 * - mark all memory queues empty 4378 * - mark all memory queues empty
4342 * - clear the memory bitmaps 4379 * - clear the memory bitmaps
4380 *
4381 * NOTE: pgdat should get zeroed by caller.
4343 */ 4382 */
4344static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4383static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4345 unsigned long *zones_size, unsigned long *zholes_size) 4384 unsigned long *zones_size, unsigned long *zholes_size)
@@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4350 int ret; 4389 int ret;
4351 4390
4352 pgdat_resize_init(pgdat); 4391 pgdat_resize_init(pgdat);
4353 pgdat->nr_zones = 0;
4354 init_waitqueue_head(&pgdat->kswapd_wait); 4392 init_waitqueue_head(&pgdat->kswapd_wait);
4355 pgdat->kswapd_max_order = 0; 4393 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4356 pgdat_page_cgroup_init(pgdat); 4394 pgdat_page_cgroup_init(pgdat);
4357 4395
4358 for (j = 0; j < MAX_NR_ZONES; j++) { 4396 for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4394 4432
4395 zone->spanned_pages = size; 4433 zone->spanned_pages = size;
4396 zone->present_pages = realsize; 4434 zone->present_pages = realsize;
4435#if defined CONFIG_COMPACTION || defined CONFIG_CMA
4436 zone->compact_cached_free_pfn = zone->zone_start_pfn +
4437 zone->spanned_pages;
4438 zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
4439#endif
4397#ifdef CONFIG_NUMA 4440#ifdef CONFIG_NUMA
4398 zone->node = nid; 4441 zone->node = nid;
4399 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4442 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4408 4451
4409 zone_pcp_init(zone); 4452 zone_pcp_init(zone);
4410 lruvec_init(&zone->lruvec, zone); 4453 lruvec_init(&zone->lruvec, zone);
4411 zap_zone_vm_stats(zone);
4412 zone->flags = 0;
4413 if (!size) 4454 if (!size)
4414 continue; 4455 continue;
4415 4456
@@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4469{ 4510{
4470 pg_data_t *pgdat = NODE_DATA(nid); 4511 pg_data_t *pgdat = NODE_DATA(nid);
4471 4512
4513 /* pg_data_t should be reset to zero when it's allocated */
4514 WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx);
4515
4472 pgdat->node_id = nid; 4516 pgdat->node_id = nid;
4473 pgdat->node_start_pfn = node_start_pfn; 4517 pgdat->node_start_pfn = node_start_pfn;
4474 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4518 calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -4750,7 +4794,7 @@ out:
4750} 4794}
4751 4795
4752/* Any regular memory on that node ? */ 4796/* Any regular memory on that node ? */
4753static void check_for_regular_memory(pg_data_t *pgdat) 4797static void __init check_for_regular_memory(pg_data_t *pgdat)
4754{ 4798{
4755#ifdef CONFIG_HIGHMEM 4799#ifdef CONFIG_HIGHMEM
4756 enum zone_type zone_type; 4800 enum zone_type zone_type;
@@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5468} 5512}
5469 5513
5470/* 5514/*
5471 * This is designed as sub function...plz see page_isolation.c also. 5515 * This function checks whether pageblock includes unmovable pages or not.
5472 * set/clear page block's type to be ISOLATE. 5516 * If @count is not zero, it is okay to include less @count unmovable pages
5473 * page allocater never alloc memory from ISOLATE block. 5517 *
5518 * PageLRU check wihtout isolation or lru_lock could race so that
5519 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5520 * expect this function should be exact.
5474 */ 5521 */
5475 5522bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
5476static int
5477__count_immobile_pages(struct zone *zone, struct page *page, int count)
5478{ 5523{
5479 unsigned long pfn, iter, found; 5524 unsigned long pfn, iter, found;
5480 int mt; 5525 int mt;
5481 5526
5482 /* 5527 /*
5483 * For avoiding noise data, lru_add_drain_all() should be called 5528 * For avoiding noise data, lru_add_drain_all() should be called
5484 * If ZONE_MOVABLE, the zone never contains immobile pages 5529 * If ZONE_MOVABLE, the zone never contains unmovable pages
5485 */ 5530 */
5486 if (zone_idx(zone) == ZONE_MOVABLE) 5531 if (zone_idx(zone) == ZONE_MOVABLE)
5487 return true; 5532 return false;
5488 mt = get_pageblock_migratetype(page); 5533 mt = get_pageblock_migratetype(page);
5489 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 5534 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
5490 return true; 5535 return false;
5491 5536
5492 pfn = page_to_pfn(page); 5537 pfn = page_to_pfn(page);
5493 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 5538 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
@@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5497 continue; 5542 continue;
5498 5543
5499 page = pfn_to_page(check); 5544 page = pfn_to_page(check);
5500 if (!page_count(page)) { 5545 /*
5546 * We can't use page_count without pin a page
5547 * because another CPU can free compound page.
5548 * This check already skips compound tails of THP
5549 * because their page->_count is zero at all time.
5550 */
5551 if (!atomic_read(&page->_count)) {
5501 if (PageBuddy(page)) 5552 if (PageBuddy(page))
5502 iter += (1 << page_order(page)) - 1; 5553 iter += (1 << page_order(page)) - 1;
5503 continue; 5554 continue;
5504 } 5555 }
5556
5505 if (!PageLRU(page)) 5557 if (!PageLRU(page))
5506 found++; 5558 found++;
5507 /* 5559 /*
@@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5518 * page at boot. 5570 * page at boot.
5519 */ 5571 */
5520 if (found > count) 5572 if (found > count)
5521 return false; 5573 return true;
5522 } 5574 }
5523 return true; 5575 return false;
5524} 5576}
5525 5577
5526bool is_pageblock_removable_nolock(struct page *page) 5578bool is_pageblock_removable_nolock(struct page *page)
@@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page)
5544 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5596 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5545 return false; 5597 return false;
5546 5598
5547 return __count_immobile_pages(zone, page, 0); 5599 return !has_unmovable_pages(zone, page, 0);
5548}
5549
5550int set_migratetype_isolate(struct page *page)
5551{
5552 struct zone *zone;
5553 unsigned long flags, pfn;
5554 struct memory_isolate_notify arg;
5555 int notifier_ret;
5556 int ret = -EBUSY;
5557
5558 zone = page_zone(page);
5559
5560 spin_lock_irqsave(&zone->lock, flags);
5561
5562 pfn = page_to_pfn(page);
5563 arg.start_pfn = pfn;
5564 arg.nr_pages = pageblock_nr_pages;
5565 arg.pages_found = 0;
5566
5567 /*
5568 * It may be possible to isolate a pageblock even if the
5569 * migratetype is not MIGRATE_MOVABLE. The memory isolation
5570 * notifier chain is used by balloon drivers to return the
5571 * number of pages in a range that are held by the balloon
5572 * driver to shrink memory. If all the pages are accounted for
5573 * by balloons, are free, or on the LRU, isolation can continue.
5574 * Later, for example, when memory hotplug notifier runs, these
5575 * pages reported as "can be isolated" should be isolated(freed)
5576 * by the balloon driver through the memory notifier chain.
5577 */
5578 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5579 notifier_ret = notifier_to_errno(notifier_ret);
5580 if (notifier_ret)
5581 goto out;
5582 /*
5583 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
5584 * We just check MOVABLE pages.
5585 */
5586 if (__count_immobile_pages(zone, page, arg.pages_found))
5587 ret = 0;
5588
5589 /*
5590 * immobile means "not-on-lru" paes. If immobile is larger than
5591 * removable-by-driver pages reported by notifier, we'll fail.
5592 */
5593
5594out:
5595 if (!ret) {
5596 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
5597 move_freepages_block(zone, page, MIGRATE_ISOLATE);
5598 }
5599
5600 spin_unlock_irqrestore(&zone->lock, flags);
5601 if (!ret)
5602 drain_all_pages();
5603 return ret;
5604}
5605
5606void unset_migratetype_isolate(struct page *page, unsigned migratetype)
5607{
5608 struct zone *zone;
5609 unsigned long flags;
5610 zone = page_zone(page);
5611 spin_lock_irqsave(&zone->lock, flags);
5612 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
5613 goto out;
5614 set_pageblock_migratetype(page, migratetype);
5615 move_freepages_block(zone, page, migratetype);
5616out:
5617 spin_unlock_irqrestore(&zone->lock, flags);
5618} 5600}
5619 5601
5620#ifdef CONFIG_CMA 5602#ifdef CONFIG_CMA
@@ -5635,7 +5617,12 @@ static struct page *
5635__alloc_contig_migrate_alloc(struct page *page, unsigned long private, 5617__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
5636 int **resultp) 5618 int **resultp)
5637{ 5619{
5638 return alloc_page(GFP_HIGHUSER_MOVABLE); 5620 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
5621
5622 if (PageHighMem(page))
5623 gfp_mask |= __GFP_HIGHMEM;
5624
5625 return alloc_page(gfp_mask);
5639} 5626}
5640 5627
5641/* [start, end) must belong to a single zone. */ 5628/* [start, end) must belong to a single zone. */
@@ -5864,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
5864} 5851}
5865#endif 5852#endif
5866 5853
5854#ifdef CONFIG_MEMORY_HOTPLUG
5855static int __meminit __zone_pcp_update(void *data)
5856{
5857 struct zone *zone = data;
5858 int cpu;
5859 unsigned long batch = zone_batchsize(zone), flags;
5860
5861 for_each_possible_cpu(cpu) {
5862 struct per_cpu_pageset *pset;
5863 struct per_cpu_pages *pcp;
5864
5865 pset = per_cpu_ptr(zone->pageset, cpu);
5866 pcp = &pset->pcp;
5867
5868 local_irq_save(flags);
5869 if (pcp->count > 0)
5870 free_pcppages_bulk(zone, pcp->count, pcp);
5871 setup_pageset(pset, batch);
5872 local_irq_restore(flags);
5873 }
5874 return 0;
5875}
5876
5877void __meminit zone_pcp_update(struct zone *zone)
5878{
5879 stop_machine(__zone_pcp_update, zone, NULL);
5880}
5881#endif
5882
5867#ifdef CONFIG_MEMORY_HOTREMOVE 5883#ifdef CONFIG_MEMORY_HOTREMOVE
5884void zone_pcp_reset(struct zone *zone)
5885{
5886 unsigned long flags;
5887
5888 /* avoid races with drain_pages() */
5889 local_irq_save(flags);
5890 if (zone->pageset != &boot_pageset) {
5891 free_percpu(zone->pageset);
5892 zone->pageset = &boot_pageset;
5893 }
5894 local_irq_restore(flags);
5895}
5896
5868/* 5897/*
5869 * All pages in the range must be isolated before calling this. 5898 * All pages in the range must be isolated before calling this.
5870 */ 5899 */
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1ccbd714059c..5ddad0c6daa6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
317#endif 317#endif
318 318
319 319
320#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 320#ifdef CONFIG_MEMCG_SWAP
321 321
322static DEFINE_MUTEX(swap_cgroup_mutex); 322static DEFINE_MUTEX(swap_cgroup_mutex);
323struct swap_cgroup_ctrl { 323struct swap_cgroup_ctrl {
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
392 392
393/** 393/**
394 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. 394 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
395 * @end: swap entry to be cmpxchged 395 * @ent: swap entry to be cmpxchged
396 * @old: old id 396 * @old: old id
397 * @new: new id 397 * @new: new id
398 * 398 *
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
422/** 422/**
423 * swap_cgroup_record - record mem_cgroup for this swp_entry. 423 * swap_cgroup_record - record mem_cgroup for this swp_entry.
424 * @ent: swap entry to be recorded into 424 * @ent: swap entry to be recorded into
425 * @mem: mem_cgroup to be recorded 425 * @id: mem_cgroup to be recorded
426 * 426 *
427 * Returns old value at success, 0 at failure. 427 * Returns old value at success, 0 at failure.
428 * (Of course, old value can be 0.) 428 * (Of course, old value can be 0.)
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d0611e..78eee32ee486 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -17,7 +17,9 @@
17#include <linux/swap.h> 17#include <linux/swap.h>
18#include <linux/bio.h> 18#include <linux/bio.h>
19#include <linux/swapops.h> 19#include <linux/swapops.h>
20#include <linux/buffer_head.h>
20#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/frontswap.h>
21#include <asm/pgtable.h> 23#include <asm/pgtable.h>
22 24
23static struct bio *get_swap_bio(gfp_t gfp_flags, 25static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -85,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err)
85 bio_put(bio); 87 bio_put(bio);
86} 88}
87 89
90int generic_swapfile_activate(struct swap_info_struct *sis,
91 struct file *swap_file,
92 sector_t *span)
93{
94 struct address_space *mapping = swap_file->f_mapping;
95 struct inode *inode = mapping->host;
96 unsigned blocks_per_page;
97 unsigned long page_no;
98 unsigned blkbits;
99 sector_t probe_block;
100 sector_t last_block;
101 sector_t lowest_block = -1;
102 sector_t highest_block = 0;
103 int nr_extents = 0;
104 int ret;
105
106 blkbits = inode->i_blkbits;
107 blocks_per_page = PAGE_SIZE >> blkbits;
108
109 /*
110 * Map all the blocks into the extent list. This code doesn't try
111 * to be very smart.
112 */
113 probe_block = 0;
114 page_no = 0;
115 last_block = i_size_read(inode) >> blkbits;
116 while ((probe_block + blocks_per_page) <= last_block &&
117 page_no < sis->max) {
118 unsigned block_in_page;
119 sector_t first_block;
120
121 first_block = bmap(inode, probe_block);
122 if (first_block == 0)
123 goto bad_bmap;
124
125 /*
126 * It must be PAGE_SIZE aligned on-disk
127 */
128 if (first_block & (blocks_per_page - 1)) {
129 probe_block++;
130 goto reprobe;
131 }
132
133 for (block_in_page = 1; block_in_page < blocks_per_page;
134 block_in_page++) {
135 sector_t block;
136
137 block = bmap(inode, probe_block + block_in_page);
138 if (block == 0)
139 goto bad_bmap;
140 if (block != first_block + block_in_page) {
141 /* Discontiguity */
142 probe_block++;
143 goto reprobe;
144 }
145 }
146
147 first_block >>= (PAGE_SHIFT - blkbits);
148 if (page_no) { /* exclude the header page */
149 if (first_block < lowest_block)
150 lowest_block = first_block;
151 if (first_block > highest_block)
152 highest_block = first_block;
153 }
154
155 /*
156 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
157 */
158 ret = add_swap_extent(sis, page_no, 1, first_block);
159 if (ret < 0)
160 goto out;
161 nr_extents += ret;
162 page_no++;
163 probe_block += blocks_per_page;
164reprobe:
165 continue;
166 }
167 ret = nr_extents;
168 *span = 1 + highest_block - lowest_block;
169 if (page_no == 0)
170 page_no = 1; /* force Empty message */
171 sis->max = page_no;
172 sis->pages = page_no - 1;
173 sis->highest_bit = page_no - 1;
174out:
175 return ret;
176bad_bmap:
177 printk(KERN_ERR "swapon: swapfile has holes\n");
178 ret = -EINVAL;
179 goto out;
180}
181
88/* 182/*
89 * We may have stale swap cache pages in memory: notice 183 * We may have stale swap cache pages in memory: notice
90 * them here and get rid of the unnecessary final write. 184 * them here and get rid of the unnecessary final write.
@@ -93,11 +187,45 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
93{ 187{
94 struct bio *bio; 188 struct bio *bio;
95 int ret = 0, rw = WRITE; 189 int ret = 0, rw = WRITE;
190 struct swap_info_struct *sis = page_swap_info(page);
96 191
97 if (try_to_free_swap(page)) { 192 if (try_to_free_swap(page)) {
98 unlock_page(page); 193 unlock_page(page);
99 goto out; 194 goto out;
100 } 195 }
196 if (frontswap_store(page) == 0) {
197 set_page_writeback(page);
198 unlock_page(page);
199 end_page_writeback(page);
200 goto out;
201 }
202
203 if (sis->flags & SWP_FILE) {
204 struct kiocb kiocb;
205 struct file *swap_file = sis->swap_file;
206 struct address_space *mapping = swap_file->f_mapping;
207 struct iovec iov = {
208 .iov_base = kmap(page),
209 .iov_len = PAGE_SIZE,
210 };
211
212 init_sync_kiocb(&kiocb, swap_file);
213 kiocb.ki_pos = page_file_offset(page);
214 kiocb.ki_left = PAGE_SIZE;
215 kiocb.ki_nbytes = PAGE_SIZE;
216
217 unlock_page(page);
218 ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
219 &kiocb, &iov,
220 kiocb.ki_pos, 1);
221 kunmap(page);
222 if (ret == PAGE_SIZE) {
223 count_vm_event(PSWPOUT);
224 ret = 0;
225 }
226 return ret;
227 }
228
101 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); 229 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
102 if (bio == NULL) { 230 if (bio == NULL) {
103 set_page_dirty(page); 231 set_page_dirty(page);
@@ -119,9 +247,26 @@ int swap_readpage(struct page *page)
119{ 247{
120 struct bio *bio; 248 struct bio *bio;
121 int ret = 0; 249 int ret = 0;
250 struct swap_info_struct *sis = page_swap_info(page);
122 251
123 VM_BUG_ON(!PageLocked(page)); 252 VM_BUG_ON(!PageLocked(page));
124 VM_BUG_ON(PageUptodate(page)); 253 VM_BUG_ON(PageUptodate(page));
254 if (frontswap_load(page) == 0) {
255 SetPageUptodate(page);
256 unlock_page(page);
257 goto out;
258 }
259
260 if (sis->flags & SWP_FILE) {
261 struct file *swap_file = sis->swap_file;
262 struct address_space *mapping = swap_file->f_mapping;
263
264 ret = mapping->a_ops->readpage(swap_file, page);
265 if (!ret)
266 count_vm_event(PSWPIN);
267 return ret;
268 }
269
125 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); 270 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
126 if (bio == NULL) { 271 if (bio == NULL) {
127 unlock_page(page); 272 unlock_page(page);
@@ -133,3 +278,15 @@ int swap_readpage(struct page *page)
133out: 278out:
134 return ret; 279 return ret;
135} 280}
281
282int swap_set_page_dirty(struct page *page)
283{
284 struct swap_info_struct *sis = page_swap_info(page);
285
286 if (sis->flags & SWP_FILE) {
287 struct address_space *mapping = sis->swap_file->f_mapping;
288 return mapping->a_ops->set_page_dirty(page);
289 } else {
290 return __set_page_dirty_no_writeback(page);
291 }
292}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c9f04774f2b8..247d1f175739 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -5,8 +5,101 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/page-isolation.h> 6#include <linux/page-isolation.h>
7#include <linux/pageblock-flags.h> 7#include <linux/pageblock-flags.h>
8#include <linux/memory.h>
8#include "internal.h" 9#include "internal.h"
9 10
11/* called while holding zone->lock */
12static void set_pageblock_isolate(struct page *page)
13{
14 if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
15 return;
16
17 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
18 page_zone(page)->nr_pageblock_isolate++;
19}
20
21/* called while holding zone->lock */
22static void restore_pageblock_isolate(struct page *page, int migratetype)
23{
24 struct zone *zone = page_zone(page);
25 if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
26 return;
27
28 BUG_ON(zone->nr_pageblock_isolate <= 0);
29 set_pageblock_migratetype(page, migratetype);
30 zone->nr_pageblock_isolate--;
31}
32
33int set_migratetype_isolate(struct page *page)
34{
35 struct zone *zone;
36 unsigned long flags, pfn;
37 struct memory_isolate_notify arg;
38 int notifier_ret;
39 int ret = -EBUSY;
40
41 zone = page_zone(page);
42
43 spin_lock_irqsave(&zone->lock, flags);
44
45 pfn = page_to_pfn(page);
46 arg.start_pfn = pfn;
47 arg.nr_pages = pageblock_nr_pages;
48 arg.pages_found = 0;
49
50 /*
51 * It may be possible to isolate a pageblock even if the
52 * migratetype is not MIGRATE_MOVABLE. The memory isolation
53 * notifier chain is used by balloon drivers to return the
54 * number of pages in a range that are held by the balloon
55 * driver to shrink memory. If all the pages are accounted for
56 * by balloons, are free, or on the LRU, isolation can continue.
57 * Later, for example, when memory hotplug notifier runs, these
58 * pages reported as "can be isolated" should be isolated(freed)
59 * by the balloon driver through the memory notifier chain.
60 */
61 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
62 notifier_ret = notifier_to_errno(notifier_ret);
63 if (notifier_ret)
64 goto out;
65 /*
66 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
67 * We just check MOVABLE pages.
68 */
69 if (!has_unmovable_pages(zone, page, arg.pages_found))
70 ret = 0;
71
72 /*
73 * immobile means "not-on-lru" paes. If immobile is larger than
74 * removable-by-driver pages reported by notifier, we'll fail.
75 */
76
77out:
78 if (!ret) {
79 set_pageblock_isolate(page);
80 move_freepages_block(zone, page, MIGRATE_ISOLATE);
81 }
82
83 spin_unlock_irqrestore(&zone->lock, flags);
84 if (!ret)
85 drain_all_pages();
86 return ret;
87}
88
89void unset_migratetype_isolate(struct page *page, unsigned migratetype)
90{
91 struct zone *zone;
92 unsigned long flags;
93 zone = page_zone(page);
94 spin_lock_irqsave(&zone->lock, flags);
95 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
96 goto out;
97 move_freepages_block(zone, page, migratetype);
98 restore_pageblock_isolate(page, migratetype);
99out:
100 spin_unlock_irqrestore(&zone->lock, flags);
101}
102
10static inline struct page * 103static inline struct page *
11__first_valid_page(unsigned long pfn, unsigned long nr_pages) 104__first_valid_page(unsigned long pfn, unsigned long nr_pages)
12{ 105{
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index aa9701e12714..6c118d012bb5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
162 162
163/** 163/**
164 * walk_page_range - walk a memory map's page tables with a callback 164 * walk_page_range - walk a memory map's page tables with a callback
165 * @mm: memory map to walk
166 * @addr: starting address 165 * @addr: starting address
167 * @end: ending address 166 * @end: ending address
168 * @walk: set of callbacks to invoke for each level of the tree 167 * @walk: set of callbacks to invoke for each level of the tree
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 405d331804c3..3707c71ae4cd 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -360,7 +360,6 @@ err_free:
360 * @chunk: chunk to depopulate 360 * @chunk: chunk to depopulate
361 * @off: offset to the area to depopulate 361 * @off: offset to the area to depopulate
362 * @size: size of the area to depopulate in bytes 362 * @size: size of the area to depopulate in bytes
363 * @flush: whether to flush cache and tlb or not
364 * 363 *
365 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 364 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
366 * from @chunk. If @flush is true, vcache is flushed before unmapping 365 * from @chunk. If @flush is true, vcache is flushed before unmapping
diff --git a/mm/shmem.c b/mm/shmem.c
index c244e93a70fa..d4e184e2a38e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -264,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
264} 264}
265 265
266/* 266/*
267 * Sometimes, before we decide whether to proceed or to fail, we must check
268 * that an entry was not already brought back from swap by a racing thread.
269 *
270 * Checking page is not enough: by the time a SwapCache page is locked, it
271 * might be reused, and again be SwapCache, using the same swap as before.
272 */
273static bool shmem_confirm_swap(struct address_space *mapping,
274 pgoff_t index, swp_entry_t swap)
275{
276 void *item;
277
278 rcu_read_lock();
279 item = radix_tree_lookup(&mapping->page_tree, index);
280 rcu_read_unlock();
281 return item == swp_to_radix_entry(swap);
282}
283
284/*
267 * Like add_to_page_cache_locked, but error if expected item has gone. 285 * Like add_to_page_cache_locked, but error if expected item has gone.
268 */ 286 */
269static int shmem_add_to_page_cache(struct page *page, 287static int shmem_add_to_page_cache(struct page *page,
270 struct address_space *mapping, 288 struct address_space *mapping,
271 pgoff_t index, gfp_t gfp, void *expected) 289 pgoff_t index, gfp_t gfp, void *expected)
272{ 290{
273 int error = 0; 291 int error;
274 292
275 VM_BUG_ON(!PageLocked(page)); 293 VM_BUG_ON(!PageLocked(page));
276 VM_BUG_ON(!PageSwapBacked(page)); 294 VM_BUG_ON(!PageSwapBacked(page));
277 295
296 page_cache_get(page);
297 page->mapping = mapping;
298 page->index = index;
299
300 spin_lock_irq(&mapping->tree_lock);
278 if (!expected) 301 if (!expected)
279 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); 302 error = radix_tree_insert(&mapping->page_tree, index, page);
303 else
304 error = shmem_radix_tree_replace(mapping, index, expected,
305 page);
280 if (!error) { 306 if (!error) {
281 page_cache_get(page); 307 mapping->nrpages++;
282 page->mapping = mapping; 308 __inc_zone_page_state(page, NR_FILE_PAGES);
283 page->index = index; 309 __inc_zone_page_state(page, NR_SHMEM);
284 310 spin_unlock_irq(&mapping->tree_lock);
285 spin_lock_irq(&mapping->tree_lock); 311 } else {
286 if (!expected) 312 page->mapping = NULL;
287 error = radix_tree_insert(&mapping->page_tree, 313 spin_unlock_irq(&mapping->tree_lock);
288 index, page); 314 page_cache_release(page);
289 else
290 error = shmem_radix_tree_replace(mapping, index,
291 expected, page);
292 if (!error) {
293 mapping->nrpages++;
294 __inc_zone_page_state(page, NR_FILE_PAGES);
295 __inc_zone_page_state(page, NR_SHMEM);
296 spin_unlock_irq(&mapping->tree_lock);
297 } else {
298 page->mapping = NULL;
299 spin_unlock_irq(&mapping->tree_lock);
300 page_cache_release(page);
301 }
302 if (!expected)
303 radix_tree_preload_end();
304 } 315 }
305 if (error)
306 mem_cgroup_uncharge_cache_page(page);
307 return error; 316 return error;
308} 317}
309 318
@@ -683,10 +692,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
683 mutex_lock(&shmem_swaplist_mutex); 692 mutex_lock(&shmem_swaplist_mutex);
684 /* 693 /*
685 * We needed to drop mutex to make that restrictive page 694 * We needed to drop mutex to make that restrictive page
686 * allocation; but the inode might already be freed by now, 695 * allocation, but the inode might have been freed while we
687 * and we cannot refer to inode or mapping or info to check. 696 * dropped it: although a racing shmem_evict_inode() cannot
688 * However, we do hold page lock on the PageSwapCache page, 697 * complete without emptying the radix_tree, our page lock
689 * so can check if that still has our reference remaining. 698 * on this swapcache page is not enough to prevent that -
699 * free_swap_and_cache() of our swap entry will only
700 * trylock_page(), removing swap from radix_tree whatever.
701 *
702 * We must not proceed to shmem_add_to_page_cache() if the
703 * inode has been freed, but of course we cannot rely on
704 * inode or mapping or info to check that. However, we can
705 * safely check if our swap entry is still in use (and here
706 * it can't have got reused for another page): if it's still
707 * in use, then the inode cannot have been freed yet, and we
708 * can safely proceed (if it's no longer in use, that tells
709 * nothing about the inode, but we don't need to unuse swap).
690 */ 710 */
691 if (!page_swapcount(*pagep)) 711 if (!page_swapcount(*pagep))
692 error = -ENOENT; 712 error = -ENOENT;
@@ -730,9 +750,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
730 750
731 /* 751 /*
732 * There's a faint possibility that swap page was replaced before 752 * There's a faint possibility that swap page was replaced before
733 * caller locked it: it will come back later with the right page. 753 * caller locked it: caller will come back later with the right page.
734 */ 754 */
735 if (unlikely(!PageSwapCache(page))) 755 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
736 goto out; 756 goto out;
737 757
738 /* 758 /*
@@ -909,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
909 929
910 /* Create a pseudo vma that just contains the policy */ 930 /* Create a pseudo vma that just contains the policy */
911 pvma.vm_start = 0; 931 pvma.vm_start = 0;
912 pvma.vm_pgoff = index; 932 /* Bias interleave by inode number to distribute better across nodes */
933 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
913 pvma.vm_ops = NULL; 934 pvma.vm_ops = NULL;
914 pvma.vm_policy = spol; 935 pvma.vm_policy = spol;
915 return swapin_readahead(swap, gfp, &pvma, 0); 936 return swapin_readahead(swap, gfp, &pvma, 0);
@@ -922,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp,
922 943
923 /* Create a pseudo vma that just contains the policy */ 944 /* Create a pseudo vma that just contains the policy */
924 pvma.vm_start = 0; 945 pvma.vm_start = 0;
925 pvma.vm_pgoff = index; 946 /* Bias interleave by inode number to distribute better across nodes */
947 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
926 pvma.vm_ops = NULL; 948 pvma.vm_ops = NULL;
927 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 949 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
928 950
@@ -995,21 +1017,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
995 newpage = shmem_alloc_page(gfp, info, index); 1017 newpage = shmem_alloc_page(gfp, info, index);
996 if (!newpage) 1018 if (!newpage)
997 return -ENOMEM; 1019 return -ENOMEM;
998 VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
999 1020
1000 *pagep = newpage;
1001 page_cache_get(newpage); 1021 page_cache_get(newpage);
1002 copy_highpage(newpage, oldpage); 1022 copy_highpage(newpage, oldpage);
1023 flush_dcache_page(newpage);
1003 1024
1004 VM_BUG_ON(!PageLocked(oldpage));
1005 __set_page_locked(newpage); 1025 __set_page_locked(newpage);
1006 VM_BUG_ON(!PageUptodate(oldpage));
1007 SetPageUptodate(newpage); 1026 SetPageUptodate(newpage);
1008 VM_BUG_ON(!PageSwapBacked(oldpage));
1009 SetPageSwapBacked(newpage); 1027 SetPageSwapBacked(newpage);
1010 VM_BUG_ON(!swap_index);
1011 set_page_private(newpage, swap_index); 1028 set_page_private(newpage, swap_index);
1012 VM_BUG_ON(!PageSwapCache(oldpage));
1013 SetPageSwapCache(newpage); 1029 SetPageSwapCache(newpage);
1014 1030
1015 /* 1031 /*
@@ -1019,13 +1035,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1019 spin_lock_irq(&swap_mapping->tree_lock); 1035 spin_lock_irq(&swap_mapping->tree_lock);
1020 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, 1036 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1021 newpage); 1037 newpage);
1022 __inc_zone_page_state(newpage, NR_FILE_PAGES); 1038 if (!error) {
1023 __dec_zone_page_state(oldpage, NR_FILE_PAGES); 1039 __inc_zone_page_state(newpage, NR_FILE_PAGES);
1040 __dec_zone_page_state(oldpage, NR_FILE_PAGES);
1041 }
1024 spin_unlock_irq(&swap_mapping->tree_lock); 1042 spin_unlock_irq(&swap_mapping->tree_lock);
1025 BUG_ON(error);
1026 1043
1027 mem_cgroup_replace_page_cache(oldpage, newpage); 1044 if (unlikely(error)) {
1028 lru_cache_add_anon(newpage); 1045 /*
1046 * Is this possible? I think not, now that our callers check
1047 * both PageSwapCache and page_private after getting page lock;
1048 * but be defensive. Reverse old to newpage for clear and free.
1049 */
1050 oldpage = newpage;
1051 } else {
1052 mem_cgroup_replace_page_cache(oldpage, newpage);
1053 lru_cache_add_anon(newpage);
1054 *pagep = newpage;
1055 }
1029 1056
1030 ClearPageSwapCache(oldpage); 1057 ClearPageSwapCache(oldpage);
1031 set_page_private(oldpage, 0); 1058 set_page_private(oldpage, 0);
@@ -1033,7 +1060,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1033 unlock_page(oldpage); 1060 unlock_page(oldpage);
1034 page_cache_release(oldpage); 1061 page_cache_release(oldpage);
1035 page_cache_release(oldpage); 1062 page_cache_release(oldpage);
1036 return 0; 1063 return error;
1037} 1064}
1038 1065
1039/* 1066/*
@@ -1107,9 +1134,10 @@ repeat:
1107 1134
1108 /* We have to do this with page locked to prevent races */ 1135 /* We have to do this with page locked to prevent races */
1109 lock_page(page); 1136 lock_page(page);
1110 if (!PageSwapCache(page) || page->mapping) { 1137 if (!PageSwapCache(page) || page_private(page) != swap.val ||
1138 !shmem_confirm_swap(mapping, index, swap)) {
1111 error = -EEXIST; /* try again */ 1139 error = -EEXIST; /* try again */
1112 goto failed; 1140 goto unlock;
1113 } 1141 }
1114 if (!PageUptodate(page)) { 1142 if (!PageUptodate(page)) {
1115 error = -EIO; 1143 error = -EIO;
@@ -1125,9 +1153,12 @@ repeat:
1125 1153
1126 error = mem_cgroup_cache_charge(page, current->mm, 1154 error = mem_cgroup_cache_charge(page, current->mm,
1127 gfp & GFP_RECLAIM_MASK); 1155 gfp & GFP_RECLAIM_MASK);
1128 if (!error) 1156 if (!error) {
1129 error = shmem_add_to_page_cache(page, mapping, index, 1157 error = shmem_add_to_page_cache(page, mapping, index,
1130 gfp, swp_to_radix_entry(swap)); 1158 gfp, swp_to_radix_entry(swap));
1159 /* We already confirmed swap, and make no allocation */
1160 VM_BUG_ON(error);
1161 }
1131 if (error) 1162 if (error)
1132 goto failed; 1163 goto failed;
1133 1164
@@ -1164,11 +1195,18 @@ repeat:
1164 __set_page_locked(page); 1195 __set_page_locked(page);
1165 error = mem_cgroup_cache_charge(page, current->mm, 1196 error = mem_cgroup_cache_charge(page, current->mm,
1166 gfp & GFP_RECLAIM_MASK); 1197 gfp & GFP_RECLAIM_MASK);
1167 if (!error)
1168 error = shmem_add_to_page_cache(page, mapping, index,
1169 gfp, NULL);
1170 if (error) 1198 if (error)
1171 goto decused; 1199 goto decused;
1200 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
1201 if (!error) {
1202 error = shmem_add_to_page_cache(page, mapping, index,
1203 gfp, NULL);
1204 radix_tree_preload_end();
1205 }
1206 if (error) {
1207 mem_cgroup_uncharge_cache_page(page);
1208 goto decused;
1209 }
1172 lru_cache_add_anon(page); 1210 lru_cache_add_anon(page);
1173 1211
1174 spin_lock(&info->lock); 1212 spin_lock(&info->lock);
@@ -1228,14 +1266,10 @@ decused:
1228unacct: 1266unacct:
1229 shmem_unacct_blocks(info->flags, 1); 1267 shmem_unacct_blocks(info->flags, 1);
1230failed: 1268failed:
1231 if (swap.val && error != -EINVAL) { 1269 if (swap.val && error != -EINVAL &&
1232 struct page *test = find_get_page(mapping, index); 1270 !shmem_confirm_swap(mapping, index, swap))
1233 if (test && !radix_tree_exceptional_entry(test)) 1271 error = -EEXIST;
1234 page_cache_release(test); 1272unlock:
1235 /* Have another try if the entry has changed */
1236 if (test != swp_to_radix_entry(swap))
1237 error = -EEXIST;
1238 }
1239 if (page) { 1273 if (page) {
1240 unlock_page(page); 1274 unlock_page(page);
1241 page_cache_release(page); 1275 page_cache_release(page);
@@ -1247,7 +1281,7 @@ failed:
1247 spin_unlock(&info->lock); 1281 spin_unlock(&info->lock);
1248 goto repeat; 1282 goto repeat;
1249 } 1283 }
1250 if (error == -EEXIST) 1284 if (error == -EEXIST) /* from above or from radix_tree_insert */
1251 goto repeat; 1285 goto repeat;
1252 return error; 1286 return error;
1253} 1287}
@@ -1675,98 +1709,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1675 return error; 1709 return error;
1676} 1710}
1677 1711
1678/*
1679 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
1680 */
1681static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
1682 pgoff_t index, pgoff_t end, int origin)
1683{
1684 struct page *page;
1685 struct pagevec pvec;
1686 pgoff_t indices[PAGEVEC_SIZE];
1687 bool done = false;
1688 int i;
1689
1690 pagevec_init(&pvec, 0);
1691 pvec.nr = 1; /* start small: we may be there already */
1692 while (!done) {
1693 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
1694 pvec.nr, pvec.pages, indices);
1695 if (!pvec.nr) {
1696 if (origin == SEEK_DATA)
1697 index = end;
1698 break;
1699 }
1700 for (i = 0; i < pvec.nr; i++, index++) {
1701 if (index < indices[i]) {
1702 if (origin == SEEK_HOLE) {
1703 done = true;
1704 break;
1705 }
1706 index = indices[i];
1707 }
1708 page = pvec.pages[i];
1709 if (page && !radix_tree_exceptional_entry(page)) {
1710 if (!PageUptodate(page))
1711 page = NULL;
1712 }
1713 if (index >= end ||
1714 (page && origin == SEEK_DATA) ||
1715 (!page && origin == SEEK_HOLE)) {
1716 done = true;
1717 break;
1718 }
1719 }
1720 shmem_deswap_pagevec(&pvec);
1721 pagevec_release(&pvec);
1722 pvec.nr = PAGEVEC_SIZE;
1723 cond_resched();
1724 }
1725 return index;
1726}
1727
1728static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
1729{
1730 struct address_space *mapping;
1731 struct inode *inode;
1732 pgoff_t start, end;
1733 loff_t new_offset;
1734
1735 if (origin != SEEK_DATA && origin != SEEK_HOLE)
1736 return generic_file_llseek_size(file, offset, origin,
1737 MAX_LFS_FILESIZE);
1738 mapping = file->f_mapping;
1739 inode = mapping->host;
1740 mutex_lock(&inode->i_mutex);
1741 /* We're holding i_mutex so we can access i_size directly */
1742
1743 if (offset < 0)
1744 offset = -EINVAL;
1745 else if (offset >= inode->i_size)
1746 offset = -ENXIO;
1747 else {
1748 start = offset >> PAGE_CACHE_SHIFT;
1749 end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1750 new_offset = shmem_seek_hole_data(mapping, start, end, origin);
1751 new_offset <<= PAGE_CACHE_SHIFT;
1752 if (new_offset > offset) {
1753 if (new_offset < inode->i_size)
1754 offset = new_offset;
1755 else if (origin == SEEK_DATA)
1756 offset = -ENXIO;
1757 else
1758 offset = inode->i_size;
1759 }
1760 }
1761
1762 if (offset >= 0 && offset != file->f_pos) {
1763 file->f_pos = offset;
1764 file->f_version = 0;
1765 }
1766 mutex_unlock(&inode->i_mutex);
1767 return offset;
1768}
1769
1770static long shmem_fallocate(struct file *file, int mode, loff_t offset, 1712static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1771 loff_t len) 1713 loff_t len)
1772{ 1714{
@@ -1937,7 +1879,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1937} 1879}
1938 1880
1939static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, 1881static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
1940 struct nameidata *nd) 1882 bool excl)
1941{ 1883{
1942 return shmem_mknod(dir, dentry, mode | S_IFREG, 0); 1884 return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1943} 1885}
@@ -2770,7 +2712,7 @@ static const struct address_space_operations shmem_aops = {
2770static const struct file_operations shmem_file_operations = { 2712static const struct file_operations shmem_file_operations = {
2771 .mmap = shmem_mmap, 2713 .mmap = shmem_mmap,
2772#ifdef CONFIG_TMPFS 2714#ifdef CONFIG_TMPFS
2773 .llseek = shmem_file_llseek, 2715 .llseek = generic_file_llseek,
2774 .read = do_sync_read, 2716 .read = do_sync_read,
2775 .write = do_sync_write, 2717 .write = do_sync_write,
2776 .aio_read = shmem_file_aio_read, 2718 .aio_read = shmem_file_aio_read,
diff --git a/mm/slab.c b/mm/slab.c
index e901a36e2520..f8b0d539b482 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -68,7 +68,7 @@
68 * Further notes from the original documentation: 68 * Further notes from the original documentation:
69 * 69 *
70 * 11 April '97. Started multi-threading - markhe 70 * 11 April '97. Started multi-threading - markhe
71 * The global cache-chain is protected by the mutex 'cache_chain_mutex'. 71 * The global cache-chain is protected by the mutex 'slab_mutex'.
72 * The sem is only needed when accessing/extending the cache-chain, which 72 * The sem is only needed when accessing/extending the cache-chain, which
73 * can never happen inside an interrupt (kmem_cache_create(), 73 * can never happen inside an interrupt (kmem_cache_create(),
74 * kmem_cache_shrink() and kmem_cache_reap()). 74 * kmem_cache_shrink() and kmem_cache_reap()).
@@ -87,6 +87,7 @@
87 */ 87 */
88 88
89#include <linux/slab.h> 89#include <linux/slab.h>
90#include "slab.h"
90#include <linux/mm.h> 91#include <linux/mm.h>
91#include <linux/poison.h> 92#include <linux/poison.h>
92#include <linux/swap.h> 93#include <linux/swap.h>
@@ -117,12 +118,16 @@
117#include <linux/memory.h> 118#include <linux/memory.h>
118#include <linux/prefetch.h> 119#include <linux/prefetch.h>
119 120
121#include <net/sock.h>
122
120#include <asm/cacheflush.h> 123#include <asm/cacheflush.h>
121#include <asm/tlbflush.h> 124#include <asm/tlbflush.h>
122#include <asm/page.h> 125#include <asm/page.h>
123 126
124#include <trace/events/kmem.h> 127#include <trace/events/kmem.h>
125 128
129#include "internal.h"
130
126/* 131/*
127 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 132 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
128 * 0 for faster, smaller code (especially in the critical paths). 133 * 0 for faster, smaller code (especially in the critical paths).
@@ -151,6 +156,12 @@
151#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 156#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
152#endif 157#endif
153 158
159/*
160 * true if a page was allocated from pfmemalloc reserves for network-based
161 * swap
162 */
163static bool pfmemalloc_active __read_mostly;
164
154/* Legal flag mask for kmem_cache_create(). */ 165/* Legal flag mask for kmem_cache_create(). */
155#if DEBUG 166#if DEBUG
156# define CREATE_MASK (SLAB_RED_ZONE | \ 167# define CREATE_MASK (SLAB_RED_ZONE | \
@@ -256,9 +267,30 @@ struct array_cache {
256 * Must have this definition in here for the proper 267 * Must have this definition in here for the proper
257 * alignment of array_cache. Also simplifies accessing 268 * alignment of array_cache. Also simplifies accessing
258 * the entries. 269 * the entries.
270 *
271 * Entries should not be directly dereferenced as
272 * entries belonging to slabs marked pfmemalloc will
273 * have the lower bits set SLAB_OBJ_PFMEMALLOC
259 */ 274 */
260}; 275};
261 276
277#define SLAB_OBJ_PFMEMALLOC 1
278static inline bool is_obj_pfmemalloc(void *objp)
279{
280 return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
281}
282
283static inline void set_obj_pfmemalloc(void **objp)
284{
285 *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
286 return;
287}
288
289static inline void clear_obj_pfmemalloc(void **objp)
290{
291 *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
292}
293
262/* 294/*
263 * bootstrap: The caches do not work without cpuarrays anymore, but the 295 * bootstrap: The caches do not work without cpuarrays anymore, but the
264 * cpuarrays are allocated from the generic caches... 296 * cpuarrays are allocated from the generic caches...
@@ -424,8 +456,8 @@ static void kmem_list3_init(struct kmem_list3 *parent)
424 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: 456 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
425 * redzone word. 457 * redzone word.
426 * cachep->obj_offset: The real object. 458 * cachep->obj_offset: The real object.
427 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 459 * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
428 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address 460 * cachep->size - 1* BYTES_PER_WORD: last caller address
429 * [BYTES_PER_WORD long] 461 * [BYTES_PER_WORD long]
430 */ 462 */
431static int obj_offset(struct kmem_cache *cachep) 463static int obj_offset(struct kmem_cache *cachep)
@@ -433,11 +465,6 @@ static int obj_offset(struct kmem_cache *cachep)
433 return cachep->obj_offset; 465 return cachep->obj_offset;
434} 466}
435 467
436static int obj_size(struct kmem_cache *cachep)
437{
438 return cachep->obj_size;
439}
440
441static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) 468static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
442{ 469{
443 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 470 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
@@ -449,23 +476,22 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
449{ 476{
450 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 477 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
451 if (cachep->flags & SLAB_STORE_USER) 478 if (cachep->flags & SLAB_STORE_USER)
452 return (unsigned long long *)(objp + cachep->buffer_size - 479 return (unsigned long long *)(objp + cachep->size -
453 sizeof(unsigned long long) - 480 sizeof(unsigned long long) -
454 REDZONE_ALIGN); 481 REDZONE_ALIGN);
455 return (unsigned long long *) (objp + cachep->buffer_size - 482 return (unsigned long long *) (objp + cachep->size -
456 sizeof(unsigned long long)); 483 sizeof(unsigned long long));
457} 484}
458 485
459static void **dbg_userword(struct kmem_cache *cachep, void *objp) 486static void **dbg_userword(struct kmem_cache *cachep, void *objp)
460{ 487{
461 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 488 BUG_ON(!(cachep->flags & SLAB_STORE_USER));
462 return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD); 489 return (void **)(objp + cachep->size - BYTES_PER_WORD);
463} 490}
464 491
465#else 492#else
466 493
467#define obj_offset(x) 0 494#define obj_offset(x) 0
468#define obj_size(cachep) (cachep->buffer_size)
469#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 495#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
470#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 496#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
471#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 497#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
@@ -475,7 +501,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
475#ifdef CONFIG_TRACING 501#ifdef CONFIG_TRACING
476size_t slab_buffer_size(struct kmem_cache *cachep) 502size_t slab_buffer_size(struct kmem_cache *cachep)
477{ 503{
478 return cachep->buffer_size; 504 return cachep->size;
479} 505}
480EXPORT_SYMBOL(slab_buffer_size); 506EXPORT_SYMBOL(slab_buffer_size);
481#endif 507#endif
@@ -489,56 +515,37 @@ EXPORT_SYMBOL(slab_buffer_size);
489static int slab_max_order = SLAB_MAX_ORDER_LO; 515static int slab_max_order = SLAB_MAX_ORDER_LO;
490static bool slab_max_order_set __initdata; 516static bool slab_max_order_set __initdata;
491 517
492/*
493 * Functions for storing/retrieving the cachep and or slab from the page
494 * allocator. These are used to find the slab an obj belongs to. With kfree(),
495 * these are used to find the cache which an obj belongs to.
496 */
497static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
498{
499 page->lru.next = (struct list_head *)cache;
500}
501
502static inline struct kmem_cache *page_get_cache(struct page *page) 518static inline struct kmem_cache *page_get_cache(struct page *page)
503{ 519{
504 page = compound_head(page); 520 page = compound_head(page);
505 BUG_ON(!PageSlab(page)); 521 BUG_ON(!PageSlab(page));
506 return (struct kmem_cache *)page->lru.next; 522 return page->slab_cache;
507}
508
509static inline void page_set_slab(struct page *page, struct slab *slab)
510{
511 page->lru.prev = (struct list_head *)slab;
512}
513
514static inline struct slab *page_get_slab(struct page *page)
515{
516 BUG_ON(!PageSlab(page));
517 return (struct slab *)page->lru.prev;
518} 523}
519 524
520static inline struct kmem_cache *virt_to_cache(const void *obj) 525static inline struct kmem_cache *virt_to_cache(const void *obj)
521{ 526{
522 struct page *page = virt_to_head_page(obj); 527 struct page *page = virt_to_head_page(obj);
523 return page_get_cache(page); 528 return page->slab_cache;
524} 529}
525 530
526static inline struct slab *virt_to_slab(const void *obj) 531static inline struct slab *virt_to_slab(const void *obj)
527{ 532{
528 struct page *page = virt_to_head_page(obj); 533 struct page *page = virt_to_head_page(obj);
529 return page_get_slab(page); 534
535 VM_BUG_ON(!PageSlab(page));
536 return page->slab_page;
530} 537}
531 538
532static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, 539static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
533 unsigned int idx) 540 unsigned int idx)
534{ 541{
535 return slab->s_mem + cache->buffer_size * idx; 542 return slab->s_mem + cache->size * idx;
536} 543}
537 544
538/* 545/*
539 * We want to avoid an expensive divide : (offset / cache->buffer_size) 546 * We want to avoid an expensive divide : (offset / cache->size)
540 * Using the fact that buffer_size is a constant for a particular cache, 547 * Using the fact that size is a constant for a particular cache,
541 * we can replace (offset / cache->buffer_size) by 548 * we can replace (offset / cache->size) by
542 * reciprocal_divide(offset, cache->reciprocal_buffer_size) 549 * reciprocal_divide(offset, cache->reciprocal_buffer_size)
543 */ 550 */
544static inline unsigned int obj_to_index(const struct kmem_cache *cache, 551static inline unsigned int obj_to_index(const struct kmem_cache *cache,
@@ -584,33 +591,12 @@ static struct kmem_cache cache_cache = {
584 .batchcount = 1, 591 .batchcount = 1,
585 .limit = BOOT_CPUCACHE_ENTRIES, 592 .limit = BOOT_CPUCACHE_ENTRIES,
586 .shared = 1, 593 .shared = 1,
587 .buffer_size = sizeof(struct kmem_cache), 594 .size = sizeof(struct kmem_cache),
588 .name = "kmem_cache", 595 .name = "kmem_cache",
589}; 596};
590 597
591#define BAD_ALIEN_MAGIC 0x01020304ul 598#define BAD_ALIEN_MAGIC 0x01020304ul
592 599
593/*
594 * chicken and egg problem: delay the per-cpu array allocation
595 * until the general caches are up.
596 */
597static enum {
598 NONE,
599 PARTIAL_AC,
600 PARTIAL_L3,
601 EARLY,
602 LATE,
603 FULL
604} g_cpucache_up;
605
606/*
607 * used by boot code to determine if it can use slab based allocator
608 */
609int slab_is_available(void)
610{
611 return g_cpucache_up >= EARLY;
612}
613
614#ifdef CONFIG_LOCKDEP 600#ifdef CONFIG_LOCKDEP
615 601
616/* 602/*
@@ -676,7 +662,7 @@ static void init_node_lock_keys(int q)
676{ 662{
677 struct cache_sizes *s = malloc_sizes; 663 struct cache_sizes *s = malloc_sizes;
678 664
679 if (g_cpucache_up < LATE) 665 if (slab_state < UP)
680 return; 666 return;
681 667
682 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { 668 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
@@ -716,12 +702,6 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
716} 702}
717#endif 703#endif
718 704
719/*
720 * Guard access to the cache-chain.
721 */
722static DEFINE_MUTEX(cache_chain_mutex);
723static struct list_head cache_chain;
724
725static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); 705static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
726 706
727static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 707static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -951,6 +931,124 @@ static struct array_cache *alloc_arraycache(int node, int entries,
951 return nc; 931 return nc;
952} 932}
953 933
934static inline bool is_slab_pfmemalloc(struct slab *slabp)
935{
936 struct page *page = virt_to_page(slabp->s_mem);
937
938 return PageSlabPfmemalloc(page);
939}
940
941/* Clears pfmemalloc_active if no slabs have pfmalloc set */
942static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
943 struct array_cache *ac)
944{
945 struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
946 struct slab *slabp;
947 unsigned long flags;
948
949 if (!pfmemalloc_active)
950 return;
951
952 spin_lock_irqsave(&l3->list_lock, flags);
953 list_for_each_entry(slabp, &l3->slabs_full, list)
954 if (is_slab_pfmemalloc(slabp))
955 goto out;
956
957 list_for_each_entry(slabp, &l3->slabs_partial, list)
958 if (is_slab_pfmemalloc(slabp))
959 goto out;
960
961 list_for_each_entry(slabp, &l3->slabs_free, list)
962 if (is_slab_pfmemalloc(slabp))
963 goto out;
964
965 pfmemalloc_active = false;
966out:
967 spin_unlock_irqrestore(&l3->list_lock, flags);
968}
969
970static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
971 gfp_t flags, bool force_refill)
972{
973 int i;
974 void *objp = ac->entry[--ac->avail];
975
976 /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
977 if (unlikely(is_obj_pfmemalloc(objp))) {
978 struct kmem_list3 *l3;
979
980 if (gfp_pfmemalloc_allowed(flags)) {
981 clear_obj_pfmemalloc(&objp);
982 return objp;
983 }
984
985 /* The caller cannot use PFMEMALLOC objects, find another one */
986 for (i = 1; i < ac->avail; i++) {
987 /* If a !PFMEMALLOC object is found, swap them */
988 if (!is_obj_pfmemalloc(ac->entry[i])) {
989 objp = ac->entry[i];
990 ac->entry[i] = ac->entry[ac->avail];
991 ac->entry[ac->avail] = objp;
992 return objp;
993 }
994 }
995
996 /*
997 * If there are empty slabs on the slabs_free list and we are
998 * being forced to refill the cache, mark this one !pfmemalloc.
999 */
1000 l3 = cachep->nodelists[numa_mem_id()];
1001 if (!list_empty(&l3->slabs_free) && force_refill) {
1002 struct slab *slabp = virt_to_slab(objp);
1003 ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem));
1004 clear_obj_pfmemalloc(&objp);
1005 recheck_pfmemalloc_active(cachep, ac);
1006 return objp;
1007 }
1008
1009 /* No !PFMEMALLOC objects available */
1010 ac->avail++;
1011 objp = NULL;
1012 }
1013
1014 return objp;
1015}
1016
1017static inline void *ac_get_obj(struct kmem_cache *cachep,
1018 struct array_cache *ac, gfp_t flags, bool force_refill)
1019{
1020 void *objp;
1021
1022 if (unlikely(sk_memalloc_socks()))
1023 objp = __ac_get_obj(cachep, ac, flags, force_refill);
1024 else
1025 objp = ac->entry[--ac->avail];
1026
1027 return objp;
1028}
1029
1030static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
1031 void *objp)
1032{
1033 if (unlikely(pfmemalloc_active)) {
1034 /* Some pfmemalloc slabs exist, check if this is one */
1035 struct page *page = virt_to_page(objp);
1036 if (PageSlabPfmemalloc(page))
1037 set_obj_pfmemalloc(&objp);
1038 }
1039
1040 return objp;
1041}
1042
1043static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
1044 void *objp)
1045{
1046 if (unlikely(sk_memalloc_socks()))
1047 objp = __ac_put_obj(cachep, ac, objp);
1048
1049 ac->entry[ac->avail++] = objp;
1050}
1051
954/* 1052/*
955 * Transfer objects in one arraycache to another. 1053 * Transfer objects in one arraycache to another.
956 * Locking must be handled by the caller. 1054 * Locking must be handled by the caller.
@@ -1127,7 +1225,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1127 STATS_INC_ACOVERFLOW(cachep); 1225 STATS_INC_ACOVERFLOW(cachep);
1128 __drain_alien_cache(cachep, alien, nodeid); 1226 __drain_alien_cache(cachep, alien, nodeid);
1129 } 1227 }
1130 alien->entry[alien->avail++] = objp; 1228 ac_put_obj(cachep, alien, objp);
1131 spin_unlock(&alien->lock); 1229 spin_unlock(&alien->lock);
1132 } else { 1230 } else {
1133 spin_lock(&(cachep->nodelists[nodeid])->list_lock); 1231 spin_lock(&(cachep->nodelists[nodeid])->list_lock);
@@ -1145,7 +1243,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1145 * When hotplugging memory or a cpu, existing nodelists are not replaced if 1243 * When hotplugging memory or a cpu, existing nodelists are not replaced if
1146 * already in use. 1244 * already in use.
1147 * 1245 *
1148 * Must hold cache_chain_mutex. 1246 * Must hold slab_mutex.
1149 */ 1247 */
1150static int init_cache_nodelists_node(int node) 1248static int init_cache_nodelists_node(int node)
1151{ 1249{
@@ -1153,7 +1251,7 @@ static int init_cache_nodelists_node(int node)
1153 struct kmem_list3 *l3; 1251 struct kmem_list3 *l3;
1154 const int memsize = sizeof(struct kmem_list3); 1252 const int memsize = sizeof(struct kmem_list3);
1155 1253
1156 list_for_each_entry(cachep, &cache_chain, next) { 1254 list_for_each_entry(cachep, &slab_caches, list) {
1157 /* 1255 /*
1158 * Set up the size64 kmemlist for cpu before we can 1256 * Set up the size64 kmemlist for cpu before we can
1159 * begin anything. Make sure some other cpu on this 1257 * begin anything. Make sure some other cpu on this
@@ -1169,7 +1267,7 @@ static int init_cache_nodelists_node(int node)
1169 1267
1170 /* 1268 /*
1171 * The l3s don't come and go as CPUs come and 1269 * The l3s don't come and go as CPUs come and
1172 * go. cache_chain_mutex is sufficient 1270 * go. slab_mutex is sufficient
1173 * protection here. 1271 * protection here.
1174 */ 1272 */
1175 cachep->nodelists[node] = l3; 1273 cachep->nodelists[node] = l3;
@@ -1191,7 +1289,7 @@ static void __cpuinit cpuup_canceled(long cpu)
1191 int node = cpu_to_mem(cpu); 1289 int node = cpu_to_mem(cpu);
1192 const struct cpumask *mask = cpumask_of_node(node); 1290 const struct cpumask *mask = cpumask_of_node(node);
1193 1291
1194 list_for_each_entry(cachep, &cache_chain, next) { 1292 list_for_each_entry(cachep, &slab_caches, list) {
1195 struct array_cache *nc; 1293 struct array_cache *nc;
1196 struct array_cache *shared; 1294 struct array_cache *shared;
1197 struct array_cache **alien; 1295 struct array_cache **alien;
@@ -1241,7 +1339,7 @@ free_array_cache:
1241 * the respective cache's slabs, now we can go ahead and 1339 * the respective cache's slabs, now we can go ahead and
1242 * shrink each nodelist to its limit. 1340 * shrink each nodelist to its limit.
1243 */ 1341 */
1244 list_for_each_entry(cachep, &cache_chain, next) { 1342 list_for_each_entry(cachep, &slab_caches, list) {
1245 l3 = cachep->nodelists[node]; 1343 l3 = cachep->nodelists[node];
1246 if (!l3) 1344 if (!l3)
1247 continue; 1345 continue;
@@ -1270,7 +1368,7 @@ static int __cpuinit cpuup_prepare(long cpu)
1270 * Now we can go ahead with allocating the shared arrays and 1368 * Now we can go ahead with allocating the shared arrays and
1271 * array caches 1369 * array caches
1272 */ 1370 */
1273 list_for_each_entry(cachep, &cache_chain, next) { 1371 list_for_each_entry(cachep, &slab_caches, list) {
1274 struct array_cache *nc; 1372 struct array_cache *nc;
1275 struct array_cache *shared = NULL; 1373 struct array_cache *shared = NULL;
1276 struct array_cache **alien = NULL; 1374 struct array_cache **alien = NULL;
@@ -1338,9 +1436,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1338 switch (action) { 1436 switch (action) {
1339 case CPU_UP_PREPARE: 1437 case CPU_UP_PREPARE:
1340 case CPU_UP_PREPARE_FROZEN: 1438 case CPU_UP_PREPARE_FROZEN:
1341 mutex_lock(&cache_chain_mutex); 1439 mutex_lock(&slab_mutex);
1342 err = cpuup_prepare(cpu); 1440 err = cpuup_prepare(cpu);
1343 mutex_unlock(&cache_chain_mutex); 1441 mutex_unlock(&slab_mutex);
1344 break; 1442 break;
1345 case CPU_ONLINE: 1443 case CPU_ONLINE:
1346 case CPU_ONLINE_FROZEN: 1444 case CPU_ONLINE_FROZEN:
@@ -1350,7 +1448,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1350 case CPU_DOWN_PREPARE: 1448 case CPU_DOWN_PREPARE:
1351 case CPU_DOWN_PREPARE_FROZEN: 1449 case CPU_DOWN_PREPARE_FROZEN:
1352 /* 1450 /*
1353 * Shutdown cache reaper. Note that the cache_chain_mutex is 1451 * Shutdown cache reaper. Note that the slab_mutex is
1354 * held so that if cache_reap() is invoked it cannot do 1452 * held so that if cache_reap() is invoked it cannot do
1355 * anything expensive but will only modify reap_work 1453 * anything expensive but will only modify reap_work
1356 * and reschedule the timer. 1454 * and reschedule the timer.
@@ -1377,9 +1475,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1377#endif 1475#endif
1378 case CPU_UP_CANCELED: 1476 case CPU_UP_CANCELED:
1379 case CPU_UP_CANCELED_FROZEN: 1477 case CPU_UP_CANCELED_FROZEN:
1380 mutex_lock(&cache_chain_mutex); 1478 mutex_lock(&slab_mutex);
1381 cpuup_canceled(cpu); 1479 cpuup_canceled(cpu);
1382 mutex_unlock(&cache_chain_mutex); 1480 mutex_unlock(&slab_mutex);
1383 break; 1481 break;
1384 } 1482 }
1385 return notifier_from_errno(err); 1483 return notifier_from_errno(err);
@@ -1395,14 +1493,14 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
1395 * Returns -EBUSY if all objects cannot be drained so that the node is not 1493 * Returns -EBUSY if all objects cannot be drained so that the node is not
1396 * removed. 1494 * removed.
1397 * 1495 *
1398 * Must hold cache_chain_mutex. 1496 * Must hold slab_mutex.
1399 */ 1497 */
1400static int __meminit drain_cache_nodelists_node(int node) 1498static int __meminit drain_cache_nodelists_node(int node)
1401{ 1499{
1402 struct kmem_cache *cachep; 1500 struct kmem_cache *cachep;
1403 int ret = 0; 1501 int ret = 0;
1404 1502
1405 list_for_each_entry(cachep, &cache_chain, next) { 1503 list_for_each_entry(cachep, &slab_caches, list) {
1406 struct kmem_list3 *l3; 1504 struct kmem_list3 *l3;
1407 1505
1408 l3 = cachep->nodelists[node]; 1506 l3 = cachep->nodelists[node];
@@ -1433,14 +1531,14 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
1433 1531
1434 switch (action) { 1532 switch (action) {
1435 case MEM_GOING_ONLINE: 1533 case MEM_GOING_ONLINE:
1436 mutex_lock(&cache_chain_mutex); 1534 mutex_lock(&slab_mutex);
1437 ret = init_cache_nodelists_node(nid); 1535 ret = init_cache_nodelists_node(nid);
1438 mutex_unlock(&cache_chain_mutex); 1536 mutex_unlock(&slab_mutex);
1439 break; 1537 break;
1440 case MEM_GOING_OFFLINE: 1538 case MEM_GOING_OFFLINE:
1441 mutex_lock(&cache_chain_mutex); 1539 mutex_lock(&slab_mutex);
1442 ret = drain_cache_nodelists_node(nid); 1540 ret = drain_cache_nodelists_node(nid);
1443 mutex_unlock(&cache_chain_mutex); 1541 mutex_unlock(&slab_mutex);
1444 break; 1542 break;
1445 case MEM_ONLINE: 1543 case MEM_ONLINE:
1446 case MEM_OFFLINE: 1544 case MEM_OFFLINE:
@@ -1544,8 +1642,8 @@ void __init kmem_cache_init(void)
1544 node = numa_mem_id(); 1642 node = numa_mem_id();
1545 1643
1546 /* 1) create the cache_cache */ 1644 /* 1) create the cache_cache */
1547 INIT_LIST_HEAD(&cache_chain); 1645 INIT_LIST_HEAD(&slab_caches);
1548 list_add(&cache_cache.next, &cache_chain); 1646 list_add(&cache_cache.list, &slab_caches);
1549 cache_cache.colour_off = cache_line_size(); 1647 cache_cache.colour_off = cache_line_size();
1550 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1648 cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1551 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; 1649 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
@@ -1553,18 +1651,16 @@ void __init kmem_cache_init(void)
1553 /* 1651 /*
1554 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids 1652 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1555 */ 1653 */
1556 cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + 1654 cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1557 nr_node_ids * sizeof(struct kmem_list3 *); 1655 nr_node_ids * sizeof(struct kmem_list3 *);
1558#if DEBUG 1656 cache_cache.object_size = cache_cache.size;
1559 cache_cache.obj_size = cache_cache.buffer_size; 1657 cache_cache.size = ALIGN(cache_cache.size,
1560#endif
1561 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1562 cache_line_size()); 1658 cache_line_size());
1563 cache_cache.reciprocal_buffer_size = 1659 cache_cache.reciprocal_buffer_size =
1564 reciprocal_value(cache_cache.buffer_size); 1660 reciprocal_value(cache_cache.size);
1565 1661
1566 for (order = 0; order < MAX_ORDER; order++) { 1662 for (order = 0; order < MAX_ORDER; order++) {
1567 cache_estimate(order, cache_cache.buffer_size, 1663 cache_estimate(order, cache_cache.size,
1568 cache_line_size(), 0, &left_over, &cache_cache.num); 1664 cache_line_size(), 0, &left_over, &cache_cache.num);
1569 if (cache_cache.num) 1665 if (cache_cache.num)
1570 break; 1666 break;
@@ -1585,7 +1681,7 @@ void __init kmem_cache_init(void)
1585 * bug. 1681 * bug.
1586 */ 1682 */
1587 1683
1588 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1684 sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name,
1589 sizes[INDEX_AC].cs_size, 1685 sizes[INDEX_AC].cs_size,
1590 ARCH_KMALLOC_MINALIGN, 1686 ARCH_KMALLOC_MINALIGN,
1591 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1687 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1593,7 +1689,7 @@ void __init kmem_cache_init(void)
1593 1689
1594 if (INDEX_AC != INDEX_L3) { 1690 if (INDEX_AC != INDEX_L3) {
1595 sizes[INDEX_L3].cs_cachep = 1691 sizes[INDEX_L3].cs_cachep =
1596 kmem_cache_create(names[INDEX_L3].name, 1692 __kmem_cache_create(names[INDEX_L3].name,
1597 sizes[INDEX_L3].cs_size, 1693 sizes[INDEX_L3].cs_size,
1598 ARCH_KMALLOC_MINALIGN, 1694 ARCH_KMALLOC_MINALIGN,
1599 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1695 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1611,14 +1707,14 @@ void __init kmem_cache_init(void)
1611 * allow tighter packing of the smaller caches. 1707 * allow tighter packing of the smaller caches.
1612 */ 1708 */
1613 if (!sizes->cs_cachep) { 1709 if (!sizes->cs_cachep) {
1614 sizes->cs_cachep = kmem_cache_create(names->name, 1710 sizes->cs_cachep = __kmem_cache_create(names->name,
1615 sizes->cs_size, 1711 sizes->cs_size,
1616 ARCH_KMALLOC_MINALIGN, 1712 ARCH_KMALLOC_MINALIGN,
1617 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1713 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1618 NULL); 1714 NULL);
1619 } 1715 }
1620#ifdef CONFIG_ZONE_DMA 1716#ifdef CONFIG_ZONE_DMA
1621 sizes->cs_dmacachep = kmem_cache_create( 1717 sizes->cs_dmacachep = __kmem_cache_create(
1622 names->name_dma, 1718 names->name_dma,
1623 sizes->cs_size, 1719 sizes->cs_size,
1624 ARCH_KMALLOC_MINALIGN, 1720 ARCH_KMALLOC_MINALIGN,
@@ -1676,27 +1772,27 @@ void __init kmem_cache_init(void)
1676 } 1772 }
1677 } 1773 }
1678 1774
1679 g_cpucache_up = EARLY; 1775 slab_state = UP;
1680} 1776}
1681 1777
1682void __init kmem_cache_init_late(void) 1778void __init kmem_cache_init_late(void)
1683{ 1779{
1684 struct kmem_cache *cachep; 1780 struct kmem_cache *cachep;
1685 1781
1686 g_cpucache_up = LATE; 1782 slab_state = UP;
1687 1783
1688 /* Annotate slab for lockdep -- annotate the malloc caches */ 1784 /* Annotate slab for lockdep -- annotate the malloc caches */
1689 init_lock_keys(); 1785 init_lock_keys();
1690 1786
1691 /* 6) resize the head arrays to their final sizes */ 1787 /* 6) resize the head arrays to their final sizes */
1692 mutex_lock(&cache_chain_mutex); 1788 mutex_lock(&slab_mutex);
1693 list_for_each_entry(cachep, &cache_chain, next) 1789 list_for_each_entry(cachep, &slab_caches, list)
1694 if (enable_cpucache(cachep, GFP_NOWAIT)) 1790 if (enable_cpucache(cachep, GFP_NOWAIT))
1695 BUG(); 1791 BUG();
1696 mutex_unlock(&cache_chain_mutex); 1792 mutex_unlock(&slab_mutex);
1697 1793
1698 /* Done! */ 1794 /* Done! */
1699 g_cpucache_up = FULL; 1795 slab_state = FULL;
1700 1796
1701 /* 1797 /*
1702 * Register a cpu startup notifier callback that initializes 1798 * Register a cpu startup notifier callback that initializes
@@ -1727,6 +1823,9 @@ static int __init cpucache_init(void)
1727 */ 1823 */
1728 for_each_online_cpu(cpu) 1824 for_each_online_cpu(cpu)
1729 start_cpu_timer(cpu); 1825 start_cpu_timer(cpu);
1826
1827 /* Done! */
1828 slab_state = FULL;
1730 return 0; 1829 return 0;
1731} 1830}
1732__initcall(cpucache_init); 1831__initcall(cpucache_init);
@@ -1743,7 +1842,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1743 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", 1842 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1744 nodeid, gfpflags); 1843 nodeid, gfpflags);
1745 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", 1844 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
1746 cachep->name, cachep->buffer_size, cachep->gfporder); 1845 cachep->name, cachep->size, cachep->gfporder);
1747 1846
1748 for_each_online_node(node) { 1847 for_each_online_node(node) {
1749 unsigned long active_objs = 0, num_objs = 0, free_objects = 0; 1848 unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
@@ -1798,7 +1897,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1798 flags |= __GFP_COMP; 1897 flags |= __GFP_COMP;
1799#endif 1898#endif
1800 1899
1801 flags |= cachep->gfpflags; 1900 flags |= cachep->allocflags;
1802 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1901 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1803 flags |= __GFP_RECLAIMABLE; 1902 flags |= __GFP_RECLAIMABLE;
1804 1903
@@ -1809,6 +1908,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1809 return NULL; 1908 return NULL;
1810 } 1909 }
1811 1910
1911 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1912 if (unlikely(page->pfmemalloc))
1913 pfmemalloc_active = true;
1914
1812 nr_pages = (1 << cachep->gfporder); 1915 nr_pages = (1 << cachep->gfporder);
1813 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1916 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1814 add_zone_page_state(page_zone(page), 1917 add_zone_page_state(page_zone(page),
@@ -1816,9 +1919,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1816 else 1919 else
1817 add_zone_page_state(page_zone(page), 1920 add_zone_page_state(page_zone(page),
1818 NR_SLAB_UNRECLAIMABLE, nr_pages); 1921 NR_SLAB_UNRECLAIMABLE, nr_pages);
1819 for (i = 0; i < nr_pages; i++) 1922 for (i = 0; i < nr_pages; i++) {
1820 __SetPageSlab(page + i); 1923 __SetPageSlab(page + i);
1821 1924
1925 if (page->pfmemalloc)
1926 SetPageSlabPfmemalloc(page + i);
1927 }
1928
1822 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1929 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1823 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1930 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1824 1931
@@ -1850,6 +1957,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1850 NR_SLAB_UNRECLAIMABLE, nr_freed); 1957 NR_SLAB_UNRECLAIMABLE, nr_freed);
1851 while (i--) { 1958 while (i--) {
1852 BUG_ON(!PageSlab(page)); 1959 BUG_ON(!PageSlab(page));
1960 __ClearPageSlabPfmemalloc(page);
1853 __ClearPageSlab(page); 1961 __ClearPageSlab(page);
1854 page++; 1962 page++;
1855 } 1963 }
@@ -1874,7 +1982,7 @@ static void kmem_rcu_free(struct rcu_head *head)
1874static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, 1982static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1875 unsigned long caller) 1983 unsigned long caller)
1876{ 1984{
1877 int size = obj_size(cachep); 1985 int size = cachep->object_size;
1878 1986
1879 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; 1987 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1880 1988
@@ -1906,7 +2014,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1906 2014
1907static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) 2015static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1908{ 2016{
1909 int size = obj_size(cachep); 2017 int size = cachep->object_size;
1910 addr = &((char *)addr)[obj_offset(cachep)]; 2018 addr = &((char *)addr)[obj_offset(cachep)];
1911 2019
1912 memset(addr, val, size); 2020 memset(addr, val, size);
@@ -1966,7 +2074,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1966 printk("\n"); 2074 printk("\n");
1967 } 2075 }
1968 realobj = (char *)objp + obj_offset(cachep); 2076 realobj = (char *)objp + obj_offset(cachep);
1969 size = obj_size(cachep); 2077 size = cachep->object_size;
1970 for (i = 0; i < size && lines; i += 16, lines--) { 2078 for (i = 0; i < size && lines; i += 16, lines--) {
1971 int limit; 2079 int limit;
1972 limit = 16; 2080 limit = 16;
@@ -1983,7 +2091,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1983 int lines = 0; 2091 int lines = 0;
1984 2092
1985 realobj = (char *)objp + obj_offset(cachep); 2093 realobj = (char *)objp + obj_offset(cachep);
1986 size = obj_size(cachep); 2094 size = cachep->object_size;
1987 2095
1988 for (i = 0; i < size; i++) { 2096 for (i = 0; i < size; i++) {
1989 char exp = POISON_FREE; 2097 char exp = POISON_FREE;
@@ -2047,10 +2155,10 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
2047 2155
2048 if (cachep->flags & SLAB_POISON) { 2156 if (cachep->flags & SLAB_POISON) {
2049#ifdef CONFIG_DEBUG_PAGEALLOC 2157#ifdef CONFIG_DEBUG_PAGEALLOC
2050 if (cachep->buffer_size % PAGE_SIZE == 0 && 2158 if (cachep->size % PAGE_SIZE == 0 &&
2051 OFF_SLAB(cachep)) 2159 OFF_SLAB(cachep))
2052 kernel_map_pages(virt_to_page(objp), 2160 kernel_map_pages(virt_to_page(objp),
2053 cachep->buffer_size / PAGE_SIZE, 1); 2161 cachep->size / PAGE_SIZE, 1);
2054 else 2162 else
2055 check_poison_obj(cachep, objp); 2163 check_poison_obj(cachep, objp);
2056#else 2164#else
@@ -2194,10 +2302,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2194 2302
2195static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) 2303static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2196{ 2304{
2197 if (g_cpucache_up == FULL) 2305 if (slab_state >= FULL)
2198 return enable_cpucache(cachep, gfp); 2306 return enable_cpucache(cachep, gfp);
2199 2307
2200 if (g_cpucache_up == NONE) { 2308 if (slab_state == DOWN) {
2201 /* 2309 /*
2202 * Note: the first kmem_cache_create must create the cache 2310 * Note: the first kmem_cache_create must create the cache
2203 * that's used by kmalloc(24), otherwise the creation of 2311 * that's used by kmalloc(24), otherwise the creation of
@@ -2212,16 +2320,16 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2212 */ 2320 */
2213 set_up_list3s(cachep, SIZE_AC); 2321 set_up_list3s(cachep, SIZE_AC);
2214 if (INDEX_AC == INDEX_L3) 2322 if (INDEX_AC == INDEX_L3)
2215 g_cpucache_up = PARTIAL_L3; 2323 slab_state = PARTIAL_L3;
2216 else 2324 else
2217 g_cpucache_up = PARTIAL_AC; 2325 slab_state = PARTIAL_ARRAYCACHE;
2218 } else { 2326 } else {
2219 cachep->array[smp_processor_id()] = 2327 cachep->array[smp_processor_id()] =
2220 kmalloc(sizeof(struct arraycache_init), gfp); 2328 kmalloc(sizeof(struct arraycache_init), gfp);
2221 2329
2222 if (g_cpucache_up == PARTIAL_AC) { 2330 if (slab_state == PARTIAL_ARRAYCACHE) {
2223 set_up_list3s(cachep, SIZE_L3); 2331 set_up_list3s(cachep, SIZE_L3);
2224 g_cpucache_up = PARTIAL_L3; 2332 slab_state = PARTIAL_L3;
2225 } else { 2333 } else {
2226 int node; 2334 int node;
2227 for_each_online_node(node) { 2335 for_each_online_node(node) {
@@ -2247,7 +2355,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2247} 2355}
2248 2356
2249/** 2357/**
2250 * kmem_cache_create - Create a cache. 2358 * __kmem_cache_create - Create a cache.
2251 * @name: A string which is used in /proc/slabinfo to identify this cache. 2359 * @name: A string which is used in /proc/slabinfo to identify this cache.
2252 * @size: The size of objects to be created in this cache. 2360 * @size: The size of objects to be created in this cache.
2253 * @align: The required alignment for the objects. 2361 * @align: The required alignment for the objects.
@@ -2274,59 +2382,14 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2274 * as davem. 2382 * as davem.
2275 */ 2383 */
2276struct kmem_cache * 2384struct kmem_cache *
2277kmem_cache_create (const char *name, size_t size, size_t align, 2385__kmem_cache_create (const char *name, size_t size, size_t align,
2278 unsigned long flags, void (*ctor)(void *)) 2386 unsigned long flags, void (*ctor)(void *))
2279{ 2387{
2280 size_t left_over, slab_size, ralign; 2388 size_t left_over, slab_size, ralign;
2281 struct kmem_cache *cachep = NULL, *pc; 2389 struct kmem_cache *cachep = NULL;
2282 gfp_t gfp; 2390 gfp_t gfp;
2283 2391
2284 /*
2285 * Sanity checks... these are all serious usage bugs.
2286 */
2287 if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2288 size > KMALLOC_MAX_SIZE) {
2289 printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
2290 name);
2291 BUG();
2292 }
2293
2294 /*
2295 * We use cache_chain_mutex to ensure a consistent view of
2296 * cpu_online_mask as well. Please see cpuup_callback
2297 */
2298 if (slab_is_available()) {
2299 get_online_cpus();
2300 mutex_lock(&cache_chain_mutex);
2301 }
2302
2303 list_for_each_entry(pc, &cache_chain, next) {
2304 char tmp;
2305 int res;
2306
2307 /*
2308 * This happens when the module gets unloaded and doesn't
2309 * destroy its slab cache and no-one else reuses the vmalloc
2310 * area of the module. Print a warning.
2311 */
2312 res = probe_kernel_address(pc->name, tmp);
2313 if (res) {
2314 printk(KERN_ERR
2315 "SLAB: cache with size %d has lost its name\n",
2316 pc->buffer_size);
2317 continue;
2318 }
2319
2320 if (!strcmp(pc->name, name)) {
2321 printk(KERN_ERR
2322 "kmem_cache_create: duplicate cache %s\n", name);
2323 dump_stack();
2324 goto oops;
2325 }
2326 }
2327
2328#if DEBUG 2392#if DEBUG
2329 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
2330#if FORCED_DEBUG 2393#if FORCED_DEBUG
2331 /* 2394 /*
2332 * Enable redzoning and last user accounting, except for caches with 2395 * Enable redzoning and last user accounting, except for caches with
@@ -2415,11 +2478,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2415 /* Get cache's description obj. */ 2478 /* Get cache's description obj. */
2416 cachep = kmem_cache_zalloc(&cache_cache, gfp); 2479 cachep = kmem_cache_zalloc(&cache_cache, gfp);
2417 if (!cachep) 2480 if (!cachep)
2418 goto oops; 2481 return NULL;
2419 2482
2420 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; 2483 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
2484 cachep->object_size = size;
2485 cachep->align = align;
2421#if DEBUG 2486#if DEBUG
2422 cachep->obj_size = size;
2423 2487
2424 /* 2488 /*
2425 * Both debugging options require word-alignment which is calculated 2489 * Both debugging options require word-alignment which is calculated
@@ -2442,7 +2506,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2442 } 2506 }
2443#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2507#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2444 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 2508 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2445 && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { 2509 && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
2446 cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); 2510 cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
2447 size = PAGE_SIZE; 2511 size = PAGE_SIZE;
2448 } 2512 }
@@ -2471,8 +2535,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2471 printk(KERN_ERR 2535 printk(KERN_ERR
2472 "kmem_cache_create: couldn't create cache %s.\n", name); 2536 "kmem_cache_create: couldn't create cache %s.\n", name);
2473 kmem_cache_free(&cache_cache, cachep); 2537 kmem_cache_free(&cache_cache, cachep);
2474 cachep = NULL; 2538 return NULL;
2475 goto oops;
2476 } 2539 }
2477 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) 2540 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2478 + sizeof(struct slab), align); 2541 + sizeof(struct slab), align);
@@ -2508,10 +2571,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2508 cachep->colour = left_over / cachep->colour_off; 2571 cachep->colour = left_over / cachep->colour_off;
2509 cachep->slab_size = slab_size; 2572 cachep->slab_size = slab_size;
2510 cachep->flags = flags; 2573 cachep->flags = flags;
2511 cachep->gfpflags = 0; 2574 cachep->allocflags = 0;
2512 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) 2575 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2513 cachep->gfpflags |= GFP_DMA; 2576 cachep->allocflags |= GFP_DMA;
2514 cachep->buffer_size = size; 2577 cachep->size = size;
2515 cachep->reciprocal_buffer_size = reciprocal_value(size); 2578 cachep->reciprocal_buffer_size = reciprocal_value(size);
2516 2579
2517 if (flags & CFLGS_OFF_SLAB) { 2580 if (flags & CFLGS_OFF_SLAB) {
@@ -2530,8 +2593,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2530 2593
2531 if (setup_cpu_cache(cachep, gfp)) { 2594 if (setup_cpu_cache(cachep, gfp)) {
2532 __kmem_cache_destroy(cachep); 2595 __kmem_cache_destroy(cachep);
2533 cachep = NULL; 2596 return NULL;
2534 goto oops;
2535 } 2597 }
2536 2598
2537 if (flags & SLAB_DEBUG_OBJECTS) { 2599 if (flags & SLAB_DEBUG_OBJECTS) {
@@ -2545,18 +2607,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2545 } 2607 }
2546 2608
2547 /* cache setup completed, link it into the list */ 2609 /* cache setup completed, link it into the list */
2548 list_add(&cachep->next, &cache_chain); 2610 list_add(&cachep->list, &slab_caches);
2549oops:
2550 if (!cachep && (flags & SLAB_PANIC))
2551 panic("kmem_cache_create(): failed to create slab `%s'\n",
2552 name);
2553 if (slab_is_available()) {
2554 mutex_unlock(&cache_chain_mutex);
2555 put_online_cpus();
2556 }
2557 return cachep; 2611 return cachep;
2558} 2612}
2559EXPORT_SYMBOL(kmem_cache_create);
2560 2613
2561#if DEBUG 2614#if DEBUG
2562static void check_irq_off(void) 2615static void check_irq_off(void)
@@ -2671,7 +2724,7 @@ out:
2671 return nr_freed; 2724 return nr_freed;
2672} 2725}
2673 2726
2674/* Called with cache_chain_mutex held to protect against cpu hotplug */ 2727/* Called with slab_mutex held to protect against cpu hotplug */
2675static int __cache_shrink(struct kmem_cache *cachep) 2728static int __cache_shrink(struct kmem_cache *cachep)
2676{ 2729{
2677 int ret = 0, i = 0; 2730 int ret = 0, i = 0;
@@ -2706,9 +2759,9 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
2706 BUG_ON(!cachep || in_interrupt()); 2759 BUG_ON(!cachep || in_interrupt());
2707 2760
2708 get_online_cpus(); 2761 get_online_cpus();
2709 mutex_lock(&cache_chain_mutex); 2762 mutex_lock(&slab_mutex);
2710 ret = __cache_shrink(cachep); 2763 ret = __cache_shrink(cachep);
2711 mutex_unlock(&cache_chain_mutex); 2764 mutex_unlock(&slab_mutex);
2712 put_online_cpus(); 2765 put_online_cpus();
2713 return ret; 2766 return ret;
2714} 2767}
@@ -2736,15 +2789,15 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2736 2789
2737 /* Find the cache in the chain of caches. */ 2790 /* Find the cache in the chain of caches. */
2738 get_online_cpus(); 2791 get_online_cpus();
2739 mutex_lock(&cache_chain_mutex); 2792 mutex_lock(&slab_mutex);
2740 /* 2793 /*
2741 * the chain is never empty, cache_cache is never destroyed 2794 * the chain is never empty, cache_cache is never destroyed
2742 */ 2795 */
2743 list_del(&cachep->next); 2796 list_del(&cachep->list);
2744 if (__cache_shrink(cachep)) { 2797 if (__cache_shrink(cachep)) {
2745 slab_error(cachep, "Can't free all objects"); 2798 slab_error(cachep, "Can't free all objects");
2746 list_add(&cachep->next, &cache_chain); 2799 list_add(&cachep->list, &slab_caches);
2747 mutex_unlock(&cache_chain_mutex); 2800 mutex_unlock(&slab_mutex);
2748 put_online_cpus(); 2801 put_online_cpus();
2749 return; 2802 return;
2750 } 2803 }
@@ -2753,7 +2806,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2753 rcu_barrier(); 2806 rcu_barrier();
2754 2807
2755 __kmem_cache_destroy(cachep); 2808 __kmem_cache_destroy(cachep);
2756 mutex_unlock(&cache_chain_mutex); 2809 mutex_unlock(&slab_mutex);
2757 put_online_cpus(); 2810 put_online_cpus();
2758} 2811}
2759EXPORT_SYMBOL(kmem_cache_destroy); 2812EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2840,10 +2893,10 @@ static void cache_init_objs(struct kmem_cache *cachep,
2840 slab_error(cachep, "constructor overwrote the" 2893 slab_error(cachep, "constructor overwrote the"
2841 " start of an object"); 2894 " start of an object");
2842 } 2895 }
2843 if ((cachep->buffer_size % PAGE_SIZE) == 0 && 2896 if ((cachep->size % PAGE_SIZE) == 0 &&
2844 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2897 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2845 kernel_map_pages(virt_to_page(objp), 2898 kernel_map_pages(virt_to_page(objp),
2846 cachep->buffer_size / PAGE_SIZE, 0); 2899 cachep->size / PAGE_SIZE, 0);
2847#else 2900#else
2848 if (cachep->ctor) 2901 if (cachep->ctor)
2849 cachep->ctor(objp); 2902 cachep->ctor(objp);
@@ -2857,9 +2910,9 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2857{ 2910{
2858 if (CONFIG_ZONE_DMA_FLAG) { 2911 if (CONFIG_ZONE_DMA_FLAG) {
2859 if (flags & GFP_DMA) 2912 if (flags & GFP_DMA)
2860 BUG_ON(!(cachep->gfpflags & GFP_DMA)); 2913 BUG_ON(!(cachep->allocflags & GFP_DMA));
2861 else 2914 else
2862 BUG_ON(cachep->gfpflags & GFP_DMA); 2915 BUG_ON(cachep->allocflags & GFP_DMA);
2863 } 2916 }
2864} 2917}
2865 2918
@@ -2918,8 +2971,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2918 nr_pages <<= cache->gfporder; 2971 nr_pages <<= cache->gfporder;
2919 2972
2920 do { 2973 do {
2921 page_set_cache(page, cache); 2974 page->slab_cache = cache;
2922 page_set_slab(page, slab); 2975 page->slab_page = slab;
2923 page++; 2976 page++;
2924 } while (--nr_pages); 2977 } while (--nr_pages);
2925} 2978}
@@ -3057,7 +3110,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
3057 kfree_debugcheck(objp); 3110 kfree_debugcheck(objp);
3058 page = virt_to_head_page(objp); 3111 page = virt_to_head_page(objp);
3059 3112
3060 slabp = page_get_slab(page); 3113 slabp = page->slab_page;
3061 3114
3062 if (cachep->flags & SLAB_RED_ZONE) { 3115 if (cachep->flags & SLAB_RED_ZONE) {
3063 verify_redzone_free(cachep, objp); 3116 verify_redzone_free(cachep, objp);
@@ -3077,10 +3130,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
3077#endif 3130#endif
3078 if (cachep->flags & SLAB_POISON) { 3131 if (cachep->flags & SLAB_POISON) {
3079#ifdef CONFIG_DEBUG_PAGEALLOC 3132#ifdef CONFIG_DEBUG_PAGEALLOC
3080 if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 3133 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
3081 store_stackinfo(cachep, objp, (unsigned long)caller); 3134 store_stackinfo(cachep, objp, (unsigned long)caller);
3082 kernel_map_pages(virt_to_page(objp), 3135 kernel_map_pages(virt_to_page(objp),
3083 cachep->buffer_size / PAGE_SIZE, 0); 3136 cachep->size / PAGE_SIZE, 0);
3084 } else { 3137 } else {
3085 poison_obj(cachep, objp, POISON_FREE); 3138 poison_obj(cachep, objp, POISON_FREE);
3086 } 3139 }
@@ -3120,16 +3173,19 @@ bad:
3120#define check_slabp(x,y) do { } while(0) 3173#define check_slabp(x,y) do { } while(0)
3121#endif 3174#endif
3122 3175
3123static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) 3176static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
3177 bool force_refill)
3124{ 3178{
3125 int batchcount; 3179 int batchcount;
3126 struct kmem_list3 *l3; 3180 struct kmem_list3 *l3;
3127 struct array_cache *ac; 3181 struct array_cache *ac;
3128 int node; 3182 int node;
3129 3183
3130retry:
3131 check_irq_off(); 3184 check_irq_off();
3132 node = numa_mem_id(); 3185 node = numa_mem_id();
3186 if (unlikely(force_refill))
3187 goto force_grow;
3188retry:
3133 ac = cpu_cache_get(cachep); 3189 ac = cpu_cache_get(cachep);
3134 batchcount = ac->batchcount; 3190 batchcount = ac->batchcount;
3135 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 3191 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3179,8 +3235,8 @@ retry:
3179 STATS_INC_ACTIVE(cachep); 3235 STATS_INC_ACTIVE(cachep);
3180 STATS_SET_HIGH(cachep); 3236 STATS_SET_HIGH(cachep);
3181 3237
3182 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, 3238 ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
3183 node); 3239 node));
3184 } 3240 }
3185 check_slabp(cachep, slabp); 3241 check_slabp(cachep, slabp);
3186 3242
@@ -3199,18 +3255,22 @@ alloc_done:
3199 3255
3200 if (unlikely(!ac->avail)) { 3256 if (unlikely(!ac->avail)) {
3201 int x; 3257 int x;
3258force_grow:
3202 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); 3259 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3203 3260
3204 /* cache_grow can reenable interrupts, then ac could change. */ 3261 /* cache_grow can reenable interrupts, then ac could change. */
3205 ac = cpu_cache_get(cachep); 3262 ac = cpu_cache_get(cachep);
3206 if (!x && ac->avail == 0) /* no objects in sight? abort */ 3263
3264 /* no objects in sight? abort */
3265 if (!x && (ac->avail == 0 || force_refill))
3207 return NULL; 3266 return NULL;
3208 3267
3209 if (!ac->avail) /* objects refilled by interrupt? */ 3268 if (!ac->avail) /* objects refilled by interrupt? */
3210 goto retry; 3269 goto retry;
3211 } 3270 }
3212 ac->touched = 1; 3271 ac->touched = 1;
3213 return ac->entry[--ac->avail]; 3272
3273 return ac_get_obj(cachep, ac, flags, force_refill);
3214} 3274}
3215 3275
3216static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 3276static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3230,9 +3290,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3230 return objp; 3290 return objp;
3231 if (cachep->flags & SLAB_POISON) { 3291 if (cachep->flags & SLAB_POISON) {
3232#ifdef CONFIG_DEBUG_PAGEALLOC 3292#ifdef CONFIG_DEBUG_PAGEALLOC
3233 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 3293 if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3234 kernel_map_pages(virt_to_page(objp), 3294 kernel_map_pages(virt_to_page(objp),
3235 cachep->buffer_size / PAGE_SIZE, 1); 3295 cachep->size / PAGE_SIZE, 1);
3236 else 3296 else
3237 check_poison_obj(cachep, objp); 3297 check_poison_obj(cachep, objp);
3238#else 3298#else
@@ -3261,8 +3321,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3261 struct slab *slabp; 3321 struct slab *slabp;
3262 unsigned objnr; 3322 unsigned objnr;
3263 3323
3264 slabp = page_get_slab(virt_to_head_page(objp)); 3324 slabp = virt_to_head_page(objp)->slab_page;
3265 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 3325 objnr = (unsigned)(objp - slabp->s_mem) / cachep->size;
3266 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; 3326 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3267 } 3327 }
3268#endif 3328#endif
@@ -3285,30 +3345,42 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3285 if (cachep == &cache_cache) 3345 if (cachep == &cache_cache)
3286 return false; 3346 return false;
3287 3347
3288 return should_failslab(obj_size(cachep), flags, cachep->flags); 3348 return should_failslab(cachep->object_size, flags, cachep->flags);
3289} 3349}
3290 3350
3291static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3351static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3292{ 3352{
3293 void *objp; 3353 void *objp;
3294 struct array_cache *ac; 3354 struct array_cache *ac;
3355 bool force_refill = false;
3295 3356
3296 check_irq_off(); 3357 check_irq_off();
3297 3358
3298 ac = cpu_cache_get(cachep); 3359 ac = cpu_cache_get(cachep);
3299 if (likely(ac->avail)) { 3360 if (likely(ac->avail)) {
3300 STATS_INC_ALLOCHIT(cachep);
3301 ac->touched = 1; 3361 ac->touched = 1;
3302 objp = ac->entry[--ac->avail]; 3362 objp = ac_get_obj(cachep, ac, flags, false);
3303 } else { 3363
3304 STATS_INC_ALLOCMISS(cachep);
3305 objp = cache_alloc_refill(cachep, flags);
3306 /* 3364 /*
3307 * the 'ac' may be updated by cache_alloc_refill(), 3365 * Allow for the possibility all avail objects are not allowed
3308 * and kmemleak_erase() requires its correct value. 3366 * by the current flags
3309 */ 3367 */
3310 ac = cpu_cache_get(cachep); 3368 if (objp) {
3369 STATS_INC_ALLOCHIT(cachep);
3370 goto out;
3371 }
3372 force_refill = true;
3311 } 3373 }
3374
3375 STATS_INC_ALLOCMISS(cachep);
3376 objp = cache_alloc_refill(cachep, flags, force_refill);
3377 /*
3378 * the 'ac' may be updated by cache_alloc_refill(),
3379 * and kmemleak_erase() requires its correct value.
3380 */
3381 ac = cpu_cache_get(cachep);
3382
3383out:
3312 /* 3384 /*
3313 * To avoid a false negative, if an object that is in one of the 3385 * To avoid a false negative, if an object that is in one of the
3314 * per-CPU caches is leaked, we need to make sure kmemleak doesn't 3386 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
@@ -3336,7 +3408,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3336 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3408 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3337 nid_alloc = cpuset_slab_spread_node(); 3409 nid_alloc = cpuset_slab_spread_node();
3338 else if (current->mempolicy) 3410 else if (current->mempolicy)
3339 nid_alloc = slab_node(current->mempolicy); 3411 nid_alloc = slab_node();
3340 if (nid_alloc != nid_here) 3412 if (nid_alloc != nid_here)
3341 return ____cache_alloc_node(cachep, flags, nid_alloc); 3413 return ____cache_alloc_node(cachep, flags, nid_alloc);
3342 return NULL; 3414 return NULL;
@@ -3368,7 +3440,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3368 3440
3369retry_cpuset: 3441retry_cpuset:
3370 cpuset_mems_cookie = get_mems_allowed(); 3442 cpuset_mems_cookie = get_mems_allowed();
3371 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 3443 zonelist = node_zonelist(slab_node(), flags);
3372 3444
3373retry: 3445retry:
3374 /* 3446 /*
@@ -3545,14 +3617,14 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3545 out: 3617 out:
3546 local_irq_restore(save_flags); 3618 local_irq_restore(save_flags);
3547 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3619 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3548 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, 3620 kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
3549 flags); 3621 flags);
3550 3622
3551 if (likely(ptr)) 3623 if (likely(ptr))
3552 kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); 3624 kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
3553 3625
3554 if (unlikely((flags & __GFP_ZERO) && ptr)) 3626 if (unlikely((flags & __GFP_ZERO) && ptr))
3555 memset(ptr, 0, obj_size(cachep)); 3627 memset(ptr, 0, cachep->object_size);
3556 3628
3557 return ptr; 3629 return ptr;
3558} 3630}
@@ -3607,15 +3679,15 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3607 objp = __do_cache_alloc(cachep, flags); 3679 objp = __do_cache_alloc(cachep, flags);
3608 local_irq_restore(save_flags); 3680 local_irq_restore(save_flags);
3609 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3681 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3610 kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, 3682 kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
3611 flags); 3683 flags);
3612 prefetchw(objp); 3684 prefetchw(objp);
3613 3685
3614 if (likely(objp)) 3686 if (likely(objp))
3615 kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); 3687 kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
3616 3688
3617 if (unlikely((flags & __GFP_ZERO) && objp)) 3689 if (unlikely((flags & __GFP_ZERO) && objp))
3618 memset(objp, 0, obj_size(cachep)); 3690 memset(objp, 0, cachep->object_size);
3619 3691
3620 return objp; 3692 return objp;
3621} 3693}
@@ -3630,9 +3702,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3630 struct kmem_list3 *l3; 3702 struct kmem_list3 *l3;
3631 3703
3632 for (i = 0; i < nr_objects; i++) { 3704 for (i = 0; i < nr_objects; i++) {
3633 void *objp = objpp[i]; 3705 void *objp;
3634 struct slab *slabp; 3706 struct slab *slabp;
3635 3707
3708 clear_obj_pfmemalloc(&objpp[i]);
3709 objp = objpp[i];
3710
3636 slabp = virt_to_slab(objp); 3711 slabp = virt_to_slab(objp);
3637 l3 = cachep->nodelists[node]; 3712 l3 = cachep->nodelists[node];
3638 list_del(&slabp->list); 3713 list_del(&slabp->list);
@@ -3731,7 +3806,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3731 kmemleak_free_recursive(objp, cachep->flags); 3806 kmemleak_free_recursive(objp, cachep->flags);
3732 objp = cache_free_debugcheck(cachep, objp, caller); 3807 objp = cache_free_debugcheck(cachep, objp, caller);
3733 3808
3734 kmemcheck_slab_free(cachep, objp, obj_size(cachep)); 3809 kmemcheck_slab_free(cachep, objp, cachep->object_size);
3735 3810
3736 /* 3811 /*
3737 * Skip calling cache_free_alien() when the platform is not numa. 3812 * Skip calling cache_free_alien() when the platform is not numa.
@@ -3750,7 +3825,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3750 cache_flusharray(cachep, ac); 3825 cache_flusharray(cachep, ac);
3751 } 3826 }
3752 3827
3753 ac->entry[ac->avail++] = objp; 3828 ac_put_obj(cachep, ac, objp);
3754} 3829}
3755 3830
3756/** 3831/**
@@ -3766,7 +3841,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3766 void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); 3841 void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3767 3842
3768 trace_kmem_cache_alloc(_RET_IP_, ret, 3843 trace_kmem_cache_alloc(_RET_IP_, ret,
3769 obj_size(cachep), cachep->buffer_size, flags); 3844 cachep->object_size, cachep->size, flags);
3770 3845
3771 return ret; 3846 return ret;
3772} 3847}
@@ -3794,7 +3869,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3794 __builtin_return_address(0)); 3869 __builtin_return_address(0));
3795 3870
3796 trace_kmem_cache_alloc_node(_RET_IP_, ret, 3871 trace_kmem_cache_alloc_node(_RET_IP_, ret,
3797 obj_size(cachep), cachep->buffer_size, 3872 cachep->object_size, cachep->size,
3798 flags, nodeid); 3873 flags, nodeid);
3799 3874
3800 return ret; 3875 return ret;
@@ -3876,7 +3951,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3876 ret = __cache_alloc(cachep, flags, caller); 3951 ret = __cache_alloc(cachep, flags, caller);
3877 3952
3878 trace_kmalloc((unsigned long) caller, ret, 3953 trace_kmalloc((unsigned long) caller, ret,
3879 size, cachep->buffer_size, flags); 3954 size, cachep->size, flags);
3880 3955
3881 return ret; 3956 return ret;
3882} 3957}
@@ -3916,9 +3991,9 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3916 unsigned long flags; 3991 unsigned long flags;
3917 3992
3918 local_irq_save(flags); 3993 local_irq_save(flags);
3919 debug_check_no_locks_freed(objp, obj_size(cachep)); 3994 debug_check_no_locks_freed(objp, cachep->object_size);
3920 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3995 if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3921 debug_check_no_obj_freed(objp, obj_size(cachep)); 3996 debug_check_no_obj_freed(objp, cachep->object_size);
3922 __cache_free(cachep, objp, __builtin_return_address(0)); 3997 __cache_free(cachep, objp, __builtin_return_address(0));
3923 local_irq_restore(flags); 3998 local_irq_restore(flags);
3924 3999
@@ -3947,8 +4022,9 @@ void kfree(const void *objp)
3947 local_irq_save(flags); 4022 local_irq_save(flags);
3948 kfree_debugcheck(objp); 4023 kfree_debugcheck(objp);
3949 c = virt_to_cache(objp); 4024 c = virt_to_cache(objp);
3950 debug_check_no_locks_freed(objp, obj_size(c)); 4025 debug_check_no_locks_freed(objp, c->object_size);
3951 debug_check_no_obj_freed(objp, obj_size(c)); 4026
4027 debug_check_no_obj_freed(objp, c->object_size);
3952 __cache_free(c, (void *)objp, __builtin_return_address(0)); 4028 __cache_free(c, (void *)objp, __builtin_return_address(0));
3953 local_irq_restore(flags); 4029 local_irq_restore(flags);
3954} 4030}
@@ -3956,7 +4032,7 @@ EXPORT_SYMBOL(kfree);
3956 4032
3957unsigned int kmem_cache_size(struct kmem_cache *cachep) 4033unsigned int kmem_cache_size(struct kmem_cache *cachep)
3958{ 4034{
3959 return obj_size(cachep); 4035 return cachep->object_size;
3960} 4036}
3961EXPORT_SYMBOL(kmem_cache_size); 4037EXPORT_SYMBOL(kmem_cache_size);
3962 4038
@@ -4030,7 +4106,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
4030 return 0; 4106 return 0;
4031 4107
4032fail: 4108fail:
4033 if (!cachep->next.next) { 4109 if (!cachep->list.next) {
4034 /* Cache is not active yet. Roll back what we did */ 4110 /* Cache is not active yet. Roll back what we did */
4035 node--; 4111 node--;
4036 while (node >= 0) { 4112 while (node >= 0) {
@@ -4065,7 +4141,7 @@ static void do_ccupdate_local(void *info)
4065 new->new[smp_processor_id()] = old; 4141 new->new[smp_processor_id()] = old;
4066} 4142}
4067 4143
4068/* Always called with the cache_chain_mutex held */ 4144/* Always called with the slab_mutex held */
4069static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 4145static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4070 int batchcount, int shared, gfp_t gfp) 4146 int batchcount, int shared, gfp_t gfp)
4071{ 4147{
@@ -4109,7 +4185,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4109 return alloc_kmemlist(cachep, gfp); 4185 return alloc_kmemlist(cachep, gfp);
4110} 4186}
4111 4187
4112/* Called with cache_chain_mutex held always */ 4188/* Called with slab_mutex held always */
4113static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) 4189static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4114{ 4190{
4115 int err; 4191 int err;
@@ -4124,13 +4200,13 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4124 * The numbers are guessed, we should auto-tune as described by 4200 * The numbers are guessed, we should auto-tune as described by
4125 * Bonwick. 4201 * Bonwick.
4126 */ 4202 */
4127 if (cachep->buffer_size > 131072) 4203 if (cachep->size > 131072)
4128 limit = 1; 4204 limit = 1;
4129 else if (cachep->buffer_size > PAGE_SIZE) 4205 else if (cachep->size > PAGE_SIZE)
4130 limit = 8; 4206 limit = 8;
4131 else if (cachep->buffer_size > 1024) 4207 else if (cachep->size > 1024)
4132 limit = 24; 4208 limit = 24;
4133 else if (cachep->buffer_size > 256) 4209 else if (cachep->size > 256)
4134 limit = 54; 4210 limit = 54;
4135 else 4211 else
4136 limit = 120; 4212 limit = 120;
@@ -4145,7 +4221,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4145 * to a larger limit. Thus disabled by default. 4221 * to a larger limit. Thus disabled by default.
4146 */ 4222 */
4147 shared = 0; 4223 shared = 0;
4148 if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) 4224 if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
4149 shared = 8; 4225 shared = 8;
4150 4226
4151#if DEBUG 4227#if DEBUG
@@ -4211,11 +4287,11 @@ static void cache_reap(struct work_struct *w)
4211 int node = numa_mem_id(); 4287 int node = numa_mem_id();
4212 struct delayed_work *work = to_delayed_work(w); 4288 struct delayed_work *work = to_delayed_work(w);
4213 4289
4214 if (!mutex_trylock(&cache_chain_mutex)) 4290 if (!mutex_trylock(&slab_mutex))
4215 /* Give up. Setup the next iteration. */ 4291 /* Give up. Setup the next iteration. */
4216 goto out; 4292 goto out;
4217 4293
4218 list_for_each_entry(searchp, &cache_chain, next) { 4294 list_for_each_entry(searchp, &slab_caches, list) {
4219 check_irq_on(); 4295 check_irq_on();
4220 4296
4221 /* 4297 /*
@@ -4253,7 +4329,7 @@ next:
4253 cond_resched(); 4329 cond_resched();
4254 } 4330 }
4255 check_irq_on(); 4331 check_irq_on();
4256 mutex_unlock(&cache_chain_mutex); 4332 mutex_unlock(&slab_mutex);
4257 next_reap_node(); 4333 next_reap_node();
4258out: 4334out:
4259 /* Set up the next iteration */ 4335 /* Set up the next iteration */
@@ -4289,26 +4365,26 @@ static void *s_start(struct seq_file *m, loff_t *pos)
4289{ 4365{
4290 loff_t n = *pos; 4366 loff_t n = *pos;
4291 4367
4292 mutex_lock(&cache_chain_mutex); 4368 mutex_lock(&slab_mutex);
4293 if (!n) 4369 if (!n)
4294 print_slabinfo_header(m); 4370 print_slabinfo_header(m);
4295 4371
4296 return seq_list_start(&cache_chain, *pos); 4372 return seq_list_start(&slab_caches, *pos);
4297} 4373}
4298 4374
4299static void *s_next(struct seq_file *m, void *p, loff_t *pos) 4375static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4300{ 4376{
4301 return seq_list_next(p, &cache_chain, pos); 4377 return seq_list_next(p, &slab_caches, pos);
4302} 4378}
4303 4379
4304static void s_stop(struct seq_file *m, void *p) 4380static void s_stop(struct seq_file *m, void *p)
4305{ 4381{
4306 mutex_unlock(&cache_chain_mutex); 4382 mutex_unlock(&slab_mutex);
4307} 4383}
4308 4384
4309static int s_show(struct seq_file *m, void *p) 4385static int s_show(struct seq_file *m, void *p)
4310{ 4386{
4311 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); 4387 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4312 struct slab *slabp; 4388 struct slab *slabp;
4313 unsigned long active_objs; 4389 unsigned long active_objs;
4314 unsigned long num_objs; 4390 unsigned long num_objs;
@@ -4364,7 +4440,7 @@ static int s_show(struct seq_file *m, void *p)
4364 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 4440 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4365 4441
4366 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 4442 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4367 name, active_objs, num_objs, cachep->buffer_size, 4443 name, active_objs, num_objs, cachep->size,
4368 cachep->num, (1 << cachep->gfporder)); 4444 cachep->num, (1 << cachep->gfporder));
4369 seq_printf(m, " : tunables %4u %4u %4u", 4445 seq_printf(m, " : tunables %4u %4u %4u",
4370 cachep->limit, cachep->batchcount, cachep->shared); 4446 cachep->limit, cachep->batchcount, cachep->shared);
@@ -4454,9 +4530,9 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4454 return -EINVAL; 4530 return -EINVAL;
4455 4531
4456 /* Find the cache in the chain of caches. */ 4532 /* Find the cache in the chain of caches. */
4457 mutex_lock(&cache_chain_mutex); 4533 mutex_lock(&slab_mutex);
4458 res = -EINVAL; 4534 res = -EINVAL;
4459 list_for_each_entry(cachep, &cache_chain, next) { 4535 list_for_each_entry(cachep, &slab_caches, list) {
4460 if (!strcmp(cachep->name, kbuf)) { 4536 if (!strcmp(cachep->name, kbuf)) {
4461 if (limit < 1 || batchcount < 1 || 4537 if (limit < 1 || batchcount < 1 ||
4462 batchcount > limit || shared < 0) { 4538 batchcount > limit || shared < 0) {
@@ -4469,7 +4545,7 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4469 break; 4545 break;
4470 } 4546 }
4471 } 4547 }
4472 mutex_unlock(&cache_chain_mutex); 4548 mutex_unlock(&slab_mutex);
4473 if (res >= 0) 4549 if (res >= 0)
4474 res = count; 4550 res = count;
4475 return res; 4551 return res;
@@ -4492,8 +4568,8 @@ static const struct file_operations proc_slabinfo_operations = {
4492 4568
4493static void *leaks_start(struct seq_file *m, loff_t *pos) 4569static void *leaks_start(struct seq_file *m, loff_t *pos)
4494{ 4570{
4495 mutex_lock(&cache_chain_mutex); 4571 mutex_lock(&slab_mutex);
4496 return seq_list_start(&cache_chain, *pos); 4572 return seq_list_start(&slab_caches, *pos);
4497} 4573}
4498 4574
4499static inline int add_caller(unsigned long *n, unsigned long v) 4575static inline int add_caller(unsigned long *n, unsigned long v)
@@ -4532,7 +4608,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4532 int i; 4608 int i;
4533 if (n[0] == n[1]) 4609 if (n[0] == n[1])
4534 return; 4610 return;
4535 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { 4611 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) {
4536 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) 4612 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4537 continue; 4613 continue;
4538 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4614 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
@@ -4558,7 +4634,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
4558 4634
4559static int leaks_show(struct seq_file *m, void *p) 4635static int leaks_show(struct seq_file *m, void *p)
4560{ 4636{
4561 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); 4637 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4562 struct slab *slabp; 4638 struct slab *slabp;
4563 struct kmem_list3 *l3; 4639 struct kmem_list3 *l3;
4564 const char *name; 4640 const char *name;
@@ -4592,17 +4668,17 @@ static int leaks_show(struct seq_file *m, void *p)
4592 name = cachep->name; 4668 name = cachep->name;
4593 if (n[0] == n[1]) { 4669 if (n[0] == n[1]) {
4594 /* Increase the buffer size */ 4670 /* Increase the buffer size */
4595 mutex_unlock(&cache_chain_mutex); 4671 mutex_unlock(&slab_mutex);
4596 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); 4672 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4597 if (!m->private) { 4673 if (!m->private) {
4598 /* Too bad, we are really out */ 4674 /* Too bad, we are really out */
4599 m->private = n; 4675 m->private = n;
4600 mutex_lock(&cache_chain_mutex); 4676 mutex_lock(&slab_mutex);
4601 return -ENOMEM; 4677 return -ENOMEM;
4602 } 4678 }
4603 *(unsigned long *)m->private = n[0] * 2; 4679 *(unsigned long *)m->private = n[0] * 2;
4604 kfree(n); 4680 kfree(n);
4605 mutex_lock(&cache_chain_mutex); 4681 mutex_lock(&slab_mutex);
4606 /* Now make sure this entry will be retried */ 4682 /* Now make sure this entry will be retried */
4607 m->count = m->size; 4683 m->count = m->size;
4608 return 0; 4684 return 0;
@@ -4677,6 +4753,6 @@ size_t ksize(const void *objp)
4677 if (unlikely(objp == ZERO_SIZE_PTR)) 4753 if (unlikely(objp == ZERO_SIZE_PTR))
4678 return 0; 4754 return 0;
4679 4755
4680 return obj_size(virt_to_cache(objp)); 4756 return virt_to_cache(objp)->object_size;
4681} 4757}
4682EXPORT_SYMBOL(ksize); 4758EXPORT_SYMBOL(ksize);
diff --git a/mm/slab.h b/mm/slab.h
new file mode 100644
index 000000000000..db7848caaa25
--- /dev/null
+++ b/mm/slab.h
@@ -0,0 +1,33 @@
1#ifndef MM_SLAB_H
2#define MM_SLAB_H
3/*
4 * Internal slab definitions
5 */
6
7/*
8 * State of the slab allocator.
9 *
10 * This is used to describe the states of the allocator during bootup.
11 * Allocators use this to gradually bootstrap themselves. Most allocators
12 * have the problem that the structures used for managing slab caches are
13 * allocated from slab caches themselves.
14 */
15enum slab_state {
16 DOWN, /* No slab functionality yet */
17 PARTIAL, /* SLUB: kmem_cache_node available */
18 PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */
19 PARTIAL_L3, /* SLAB: kmalloc size for l3 struct available */
20 UP, /* Slab caches usable but not all extras yet */
21 FULL /* Everything is working */
22};
23
24extern enum slab_state slab_state;
25
26/* The slab cache mutex protects the management structures during changes */
27extern struct mutex slab_mutex;
28extern struct list_head slab_caches;
29
30struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
31 size_t align, unsigned long flags, void (*ctor)(void *));
32
33#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
new file mode 100644
index 000000000000..aa3ca5bb01b5
--- /dev/null
+++ b/mm/slab_common.c
@@ -0,0 +1,120 @@
1/*
2 * Slab allocator functions that are independent of the allocator strategy
3 *
4 * (C) 2012 Christoph Lameter <cl@linux.com>
5 */
6#include <linux/slab.h>
7
8#include <linux/mm.h>
9#include <linux/poison.h>
10#include <linux/interrupt.h>
11#include <linux/memory.h>
12#include <linux/compiler.h>
13#include <linux/module.h>
14#include <linux/cpu.h>
15#include <linux/uaccess.h>
16#include <asm/cacheflush.h>
17#include <asm/tlbflush.h>
18#include <asm/page.h>
19
20#include "slab.h"
21
22enum slab_state slab_state;
23LIST_HEAD(slab_caches);
24DEFINE_MUTEX(slab_mutex);
25
26/*
27 * kmem_cache_create - Create a cache.
28 * @name: A string which is used in /proc/slabinfo to identify this cache.
29 * @size: The size of objects to be created in this cache.
30 * @align: The required alignment for the objects.
31 * @flags: SLAB flags
32 * @ctor: A constructor for the objects.
33 *
34 * Returns a ptr to the cache on success, NULL on failure.
35 * Cannot be called within a interrupt, but can be interrupted.
36 * The @ctor is run when new pages are allocated by the cache.
37 *
38 * The flags are
39 *
40 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
41 * to catch references to uninitialised memory.
42 *
43 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
44 * for buffer overruns.
45 *
46 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
47 * cacheline. This can be beneficial if you're counting cycles as closely
48 * as davem.
49 */
50
51struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
52 unsigned long flags, void (*ctor)(void *))
53{
54 struct kmem_cache *s = NULL;
55
56#ifdef CONFIG_DEBUG_VM
57 if (!name || in_interrupt() || size < sizeof(void *) ||
58 size > KMALLOC_MAX_SIZE) {
59 printk(KERN_ERR "kmem_cache_create(%s) integrity check"
60 " failed\n", name);
61 goto out;
62 }
63#endif
64
65 get_online_cpus();
66 mutex_lock(&slab_mutex);
67
68#ifdef CONFIG_DEBUG_VM
69 list_for_each_entry(s, &slab_caches, list) {
70 char tmp;
71 int res;
72
73 /*
74 * This happens when the module gets unloaded and doesn't
75 * destroy its slab cache and no-one else reuses the vmalloc
76 * area of the module. Print a warning.
77 */
78 res = probe_kernel_address(s->name, tmp);
79 if (res) {
80 printk(KERN_ERR
81 "Slab cache with size %d has lost its name\n",
82 s->object_size);
83 continue;
84 }
85
86 if (!strcmp(s->name, name)) {
87 printk(KERN_ERR "kmem_cache_create(%s): Cache name"
88 " already exists.\n",
89 name);
90 dump_stack();
91 s = NULL;
92 goto oops;
93 }
94 }
95
96 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
97#endif
98
99 s = __kmem_cache_create(name, size, align, flags, ctor);
100
101#ifdef CONFIG_DEBUG_VM
102oops:
103#endif
104 mutex_unlock(&slab_mutex);
105 put_online_cpus();
106
107#ifdef CONFIG_DEBUG_VM
108out:
109#endif
110 if (!s && (flags & SLAB_PANIC))
111 panic("kmem_cache_create: Failed to create slab '%s'\n", name);
112
113 return s;
114}
115EXPORT_SYMBOL(kmem_cache_create);
116
117int slab_is_available(void)
118{
119 return slab_state >= UP;
120}
diff --git a/mm/slob.c b/mm/slob.c
index 8105be42cad1..45d4ca79933a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -59,6 +59,8 @@
59 59
60#include <linux/kernel.h> 60#include <linux/kernel.h>
61#include <linux/slab.h> 61#include <linux/slab.h>
62#include "slab.h"
63
62#include <linux/mm.h> 64#include <linux/mm.h>
63#include <linux/swap.h> /* struct reclaim_state */ 65#include <linux/swap.h> /* struct reclaim_state */
64#include <linux/cache.h> 66#include <linux/cache.h>
@@ -92,36 +94,6 @@ struct slob_block {
92typedef struct slob_block slob_t; 94typedef struct slob_block slob_t;
93 95
94/* 96/*
95 * We use struct page fields to manage some slob allocation aspects,
96 * however to avoid the horrible mess in include/linux/mm_types.h, we'll
97 * just define our own struct page type variant here.
98 */
99struct slob_page {
100 union {
101 struct {
102 unsigned long flags; /* mandatory */
103 atomic_t _count; /* mandatory */
104 slobidx_t units; /* free units left in page */
105 unsigned long pad[2];
106 slob_t *free; /* first free slob_t in page */
107 struct list_head list; /* linked list of free pages */
108 };
109 struct page page;
110 };
111};
112static inline void struct_slob_page_wrong_size(void)
113{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); }
114
115/*
116 * free_slob_page: call before a slob_page is returned to the page allocator.
117 */
118static inline void free_slob_page(struct slob_page *sp)
119{
120 reset_page_mapcount(&sp->page);
121 sp->page.mapping = NULL;
122}
123
124/*
125 * All partially free slob pages go on these lists. 97 * All partially free slob pages go on these lists.
126 */ 98 */
127#define SLOB_BREAK1 256 99#define SLOB_BREAK1 256
@@ -131,46 +103,23 @@ static LIST_HEAD(free_slob_medium);
131static LIST_HEAD(free_slob_large); 103static LIST_HEAD(free_slob_large);
132 104
133/* 105/*
134 * is_slob_page: True for all slob pages (false for bigblock pages)
135 */
136static inline int is_slob_page(struct slob_page *sp)
137{
138 return PageSlab((struct page *)sp);
139}
140
141static inline void set_slob_page(struct slob_page *sp)
142{
143 __SetPageSlab((struct page *)sp);
144}
145
146static inline void clear_slob_page(struct slob_page *sp)
147{
148 __ClearPageSlab((struct page *)sp);
149}
150
151static inline struct slob_page *slob_page(const void *addr)
152{
153 return (struct slob_page *)virt_to_page(addr);
154}
155
156/*
157 * slob_page_free: true for pages on free_slob_pages list. 106 * slob_page_free: true for pages on free_slob_pages list.
158 */ 107 */
159static inline int slob_page_free(struct slob_page *sp) 108static inline int slob_page_free(struct page *sp)
160{ 109{
161 return PageSlobFree((struct page *)sp); 110 return PageSlobFree(sp);
162} 111}
163 112
164static void set_slob_page_free(struct slob_page *sp, struct list_head *list) 113static void set_slob_page_free(struct page *sp, struct list_head *list)
165{ 114{
166 list_add(&sp->list, list); 115 list_add(&sp->list, list);
167 __SetPageSlobFree((struct page *)sp); 116 __SetPageSlobFree(sp);
168} 117}
169 118
170static inline void clear_slob_page_free(struct slob_page *sp) 119static inline void clear_slob_page_free(struct page *sp)
171{ 120{
172 list_del(&sp->list); 121 list_del(&sp->list);
173 __ClearPageSlobFree((struct page *)sp); 122 __ClearPageSlobFree(sp);
174} 123}
175 124
176#define SLOB_UNIT sizeof(slob_t) 125#define SLOB_UNIT sizeof(slob_t)
@@ -267,12 +216,12 @@ static void slob_free_pages(void *b, int order)
267/* 216/*
268 * Allocate a slob block within a given slob_page sp. 217 * Allocate a slob block within a given slob_page sp.
269 */ 218 */
270static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) 219static void *slob_page_alloc(struct page *sp, size_t size, int align)
271{ 220{
272 slob_t *prev, *cur, *aligned = NULL; 221 slob_t *prev, *cur, *aligned = NULL;
273 int delta = 0, units = SLOB_UNITS(size); 222 int delta = 0, units = SLOB_UNITS(size);
274 223
275 for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { 224 for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
276 slobidx_t avail = slob_units(cur); 225 slobidx_t avail = slob_units(cur);
277 226
278 if (align) { 227 if (align) {
@@ -296,12 +245,12 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
296 if (prev) 245 if (prev)
297 set_slob(prev, slob_units(prev), next); 246 set_slob(prev, slob_units(prev), next);
298 else 247 else
299 sp->free = next; 248 sp->freelist = next;
300 } else { /* fragment */ 249 } else { /* fragment */
301 if (prev) 250 if (prev)
302 set_slob(prev, slob_units(prev), cur + units); 251 set_slob(prev, slob_units(prev), cur + units);
303 else 252 else
304 sp->free = cur + units; 253 sp->freelist = cur + units;
305 set_slob(cur + units, avail - units, next); 254 set_slob(cur + units, avail - units, next);
306 } 255 }
307 256
@@ -320,7 +269,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
320 */ 269 */
321static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) 270static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
322{ 271{
323 struct slob_page *sp; 272 struct page *sp;
324 struct list_head *prev; 273 struct list_head *prev;
325 struct list_head *slob_list; 274 struct list_head *slob_list;
326 slob_t *b = NULL; 275 slob_t *b = NULL;
@@ -341,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
341 * If there's a node specification, search for a partial 290 * If there's a node specification, search for a partial
342 * page with a matching node id in the freelist. 291 * page with a matching node id in the freelist.
343 */ 292 */
344 if (node != -1 && page_to_nid(&sp->page) != node) 293 if (node != -1 && page_to_nid(sp) != node)
345 continue; 294 continue;
346#endif 295#endif
347 /* Enough room on this page? */ 296 /* Enough room on this page? */
@@ -369,12 +318,12 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
369 b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); 318 b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
370 if (!b) 319 if (!b)
371 return NULL; 320 return NULL;
372 sp = slob_page(b); 321 sp = virt_to_page(b);
373 set_slob_page(sp); 322 __SetPageSlab(sp);
374 323
375 spin_lock_irqsave(&slob_lock, flags); 324 spin_lock_irqsave(&slob_lock, flags);
376 sp->units = SLOB_UNITS(PAGE_SIZE); 325 sp->units = SLOB_UNITS(PAGE_SIZE);
377 sp->free = b; 326 sp->freelist = b;
378 INIT_LIST_HEAD(&sp->list); 327 INIT_LIST_HEAD(&sp->list);
379 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); 328 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
380 set_slob_page_free(sp, slob_list); 329 set_slob_page_free(sp, slob_list);
@@ -392,7 +341,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
392 */ 341 */
393static void slob_free(void *block, int size) 342static void slob_free(void *block, int size)
394{ 343{
395 struct slob_page *sp; 344 struct page *sp;
396 slob_t *prev, *next, *b = (slob_t *)block; 345 slob_t *prev, *next, *b = (slob_t *)block;
397 slobidx_t units; 346 slobidx_t units;
398 unsigned long flags; 347 unsigned long flags;
@@ -402,7 +351,7 @@ static void slob_free(void *block, int size)
402 return; 351 return;
403 BUG_ON(!size); 352 BUG_ON(!size);
404 353
405 sp = slob_page(block); 354 sp = virt_to_page(block);
406 units = SLOB_UNITS(size); 355 units = SLOB_UNITS(size);
407 356
408 spin_lock_irqsave(&slob_lock, flags); 357 spin_lock_irqsave(&slob_lock, flags);
@@ -412,8 +361,8 @@ static void slob_free(void *block, int size)
412 if (slob_page_free(sp)) 361 if (slob_page_free(sp))
413 clear_slob_page_free(sp); 362 clear_slob_page_free(sp);
414 spin_unlock_irqrestore(&slob_lock, flags); 363 spin_unlock_irqrestore(&slob_lock, flags);
415 clear_slob_page(sp); 364 __ClearPageSlab(sp);
416 free_slob_page(sp); 365 reset_page_mapcount(sp);
417 slob_free_pages(b, 0); 366 slob_free_pages(b, 0);
418 return; 367 return;
419 } 368 }
@@ -421,7 +370,7 @@ static void slob_free(void *block, int size)
421 if (!slob_page_free(sp)) { 370 if (!slob_page_free(sp)) {
422 /* This slob page is about to become partially free. Easy! */ 371 /* This slob page is about to become partially free. Easy! */
423 sp->units = units; 372 sp->units = units;
424 sp->free = b; 373 sp->freelist = b;
425 set_slob(b, units, 374 set_slob(b, units,
426 (void *)((unsigned long)(b + 375 (void *)((unsigned long)(b +
427 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); 376 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
@@ -441,15 +390,15 @@ static void slob_free(void *block, int size)
441 */ 390 */
442 sp->units += units; 391 sp->units += units;
443 392
444 if (b < sp->free) { 393 if (b < (slob_t *)sp->freelist) {
445 if (b + units == sp->free) { 394 if (b + units == sp->freelist) {
446 units += slob_units(sp->free); 395 units += slob_units(sp->freelist);
447 sp->free = slob_next(sp->free); 396 sp->freelist = slob_next(sp->freelist);
448 } 397 }
449 set_slob(b, units, sp->free); 398 set_slob(b, units, sp->freelist);
450 sp->free = b; 399 sp->freelist = b;
451 } else { 400 } else {
452 prev = sp->free; 401 prev = sp->freelist;
453 next = slob_next(prev); 402 next = slob_next(prev);
454 while (b > next) { 403 while (b > next) {
455 prev = next; 404 prev = next;
@@ -522,7 +471,7 @@ EXPORT_SYMBOL(__kmalloc_node);
522 471
523void kfree(const void *block) 472void kfree(const void *block)
524{ 473{
525 struct slob_page *sp; 474 struct page *sp;
526 475
527 trace_kfree(_RET_IP_, block); 476 trace_kfree(_RET_IP_, block);
528 477
@@ -530,43 +479,36 @@ void kfree(const void *block)
530 return; 479 return;
531 kmemleak_free(block); 480 kmemleak_free(block);
532 481
533 sp = slob_page(block); 482 sp = virt_to_page(block);
534 if (is_slob_page(sp)) { 483 if (PageSlab(sp)) {
535 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 484 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
536 unsigned int *m = (unsigned int *)(block - align); 485 unsigned int *m = (unsigned int *)(block - align);
537 slob_free(m, *m + align); 486 slob_free(m, *m + align);
538 } else 487 } else
539 put_page(&sp->page); 488 put_page(sp);
540} 489}
541EXPORT_SYMBOL(kfree); 490EXPORT_SYMBOL(kfree);
542 491
543/* can't use ksize for kmem_cache_alloc memory, only kmalloc */ 492/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
544size_t ksize(const void *block) 493size_t ksize(const void *block)
545{ 494{
546 struct slob_page *sp; 495 struct page *sp;
547 496
548 BUG_ON(!block); 497 BUG_ON(!block);
549 if (unlikely(block == ZERO_SIZE_PTR)) 498 if (unlikely(block == ZERO_SIZE_PTR))
550 return 0; 499 return 0;
551 500
552 sp = slob_page(block); 501 sp = virt_to_page(block);
553 if (is_slob_page(sp)) { 502 if (PageSlab(sp)) {
554 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 503 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
555 unsigned int *m = (unsigned int *)(block - align); 504 unsigned int *m = (unsigned int *)(block - align);
556 return SLOB_UNITS(*m) * SLOB_UNIT; 505 return SLOB_UNITS(*m) * SLOB_UNIT;
557 } else 506 } else
558 return sp->page.private; 507 return sp->private;
559} 508}
560EXPORT_SYMBOL(ksize); 509EXPORT_SYMBOL(ksize);
561 510
562struct kmem_cache { 511struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
563 unsigned int size, align;
564 unsigned long flags;
565 const char *name;
566 void (*ctor)(void *);
567};
568
569struct kmem_cache *kmem_cache_create(const char *name, size_t size,
570 size_t align, unsigned long flags, void (*ctor)(void *)) 512 size_t align, unsigned long flags, void (*ctor)(void *))
571{ 513{
572 struct kmem_cache *c; 514 struct kmem_cache *c;
@@ -589,13 +531,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
589 c->align = ARCH_SLAB_MINALIGN; 531 c->align = ARCH_SLAB_MINALIGN;
590 if (c->align < align) 532 if (c->align < align)
591 c->align = align; 533 c->align = align;
592 } else if (flags & SLAB_PANIC)
593 panic("Cannot create slab cache %s\n", name);
594 534
595 kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); 535 kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
536 c->refcount = 1;
537 }
596 return c; 538 return c;
597} 539}
598EXPORT_SYMBOL(kmem_cache_create);
599 540
600void kmem_cache_destroy(struct kmem_cache *c) 541void kmem_cache_destroy(struct kmem_cache *c)
601{ 542{
@@ -678,19 +619,12 @@ int kmem_cache_shrink(struct kmem_cache *d)
678} 619}
679EXPORT_SYMBOL(kmem_cache_shrink); 620EXPORT_SYMBOL(kmem_cache_shrink);
680 621
681static unsigned int slob_ready __read_mostly;
682
683int slab_is_available(void)
684{
685 return slob_ready;
686}
687
688void __init kmem_cache_init(void) 622void __init kmem_cache_init(void)
689{ 623{
690 slob_ready = 1; 624 slab_state = UP;
691} 625}
692 626
693void __init kmem_cache_init_late(void) 627void __init kmem_cache_init_late(void)
694{ 628{
695 /* Nothing to do */ 629 slab_state = FULL;
696} 630}
diff --git a/mm/slub.c b/mm/slub.c
index 8c691fa1cf3c..8f78e2577031 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include "slab.h"
19#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
20#include <linux/seq_file.h> 21#include <linux/seq_file.h>
21#include <linux/kmemcheck.h> 22#include <linux/kmemcheck.h>
@@ -33,15 +34,17 @@
33 34
34#include <trace/events/kmem.h> 35#include <trace/events/kmem.h>
35 36
37#include "internal.h"
38
36/* 39/*
37 * Lock order: 40 * Lock order:
38 * 1. slub_lock (Global Semaphore) 41 * 1. slab_mutex (Global Mutex)
39 * 2. node->list_lock 42 * 2. node->list_lock
40 * 3. slab_lock(page) (Only on some arches and for debugging) 43 * 3. slab_lock(page) (Only on some arches and for debugging)
41 * 44 *
42 * slub_lock 45 * slab_mutex
43 * 46 *
44 * The role of the slub_lock is to protect the list of all the slabs 47 * The role of the slab_mutex is to protect the list of all the slabs
45 * and to synchronize major metadata changes to slab cache structures. 48 * and to synchronize major metadata changes to slab cache structures.
46 * 49 *
47 * The slab_lock is only used for debugging and on arches that do not 50 * The slab_lock is only used for debugging and on arches that do not
@@ -182,17 +185,6 @@ static int kmem_size = sizeof(struct kmem_cache);
182static struct notifier_block slab_notifier; 185static struct notifier_block slab_notifier;
183#endif 186#endif
184 187
185static enum {
186 DOWN, /* No slab functionality available */
187 PARTIAL, /* Kmem_cache_node works */
188 UP, /* Everything works but does not show up in sysfs */
189 SYSFS /* Sysfs up */
190} slab_state = DOWN;
191
192/* A list of all slab caches on the system */
193static DECLARE_RWSEM(slub_lock);
194static LIST_HEAD(slab_caches);
195
196/* 188/*
197 * Tracking user of a slab. 189 * Tracking user of a slab.
198 */ 190 */
@@ -237,11 +229,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
237 * Core slab cache functions 229 * Core slab cache functions
238 *******************************************************************/ 230 *******************************************************************/
239 231
240int slab_is_available(void)
241{
242 return slab_state >= UP;
243}
244
245static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 232static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
246{ 233{
247 return s->node[node]; 234 return s->node[node];
@@ -311,7 +298,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
311 * and whatever may come after it. 298 * and whatever may come after it.
312 */ 299 */
313 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 300 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
314 return s->objsize; 301 return s->object_size;
315 302
316#endif 303#endif
317 /* 304 /*
@@ -609,11 +596,11 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
609 if (p > addr + 16) 596 if (p > addr + 16)
610 print_section("Bytes b4 ", p - 16, 16); 597 print_section("Bytes b4 ", p - 16, 16);
611 598
612 print_section("Object ", p, min_t(unsigned long, s->objsize, 599 print_section("Object ", p, min_t(unsigned long, s->object_size,
613 PAGE_SIZE)); 600 PAGE_SIZE));
614 if (s->flags & SLAB_RED_ZONE) 601 if (s->flags & SLAB_RED_ZONE)
615 print_section("Redzone ", p + s->objsize, 602 print_section("Redzone ", p + s->object_size,
616 s->inuse - s->objsize); 603 s->inuse - s->object_size);
617 604
618 if (s->offset) 605 if (s->offset)
619 off = s->offset + sizeof(void *); 606 off = s->offset + sizeof(void *);
@@ -655,12 +642,12 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
655 u8 *p = object; 642 u8 *p = object;
656 643
657 if (s->flags & __OBJECT_POISON) { 644 if (s->flags & __OBJECT_POISON) {
658 memset(p, POISON_FREE, s->objsize - 1); 645 memset(p, POISON_FREE, s->object_size - 1);
659 p[s->objsize - 1] = POISON_END; 646 p[s->object_size - 1] = POISON_END;
660 } 647 }
661 648
662 if (s->flags & SLAB_RED_ZONE) 649 if (s->flags & SLAB_RED_ZONE)
663 memset(p + s->objsize, val, s->inuse - s->objsize); 650 memset(p + s->object_size, val, s->inuse - s->object_size);
664} 651}
665 652
666static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 653static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
@@ -705,10 +692,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
705 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 692 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
706 * 0xa5 (POISON_END) 693 * 0xa5 (POISON_END)
707 * 694 *
708 * object + s->objsize 695 * object + s->object_size
709 * Padding to reach word boundary. This is also used for Redzoning. 696 * Padding to reach word boundary. This is also used for Redzoning.
710 * Padding is extended by another word if Redzoning is enabled and 697 * Padding is extended by another word if Redzoning is enabled and
711 * objsize == inuse. 698 * object_size == inuse.
712 * 699 *
713 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 700 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
714 * 0xcc (RED_ACTIVE) for objects in use. 701 * 0xcc (RED_ACTIVE) for objects in use.
@@ -727,7 +714,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
727 * object + s->size 714 * object + s->size
728 * Nothing is used beyond s->size. 715 * Nothing is used beyond s->size.
729 * 716 *
730 * If slabcaches are merged then the objsize and inuse boundaries are mostly 717 * If slabcaches are merged then the object_size and inuse boundaries are mostly
731 * ignored. And therefore no slab options that rely on these boundaries 718 * ignored. And therefore no slab options that rely on these boundaries
732 * may be used with merged slabcaches. 719 * may be used with merged slabcaches.
733 */ 720 */
@@ -787,25 +774,25 @@ static int check_object(struct kmem_cache *s, struct page *page,
787 void *object, u8 val) 774 void *object, u8 val)
788{ 775{
789 u8 *p = object; 776 u8 *p = object;
790 u8 *endobject = object + s->objsize; 777 u8 *endobject = object + s->object_size;
791 778
792 if (s->flags & SLAB_RED_ZONE) { 779 if (s->flags & SLAB_RED_ZONE) {
793 if (!check_bytes_and_report(s, page, object, "Redzone", 780 if (!check_bytes_and_report(s, page, object, "Redzone",
794 endobject, val, s->inuse - s->objsize)) 781 endobject, val, s->inuse - s->object_size))
795 return 0; 782 return 0;
796 } else { 783 } else {
797 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { 784 if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
798 check_bytes_and_report(s, page, p, "Alignment padding", 785 check_bytes_and_report(s, page, p, "Alignment padding",
799 endobject, POISON_INUSE, s->inuse - s->objsize); 786 endobject, POISON_INUSE, s->inuse - s->object_size);
800 } 787 }
801 } 788 }
802 789
803 if (s->flags & SLAB_POISON) { 790 if (s->flags & SLAB_POISON) {
804 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && 791 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
805 (!check_bytes_and_report(s, page, p, "Poison", p, 792 (!check_bytes_and_report(s, page, p, "Poison", p,
806 POISON_FREE, s->objsize - 1) || 793 POISON_FREE, s->object_size - 1) ||
807 !check_bytes_and_report(s, page, p, "Poison", 794 !check_bytes_and_report(s, page, p, "Poison",
808 p + s->objsize - 1, POISON_END, 1))) 795 p + s->object_size - 1, POISON_END, 1)))
809 return 0; 796 return 0;
810 /* 797 /*
811 * check_pad_bytes cleans up on its own. 798 * check_pad_bytes cleans up on its own.
@@ -926,7 +913,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
926 page->freelist); 913 page->freelist);
927 914
928 if (!alloc) 915 if (!alloc)
929 print_section("Object ", (void *)object, s->objsize); 916 print_section("Object ", (void *)object, s->object_size);
930 917
931 dump_stack(); 918 dump_stack();
932 } 919 }
@@ -942,14 +929,14 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
942 lockdep_trace_alloc(flags); 929 lockdep_trace_alloc(flags);
943 might_sleep_if(flags & __GFP_WAIT); 930 might_sleep_if(flags & __GFP_WAIT);
944 931
945 return should_failslab(s->objsize, flags, s->flags); 932 return should_failslab(s->object_size, flags, s->flags);
946} 933}
947 934
948static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) 935static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
949{ 936{
950 flags &= gfp_allowed_mask; 937 flags &= gfp_allowed_mask;
951 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 938 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
952 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); 939 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
953} 940}
954 941
955static inline void slab_free_hook(struct kmem_cache *s, void *x) 942static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -966,13 +953,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
966 unsigned long flags; 953 unsigned long flags;
967 954
968 local_irq_save(flags); 955 local_irq_save(flags);
969 kmemcheck_slab_free(s, x, s->objsize); 956 kmemcheck_slab_free(s, x, s->object_size);
970 debug_check_no_locks_freed(x, s->objsize); 957 debug_check_no_locks_freed(x, s->object_size);
971 local_irq_restore(flags); 958 local_irq_restore(flags);
972 } 959 }
973#endif 960#endif
974 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 961 if (!(s->flags & SLAB_DEBUG_OBJECTS))
975 debug_check_no_obj_freed(x, s->objsize); 962 debug_check_no_obj_freed(x, s->object_size);
976} 963}
977 964
978/* 965/*
@@ -1207,7 +1194,7 @@ out:
1207 1194
1208__setup("slub_debug", setup_slub_debug); 1195__setup("slub_debug", setup_slub_debug);
1209 1196
1210static unsigned long kmem_cache_flags(unsigned long objsize, 1197static unsigned long kmem_cache_flags(unsigned long object_size,
1211 unsigned long flags, const char *name, 1198 unsigned long flags, const char *name,
1212 void (*ctor)(void *)) 1199 void (*ctor)(void *))
1213{ 1200{
@@ -1237,7 +1224,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
1237static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, 1224static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1238 struct page *page) {} 1225 struct page *page) {}
1239static inline void remove_full(struct kmem_cache *s, struct page *page) {} 1226static inline void remove_full(struct kmem_cache *s, struct page *page) {}
1240static inline unsigned long kmem_cache_flags(unsigned long objsize, 1227static inline unsigned long kmem_cache_flags(unsigned long object_size,
1241 unsigned long flags, const char *name, 1228 unsigned long flags, const char *name,
1242 void (*ctor)(void *)) 1229 void (*ctor)(void *))
1243{ 1230{
@@ -1314,13 +1301,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1314 stat(s, ORDER_FALLBACK); 1301 stat(s, ORDER_FALLBACK);
1315 } 1302 }
1316 1303
1317 if (flags & __GFP_WAIT) 1304 if (kmemcheck_enabled && page
1318 local_irq_disable();
1319
1320 if (!page)
1321 return NULL;
1322
1323 if (kmemcheck_enabled
1324 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1305 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1325 int pages = 1 << oo_order(oo); 1306 int pages = 1 << oo_order(oo);
1326 1307
@@ -1336,6 +1317,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1336 kmemcheck_mark_unallocated_pages(page, pages); 1317 kmemcheck_mark_unallocated_pages(page, pages);
1337 } 1318 }
1338 1319
1320 if (flags & __GFP_WAIT)
1321 local_irq_disable();
1322 if (!page)
1323 return NULL;
1324
1339 page->objects = oo_objects(oo); 1325 page->objects = oo_objects(oo);
1340 mod_zone_page_state(page_zone(page), 1326 mod_zone_page_state(page_zone(page),
1341 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1327 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -1370,6 +1356,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1370 inc_slabs_node(s, page_to_nid(page), page->objects); 1356 inc_slabs_node(s, page_to_nid(page), page->objects);
1371 page->slab = s; 1357 page->slab = s;
1372 __SetPageSlab(page); 1358 __SetPageSlab(page);
1359 if (page->pfmemalloc)
1360 SetPageSlabPfmemalloc(page);
1373 1361
1374 start = page_address(page); 1362 start = page_address(page);
1375 1363
@@ -1413,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1413 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1401 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1414 -pages); 1402 -pages);
1415 1403
1404 __ClearPageSlabPfmemalloc(page);
1416 __ClearPageSlab(page); 1405 __ClearPageSlab(page);
1417 reset_page_mapcount(page); 1406 reset_page_mapcount(page);
1418 if (current->reclaim_state) 1407 if (current->reclaim_state)
@@ -1490,12 +1479,12 @@ static inline void remove_partial(struct kmem_cache_node *n,
1490} 1479}
1491 1480
1492/* 1481/*
1493 * Lock slab, remove from the partial list and put the object into the 1482 * Remove slab from the partial list, freeze it and
1494 * per cpu freelist. 1483 * return the pointer to the freelist.
1495 * 1484 *
1496 * Returns a list of objects or NULL if it fails. 1485 * Returns a list of objects or NULL if it fails.
1497 * 1486 *
1498 * Must hold list_lock. 1487 * Must hold list_lock since we modify the partial list.
1499 */ 1488 */
1500static inline void *acquire_slab(struct kmem_cache *s, 1489static inline void *acquire_slab(struct kmem_cache *s,
1501 struct kmem_cache_node *n, struct page *page, 1490 struct kmem_cache_node *n, struct page *page,
@@ -1510,26 +1499,27 @@ static inline void *acquire_slab(struct kmem_cache *s,
1510 * The old freelist is the list of objects for the 1499 * The old freelist is the list of objects for the
1511 * per cpu allocation list. 1500 * per cpu allocation list.
1512 */ 1501 */
1513 do { 1502 freelist = page->freelist;
1514 freelist = page->freelist; 1503 counters = page->counters;
1515 counters = page->counters; 1504 new.counters = counters;
1516 new.counters = counters; 1505 if (mode) {
1517 if (mode) { 1506 new.inuse = page->objects;
1518 new.inuse = page->objects; 1507 new.freelist = NULL;
1519 new.freelist = NULL; 1508 } else {
1520 } else { 1509 new.freelist = freelist;
1521 new.freelist = freelist; 1510 }
1522 }
1523 1511
1524 VM_BUG_ON(new.frozen); 1512 VM_BUG_ON(new.frozen);
1525 new.frozen = 1; 1513 new.frozen = 1;
1526 1514
1527 } while (!__cmpxchg_double_slab(s, page, 1515 if (!__cmpxchg_double_slab(s, page,
1528 freelist, counters, 1516 freelist, counters,
1529 new.freelist, new.counters, 1517 new.freelist, new.counters,
1530 "lock and freeze")); 1518 "acquire_slab"))
1519 return NULL;
1531 1520
1532 remove_partial(n, page); 1521 remove_partial(n, page);
1522 WARN_ON(!freelist);
1533 return freelist; 1523 return freelist;
1534} 1524}
1535 1525
@@ -1563,7 +1553,6 @@ static void *get_partial_node(struct kmem_cache *s,
1563 1553
1564 if (!object) { 1554 if (!object) {
1565 c->page = page; 1555 c->page = page;
1566 c->node = page_to_nid(page);
1567 stat(s, ALLOC_FROM_PARTIAL); 1556 stat(s, ALLOC_FROM_PARTIAL);
1568 object = t; 1557 object = t;
1569 available = page->objects - page->inuse; 1558 available = page->objects - page->inuse;
@@ -1617,7 +1606,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1617 1606
1618 do { 1607 do {
1619 cpuset_mems_cookie = get_mems_allowed(); 1608 cpuset_mems_cookie = get_mems_allowed();
1620 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1609 zonelist = node_zonelist(slab_node(), flags);
1621 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1610 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1622 struct kmem_cache_node *n; 1611 struct kmem_cache_node *n;
1623 1612
@@ -1731,14 +1720,12 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
1731/* 1720/*
1732 * Remove the cpu slab 1721 * Remove the cpu slab
1733 */ 1722 */
1734static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1723static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist)
1735{ 1724{
1736 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; 1725 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
1737 struct page *page = c->page;
1738 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1726 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1739 int lock = 0; 1727 int lock = 0;
1740 enum slab_modes l = M_NONE, m = M_NONE; 1728 enum slab_modes l = M_NONE, m = M_NONE;
1741 void *freelist;
1742 void *nextfree; 1729 void *nextfree;
1743 int tail = DEACTIVATE_TO_HEAD; 1730 int tail = DEACTIVATE_TO_HEAD;
1744 struct page new; 1731 struct page new;
@@ -1749,11 +1736,6 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1749 tail = DEACTIVATE_TO_TAIL; 1736 tail = DEACTIVATE_TO_TAIL;
1750 } 1737 }
1751 1738
1752 c->tid = next_tid(c->tid);
1753 c->page = NULL;
1754 freelist = c->freelist;
1755 c->freelist = NULL;
1756
1757 /* 1739 /*
1758 * Stage one: Free all available per cpu objects back 1740 * Stage one: Free all available per cpu objects back
1759 * to the page freelist while it is still frozen. Leave the 1741 * to the page freelist while it is still frozen. Leave the
@@ -1879,21 +1861,31 @@ redo:
1879 } 1861 }
1880} 1862}
1881 1863
1882/* Unfreeze all the cpu partial slabs */ 1864/*
1865 * Unfreeze all the cpu partial slabs.
1866 *
1867 * This function must be called with interrupt disabled.
1868 */
1883static void unfreeze_partials(struct kmem_cache *s) 1869static void unfreeze_partials(struct kmem_cache *s)
1884{ 1870{
1885 struct kmem_cache_node *n = NULL; 1871 struct kmem_cache_node *n = NULL, *n2 = NULL;
1886 struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); 1872 struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
1887 struct page *page, *discard_page = NULL; 1873 struct page *page, *discard_page = NULL;
1888 1874
1889 while ((page = c->partial)) { 1875 while ((page = c->partial)) {
1890 enum slab_modes { M_PARTIAL, M_FREE };
1891 enum slab_modes l, m;
1892 struct page new; 1876 struct page new;
1893 struct page old; 1877 struct page old;
1894 1878
1895 c->partial = page->next; 1879 c->partial = page->next;
1896 l = M_FREE; 1880
1881 n2 = get_node(s, page_to_nid(page));
1882 if (n != n2) {
1883 if (n)
1884 spin_unlock(&n->list_lock);
1885
1886 n = n2;
1887 spin_lock(&n->list_lock);
1888 }
1897 1889
1898 do { 1890 do {
1899 1891
@@ -1906,43 +1898,17 @@ static void unfreeze_partials(struct kmem_cache *s)
1906 1898
1907 new.frozen = 0; 1899 new.frozen = 0;
1908 1900
1909 if (!new.inuse && (!n || n->nr_partial > s->min_partial)) 1901 } while (!__cmpxchg_double_slab(s, page,
1910 m = M_FREE;
1911 else {
1912 struct kmem_cache_node *n2 = get_node(s,
1913 page_to_nid(page));
1914
1915 m = M_PARTIAL;
1916 if (n != n2) {
1917 if (n)
1918 spin_unlock(&n->list_lock);
1919
1920 n = n2;
1921 spin_lock(&n->list_lock);
1922 }
1923 }
1924
1925 if (l != m) {
1926 if (l == M_PARTIAL) {
1927 remove_partial(n, page);
1928 stat(s, FREE_REMOVE_PARTIAL);
1929 } else {
1930 add_partial(n, page,
1931 DEACTIVATE_TO_TAIL);
1932 stat(s, FREE_ADD_PARTIAL);
1933 }
1934
1935 l = m;
1936 }
1937
1938 } while (!cmpxchg_double_slab(s, page,
1939 old.freelist, old.counters, 1902 old.freelist, old.counters,
1940 new.freelist, new.counters, 1903 new.freelist, new.counters,
1941 "unfreezing slab")); 1904 "unfreezing slab"));
1942 1905
1943 if (m == M_FREE) { 1906 if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
1944 page->next = discard_page; 1907 page->next = discard_page;
1945 discard_page = page; 1908 discard_page = page;
1909 } else {
1910 add_partial(n, page, DEACTIVATE_TO_TAIL);
1911 stat(s, FREE_ADD_PARTIAL);
1946 } 1912 }
1947 } 1913 }
1948 1914
@@ -2011,7 +1977,11 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2011static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1977static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
2012{ 1978{
2013 stat(s, CPUSLAB_FLUSH); 1979 stat(s, CPUSLAB_FLUSH);
2014 deactivate_slab(s, c); 1980 deactivate_slab(s, c->page, c->freelist);
1981
1982 c->tid = next_tid(c->tid);
1983 c->page = NULL;
1984 c->freelist = NULL;
2015} 1985}
2016 1986
2017/* 1987/*
@@ -2055,10 +2025,10 @@ static void flush_all(struct kmem_cache *s)
2055 * Check if the objects in a per cpu structure fit numa 2025 * Check if the objects in a per cpu structure fit numa
2056 * locality expectations. 2026 * locality expectations.
2057 */ 2027 */
2058static inline int node_match(struct kmem_cache_cpu *c, int node) 2028static inline int node_match(struct page *page, int node)
2059{ 2029{
2060#ifdef CONFIG_NUMA 2030#ifdef CONFIG_NUMA
2061 if (node != NUMA_NO_NODE && c->node != node) 2031 if (node != NUMA_NO_NODE && page_to_nid(page) != node)
2062 return 0; 2032 return 0;
2063#endif 2033#endif
2064 return 1; 2034 return 1;
@@ -2101,10 +2071,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2101 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 2071 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
2102 nid, gfpflags); 2072 nid, gfpflags);
2103 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " 2073 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, "
2104 "default order: %d, min order: %d\n", s->name, s->objsize, 2074 "default order: %d, min order: %d\n", s->name, s->object_size,
2105 s->size, oo_order(s->oo), oo_order(s->min)); 2075 s->size, oo_order(s->oo), oo_order(s->min));
2106 2076
2107 if (oo_order(s->min) > get_order(s->objsize)) 2077 if (oo_order(s->min) > get_order(s->object_size))
2108 printk(KERN_WARNING " %s debugging increased min order, use " 2078 printk(KERN_WARNING " %s debugging increased min order, use "
2109 "slub_debug=O to disable.\n", s->name); 2079 "slub_debug=O to disable.\n", s->name);
2110 2080
@@ -2130,10 +2100,16 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2130static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, 2100static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2131 int node, struct kmem_cache_cpu **pc) 2101 int node, struct kmem_cache_cpu **pc)
2132{ 2102{
2133 void *object; 2103 void *freelist;
2134 struct kmem_cache_cpu *c; 2104 struct kmem_cache_cpu *c = *pc;
2135 struct page *page = new_slab(s, flags, node); 2105 struct page *page;
2136 2106
2107 freelist = get_partial(s, flags, node, c);
2108
2109 if (freelist)
2110 return freelist;
2111
2112 page = new_slab(s, flags, node);
2137 if (page) { 2113 if (page) {
2138 c = __this_cpu_ptr(s->cpu_slab); 2114 c = __this_cpu_ptr(s->cpu_slab);
2139 if (c->page) 2115 if (c->page)
@@ -2143,17 +2119,24 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2143 * No other reference to the page yet so we can 2119 * No other reference to the page yet so we can
2144 * muck around with it freely without cmpxchg 2120 * muck around with it freely without cmpxchg
2145 */ 2121 */
2146 object = page->freelist; 2122 freelist = page->freelist;
2147 page->freelist = NULL; 2123 page->freelist = NULL;
2148 2124
2149 stat(s, ALLOC_SLAB); 2125 stat(s, ALLOC_SLAB);
2150 c->node = page_to_nid(page);
2151 c->page = page; 2126 c->page = page;
2152 *pc = c; 2127 *pc = c;
2153 } else 2128 } else
2154 object = NULL; 2129 freelist = NULL;
2155 2130
2156 return object; 2131 return freelist;
2132}
2133
2134static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
2135{
2136 if (unlikely(PageSlabPfmemalloc(page)))
2137 return gfp_pfmemalloc_allowed(gfpflags);
2138
2139 return true;
2157} 2140}
2158 2141
2159/* 2142/*
@@ -2163,6 +2146,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2163 * The page is still frozen if the return value is not NULL. 2146 * The page is still frozen if the return value is not NULL.
2164 * 2147 *
2165 * If this function returns NULL then the page has been unfrozen. 2148 * If this function returns NULL then the page has been unfrozen.
2149 *
2150 * This function must be called with interrupt disabled.
2166 */ 2151 */
2167static inline void *get_freelist(struct kmem_cache *s, struct page *page) 2152static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2168{ 2153{
@@ -2173,13 +2158,14 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2173 do { 2158 do {
2174 freelist = page->freelist; 2159 freelist = page->freelist;
2175 counters = page->counters; 2160 counters = page->counters;
2161
2176 new.counters = counters; 2162 new.counters = counters;
2177 VM_BUG_ON(!new.frozen); 2163 VM_BUG_ON(!new.frozen);
2178 2164
2179 new.inuse = page->objects; 2165 new.inuse = page->objects;
2180 new.frozen = freelist != NULL; 2166 new.frozen = freelist != NULL;
2181 2167
2182 } while (!cmpxchg_double_slab(s, page, 2168 } while (!__cmpxchg_double_slab(s, page,
2183 freelist, counters, 2169 freelist, counters,
2184 NULL, new.counters, 2170 NULL, new.counters,
2185 "get_freelist")); 2171 "get_freelist"));
@@ -2206,7 +2192,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2206static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 2192static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2207 unsigned long addr, struct kmem_cache_cpu *c) 2193 unsigned long addr, struct kmem_cache_cpu *c)
2208{ 2194{
2209 void **object; 2195 void *freelist;
2196 struct page *page;
2210 unsigned long flags; 2197 unsigned long flags;
2211 2198
2212 local_irq_save(flags); 2199 local_irq_save(flags);
@@ -2219,25 +2206,41 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2219 c = this_cpu_ptr(s->cpu_slab); 2206 c = this_cpu_ptr(s->cpu_slab);
2220#endif 2207#endif
2221 2208
2222 if (!c->page) 2209 page = c->page;
2210 if (!page)
2223 goto new_slab; 2211 goto new_slab;
2224redo: 2212redo:
2225 if (unlikely(!node_match(c, node))) { 2213
2214 if (unlikely(!node_match(page, node))) {
2226 stat(s, ALLOC_NODE_MISMATCH); 2215 stat(s, ALLOC_NODE_MISMATCH);
2227 deactivate_slab(s, c); 2216 deactivate_slab(s, page, c->freelist);
2217 c->page = NULL;
2218 c->freelist = NULL;
2219 goto new_slab;
2220 }
2221
2222 /*
2223 * By rights, we should be searching for a slab page that was
2224 * PFMEMALLOC but right now, we are losing the pfmemalloc
2225 * information when the page leaves the per-cpu allocator
2226 */
2227 if (unlikely(!pfmemalloc_match(page, gfpflags))) {
2228 deactivate_slab(s, page, c->freelist);
2229 c->page = NULL;
2230 c->freelist = NULL;
2228 goto new_slab; 2231 goto new_slab;
2229 } 2232 }
2230 2233
2231 /* must check again c->freelist in case of cpu migration or IRQ */ 2234 /* must check again c->freelist in case of cpu migration or IRQ */
2232 object = c->freelist; 2235 freelist = c->freelist;
2233 if (object) 2236 if (freelist)
2234 goto load_freelist; 2237 goto load_freelist;
2235 2238
2236 stat(s, ALLOC_SLOWPATH); 2239 stat(s, ALLOC_SLOWPATH);
2237 2240
2238 object = get_freelist(s, c->page); 2241 freelist = get_freelist(s, page);
2239 2242
2240 if (!object) { 2243 if (!freelist) {
2241 c->page = NULL; 2244 c->page = NULL;
2242 stat(s, DEACTIVATE_BYPASS); 2245 stat(s, DEACTIVATE_BYPASS);
2243 goto new_slab; 2246 goto new_slab;
@@ -2246,50 +2249,50 @@ redo:
2246 stat(s, ALLOC_REFILL); 2249 stat(s, ALLOC_REFILL);
2247 2250
2248load_freelist: 2251load_freelist:
2249 c->freelist = get_freepointer(s, object); 2252 /*
2253 * freelist is pointing to the list of objects to be used.
2254 * page is pointing to the page from which the objects are obtained.
2255 * That page must be frozen for per cpu allocations to work.
2256 */
2257 VM_BUG_ON(!c->page->frozen);
2258 c->freelist = get_freepointer(s, freelist);
2250 c->tid = next_tid(c->tid); 2259 c->tid = next_tid(c->tid);
2251 local_irq_restore(flags); 2260 local_irq_restore(flags);
2252 return object; 2261 return freelist;
2253 2262
2254new_slab: 2263new_slab:
2255 2264
2256 if (c->partial) { 2265 if (c->partial) {
2257 c->page = c->partial; 2266 page = c->page = c->partial;
2258 c->partial = c->page->next; 2267 c->partial = page->next;
2259 c->node = page_to_nid(c->page);
2260 stat(s, CPU_PARTIAL_ALLOC); 2268 stat(s, CPU_PARTIAL_ALLOC);
2261 c->freelist = NULL; 2269 c->freelist = NULL;
2262 goto redo; 2270 goto redo;
2263 } 2271 }
2264 2272
2265 /* Then do expensive stuff like retrieving pages from the partial lists */ 2273 freelist = new_slab_objects(s, gfpflags, node, &c);
2266 object = get_partial(s, gfpflags, node, c);
2267
2268 if (unlikely(!object)) {
2269 2274
2270 object = new_slab_objects(s, gfpflags, node, &c); 2275 if (unlikely(!freelist)) {
2276 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
2277 slab_out_of_memory(s, gfpflags, node);
2271 2278
2272 if (unlikely(!object)) { 2279 local_irq_restore(flags);
2273 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2280 return NULL;
2274 slab_out_of_memory(s, gfpflags, node);
2275
2276 local_irq_restore(flags);
2277 return NULL;
2278 }
2279 } 2281 }
2280 2282
2281 if (likely(!kmem_cache_debug(s))) 2283 page = c->page;
2284 if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
2282 goto load_freelist; 2285 goto load_freelist;
2283 2286
2284 /* Only entered in the debug case */ 2287 /* Only entered in the debug case */
2285 if (!alloc_debug_processing(s, c->page, object, addr)) 2288 if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
2286 goto new_slab; /* Slab failed checks. Next slab needed */ 2289 goto new_slab; /* Slab failed checks. Next slab needed */
2287 2290
2288 c->freelist = get_freepointer(s, object); 2291 deactivate_slab(s, page, get_freepointer(s, freelist));
2289 deactivate_slab(s, c); 2292 c->page = NULL;
2290 c->node = NUMA_NO_NODE; 2293 c->freelist = NULL;
2291 local_irq_restore(flags); 2294 local_irq_restore(flags);
2292 return object; 2295 return freelist;
2293} 2296}
2294 2297
2295/* 2298/*
@@ -2307,6 +2310,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
2307{ 2310{
2308 void **object; 2311 void **object;
2309 struct kmem_cache_cpu *c; 2312 struct kmem_cache_cpu *c;
2313 struct page *page;
2310 unsigned long tid; 2314 unsigned long tid;
2311 2315
2312 if (slab_pre_alloc_hook(s, gfpflags)) 2316 if (slab_pre_alloc_hook(s, gfpflags))
@@ -2332,8 +2336,8 @@ redo:
2332 barrier(); 2336 barrier();
2333 2337
2334 object = c->freelist; 2338 object = c->freelist;
2335 if (unlikely(!object || !node_match(c, node))) 2339 page = c->page;
2336 2340 if (unlikely(!object || !node_match(page, node)))
2337 object = __slab_alloc(s, gfpflags, node, addr, c); 2341 object = __slab_alloc(s, gfpflags, node, addr, c);
2338 2342
2339 else { 2343 else {
@@ -2364,7 +2368,7 @@ redo:
2364 } 2368 }
2365 2369
2366 if (unlikely(gfpflags & __GFP_ZERO) && object) 2370 if (unlikely(gfpflags & __GFP_ZERO) && object)
2367 memset(object, 0, s->objsize); 2371 memset(object, 0, s->object_size);
2368 2372
2369 slab_post_alloc_hook(s, gfpflags, object); 2373 slab_post_alloc_hook(s, gfpflags, object);
2370 2374
@@ -2375,7 +2379,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
2375{ 2379{
2376 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 2380 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
2377 2381
2378 trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); 2382 trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
2379 2383
2380 return ret; 2384 return ret;
2381} 2385}
@@ -2405,7 +2409,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
2405 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); 2409 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
2406 2410
2407 trace_kmem_cache_alloc_node(_RET_IP_, ret, 2411 trace_kmem_cache_alloc_node(_RET_IP_, ret,
2408 s->objsize, s->size, gfpflags, node); 2412 s->object_size, s->size, gfpflags, node);
2409 2413
2410 return ret; 2414 return ret;
2411} 2415}
@@ -2900,7 +2904,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
2900static int calculate_sizes(struct kmem_cache *s, int forced_order) 2904static int calculate_sizes(struct kmem_cache *s, int forced_order)
2901{ 2905{
2902 unsigned long flags = s->flags; 2906 unsigned long flags = s->flags;
2903 unsigned long size = s->objsize; 2907 unsigned long size = s->object_size;
2904 unsigned long align = s->align; 2908 unsigned long align = s->align;
2905 int order; 2909 int order;
2906 2910
@@ -2929,7 +2933,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2929 * end of the object and the free pointer. If not then add an 2933 * end of the object and the free pointer. If not then add an
2930 * additional word to have some bytes to store Redzone information. 2934 * additional word to have some bytes to store Redzone information.
2931 */ 2935 */
2932 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 2936 if ((flags & SLAB_RED_ZONE) && size == s->object_size)
2933 size += sizeof(void *); 2937 size += sizeof(void *);
2934#endif 2938#endif
2935 2939
@@ -2977,7 +2981,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2977 * user specified and the dynamic determination of cache line size 2981 * user specified and the dynamic determination of cache line size
2978 * on bootup. 2982 * on bootup.
2979 */ 2983 */
2980 align = calculate_alignment(flags, align, s->objsize); 2984 align = calculate_alignment(flags, align, s->object_size);
2981 s->align = align; 2985 s->align = align;
2982 2986
2983 /* 2987 /*
@@ -3025,7 +3029,7 @@ static int kmem_cache_open(struct kmem_cache *s,
3025 memset(s, 0, kmem_size); 3029 memset(s, 0, kmem_size);
3026 s->name = name; 3030 s->name = name;
3027 s->ctor = ctor; 3031 s->ctor = ctor;
3028 s->objsize = size; 3032 s->object_size = size;
3029 s->align = align; 3033 s->align = align;
3030 s->flags = kmem_cache_flags(size, flags, name, ctor); 3034 s->flags = kmem_cache_flags(size, flags, name, ctor);
3031 s->reserved = 0; 3035 s->reserved = 0;
@@ -3040,7 +3044,7 @@ static int kmem_cache_open(struct kmem_cache *s,
3040 * Disable debugging flags that store metadata if the min slab 3044 * Disable debugging flags that store metadata if the min slab
3041 * order increased. 3045 * order increased.
3042 */ 3046 */
3043 if (get_order(s->size) > get_order(s->objsize)) { 3047 if (get_order(s->size) > get_order(s->object_size)) {
3044 s->flags &= ~DEBUG_METADATA_FLAGS; 3048 s->flags &= ~DEBUG_METADATA_FLAGS;
3045 s->offset = 0; 3049 s->offset = 0;
3046 if (!calculate_sizes(s, -1)) 3050 if (!calculate_sizes(s, -1))
@@ -3114,7 +3118,7 @@ error:
3114 */ 3118 */
3115unsigned int kmem_cache_size(struct kmem_cache *s) 3119unsigned int kmem_cache_size(struct kmem_cache *s)
3116{ 3120{
3117 return s->objsize; 3121 return s->object_size;
3118} 3122}
3119EXPORT_SYMBOL(kmem_cache_size); 3123EXPORT_SYMBOL(kmem_cache_size);
3120 3124
@@ -3192,11 +3196,11 @@ static inline int kmem_cache_close(struct kmem_cache *s)
3192 */ 3196 */
3193void kmem_cache_destroy(struct kmem_cache *s) 3197void kmem_cache_destroy(struct kmem_cache *s)
3194{ 3198{
3195 down_write(&slub_lock); 3199 mutex_lock(&slab_mutex);
3196 s->refcount--; 3200 s->refcount--;
3197 if (!s->refcount) { 3201 if (!s->refcount) {
3198 list_del(&s->list); 3202 list_del(&s->list);
3199 up_write(&slub_lock); 3203 mutex_unlock(&slab_mutex);
3200 if (kmem_cache_close(s)) { 3204 if (kmem_cache_close(s)) {
3201 printk(KERN_ERR "SLUB %s: %s called for cache that " 3205 printk(KERN_ERR "SLUB %s: %s called for cache that "
3202 "still has objects.\n", s->name, __func__); 3206 "still has objects.\n", s->name, __func__);
@@ -3206,7 +3210,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
3206 rcu_barrier(); 3210 rcu_barrier();
3207 sysfs_slab_remove(s); 3211 sysfs_slab_remove(s);
3208 } else 3212 } else
3209 up_write(&slub_lock); 3213 mutex_unlock(&slab_mutex);
3210} 3214}
3211EXPORT_SYMBOL(kmem_cache_destroy); 3215EXPORT_SYMBOL(kmem_cache_destroy);
3212 3216
@@ -3268,7 +3272,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
3268 3272
3269 /* 3273 /*
3270 * This function is called with IRQs disabled during early-boot on 3274 * This function is called with IRQs disabled during early-boot on
3271 * single CPU so there's no need to take slub_lock here. 3275 * single CPU so there's no need to take slab_mutex here.
3272 */ 3276 */
3273 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, 3277 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
3274 flags, NULL)) 3278 flags, NULL))
@@ -3553,10 +3557,10 @@ static int slab_mem_going_offline_callback(void *arg)
3553{ 3557{
3554 struct kmem_cache *s; 3558 struct kmem_cache *s;
3555 3559
3556 down_read(&slub_lock); 3560 mutex_lock(&slab_mutex);
3557 list_for_each_entry(s, &slab_caches, list) 3561 list_for_each_entry(s, &slab_caches, list)
3558 kmem_cache_shrink(s); 3562 kmem_cache_shrink(s);
3559 up_read(&slub_lock); 3563 mutex_unlock(&slab_mutex);
3560 3564
3561 return 0; 3565 return 0;
3562} 3566}
@@ -3577,7 +3581,7 @@ static void slab_mem_offline_callback(void *arg)
3577 if (offline_node < 0) 3581 if (offline_node < 0)
3578 return; 3582 return;
3579 3583
3580 down_read(&slub_lock); 3584 mutex_lock(&slab_mutex);
3581 list_for_each_entry(s, &slab_caches, list) { 3585 list_for_each_entry(s, &slab_caches, list) {
3582 n = get_node(s, offline_node); 3586 n = get_node(s, offline_node);
3583 if (n) { 3587 if (n) {
@@ -3593,7 +3597,7 @@ static void slab_mem_offline_callback(void *arg)
3593 kmem_cache_free(kmem_cache_node, n); 3597 kmem_cache_free(kmem_cache_node, n);
3594 } 3598 }
3595 } 3599 }
3596 up_read(&slub_lock); 3600 mutex_unlock(&slab_mutex);
3597} 3601}
3598 3602
3599static int slab_mem_going_online_callback(void *arg) 3603static int slab_mem_going_online_callback(void *arg)
@@ -3616,7 +3620,7 @@ static int slab_mem_going_online_callback(void *arg)
3616 * allocate a kmem_cache_node structure in order to bring the node 3620 * allocate a kmem_cache_node structure in order to bring the node
3617 * online. 3621 * online.
3618 */ 3622 */
3619 down_read(&slub_lock); 3623 mutex_lock(&slab_mutex);
3620 list_for_each_entry(s, &slab_caches, list) { 3624 list_for_each_entry(s, &slab_caches, list) {
3621 /* 3625 /*
3622 * XXX: kmem_cache_alloc_node will fallback to other nodes 3626 * XXX: kmem_cache_alloc_node will fallback to other nodes
@@ -3632,7 +3636,7 @@ static int slab_mem_going_online_callback(void *arg)
3632 s->node[nid] = n; 3636 s->node[nid] = n;
3633 } 3637 }
3634out: 3638out:
3635 up_read(&slub_lock); 3639 mutex_unlock(&slab_mutex);
3636 return ret; 3640 return ret;
3637} 3641}
3638 3642
@@ -3843,11 +3847,11 @@ void __init kmem_cache_init(void)
3843 3847
3844 if (s && s->size) { 3848 if (s && s->size) {
3845 char *name = kasprintf(GFP_NOWAIT, 3849 char *name = kasprintf(GFP_NOWAIT,
3846 "dma-kmalloc-%d", s->objsize); 3850 "dma-kmalloc-%d", s->object_size);
3847 3851
3848 BUG_ON(!name); 3852 BUG_ON(!name);
3849 kmalloc_dma_caches[i] = create_kmalloc_cache(name, 3853 kmalloc_dma_caches[i] = create_kmalloc_cache(name,
3850 s->objsize, SLAB_CACHE_DMA); 3854 s->object_size, SLAB_CACHE_DMA);
3851 } 3855 }
3852 } 3856 }
3853#endif 3857#endif
@@ -3924,16 +3928,12 @@ static struct kmem_cache *find_mergeable(size_t size,
3924 return NULL; 3928 return NULL;
3925} 3929}
3926 3930
3927struct kmem_cache *kmem_cache_create(const char *name, size_t size, 3931struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
3928 size_t align, unsigned long flags, void (*ctor)(void *)) 3932 size_t align, unsigned long flags, void (*ctor)(void *))
3929{ 3933{
3930 struct kmem_cache *s; 3934 struct kmem_cache *s;
3931 char *n; 3935 char *n;
3932 3936
3933 if (WARN_ON(!name))
3934 return NULL;
3935
3936 down_write(&slub_lock);
3937 s = find_mergeable(size, align, flags, name, ctor); 3937 s = find_mergeable(size, align, flags, name, ctor);
3938 if (s) { 3938 if (s) {
3939 s->refcount++; 3939 s->refcount++;
@@ -3941,49 +3941,42 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3941 * Adjust the object sizes so that we clear 3941 * Adjust the object sizes so that we clear
3942 * the complete object on kzalloc. 3942 * the complete object on kzalloc.
3943 */ 3943 */
3944 s->objsize = max(s->objsize, (int)size); 3944 s->object_size = max(s->object_size, (int)size);
3945 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3945 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3946 3946
3947 if (sysfs_slab_alias(s, name)) { 3947 if (sysfs_slab_alias(s, name)) {
3948 s->refcount--; 3948 s->refcount--;
3949 goto err; 3949 return NULL;
3950 } 3950 }
3951 up_write(&slub_lock);
3952 return s; 3951 return s;
3953 } 3952 }
3954 3953
3955 n = kstrdup(name, GFP_KERNEL); 3954 n = kstrdup(name, GFP_KERNEL);
3956 if (!n) 3955 if (!n)
3957 goto err; 3956 return NULL;
3958 3957
3959 s = kmalloc(kmem_size, GFP_KERNEL); 3958 s = kmalloc(kmem_size, GFP_KERNEL);
3960 if (s) { 3959 if (s) {
3961 if (kmem_cache_open(s, n, 3960 if (kmem_cache_open(s, n,
3962 size, align, flags, ctor)) { 3961 size, align, flags, ctor)) {
3962 int r;
3963
3963 list_add(&s->list, &slab_caches); 3964 list_add(&s->list, &slab_caches);
3964 up_write(&slub_lock); 3965 mutex_unlock(&slab_mutex);
3965 if (sysfs_slab_add(s)) { 3966 r = sysfs_slab_add(s);
3966 down_write(&slub_lock); 3967 mutex_lock(&slab_mutex);
3967 list_del(&s->list); 3968
3968 kfree(n); 3969 if (!r)
3969 kfree(s); 3970 return s;
3970 goto err; 3971
3971 } 3972 list_del(&s->list);
3972 return s; 3973 kmem_cache_close(s);
3973 } 3974 }
3974 kfree(s); 3975 kfree(s);
3975 } 3976 }
3976 kfree(n); 3977 kfree(n);
3977err: 3978 return NULL;
3978 up_write(&slub_lock);
3979
3980 if (flags & SLAB_PANIC)
3981 panic("Cannot create slabcache %s\n", name);
3982 else
3983 s = NULL;
3984 return s;
3985} 3979}
3986EXPORT_SYMBOL(kmem_cache_create);
3987 3980
3988#ifdef CONFIG_SMP 3981#ifdef CONFIG_SMP
3989/* 3982/*
@@ -4002,13 +3995,13 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
4002 case CPU_UP_CANCELED_FROZEN: 3995 case CPU_UP_CANCELED_FROZEN:
4003 case CPU_DEAD: 3996 case CPU_DEAD:
4004 case CPU_DEAD_FROZEN: 3997 case CPU_DEAD_FROZEN:
4005 down_read(&slub_lock); 3998 mutex_lock(&slab_mutex);
4006 list_for_each_entry(s, &slab_caches, list) { 3999 list_for_each_entry(s, &slab_caches, list) {
4007 local_irq_save(flags); 4000 local_irq_save(flags);
4008 __flush_cpu_slab(s, cpu); 4001 __flush_cpu_slab(s, cpu);
4009 local_irq_restore(flags); 4002 local_irq_restore(flags);
4010 } 4003 }
4011 up_read(&slub_lock); 4004 mutex_unlock(&slab_mutex);
4012 break; 4005 break;
4013 default: 4006 default:
4014 break; 4007 break;
@@ -4500,30 +4493,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4500 4493
4501 for_each_possible_cpu(cpu) { 4494 for_each_possible_cpu(cpu) {
4502 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4495 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
4503 int node = ACCESS_ONCE(c->node); 4496 int node;
4504 struct page *page; 4497 struct page *page;
4505 4498
4506 if (node < 0)
4507 continue;
4508 page = ACCESS_ONCE(c->page); 4499 page = ACCESS_ONCE(c->page);
4509 if (page) { 4500 if (!page)
4510 if (flags & SO_TOTAL) 4501 continue;
4511 x = page->objects;
4512 else if (flags & SO_OBJECTS)
4513 x = page->inuse;
4514 else
4515 x = 1;
4516 4502
4517 total += x; 4503 node = page_to_nid(page);
4518 nodes[node] += x; 4504 if (flags & SO_TOTAL)
4519 } 4505 x = page->objects;
4520 page = c->partial; 4506 else if (flags & SO_OBJECTS)
4507 x = page->inuse;
4508 else
4509 x = 1;
4521 4510
4511 total += x;
4512 nodes[node] += x;
4513
4514 page = ACCESS_ONCE(c->partial);
4522 if (page) { 4515 if (page) {
4523 x = page->pobjects; 4516 x = page->pobjects;
4524 total += x; 4517 total += x;
4525 nodes[node] += x; 4518 nodes[node] += x;
4526 } 4519 }
4520
4527 per_cpu[node]++; 4521 per_cpu[node]++;
4528 } 4522 }
4529 } 4523 }
@@ -4623,7 +4617,7 @@ SLAB_ATTR_RO(align);
4623 4617
4624static ssize_t object_size_show(struct kmem_cache *s, char *buf) 4618static ssize_t object_size_show(struct kmem_cache *s, char *buf)
4625{ 4619{
4626 return sprintf(buf, "%d\n", s->objsize); 4620 return sprintf(buf, "%d\n", s->object_size);
4627} 4621}
4628SLAB_ATTR_RO(object_size); 4622SLAB_ATTR_RO(object_size);
4629 4623
@@ -5286,7 +5280,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
5286 const char *name; 5280 const char *name;
5287 int unmergeable; 5281 int unmergeable;
5288 5282
5289 if (slab_state < SYSFS) 5283 if (slab_state < FULL)
5290 /* Defer until later */ 5284 /* Defer until later */
5291 return 0; 5285 return 0;
5292 5286
@@ -5331,7 +5325,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
5331 5325
5332static void sysfs_slab_remove(struct kmem_cache *s) 5326static void sysfs_slab_remove(struct kmem_cache *s)
5333{ 5327{
5334 if (slab_state < SYSFS) 5328 if (slab_state < FULL)
5335 /* 5329 /*
5336 * Sysfs has not been setup yet so no need to remove the 5330 * Sysfs has not been setup yet so no need to remove the
5337 * cache from sysfs. 5331 * cache from sysfs.
@@ -5359,7 +5353,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
5359{ 5353{
5360 struct saved_alias *al; 5354 struct saved_alias *al;
5361 5355
5362 if (slab_state == SYSFS) { 5356 if (slab_state == FULL) {
5363 /* 5357 /*
5364 * If we have a leftover link then remove it. 5358 * If we have a leftover link then remove it.
5365 */ 5359 */
@@ -5383,16 +5377,16 @@ static int __init slab_sysfs_init(void)
5383 struct kmem_cache *s; 5377 struct kmem_cache *s;
5384 int err; 5378 int err;
5385 5379
5386 down_write(&slub_lock); 5380 mutex_lock(&slab_mutex);
5387 5381
5388 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 5382 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
5389 if (!slab_kset) { 5383 if (!slab_kset) {
5390 up_write(&slub_lock); 5384 mutex_unlock(&slab_mutex);
5391 printk(KERN_ERR "Cannot register slab subsystem.\n"); 5385 printk(KERN_ERR "Cannot register slab subsystem.\n");
5392 return -ENOSYS; 5386 return -ENOSYS;
5393 } 5387 }
5394 5388
5395 slab_state = SYSFS; 5389 slab_state = FULL;
5396 5390
5397 list_for_each_entry(s, &slab_caches, list) { 5391 list_for_each_entry(s, &slab_caches, list) {
5398 err = sysfs_slab_add(s); 5392 err = sysfs_slab_add(s);
@@ -5408,11 +5402,11 @@ static int __init slab_sysfs_init(void)
5408 err = sysfs_slab_alias(al->s, al->name); 5402 err = sysfs_slab_alias(al->s, al->name);
5409 if (err) 5403 if (err)
5410 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 5404 printk(KERN_ERR "SLUB: Unable to add boot slab alias"
5411 " %s to sysfs\n", s->name); 5405 " %s to sysfs\n", al->name);
5412 kfree(al); 5406 kfree(al);
5413 } 5407 }
5414 5408
5415 up_write(&slub_lock); 5409 mutex_unlock(&slab_mutex);
5416 resiliency_test(); 5410 resiliency_test();
5417 return 0; 5411 return 0;
5418} 5412}
@@ -5427,7 +5421,7 @@ __initcall(slab_sysfs_init);
5427static void print_slabinfo_header(struct seq_file *m) 5421static void print_slabinfo_header(struct seq_file *m)
5428{ 5422{
5429 seq_puts(m, "slabinfo - version: 2.1\n"); 5423 seq_puts(m, "slabinfo - version: 2.1\n");
5430 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 5424 seq_puts(m, "# name <active_objs> <num_objs> <object_size> "
5431 "<objperslab> <pagesperslab>"); 5425 "<objperslab> <pagesperslab>");
5432 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 5426 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
5433 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 5427 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
@@ -5438,7 +5432,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
5438{ 5432{
5439 loff_t n = *pos; 5433 loff_t n = *pos;
5440 5434
5441 down_read(&slub_lock); 5435 mutex_lock(&slab_mutex);
5442 if (!n) 5436 if (!n)
5443 print_slabinfo_header(m); 5437 print_slabinfo_header(m);
5444 5438
@@ -5452,7 +5446,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
5452 5446
5453static void s_stop(struct seq_file *m, void *p) 5447static void s_stop(struct seq_file *m, void *p)
5454{ 5448{
5455 up_read(&slub_lock); 5449 mutex_unlock(&slab_mutex);
5456} 5450}
5457 5451
5458static int s_show(struct seq_file *m, void *p) 5452static int s_show(struct seq_file *m, void *p)
diff --git a/mm/sparse.c b/mm/sparse.c
index 6a4bf9160e85..fac95f2888f2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,21 +65,18 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
65 65
66 if (slab_is_available()) { 66 if (slab_is_available()) {
67 if (node_state(nid, N_HIGH_MEMORY)) 67 if (node_state(nid, N_HIGH_MEMORY))
68 section = kmalloc_node(array_size, GFP_KERNEL, nid); 68 section = kzalloc_node(array_size, GFP_KERNEL, nid);
69 else 69 else
70 section = kmalloc(array_size, GFP_KERNEL); 70 section = kzalloc(array_size, GFP_KERNEL);
71 } else 71 } else {
72 section = alloc_bootmem_node(NODE_DATA(nid), array_size); 72 section = alloc_bootmem_node(NODE_DATA(nid), array_size);
73 73 }
74 if (section)
75 memset(section, 0, array_size);
76 74
77 return section; 75 return section;
78} 76}
79 77
80static int __meminit sparse_index_init(unsigned long section_nr, int nid) 78static int __meminit sparse_index_init(unsigned long section_nr, int nid)
81{ 79{
82 static DEFINE_SPINLOCK(index_init_lock);
83 unsigned long root = SECTION_NR_TO_ROOT(section_nr); 80 unsigned long root = SECTION_NR_TO_ROOT(section_nr);
84 struct mem_section *section; 81 struct mem_section *section;
85 int ret = 0; 82 int ret = 0;
@@ -90,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
90 section = sparse_index_alloc(nid); 87 section = sparse_index_alloc(nid);
91 if (!section) 88 if (!section)
92 return -ENOMEM; 89 return -ENOMEM;
93 /*
94 * This lock keeps two different sections from
95 * reallocating for the same index
96 */
97 spin_lock(&index_init_lock);
98
99 if (mem_section[root]) {
100 ret = -EEXIST;
101 goto out;
102 }
103 90
104 mem_section[root] = section; 91 mem_section[root] = section;
105out: 92
106 spin_unlock(&index_init_lock);
107 return ret; 93 return ret;
108} 94}
109#else /* !SPARSEMEM_EXTREME */ 95#else /* !SPARSEMEM_EXTREME */
@@ -132,6 +118,8 @@ int __section_nr(struct mem_section* ms)
132 break; 118 break;
133 } 119 }
134 120
121 VM_BUG_ON(root_nr == NR_SECTION_ROOTS);
122
135 return (root_nr * SECTIONS_PER_ROOT) + (ms - root); 123 return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
136} 124}
137 125
@@ -275,8 +263,9 @@ static unsigned long * __init
275sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, 263sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
276 unsigned long size) 264 unsigned long size)
277{ 265{
278 pg_data_t *host_pgdat; 266 unsigned long goal, limit;
279 unsigned long goal; 267 unsigned long *p;
268 int nid;
280 /* 269 /*
281 * A page may contain usemaps for other sections preventing the 270 * A page may contain usemaps for other sections preventing the
282 * page being freed and making a section unremovable while 271 * page being freed and making a section unremovable while
@@ -287,10 +276,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
287 * from the same section as the pgdat where possible to avoid 276 * from the same section as the pgdat where possible to avoid
288 * this problem. 277 * this problem.
289 */ 278 */
290 goal = __pa(pgdat) & PAGE_SECTION_MASK; 279 goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
291 host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT)); 280 limit = goal + (1UL << PA_SECTION_SHIFT);
292 return __alloc_bootmem_node_nopanic(host_pgdat, size, 281 nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
293 SMP_CACHE_BYTES, goal); 282again:
283 p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
284 SMP_CACHE_BYTES, goal, limit);
285 if (!p && limit) {
286 limit = 0;
287 goto again;
288 }
289 return p;
294} 290}
295 291
296static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 292static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -485,6 +481,9 @@ void __init sparse_init(void)
485 struct page **map_map; 481 struct page **map_map;
486#endif 482#endif
487 483
484 /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
485 set_pageblock_order();
486
488 /* 487 /*
489 * map is using big page (aka 2M in x86 64 bit) 488 * map is using big page (aka 2M in x86 64 bit)
490 * usemap is less one page (aka 24 bytes) 489 * usemap is less one page (aka 24 bytes)
diff --git a/mm/swap.c b/mm/swap.c
index 4e7e2ec67078..77825883298f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -236,6 +236,58 @@ void put_pages_list(struct list_head *pages)
236} 236}
237EXPORT_SYMBOL(put_pages_list); 237EXPORT_SYMBOL(put_pages_list);
238 238
239/*
240 * get_kernel_pages() - pin kernel pages in memory
241 * @kiov: An array of struct kvec structures
242 * @nr_segs: number of segments to pin
243 * @write: pinning for read/write, currently ignored
244 * @pages: array that receives pointers to the pages pinned.
245 * Should be at least nr_segs long.
246 *
247 * Returns number of pages pinned. This may be fewer than the number
248 * requested. If nr_pages is 0 or negative, returns 0. If no pages
249 * were pinned, returns -errno. Each page returned must be released
250 * with a put_page() call when it is finished with.
251 */
252int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
253 struct page **pages)
254{
255 int seg;
256
257 for (seg = 0; seg < nr_segs; seg++) {
258 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
259 return seg;
260
261 pages[seg] = kmap_to_page(kiov[seg].iov_base);
262 page_cache_get(pages[seg]);
263 }
264
265 return seg;
266}
267EXPORT_SYMBOL_GPL(get_kernel_pages);
268
269/*
270 * get_kernel_page() - pin a kernel page in memory
271 * @start: starting kernel address
272 * @write: pinning for read/write, currently ignored
273 * @pages: array that receives pointer to the page pinned.
274 * Must be at least nr_segs long.
275 *
276 * Returns 1 if page is pinned. If the page was not pinned, returns
277 * -errno. The page returned must be released with a put_page() call
278 * when it is finished with.
279 */
280int get_kernel_page(unsigned long start, int write, struct page **pages)
281{
282 const struct kvec kiov = {
283 .iov_base = (void *)start,
284 .iov_len = PAGE_SIZE
285 };
286
287 return get_kernel_pages(&kiov, 1, write, pages);
288}
289EXPORT_SYMBOL_GPL(get_kernel_page);
290
239static void pagevec_lru_move_fn(struct pagevec *pvec, 291static void pagevec_lru_move_fn(struct pagevec *pvec,
240 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 292 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
241 void *arg) 293 void *arg)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4c5ff7f284d9..0cb36fb1f61c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/blkdev.h>
17#include <linux/pagevec.h> 18#include <linux/pagevec.h>
18#include <linux/migrate.h> 19#include <linux/migrate.h>
19#include <linux/page_cgroup.h> 20#include <linux/page_cgroup.h>
@@ -26,7 +27,7 @@
26 */ 27 */
27static const struct address_space_operations swap_aops = { 28static const struct address_space_operations swap_aops = {
28 .writepage = swap_writepage, 29 .writepage = swap_writepage,
29 .set_page_dirty = __set_page_dirty_no_writeback, 30 .set_page_dirty = swap_set_page_dirty,
30 .migratepage = migrate_page, 31 .migratepage = migrate_page,
31}; 32};
32 33
@@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
376 unsigned long offset = swp_offset(entry); 377 unsigned long offset = swp_offset(entry);
377 unsigned long start_offset, end_offset; 378 unsigned long start_offset, end_offset;
378 unsigned long mask = (1UL << page_cluster) - 1; 379 unsigned long mask = (1UL << page_cluster) - 1;
380 struct blk_plug plug;
379 381
380 /* Read a page_cluster sized and aligned cluster around offset. */ 382 /* Read a page_cluster sized and aligned cluster around offset. */
381 start_offset = offset & ~mask; 383 start_offset = offset & ~mask;
@@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
383 if (!start_offset) /* First page is swap header. */ 385 if (!start_offset) /* First page is swap header. */
384 start_offset++; 386 start_offset++;
385 387
388 blk_start_plug(&plug);
386 for (offset = start_offset; offset <= end_offset ; offset++) { 389 for (offset = start_offset; offset <= end_offset ; offset++) {
387 /* Ok, do the async read-ahead now */ 390 /* Ok, do the async read-ahead now */
388 page = read_swap_cache_async(swp_entry(swp_type(entry), offset), 391 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
@@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
391 continue; 394 continue;
392 page_cache_release(page); 395 page_cache_release(page);
393 } 396 }
397 blk_finish_plug(&plug);
398
394 lru_add_drain(); /* Push any new pages onto the LRU now */ 399 lru_add_drain(); /* Push any new pages onto the LRU now */
395 return read_swap_cache_async(entry, gfp_mask, vma, addr); 400 return read_swap_cache_async(entry, gfp_mask, vma, addr);
396} 401}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 457b10baef59..14e254c768fc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,9 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/poll.h> 32#include <linux/poll.h>
33#include <linux/oom.h> 33#include <linux/oom.h>
34#include <linux/frontswap.h>
35#include <linux/swapfile.h>
36#include <linux/export.h>
34 37
35#include <asm/pgtable.h> 38#include <asm/pgtable.h>
36#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
@@ -42,7 +45,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
42static void free_swap_count_continuations(struct swap_info_struct *); 45static void free_swap_count_continuations(struct swap_info_struct *);
43static sector_t map_swap_entry(swp_entry_t, struct block_device**); 46static sector_t map_swap_entry(swp_entry_t, struct block_device**);
44 47
45static DEFINE_SPINLOCK(swap_lock); 48DEFINE_SPINLOCK(swap_lock);
46static unsigned int nr_swapfiles; 49static unsigned int nr_swapfiles;
47long nr_swap_pages; 50long nr_swap_pages;
48long total_swap_pages; 51long total_swap_pages;
@@ -53,9 +56,9 @@ static const char Unused_file[] = "Unused swap file entry ";
53static const char Bad_offset[] = "Bad swap offset entry "; 56static const char Bad_offset[] = "Bad swap offset entry ";
54static const char Unused_offset[] = "Unused swap offset entry "; 57static const char Unused_offset[] = "Unused swap offset entry ";
55 58
56static struct swap_list_t swap_list = {-1, -1}; 59struct swap_list_t swap_list = {-1, -1};
57 60
58static struct swap_info_struct *swap_info[MAX_SWAPFILES]; 61struct swap_info_struct *swap_info[MAX_SWAPFILES];
59 62
60static DEFINE_MUTEX(swapon_mutex); 63static DEFINE_MUTEX(swapon_mutex);
61 64
@@ -546,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
546 549
547 /* free if no reference */ 550 /* free if no reference */
548 if (!usage) { 551 if (!usage) {
549 struct gendisk *disk = p->bdev->bd_disk;
550 if (offset < p->lowest_bit) 552 if (offset < p->lowest_bit)
551 p->lowest_bit = offset; 553 p->lowest_bit = offset;
552 if (offset > p->highest_bit) 554 if (offset > p->highest_bit)
@@ -556,9 +558,13 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
556 swap_list.next = p->type; 558 swap_list.next = p->type;
557 nr_swap_pages++; 559 nr_swap_pages++;
558 p->inuse_pages--; 560 p->inuse_pages--;
559 if ((p->flags & SWP_BLKDEV) && 561 frontswap_invalidate_page(p->type, offset);
560 disk->fops->swap_slot_free_notify) 562 if (p->flags & SWP_BLKDEV) {
561 disk->fops->swap_slot_free_notify(p->bdev, offset); 563 struct gendisk *disk = p->bdev->bd_disk;
564 if (disk->fops->swap_slot_free_notify)
565 disk->fops->swap_slot_free_notify(p->bdev,
566 offset);
567 }
562 } 568 }
563 569
564 return usage; 570 return usage;
@@ -829,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
829 835
830 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 836 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
831 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 837 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
832 if (ret > 0) 838 mem_cgroup_cancel_charge_swapin(memcg);
833 mem_cgroup_cancel_charge_swapin(memcg);
834 ret = 0; 839 ret = 0;
835 goto out; 840 goto out;
836 } 841 }
@@ -985,11 +990,12 @@ static int unuse_mm(struct mm_struct *mm,
985} 990}
986 991
987/* 992/*
988 * Scan swap_map from current position to next entry still in use. 993 * Scan swap_map (or frontswap_map if frontswap parameter is true)
994 * from current position to next entry still in use.
989 * Recycle to start on reaching the end, returning 0 when empty. 995 * Recycle to start on reaching the end, returning 0 when empty.
990 */ 996 */
991static unsigned int find_next_to_unuse(struct swap_info_struct *si, 997static unsigned int find_next_to_unuse(struct swap_info_struct *si,
992 unsigned int prev) 998 unsigned int prev, bool frontswap)
993{ 999{
994 unsigned int max = si->max; 1000 unsigned int max = si->max;
995 unsigned int i = prev; 1001 unsigned int i = prev;
@@ -1015,6 +1021,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1015 prev = 0; 1021 prev = 0;
1016 i = 1; 1022 i = 1;
1017 } 1023 }
1024 if (frontswap) {
1025 if (frontswap_test(si, i))
1026 break;
1027 else
1028 continue;
1029 }
1018 count = si->swap_map[i]; 1030 count = si->swap_map[i];
1019 if (count && swap_count(count) != SWAP_MAP_BAD) 1031 if (count && swap_count(count) != SWAP_MAP_BAD)
1020 break; 1032 break;
@@ -1026,8 +1038,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1026 * We completely avoid races by reading each swap page in advance, 1038 * We completely avoid races by reading each swap page in advance,
1027 * and then search for the process using it. All the necessary 1039 * and then search for the process using it. All the necessary
1028 * page table adjustments can then be made atomically. 1040 * page table adjustments can then be made atomically.
1041 *
1042 * if the boolean frontswap is true, only unuse pages_to_unuse pages;
1043 * pages_to_unuse==0 means all pages; ignored if frontswap is false
1029 */ 1044 */
1030static int try_to_unuse(unsigned int type) 1045int try_to_unuse(unsigned int type, bool frontswap,
1046 unsigned long pages_to_unuse)
1031{ 1047{
1032 struct swap_info_struct *si = swap_info[type]; 1048 struct swap_info_struct *si = swap_info[type];
1033 struct mm_struct *start_mm; 1049 struct mm_struct *start_mm;
@@ -1060,7 +1076,7 @@ static int try_to_unuse(unsigned int type)
1060 * one pass through swap_map is enough, but not necessarily: 1076 * one pass through swap_map is enough, but not necessarily:
1061 * there are races when an instance of an entry might be missed. 1077 * there are races when an instance of an entry might be missed.
1062 */ 1078 */
1063 while ((i = find_next_to_unuse(si, i)) != 0) { 1079 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
1064 if (signal_pending(current)) { 1080 if (signal_pending(current)) {
1065 retval = -EINTR; 1081 retval = -EINTR;
1066 break; 1082 break;
@@ -1227,6 +1243,10 @@ static int try_to_unuse(unsigned int type)
1227 * interactive performance. 1243 * interactive performance.
1228 */ 1244 */
1229 cond_resched(); 1245 cond_resched();
1246 if (frontswap && pages_to_unuse > 0) {
1247 if (!--pages_to_unuse)
1248 break;
1249 }
1230 } 1250 }
1231 1251
1232 mmput(start_mm); 1252 mmput(start_mm);
@@ -1310,6 +1330,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
1310 list_del(&se->list); 1330 list_del(&se->list);
1311 kfree(se); 1331 kfree(se);
1312 } 1332 }
1333
1334 if (sis->flags & SWP_FILE) {
1335 struct file *swap_file = sis->swap_file;
1336 struct address_space *mapping = swap_file->f_mapping;
1337
1338 sis->flags &= ~SWP_FILE;
1339 mapping->a_ops->swap_deactivate(swap_file);
1340 }
1313} 1341}
1314 1342
1315/* 1343/*
@@ -1318,7 +1346,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
1318 * 1346 *
1319 * This function rather assumes that it is called in ascending page order. 1347 * This function rather assumes that it is called in ascending page order.
1320 */ 1348 */
1321static int 1349int
1322add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 1350add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1323 unsigned long nr_pages, sector_t start_block) 1351 unsigned long nr_pages, sector_t start_block)
1324{ 1352{
@@ -1391,102 +1419,33 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1391 */ 1419 */
1392static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 1420static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1393{ 1421{
1394 struct inode *inode; 1422 struct file *swap_file = sis->swap_file;
1395 unsigned blocks_per_page; 1423 struct address_space *mapping = swap_file->f_mapping;
1396 unsigned long page_no; 1424 struct inode *inode = mapping->host;
1397 unsigned blkbits;
1398 sector_t probe_block;
1399 sector_t last_block;
1400 sector_t lowest_block = -1;
1401 sector_t highest_block = 0;
1402 int nr_extents = 0;
1403 int ret; 1425 int ret;
1404 1426
1405 inode = sis->swap_file->f_mapping->host;
1406 if (S_ISBLK(inode->i_mode)) { 1427 if (S_ISBLK(inode->i_mode)) {
1407 ret = add_swap_extent(sis, 0, sis->max, 0); 1428 ret = add_swap_extent(sis, 0, sis->max, 0);
1408 *span = sis->pages; 1429 *span = sis->pages;
1409 goto out; 1430 return ret;
1410 } 1431 }
1411 1432
1412 blkbits = inode->i_blkbits; 1433 if (mapping->a_ops->swap_activate) {
1413 blocks_per_page = PAGE_SIZE >> blkbits; 1434 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
1414 1435 if (!ret) {
1415 /* 1436 sis->flags |= SWP_FILE;
1416 * Map all the blocks into the extent list. This code doesn't try 1437 ret = add_swap_extent(sis, 0, sis->max, 0);
1417 * to be very smart. 1438 *span = sis->pages;
1418 */
1419 probe_block = 0;
1420 page_no = 0;
1421 last_block = i_size_read(inode) >> blkbits;
1422 while ((probe_block + blocks_per_page) <= last_block &&
1423 page_no < sis->max) {
1424 unsigned block_in_page;
1425 sector_t first_block;
1426
1427 first_block = bmap(inode, probe_block);
1428 if (first_block == 0)
1429 goto bad_bmap;
1430
1431 /*
1432 * It must be PAGE_SIZE aligned on-disk
1433 */
1434 if (first_block & (blocks_per_page - 1)) {
1435 probe_block++;
1436 goto reprobe;
1437 }
1438
1439 for (block_in_page = 1; block_in_page < blocks_per_page;
1440 block_in_page++) {
1441 sector_t block;
1442
1443 block = bmap(inode, probe_block + block_in_page);
1444 if (block == 0)
1445 goto bad_bmap;
1446 if (block != first_block + block_in_page) {
1447 /* Discontiguity */
1448 probe_block++;
1449 goto reprobe;
1450 }
1451 }
1452
1453 first_block >>= (PAGE_SHIFT - blkbits);
1454 if (page_no) { /* exclude the header page */
1455 if (first_block < lowest_block)
1456 lowest_block = first_block;
1457 if (first_block > highest_block)
1458 highest_block = first_block;
1459 } 1439 }
1440 return ret;
1441 }
1460 1442
1461 /* 1443 return generic_swapfile_activate(sis, swap_file, span);
1462 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
1463 */
1464 ret = add_swap_extent(sis, page_no, 1, first_block);
1465 if (ret < 0)
1466 goto out;
1467 nr_extents += ret;
1468 page_no++;
1469 probe_block += blocks_per_page;
1470reprobe:
1471 continue;
1472 }
1473 ret = nr_extents;
1474 *span = 1 + highest_block - lowest_block;
1475 if (page_no == 0)
1476 page_no = 1; /* force Empty message */
1477 sis->max = page_no;
1478 sis->pages = page_no - 1;
1479 sis->highest_bit = page_no - 1;
1480out:
1481 return ret;
1482bad_bmap:
1483 printk(KERN_ERR "swapon: swapfile has holes\n");
1484 ret = -EINVAL;
1485 goto out;
1486} 1444}
1487 1445
1488static void enable_swap_info(struct swap_info_struct *p, int prio, 1446static void enable_swap_info(struct swap_info_struct *p, int prio,
1489 unsigned char *swap_map) 1447 unsigned char *swap_map,
1448 unsigned long *frontswap_map)
1490{ 1449{
1491 int i, prev; 1450 int i, prev;
1492 1451
@@ -1496,6 +1455,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1496 else 1455 else
1497 p->prio = --least_priority; 1456 p->prio = --least_priority;
1498 p->swap_map = swap_map; 1457 p->swap_map = swap_map;
1458 frontswap_map_set(p, frontswap_map);
1499 p->flags |= SWP_WRITEOK; 1459 p->flags |= SWP_WRITEOK;
1500 nr_swap_pages += p->pages; 1460 nr_swap_pages += p->pages;
1501 total_swap_pages += p->pages; 1461 total_swap_pages += p->pages;
@@ -1512,6 +1472,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1512 swap_list.head = swap_list.next = p->type; 1472 swap_list.head = swap_list.next = p->type;
1513 else 1473 else
1514 swap_info[prev]->next = p->type; 1474 swap_info[prev]->next = p->type;
1475 frontswap_init(p->type);
1515 spin_unlock(&swap_lock); 1476 spin_unlock(&swap_lock);
1516} 1477}
1517 1478
@@ -1585,7 +1546,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1585 spin_unlock(&swap_lock); 1546 spin_unlock(&swap_lock);
1586 1547
1587 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1548 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1588 err = try_to_unuse(type); 1549 err = try_to_unuse(type, false, 0); /* force all pages to be unused */
1589 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); 1550 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
1590 1551
1591 if (err) { 1552 if (err) {
@@ -1596,7 +1557,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1596 * sys_swapoff for this swap_info_struct at this point. 1557 * sys_swapoff for this swap_info_struct at this point.
1597 */ 1558 */
1598 /* re-insert swap space back into swap_list */ 1559 /* re-insert swap space back into swap_list */
1599 enable_swap_info(p, p->prio, p->swap_map); 1560 enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1600 goto out_dput; 1561 goto out_dput;
1601 } 1562 }
1602 1563
@@ -1622,9 +1583,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1622 swap_map = p->swap_map; 1583 swap_map = p->swap_map;
1623 p->swap_map = NULL; 1584 p->swap_map = NULL;
1624 p->flags = 0; 1585 p->flags = 0;
1586 frontswap_invalidate_area(type);
1625 spin_unlock(&swap_lock); 1587 spin_unlock(&swap_lock);
1626 mutex_unlock(&swapon_mutex); 1588 mutex_unlock(&swapon_mutex);
1627 vfree(swap_map); 1589 vfree(swap_map);
1590 vfree(frontswap_map_get(p));
1628 /* Destroy swap account informatin */ 1591 /* Destroy swap account informatin */
1629 swap_cgroup_swapoff(type); 1592 swap_cgroup_swapoff(type);
1630 1593
@@ -1893,24 +1856,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1893 1856
1894 /* 1857 /*
1895 * Find out how many pages are allowed for a single swap 1858 * Find out how many pages are allowed for a single swap
1896 * device. There are three limiting factors: 1) the number 1859 * device. There are two limiting factors: 1) the number
1897 * of bits for the swap offset in the swp_entry_t type, and 1860 * of bits for the swap offset in the swp_entry_t type, and
1898 * 2) the number of bits in the swap pte as defined by the 1861 * 2) the number of bits in the swap pte as defined by the
1899 * the different architectures, and 3) the number of free bits 1862 * different architectures. In order to find the
1900 * in an exceptional radix_tree entry. In order to find the
1901 * largest possible bit mask, a swap entry with swap type 0 1863 * largest possible bit mask, a swap entry with swap type 0
1902 * and swap offset ~0UL is created, encoded to a swap pte, 1864 * and swap offset ~0UL is created, encoded to a swap pte,
1903 * decoded to a swp_entry_t again, and finally the swap 1865 * decoded to a swp_entry_t again, and finally the swap
1904 * offset is extracted. This will mask all the bits from 1866 * offset is extracted. This will mask all the bits from
1905 * the initial ~0UL mask that can't be encoded in either 1867 * the initial ~0UL mask that can't be encoded in either
1906 * the swp_entry_t or the architecture definition of a 1868 * the swp_entry_t or the architecture definition of a
1907 * swap pte. Then the same is done for a radix_tree entry. 1869 * swap pte.
1908 */ 1870 */
1909 maxpages = swp_offset(pte_to_swp_entry( 1871 maxpages = swp_offset(pte_to_swp_entry(
1910 swp_entry_to_pte(swp_entry(0, ~0UL)))); 1872 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1911 maxpages = swp_offset(radix_to_swp_entry(
1912 swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
1913
1914 if (maxpages > swap_header->info.last_page) { 1873 if (maxpages > swap_header->info.last_page) {
1915 maxpages = swap_header->info.last_page + 1; 1874 maxpages = swap_header->info.last_page + 1;
1916 /* p->max is an unsigned int: don't overflow it */ 1875 /* p->max is an unsigned int: don't overflow it */
@@ -1988,6 +1947,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1988 sector_t span; 1947 sector_t span;
1989 unsigned long maxpages; 1948 unsigned long maxpages;
1990 unsigned char *swap_map = NULL; 1949 unsigned char *swap_map = NULL;
1950 unsigned long *frontswap_map = NULL;
1991 struct page *page = NULL; 1951 struct page *page = NULL;
1992 struct inode *inode = NULL; 1952 struct inode *inode = NULL;
1993 1953
@@ -2071,6 +2031,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2071 error = nr_extents; 2031 error = nr_extents;
2072 goto bad_swap; 2032 goto bad_swap;
2073 } 2033 }
2034 /* frontswap enabled? set up bit-per-page map for frontswap */
2035 if (frontswap_enabled)
2036 frontswap_map = vzalloc(maxpages / sizeof(long));
2074 2037
2075 if (p->bdev) { 2038 if (p->bdev) {
2076 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2039 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2086,14 +2049,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2086 if (swap_flags & SWAP_FLAG_PREFER) 2049 if (swap_flags & SWAP_FLAG_PREFER)
2087 prio = 2050 prio =
2088 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2051 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2089 enable_swap_info(p, prio, swap_map); 2052 enable_swap_info(p, prio, swap_map, frontswap_map);
2090 2053
2091 printk(KERN_INFO "Adding %uk swap on %s. " 2054 printk(KERN_INFO "Adding %uk swap on %s. "
2092 "Priority:%d extents:%d across:%lluk %s%s\n", 2055 "Priority:%d extents:%d across:%lluk %s%s%s\n",
2093 p->pages<<(PAGE_SHIFT-10), name, p->prio, 2056 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2094 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2057 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2095 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2058 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2096 (p->flags & SWP_DISCARDABLE) ? "D" : ""); 2059 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2060 (frontswap_map) ? "FS" : "");
2097 2061
2098 mutex_unlock(&swapon_mutex); 2062 mutex_unlock(&swapon_mutex);
2099 atomic_inc(&proc_poll_event); 2063 atomic_inc(&proc_poll_event);
@@ -2261,6 +2225,31 @@ int swapcache_prepare(swp_entry_t entry)
2261 return __swap_duplicate(entry, SWAP_HAS_CACHE); 2225 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2262} 2226}
2263 2227
2228struct swap_info_struct *page_swap_info(struct page *page)
2229{
2230 swp_entry_t swap = { .val = page_private(page) };
2231 BUG_ON(!PageSwapCache(page));
2232 return swap_info[swp_type(swap)];
2233}
2234
2235/*
2236 * out-of-line __page_file_ methods to avoid include hell.
2237 */
2238struct address_space *__page_file_mapping(struct page *page)
2239{
2240 VM_BUG_ON(!PageSwapCache(page));
2241 return page_swap_info(page)->swap_file->f_mapping;
2242}
2243EXPORT_SYMBOL_GPL(__page_file_mapping);
2244
2245pgoff_t __page_file_index(struct page *page)
2246{
2247 swp_entry_t swap = { .val = page_private(page) };
2248 VM_BUG_ON(!PageSwapCache(page));
2249 return swp_offset(swap);
2250}
2251EXPORT_SYMBOL_GPL(__page_file_index);
2252
2264/* 2253/*
2265 * add_swap_count_continuation - called when a swap count is duplicated 2254 * add_swap_count_continuation - called when a swap count is duplicated
2266 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 2255 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2aad49981b57..2bb90b1d241c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -413,11 +413,11 @@ nocache:
413 if (addr + size - 1 < addr) 413 if (addr + size - 1 < addr)
414 goto overflow; 414 goto overflow;
415 415
416 n = rb_next(&first->rb_node); 416 if (list_is_last(&first->list, &vmap_area_list))
417 if (n)
418 first = rb_entry(n, struct vmap_area, rb_node);
419 else
420 goto found; 417 goto found;
418
419 first = list_entry(first->list.next,
420 struct vmap_area, list);
421 } 421 }
422 422
423found: 423found:
@@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
904 904
905 BUG_ON(size & ~PAGE_MASK); 905 BUG_ON(size & ~PAGE_MASK);
906 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 906 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
907 if (WARN_ON(size == 0)) {
908 /*
909 * Allocating 0 bytes isn't what caller wants since
910 * get_order(0) returns funny result. Just warn and terminate
911 * early.
912 */
913 return NULL;
914 }
907 order = get_order(size); 915 order = get_order(size);
908 916
909again: 917again:
@@ -1280,7 +1288,7 @@ DEFINE_RWLOCK(vmlist_lock);
1280struct vm_struct *vmlist; 1288struct vm_struct *vmlist;
1281 1289
1282static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1290static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1283 unsigned long flags, void *caller) 1291 unsigned long flags, const void *caller)
1284{ 1292{
1285 vm->flags = flags; 1293 vm->flags = flags;
1286 vm->addr = (void *)va->va_start; 1294 vm->addr = (void *)va->va_start;
@@ -1306,7 +1314,7 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm)
1306} 1314}
1307 1315
1308static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1316static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1309 unsigned long flags, void *caller) 1317 unsigned long flags, const void *caller)
1310{ 1318{
1311 setup_vmalloc_vm(vm, va, flags, caller); 1319 setup_vmalloc_vm(vm, va, flags, caller);
1312 insert_vmalloc_vmlist(vm); 1320 insert_vmalloc_vmlist(vm);
@@ -1314,7 +1322,7 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1314 1322
1315static struct vm_struct *__get_vm_area_node(unsigned long size, 1323static struct vm_struct *__get_vm_area_node(unsigned long size,
1316 unsigned long align, unsigned long flags, unsigned long start, 1324 unsigned long align, unsigned long flags, unsigned long start,
1317 unsigned long end, int node, gfp_t gfp_mask, void *caller) 1325 unsigned long end, int node, gfp_t gfp_mask, const void *caller)
1318{ 1326{
1319 struct vmap_area *va; 1327 struct vmap_area *va;
1320 struct vm_struct *area; 1328 struct vm_struct *area;
@@ -1375,7 +1383,7 @@ EXPORT_SYMBOL_GPL(__get_vm_area);
1375 1383
1376struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 1384struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1377 unsigned long start, unsigned long end, 1385 unsigned long start, unsigned long end,
1378 void *caller) 1386 const void *caller)
1379{ 1387{
1380 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1388 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1381 caller); 1389 caller);
@@ -1397,13 +1405,21 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1397} 1405}
1398 1406
1399struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 1407struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1400 void *caller) 1408 const void *caller)
1401{ 1409{
1402 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1410 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1403 -1, GFP_KERNEL, caller); 1411 -1, GFP_KERNEL, caller);
1404} 1412}
1405 1413
1406static struct vm_struct *find_vm_area(const void *addr) 1414/**
1415 * find_vm_area - find a continuous kernel virtual area
1416 * @addr: base address
1417 *
1418 * Search for the kernel VM area starting at @addr, and return it.
1419 * It is up to the caller to do all required locking to keep the returned
1420 * pointer valid.
1421 */
1422struct vm_struct *find_vm_area(const void *addr)
1407{ 1423{
1408 struct vmap_area *va; 1424 struct vmap_area *va;
1409 1425
@@ -1568,9 +1584,9 @@ EXPORT_SYMBOL(vmap);
1568 1584
1569static void *__vmalloc_node(unsigned long size, unsigned long align, 1585static void *__vmalloc_node(unsigned long size, unsigned long align,
1570 gfp_t gfp_mask, pgprot_t prot, 1586 gfp_t gfp_mask, pgprot_t prot,
1571 int node, void *caller); 1587 int node, const void *caller);
1572static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1588static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1573 pgprot_t prot, int node, void *caller) 1589 pgprot_t prot, int node, const void *caller)
1574{ 1590{
1575 const int order = 0; 1591 const int order = 0;
1576 struct page **pages; 1592 struct page **pages;
@@ -1643,7 +1659,7 @@ fail:
1643 */ 1659 */
1644void *__vmalloc_node_range(unsigned long size, unsigned long align, 1660void *__vmalloc_node_range(unsigned long size, unsigned long align,
1645 unsigned long start, unsigned long end, gfp_t gfp_mask, 1661 unsigned long start, unsigned long end, gfp_t gfp_mask,
1646 pgprot_t prot, int node, void *caller) 1662 pgprot_t prot, int node, const void *caller)
1647{ 1663{
1648 struct vm_struct *area; 1664 struct vm_struct *area;
1649 void *addr; 1665 void *addr;
@@ -1699,7 +1715,7 @@ fail:
1699 */ 1715 */
1700static void *__vmalloc_node(unsigned long size, unsigned long align, 1716static void *__vmalloc_node(unsigned long size, unsigned long align,
1701 gfp_t gfp_mask, pgprot_t prot, 1717 gfp_t gfp_mask, pgprot_t prot,
1702 int node, void *caller) 1718 int node, const void *caller)
1703{ 1719{
1704 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 1720 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
1705 gfp_mask, prot, node, caller); 1721 gfp_mask, prot, node, caller);
@@ -1975,9 +1991,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
1975 * IOREMAP area is treated as memory hole and no copy is done. 1991 * IOREMAP area is treated as memory hole and no copy is done.
1976 * 1992 *
1977 * If [addr...addr+count) doesn't includes any intersects with alive 1993 * If [addr...addr+count) doesn't includes any intersects with alive
1978 * vm_struct area, returns 0. 1994 * vm_struct area, returns 0. @buf should be kernel's buffer.
1979 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1980 * the caller should guarantee KM_USER0 is not used.
1981 * 1995 *
1982 * Note: In usual ops, vread() is never necessary because the caller 1996 * Note: In usual ops, vread() is never necessary because the caller
1983 * should know vmalloc() area is valid and can use memcpy(). 1997 * should know vmalloc() area is valid and can use memcpy().
@@ -2051,9 +2065,7 @@ finished:
2051 * IOREMAP area is treated as memory hole and no copy is done. 2065 * IOREMAP area is treated as memory hole and no copy is done.
2052 * 2066 *
2053 * If [addr...addr+count) doesn't includes any intersects with alive 2067 * If [addr...addr+count) doesn't includes any intersects with alive
2054 * vm_struct area, returns 0. 2068 * vm_struct area, returns 0. @buf should be kernel's buffer.
2055 * @buf should be kernel's buffer. Because this function uses KM_USER0,
2056 * the caller should guarantee KM_USER0 is not used.
2057 * 2069 *
2058 * Note: In usual ops, vwrite() is never necessary because the caller 2070 * Note: In usual ops, vwrite() is never necessary because the caller
2059 * should know vmalloc() area is valid and can use memcpy(). 2071 * should know vmalloc() area is valid and can use memcpy().
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eeb3bc9d1d36..8d01243d9560 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -133,7 +133,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */
133static LIST_HEAD(shrinker_list); 133static LIST_HEAD(shrinker_list);
134static DECLARE_RWSEM(shrinker_rwsem); 134static DECLARE_RWSEM(shrinker_rwsem);
135 135
136#ifdef CONFIG_CGROUP_MEM_RES_CTLR 136#ifdef CONFIG_MEMCG
137static bool global_reclaim(struct scan_control *sc) 137static bool global_reclaim(struct scan_control *sc)
138{ 138{
139 return !sc->target_mem_cgroup; 139 return !sc->target_mem_cgroup;
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
687 687
688 cond_resched(); 688 cond_resched();
689 689
690 mem_cgroup_uncharge_start();
690 while (!list_empty(page_list)) { 691 while (!list_empty(page_list)) {
691 enum page_references references; 692 enum page_references references;
692 struct address_space *mapping; 693 struct address_space *mapping;
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list,
720 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 721 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
721 722
722 if (PageWriteback(page)) { 723 if (PageWriteback(page)) {
723 nr_writeback++; 724 /*
724 unlock_page(page); 725 * memcg doesn't have any dirty pages throttling so we
725 goto keep; 726 * could easily OOM just because too many pages are in
727 * writeback and there is nothing else to reclaim.
728 *
729 * Check __GFP_IO, certainly because a loop driver
730 * thread might enter reclaim, and deadlock if it waits
731 * on a page for which it is needed to do the write
732 * (loop masks off __GFP_IO|__GFP_FS for this reason);
733 * but more thought would probably show more reasons.
734 *
735 * Don't require __GFP_FS, since we're not going into
736 * the FS, just waiting on its writeback completion.
737 * Worryingly, ext4 gfs2 and xfs allocate pages with
738 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
739 * testing may_enter_fs here is liable to OOM on them.
740 */
741 if (global_reclaim(sc) ||
742 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
743 /*
744 * This is slightly racy - end_page_writeback()
745 * might have just cleared PageReclaim, then
746 * setting PageReclaim here end up interpreted
747 * as PageReadahead - but that does not matter
748 * enough to care. What we do want is for this
749 * page to have PageReclaim set next time memcg
750 * reclaim reaches the tests above, so it will
751 * then wait_on_page_writeback() to avoid OOM;
752 * and it's also appropriate in global reclaim.
753 */
754 SetPageReclaim(page);
755 nr_writeback++;
756 goto keep_locked;
757 }
758 wait_on_page_writeback(page);
726 } 759 }
727 760
728 references = page_check_references(page, sc); 761 references = page_check_references(page, sc);
@@ -921,6 +954,7 @@ keep:
921 954
922 list_splice(&ret_pages, page_list); 955 list_splice(&ret_pages, page_list);
923 count_vm_events(PGACTIVATE, pgactivate); 956 count_vm_events(PGACTIVATE, pgactivate);
957 mem_cgroup_uncharge_end();
924 *ret_nr_dirty += nr_dirty; 958 *ret_nr_dirty += nr_dirty;
925 *ret_nr_writeback += nr_writeback; 959 *ret_nr_writeback += nr_writeback;
926 return nr_reclaimed; 960 return nr_reclaimed;
@@ -1567,7 +1601,8 @@ static int vmscan_swappiness(struct scan_control *sc)
1567 * by looking at the fraction of the pages scanned we did rotate back 1601 * by looking at the fraction of the pages scanned we did rotate back
1568 * onto the active list instead of evict. 1602 * onto the active list instead of evict.
1569 * 1603 *
1570 * nr[0] = anon pages to scan; nr[1] = file pages to scan 1604 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
1605 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
1571 */ 1606 */
1572static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 1607static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1573 unsigned long *nr) 1608 unsigned long *nr)
@@ -2111,6 +2146,83 @@ out:
2111 return 0; 2146 return 0;
2112} 2147}
2113 2148
2149static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2150{
2151 struct zone *zone;
2152 unsigned long pfmemalloc_reserve = 0;
2153 unsigned long free_pages = 0;
2154 int i;
2155 bool wmark_ok;
2156
2157 for (i = 0; i <= ZONE_NORMAL; i++) {
2158 zone = &pgdat->node_zones[i];
2159 pfmemalloc_reserve += min_wmark_pages(zone);
2160 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2161 }
2162
2163 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2164
2165 /* kswapd must be awake if processes are being throttled */
2166 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2167 pgdat->classzone_idx = min(pgdat->classzone_idx,
2168 (enum zone_type)ZONE_NORMAL);
2169 wake_up_interruptible(&pgdat->kswapd_wait);
2170 }
2171
2172 return wmark_ok;
2173}
2174
2175/*
2176 * Throttle direct reclaimers if backing storage is backed by the network
2177 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
2178 * depleted. kswapd will continue to make progress and wake the processes
2179 * when the low watermark is reached
2180 */
2181static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2182 nodemask_t *nodemask)
2183{
2184 struct zone *zone;
2185 int high_zoneidx = gfp_zone(gfp_mask);
2186 pg_data_t *pgdat;
2187
2188 /*
2189 * Kernel threads should not be throttled as they may be indirectly
2190 * responsible for cleaning pages necessary for reclaim to make forward
2191 * progress. kjournald for example may enter direct reclaim while
2192 * committing a transaction where throttling it could forcing other
2193 * processes to block on log_wait_commit().
2194 */
2195 if (current->flags & PF_KTHREAD)
2196 return;
2197
2198 /* Check if the pfmemalloc reserves are ok */
2199 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
2200 pgdat = zone->zone_pgdat;
2201 if (pfmemalloc_watermark_ok(pgdat))
2202 return;
2203
2204 /* Account for the throttling */
2205 count_vm_event(PGSCAN_DIRECT_THROTTLE);
2206
2207 /*
2208 * If the caller cannot enter the filesystem, it's possible that it
2209 * is due to the caller holding an FS lock or performing a journal
2210 * transaction in the case of a filesystem like ext[3|4]. In this case,
2211 * it is not safe to block on pfmemalloc_wait as kswapd could be
2212 * blocked waiting on the same lock. Instead, throttle for up to a
2213 * second before continuing.
2214 */
2215 if (!(gfp_mask & __GFP_FS)) {
2216 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2217 pfmemalloc_watermark_ok(pgdat), HZ);
2218 return;
2219 }
2220
2221 /* Throttle until kswapd wakes the process */
2222 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2223 pfmemalloc_watermark_ok(pgdat));
2224}
2225
2114unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2226unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2115 gfp_t gfp_mask, nodemask_t *nodemask) 2227 gfp_t gfp_mask, nodemask_t *nodemask)
2116{ 2228{
@@ -2130,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2130 .gfp_mask = sc.gfp_mask, 2242 .gfp_mask = sc.gfp_mask,
2131 }; 2243 };
2132 2244
2245 throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
2246
2247 /*
2248 * Do not enter reclaim if fatal signal is pending. 1 is returned so
2249 * that the page allocator does not consider triggering OOM
2250 */
2251 if (fatal_signal_pending(current))
2252 return 1;
2253
2133 trace_mm_vmscan_direct_reclaim_begin(order, 2254 trace_mm_vmscan_direct_reclaim_begin(order,
2134 sc.may_writepage, 2255 sc.may_writepage,
2135 gfp_mask); 2256 gfp_mask);
@@ -2141,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2141 return nr_reclaimed; 2262 return nr_reclaimed;
2142} 2263}
2143 2264
2144#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2265#ifdef CONFIG_MEMCG
2145 2266
2146unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, 2267unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2147 gfp_t gfp_mask, bool noswap, 2268 gfp_t gfp_mask, bool noswap,
@@ -2274,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2274 return balanced_pages >= (present_pages >> 2); 2395 return balanced_pages >= (present_pages >> 2);
2275} 2396}
2276 2397
2277/* is kswapd sleeping prematurely? */ 2398/*
2278static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, 2399 * Prepare kswapd for sleeping. This verifies that there are no processes
2400 * waiting in throttle_direct_reclaim() and that watermarks have been met.
2401 *
2402 * Returns true if kswapd is ready to sleep
2403 */
2404static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2279 int classzone_idx) 2405 int classzone_idx)
2280{ 2406{
2281 int i; 2407 int i;
@@ -2284,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2284 2410
2285 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2411 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2286 if (remaining) 2412 if (remaining)
2287 return true; 2413 return false;
2414
2415 /*
2416 * There is a potential race between when kswapd checks its watermarks
2417 * and a process gets throttled. There is also a potential race if
2418 * processes get throttled, kswapd wakes, a large process exits therby
2419 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
2420 * is going to sleep, no process should be sleeping on pfmemalloc_wait
2421 * so wake them now if necessary. If necessary, processes will wake
2422 * kswapd and get throttled again
2423 */
2424 if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
2425 wake_up(&pgdat->pfmemalloc_wait);
2426 return false;
2427 }
2288 2428
2289 /* Check the watermark levels */ 2429 /* Check the watermark levels */
2290 for (i = 0; i <= classzone_idx; i++) { 2430 for (i = 0; i <= classzone_idx; i++) {
@@ -2317,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2317 * must be balanced 2457 * must be balanced
2318 */ 2458 */
2319 if (order) 2459 if (order)
2320 return !pgdat_balanced(pgdat, balanced, classzone_idx); 2460 return pgdat_balanced(pgdat, balanced, classzone_idx);
2321 else 2461 else
2322 return !all_zones_ok; 2462 return all_zones_ok;
2323} 2463}
2324 2464
2325/* 2465/*
@@ -2537,7 +2677,7 @@ loop_again:
2537 * consider it to be no longer congested. It's 2677 * consider it to be no longer congested. It's
2538 * possible there are dirty pages backed by 2678 * possible there are dirty pages backed by
2539 * congested BDIs but as pressure is relieved, 2679 * congested BDIs but as pressure is relieved,
2540 * spectulatively avoid congestion waits 2680 * speculatively avoid congestion waits
2541 */ 2681 */
2542 zone_clear_flag(zone, ZONE_CONGESTED); 2682 zone_clear_flag(zone, ZONE_CONGESTED);
2543 if (i <= *classzone_idx) 2683 if (i <= *classzone_idx)
@@ -2545,6 +2685,16 @@ loop_again:
2545 } 2685 }
2546 2686
2547 } 2687 }
2688
2689 /*
2690 * If the low watermark is met there is no need for processes
2691 * to be throttled on pfmemalloc_wait as they should not be
2692 * able to safely make forward progress. Wake them
2693 */
2694 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
2695 pfmemalloc_watermark_ok(pgdat))
2696 wake_up(&pgdat->pfmemalloc_wait);
2697
2548 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2698 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2549 break; /* kswapd: all done */ 2699 break; /* kswapd: all done */
2550 /* 2700 /*
@@ -2646,7 +2796,7 @@ out:
2646 } 2796 }
2647 2797
2648 /* 2798 /*
2649 * Return the order we were reclaiming at so sleeping_prematurely() 2799 * Return the order we were reclaiming at so prepare_kswapd_sleep()
2650 * makes a decision on the order we were last reclaiming at. However, 2800 * makes a decision on the order we were last reclaiming at. However,
2651 * if another caller entered the allocator slow path while kswapd 2801 * if another caller entered the allocator slow path while kswapd
2652 * was awake, order will remain at the higher level 2802 * was awake, order will remain at the higher level
@@ -2666,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2666 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2816 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2667 2817
2668 /* Try to sleep for a short interval */ 2818 /* Try to sleep for a short interval */
2669 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2819 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2670 remaining = schedule_timeout(HZ/10); 2820 remaining = schedule_timeout(HZ/10);
2671 finish_wait(&pgdat->kswapd_wait, &wait); 2821 finish_wait(&pgdat->kswapd_wait, &wait);
2672 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2822 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2676,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2676 * After a short sleep, check if it was a premature sleep. If not, then 2826 * After a short sleep, check if it was a premature sleep. If not, then
2677 * go fully to sleep until explicitly woken up. 2827 * go fully to sleep until explicitly woken up.
2678 */ 2828 */
2679 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2829 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2680 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 2830 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2681 2831
2682 /* 2832 /*
@@ -2688,7 +2838,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2688 * them before going back to sleep. 2838 * them before going back to sleep.
2689 */ 2839 */
2690 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2840 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2691 schedule(); 2841
2842 if (!kthread_should_stop())
2843 schedule();
2844
2692 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 2845 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2693 } else { 2846 } else {
2694 if (remaining) 2847 if (remaining)
@@ -2955,14 +3108,17 @@ int kswapd_run(int nid)
2955} 3108}
2956 3109
2957/* 3110/*
2958 * Called by memory hotplug when all memory in a node is offlined. 3111 * Called by memory hotplug when all memory in a node is offlined. Caller must
3112 * hold lock_memory_hotplug().
2959 */ 3113 */
2960void kswapd_stop(int nid) 3114void kswapd_stop(int nid)
2961{ 3115{
2962 struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 3116 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
2963 3117
2964 if (kswapd) 3118 if (kswapd) {
2965 kthread_stop(kswapd); 3119 kthread_stop(kswapd);
3120 NODE_DATA(nid)->kswapd = NULL;
3121 }
2966} 3122}
2967 3123
2968static int __init kswapd_init(void) 3124static int __init kswapd_init(void)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1bbbbd9776ad..df7a6748231d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -745,6 +745,7 @@ const char * const vmstat_text[] = {
745 TEXTS_FOR_ZONES("pgsteal_direct") 745 TEXTS_FOR_ZONES("pgsteal_direct")
746 TEXTS_FOR_ZONES("pgscan_kswapd") 746 TEXTS_FOR_ZONES("pgscan_kswapd")
747 TEXTS_FOR_ZONES("pgscan_direct") 747 TEXTS_FOR_ZONES("pgscan_direct")
748 "pgscan_direct_throttle",
748 749
749#ifdef CONFIG_NUMA 750#ifdef CONFIG_NUMA
750 "zone_reclaim_failed", 751 "zone_reclaim_failed",