diff options
author | Andrea Bastoni <bastoni@cs.unc.edu> | 2010-05-30 19:16:45 -0400 |
---|---|---|
committer | Andrea Bastoni <bastoni@cs.unc.edu> | 2010-05-30 19:16:45 -0400 |
commit | ada47b5fe13d89735805b566185f4885f5a3f750 (patch) | |
tree | 644b88f8a71896307d71438e9b3af49126ffb22b /mm | |
parent | 43e98717ad40a4ae64545b5ba047c7b86aa44f4f (diff) | |
parent | 3280f21d43ee541f97f8cda5792150d2dbec20d5 (diff) |
Merge branch 'wip-2.6.34' into old-private-masterarchived-private-master
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 28 | ||||
-rw-r--r-- | mm/Makefile | 7 | ||||
-rw-r--r-- | mm/allocpercpu.c | 177 | ||||
-rw-r--r-- | mm/backing-dev.c | 39 | ||||
-rw-r--r-- | mm/bootmem.c | 228 | ||||
-rw-r--r-- | mm/bounce.c | 1 | ||||
-rw-r--r-- | mm/fadvise.c | 10 | ||||
-rw-r--r-- | mm/failslab.c | 19 | ||||
-rw-r--r-- | mm/filemap.c | 175 | ||||
-rw-r--r-- | mm/filemap_xip.c | 3 | ||||
-rw-r--r-- | mm/fremap.c | 2 | ||||
-rw-r--r-- | mm/highmem.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 565 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 113 | ||||
-rw-r--r-- | mm/internal.h | 35 | ||||
-rw-r--r-- | mm/kmemleak.c | 193 | ||||
-rw-r--r-- | mm/ksm.c | 971 | ||||
-rw-r--r-- | mm/maccess.c | 11 | ||||
-rw-r--r-- | mm/madvise.c | 21 | ||||
-rw-r--r-- | mm/memcontrol.c | 1835 | ||||
-rw-r--r-- | mm/memory-failure.c | 579 | ||||
-rw-r--r-- | mm/memory.c | 216 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 26 | ||||
-rw-r--r-- | mm/mempolicy.c | 234 | ||||
-rw-r--r-- | mm/migrate.c | 177 | ||||
-rw-r--r-- | mm/mincore.c | 39 | ||||
-rw-r--r-- | mm/mlock.c | 57 | ||||
-rw-r--r-- | mm/mmap.c | 398 | ||||
-rw-r--r-- | mm/mmu_context.c | 4 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 1 | ||||
-rw-r--r-- | mm/mprotect.c | 1 | ||||
-rw-r--r-- | mm/mremap.c | 249 | ||||
-rw-r--r-- | mm/nommu.c | 187 | ||||
-rw-r--r-- | mm/oom_kill.c | 116 | ||||
-rw-r--r-- | mm/page-writeback.c | 12 | ||||
-rw-r--r-- | mm/page_alloc.c | 527 | ||||
-rw-r--r-- | mm/page_cgroup.c | 42 | ||||
-rw-r--r-- | mm/page_io.c | 18 | ||||
-rw-r--r-- | mm/pagewalk.c | 59 | ||||
-rw-r--r-- | mm/percpu.c | 90 | ||||
-rw-r--r-- | mm/percpu_up.c | 30 | ||||
-rw-r--r-- | mm/quicklist.c | 1 | ||||
-rw-r--r-- | mm/readahead.c | 19 | ||||
-rw-r--r-- | mm/rmap.c | 568 | ||||
-rw-r--r-- | mm/shmem.c | 84 | ||||
-rw-r--r-- | mm/shmem_acl.c | 171 | ||||
-rw-r--r-- | mm/slab.c | 186 | ||||
-rw-r--r-- | mm/slub.c | 368 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 77 | ||||
-rw-r--r-- | mm/sparse.c | 197 | ||||
-rw-r--r-- | mm/swap.c | 3 | ||||
-rw-r--r-- | mm/swap_state.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 918 | ||||
-rw-r--r-- | mm/truncate.c | 39 | ||||
-rw-r--r-- | mm/util.c | 23 | ||||
-rw-r--r-- | mm/vmalloc.c | 125 | ||||
-rw-r--r-- | mm/vmscan.c | 464 | ||||
-rw-r--r-- | mm/vmstat.c | 28 |
58 files changed, 7338 insertions, 3431 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 44cf6f0a3a6d..9c61158308dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME | |||
115 | config SPARSEMEM_VMEMMAP_ENABLE | 115 | config SPARSEMEM_VMEMMAP_ENABLE |
116 | bool | 116 | bool |
117 | 117 | ||
118 | config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
119 | def_bool y | ||
120 | depends on SPARSEMEM && X86_64 | ||
121 | |||
118 | config SPARSEMEM_VMEMMAP | 122 | config SPARSEMEM_VMEMMAP |
119 | bool "Sparse Memory virtual memmap" | 123 | bool "Sparse Memory virtual memmap" |
120 | depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE | 124 | depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE |
@@ -158,11 +162,13 @@ config PAGEFLAGS_EXTENDED | |||
158 | # Default to 4 for wider testing, though 8 might be more appropriate. | 162 | # Default to 4 for wider testing, though 8 might be more appropriate. |
159 | # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. | 163 | # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. |
160 | # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. | 164 | # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. |
165 | # DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. | ||
161 | # | 166 | # |
162 | config SPLIT_PTLOCK_CPUS | 167 | config SPLIT_PTLOCK_CPUS |
163 | int | 168 | int |
164 | default "4096" if ARM && !CPU_CACHE_VIPT | 169 | default "999999" if ARM && !CPU_CACHE_VIPT |
165 | default "4096" if PARISC && !PA20 | 170 | default "999999" if PARISC && !PA20 |
171 | default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC | ||
166 | default "4" | 172 | default "4" |
167 | 173 | ||
168 | # | 174 | # |
@@ -193,21 +199,13 @@ config BOUNCE | |||
193 | config NR_QUICK | 199 | config NR_QUICK |
194 | int | 200 | int |
195 | depends on QUICKLIST | 201 | depends on QUICKLIST |
196 | default "2" if SUPERH || AVR32 | 202 | default "2" if AVR32 |
197 | default "1" | 203 | default "1" |
198 | 204 | ||
199 | config VIRT_TO_BUS | 205 | config VIRT_TO_BUS |
200 | def_bool y | 206 | def_bool y |
201 | depends on !ARCH_NO_VIRT_TO_BUS | 207 | depends on !ARCH_NO_VIRT_TO_BUS |
202 | 208 | ||
203 | config HAVE_MLOCK | ||
204 | bool | ||
205 | default y if MMU=y | ||
206 | |||
207 | config HAVE_MLOCKED_PAGE_BIT | ||
208 | bool | ||
209 | default y if HAVE_MLOCK=y | ||
210 | |||
211 | config MMU_NOTIFIER | 209 | config MMU_NOTIFIER |
212 | bool | 210 | bool |
213 | 211 | ||
@@ -218,7 +216,7 @@ config KSM | |||
218 | Enable Kernel Samepage Merging: KSM periodically scans those areas | 216 | Enable Kernel Samepage Merging: KSM periodically scans those areas |
219 | of an application's address space that an app has advised may be | 217 | of an application's address space that an app has advised may be |
220 | mergeable. When it finds pages of identical content, it replaces | 218 | mergeable. When it finds pages of identical content, it replaces |
221 | the many instances by a single resident page with that content, so | 219 | the many instances by a single page with that content, so |
222 | saving memory until one or another app needs to modify the content. | 220 | saving memory until one or another app needs to modify the content. |
223 | Recommended for use with KVM, or with other duplicative applications. | 221 | Recommended for use with KVM, or with other duplicative applications. |
224 | See Documentation/vm/ksm.txt for more information: KSM is inactive | 222 | See Documentation/vm/ksm.txt for more information: KSM is inactive |
@@ -227,6 +225,7 @@ config KSM | |||
227 | 225 | ||
228 | config DEFAULT_MMAP_MIN_ADDR | 226 | config DEFAULT_MMAP_MIN_ADDR |
229 | int "Low address space to protect from user allocation" | 227 | int "Low address space to protect from user allocation" |
228 | depends on MMU | ||
230 | default 4096 | 229 | default 4096 |
231 | help | 230 | help |
232 | This is the portion of low virtual memory which should be protected | 231 | This is the portion of low virtual memory which should be protected |
@@ -257,8 +256,9 @@ config MEMORY_FAILURE | |||
257 | special hardware support and typically ECC memory. | 256 | special hardware support and typically ECC memory. |
258 | 257 | ||
259 | config HWPOISON_INJECT | 258 | config HWPOISON_INJECT |
260 | tristate "Poison pages injector" | 259 | tristate "HWPoison pages injector" |
261 | depends on MEMORY_FAILURE && DEBUG_KERNEL | 260 | depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS |
261 | select PROC_PAGE_MONITOR | ||
262 | 262 | ||
263 | config NOMMU_INITIAL_TRIM_EXCESS | 263 | config NOMMU_INITIAL_TRIM_EXCESS |
264 | int "Turn on mmap() excess space trimming before booting" | 264 | int "Turn on mmap() excess space trimming before booting" |
diff --git a/mm/Makefile b/mm/Makefile index ebf849042ed3..6c2a73a54a43 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o | |||
22 | obj-$(CONFIG_NUMA) += mempolicy.o | 22 | obj-$(CONFIG_NUMA) += mempolicy.o |
23 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 23 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
24 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | 24 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o |
25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | ||
26 | obj-$(CONFIG_SLOB) += slob.o | 25 | obj-$(CONFIG_SLOB) += slob.o |
27 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 26 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
28 | obj-$(CONFIG_KSM) += ksm.o | 27 | obj-$(CONFIG_KSM) += ksm.o |
@@ -34,10 +33,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o | |||
34 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 33 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
35 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 34 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
36 | obj-$(CONFIG_MIGRATION) += migrate.o | 35 | obj-$(CONFIG_MIGRATION) += migrate.o |
37 | ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA | 36 | ifdef CONFIG_SMP |
38 | obj-$(CONFIG_SMP) += percpu.o | 37 | obj-y += percpu.o |
39 | else | 38 | else |
40 | obj-$(CONFIG_SMP) += allocpercpu.o | 39 | obj-y += percpu_up.o |
41 | endif | 40 | endif |
42 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 41 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
43 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 42 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c deleted file mode 100644 index df34ceae0c67..000000000000 --- a/mm/allocpercpu.c +++ /dev/null | |||
@@ -1,177 +0,0 @@ | |||
1 | /* | ||
2 | * linux/mm/allocpercpu.c | ||
3 | * | ||
4 | * Separated from slab.c August 11, 2006 Christoph Lameter | ||
5 | */ | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/bootmem.h> | ||
9 | #include <asm/sections.h> | ||
10 | |||
11 | #ifndef cache_line_size | ||
12 | #define cache_line_size() L1_CACHE_BYTES | ||
13 | #endif | ||
14 | |||
15 | /** | ||
16 | * percpu_depopulate - depopulate per-cpu data for given cpu | ||
17 | * @__pdata: per-cpu data to depopulate | ||
18 | * @cpu: depopulate per-cpu data for this cpu | ||
19 | * | ||
20 | * Depopulating per-cpu data for a cpu going offline would be a typical | ||
21 | * use case. You need to register a cpu hotplug handler for that purpose. | ||
22 | */ | ||
23 | static void percpu_depopulate(void *__pdata, int cpu) | ||
24 | { | ||
25 | struct percpu_data *pdata = __percpu_disguise(__pdata); | ||
26 | |||
27 | kfree(pdata->ptrs[cpu]); | ||
28 | pdata->ptrs[cpu] = NULL; | ||
29 | } | ||
30 | |||
31 | /** | ||
32 | * percpu_depopulate_mask - depopulate per-cpu data for some cpu's | ||
33 | * @__pdata: per-cpu data to depopulate | ||
34 | * @mask: depopulate per-cpu data for cpu's selected through mask bits | ||
35 | */ | ||
36 | static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask) | ||
37 | { | ||
38 | int cpu; | ||
39 | for_each_cpu_mask_nr(cpu, *mask) | ||
40 | percpu_depopulate(__pdata, cpu); | ||
41 | } | ||
42 | |||
43 | #define percpu_depopulate_mask(__pdata, mask) \ | ||
44 | __percpu_depopulate_mask((__pdata), &(mask)) | ||
45 | |||
46 | /** | ||
47 | * percpu_populate - populate per-cpu data for given cpu | ||
48 | * @__pdata: per-cpu data to populate further | ||
49 | * @size: size of per-cpu object | ||
50 | * @gfp: may sleep or not etc. | ||
51 | * @cpu: populate per-data for this cpu | ||
52 | * | ||
53 | * Populating per-cpu data for a cpu coming online would be a typical | ||
54 | * use case. You need to register a cpu hotplug handler for that purpose. | ||
55 | * Per-cpu object is populated with zeroed buffer. | ||
56 | */ | ||
57 | static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) | ||
58 | { | ||
59 | struct percpu_data *pdata = __percpu_disguise(__pdata); | ||
60 | int node = cpu_to_node(cpu); | ||
61 | |||
62 | /* | ||
63 | * We should make sure each CPU gets private memory. | ||
64 | */ | ||
65 | size = roundup(size, cache_line_size()); | ||
66 | |||
67 | BUG_ON(pdata->ptrs[cpu]); | ||
68 | if (node_online(node)) | ||
69 | pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node); | ||
70 | else | ||
71 | pdata->ptrs[cpu] = kzalloc(size, gfp); | ||
72 | return pdata->ptrs[cpu]; | ||
73 | } | ||
74 | |||
75 | /** | ||
76 | * percpu_populate_mask - populate per-cpu data for more cpu's | ||
77 | * @__pdata: per-cpu data to populate further | ||
78 | * @size: size of per-cpu object | ||
79 | * @gfp: may sleep or not etc. | ||
80 | * @mask: populate per-cpu data for cpu's selected through mask bits | ||
81 | * | ||
82 | * Per-cpu objects are populated with zeroed buffers. | ||
83 | */ | ||
84 | static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, | ||
85 | cpumask_t *mask) | ||
86 | { | ||
87 | cpumask_t populated; | ||
88 | int cpu; | ||
89 | |||
90 | cpus_clear(populated); | ||
91 | for_each_cpu_mask_nr(cpu, *mask) | ||
92 | if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { | ||
93 | __percpu_depopulate_mask(__pdata, &populated); | ||
94 | return -ENOMEM; | ||
95 | } else | ||
96 | cpu_set(cpu, populated); | ||
97 | return 0; | ||
98 | } | ||
99 | |||
100 | #define percpu_populate_mask(__pdata, size, gfp, mask) \ | ||
101 | __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) | ||
102 | |||
103 | /** | ||
104 | * alloc_percpu - initial setup of per-cpu data | ||
105 | * @size: size of per-cpu object | ||
106 | * @align: alignment | ||
107 | * | ||
108 | * Allocate dynamic percpu area. Percpu objects are populated with | ||
109 | * zeroed buffers. | ||
110 | */ | ||
111 | void *__alloc_percpu(size_t size, size_t align) | ||
112 | { | ||
113 | /* | ||
114 | * We allocate whole cache lines to avoid false sharing | ||
115 | */ | ||
116 | size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); | ||
117 | void *pdata = kzalloc(sz, GFP_KERNEL); | ||
118 | void *__pdata = __percpu_disguise(pdata); | ||
119 | |||
120 | /* | ||
121 | * Can't easily make larger alignment work with kmalloc. WARN | ||
122 | * on it. Larger alignment should only be used for module | ||
123 | * percpu sections on SMP for which this path isn't used. | ||
124 | */ | ||
125 | WARN_ON_ONCE(align > SMP_CACHE_BYTES); | ||
126 | |||
127 | if (unlikely(!pdata)) | ||
128 | return NULL; | ||
129 | if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL, | ||
130 | &cpu_possible_map))) | ||
131 | return __pdata; | ||
132 | kfree(pdata); | ||
133 | return NULL; | ||
134 | } | ||
135 | EXPORT_SYMBOL_GPL(__alloc_percpu); | ||
136 | |||
137 | /** | ||
138 | * free_percpu - final cleanup of per-cpu data | ||
139 | * @__pdata: object to clean up | ||
140 | * | ||
141 | * We simply clean up any per-cpu object left. No need for the client to | ||
142 | * track and specify through a bis mask which per-cpu objects are to free. | ||
143 | */ | ||
144 | void free_percpu(void *__pdata) | ||
145 | { | ||
146 | if (unlikely(!__pdata)) | ||
147 | return; | ||
148 | __percpu_depopulate_mask(__pdata, cpu_possible_mask); | ||
149 | kfree(__percpu_disguise(__pdata)); | ||
150 | } | ||
151 | EXPORT_SYMBOL_GPL(free_percpu); | ||
152 | |||
153 | /* | ||
154 | * Generic percpu area setup. | ||
155 | */ | ||
156 | #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA | ||
157 | unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; | ||
158 | |||
159 | EXPORT_SYMBOL(__per_cpu_offset); | ||
160 | |||
161 | void __init setup_per_cpu_areas(void) | ||
162 | { | ||
163 | unsigned long size, i; | ||
164 | char *ptr; | ||
165 | unsigned long nr_possible_cpus = num_possible_cpus(); | ||
166 | |||
167 | /* Copy section for each CPU (we discard the original) */ | ||
168 | size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); | ||
169 | ptr = alloc_bootmem_pages(size * nr_possible_cpus); | ||
170 | |||
171 | for_each_possible_cpu(i) { | ||
172 | __per_cpu_offset[i] = ptr - __per_cpu_start; | ||
173 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | ||
174 | ptr += size; | ||
175 | } | ||
176 | } | ||
177 | #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 67a33a5a1a93..707d0dc6da0f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -11,6 +11,8 @@ | |||
11 | #include <linux/writeback.h> | 11 | #include <linux/writeback.h> |
12 | #include <linux/device.h> | 12 | #include <linux/device.h> |
13 | 13 | ||
14 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); | ||
15 | |||
14 | void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | 16 | void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) |
15 | { | 17 | { |
16 | } | 18 | } |
@@ -25,6 +27,11 @@ struct backing_dev_info default_backing_dev_info = { | |||
25 | }; | 27 | }; |
26 | EXPORT_SYMBOL_GPL(default_backing_dev_info); | 28 | EXPORT_SYMBOL_GPL(default_backing_dev_info); |
27 | 29 | ||
30 | struct backing_dev_info noop_backing_dev_info = { | ||
31 | .name = "noop", | ||
32 | }; | ||
33 | EXPORT_SYMBOL_GPL(noop_backing_dev_info); | ||
34 | |||
28 | static struct class *bdi_class; | 35 | static struct class *bdi_class; |
29 | 36 | ||
30 | /* | 37 | /* |
@@ -227,6 +234,9 @@ static struct device_attribute bdi_dev_attrs[] = { | |||
227 | static __init int bdi_class_init(void) | 234 | static __init int bdi_class_init(void) |
228 | { | 235 | { |
229 | bdi_class = class_create(THIS_MODULE, "bdi"); | 236 | bdi_class = class_create(THIS_MODULE, "bdi"); |
237 | if (IS_ERR(bdi_class)) | ||
238 | return PTR_ERR(bdi_class); | ||
239 | |||
230 | bdi_class->dev_attrs = bdi_dev_attrs; | 240 | bdi_class->dev_attrs = bdi_dev_attrs; |
231 | bdi_debug_init(); | 241 | bdi_debug_init(); |
232 | return 0; | 242 | return 0; |
@@ -609,7 +619,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) | |||
609 | * it would never exet if it is currently stuck in the refrigerator. | 619 | * it would never exet if it is currently stuck in the refrigerator. |
610 | */ | 620 | */ |
611 | list_for_each_entry(wb, &bdi->wb_list, list) { | 621 | list_for_each_entry(wb, &bdi->wb_list, list) { |
612 | wb->task->flags &= ~PF_FROZEN; | 622 | thaw_process(wb->task); |
613 | kthread_stop(wb->task); | 623 | kthread_stop(wb->task); |
614 | } | 624 | } |
615 | } | 625 | } |
@@ -712,6 +722,33 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
712 | } | 722 | } |
713 | EXPORT_SYMBOL(bdi_destroy); | 723 | EXPORT_SYMBOL(bdi_destroy); |
714 | 724 | ||
725 | /* | ||
726 | * For use from filesystems to quickly init and register a bdi associated | ||
727 | * with dirty writeback | ||
728 | */ | ||
729 | int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, | ||
730 | unsigned int cap) | ||
731 | { | ||
732 | char tmp[32]; | ||
733 | int err; | ||
734 | |||
735 | bdi->name = name; | ||
736 | bdi->capabilities = cap; | ||
737 | err = bdi_init(bdi); | ||
738 | if (err) | ||
739 | return err; | ||
740 | |||
741 | sprintf(tmp, "%.28s%s", name, "-%d"); | ||
742 | err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); | ||
743 | if (err) { | ||
744 | bdi_destroy(bdi); | ||
745 | return err; | ||
746 | } | ||
747 | |||
748 | return 0; | ||
749 | } | ||
750 | EXPORT_SYMBOL(bdi_setup_and_register); | ||
751 | |||
715 | static wait_queue_head_t congestion_wqh[2] = { | 752 | static wait_queue_head_t congestion_wqh[2] = { |
716 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), | 753 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), |
717 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) | 754 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 555d5d2731c6..58c66cc5056a 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -10,9 +10,11 @@ | |||
10 | */ | 10 | */ |
11 | #include <linux/init.h> | 11 | #include <linux/init.h> |
12 | #include <linux/pfn.h> | 12 | #include <linux/pfn.h> |
13 | #include <linux/slab.h> | ||
13 | #include <linux/bootmem.h> | 14 | #include <linux/bootmem.h> |
14 | #include <linux/module.h> | 15 | #include <linux/module.h> |
15 | #include <linux/kmemleak.h> | 16 | #include <linux/kmemleak.h> |
17 | #include <linux/range.h> | ||
16 | 18 | ||
17 | #include <asm/bug.h> | 19 | #include <asm/bug.h> |
18 | #include <asm/io.h> | 20 | #include <asm/io.h> |
@@ -32,6 +34,7 @@ unsigned long max_pfn; | |||
32 | unsigned long saved_max_pfn; | 34 | unsigned long saved_max_pfn; |
33 | #endif | 35 | #endif |
34 | 36 | ||
37 | #ifndef CONFIG_NO_BOOTMEM | ||
35 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; | 38 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; |
36 | 39 | ||
37 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); | 40 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); |
@@ -142,7 +145,78 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | |||
142 | min_low_pfn = start; | 145 | min_low_pfn = start; |
143 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); | 146 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); |
144 | } | 147 | } |
148 | #endif | ||
149 | /* | ||
150 | * free_bootmem_late - free bootmem pages directly to page allocator | ||
151 | * @addr: starting address of the range | ||
152 | * @size: size of the range in bytes | ||
153 | * | ||
154 | * This is only useful when the bootmem allocator has already been torn | ||
155 | * down, but we are still initializing the system. Pages are given directly | ||
156 | * to the page allocator, no bootmem metadata is updated because it is gone. | ||
157 | */ | ||
158 | void __init free_bootmem_late(unsigned long addr, unsigned long size) | ||
159 | { | ||
160 | unsigned long cursor, end; | ||
145 | 161 | ||
162 | kmemleak_free_part(__va(addr), size); | ||
163 | |||
164 | cursor = PFN_UP(addr); | ||
165 | end = PFN_DOWN(addr + size); | ||
166 | |||
167 | for (; cursor < end; cursor++) { | ||
168 | __free_pages_bootmem(pfn_to_page(cursor), 0); | ||
169 | totalram_pages++; | ||
170 | } | ||
171 | } | ||
172 | |||
173 | #ifdef CONFIG_NO_BOOTMEM | ||
174 | static void __init __free_pages_memory(unsigned long start, unsigned long end) | ||
175 | { | ||
176 | int i; | ||
177 | unsigned long start_aligned, end_aligned; | ||
178 | int order = ilog2(BITS_PER_LONG); | ||
179 | |||
180 | start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); | ||
181 | end_aligned = end & ~(BITS_PER_LONG - 1); | ||
182 | |||
183 | if (end_aligned <= start_aligned) { | ||
184 | for (i = start; i < end; i++) | ||
185 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
186 | |||
187 | return; | ||
188 | } | ||
189 | |||
190 | for (i = start; i < start_aligned; i++) | ||
191 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
192 | |||
193 | for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) | ||
194 | __free_pages_bootmem(pfn_to_page(i), order); | ||
195 | |||
196 | for (i = end_aligned; i < end; i++) | ||
197 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
198 | } | ||
199 | |||
200 | unsigned long __init free_all_memory_core_early(int nodeid) | ||
201 | { | ||
202 | int i; | ||
203 | u64 start, end; | ||
204 | unsigned long count = 0; | ||
205 | struct range *range = NULL; | ||
206 | int nr_range; | ||
207 | |||
208 | nr_range = get_free_all_memory_range(&range, nodeid); | ||
209 | |||
210 | for (i = 0; i < nr_range; i++) { | ||
211 | start = range[i].start; | ||
212 | end = range[i].end; | ||
213 | count += end - start; | ||
214 | __free_pages_memory(start, end); | ||
215 | } | ||
216 | |||
217 | return count; | ||
218 | } | ||
219 | #else | ||
146 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | 220 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) |
147 | { | 221 | { |
148 | int aligned; | 222 | int aligned; |
@@ -203,6 +277,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
203 | 277 | ||
204 | return count; | 278 | return count; |
205 | } | 279 | } |
280 | #endif | ||
206 | 281 | ||
207 | /** | 282 | /** |
208 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | 283 | * free_all_bootmem_node - release a node's free pages to the buddy allocator |
@@ -213,7 +288,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
213 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 288 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) |
214 | { | 289 | { |
215 | register_page_bootmem_info_node(pgdat); | 290 | register_page_bootmem_info_node(pgdat); |
291 | #ifdef CONFIG_NO_BOOTMEM | ||
292 | /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ | ||
293 | return 0; | ||
294 | #else | ||
216 | return free_all_bootmem_core(pgdat->bdata); | 295 | return free_all_bootmem_core(pgdat->bdata); |
296 | #endif | ||
217 | } | 297 | } |
218 | 298 | ||
219 | /** | 299 | /** |
@@ -223,9 +303,27 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | |||
223 | */ | 303 | */ |
224 | unsigned long __init free_all_bootmem(void) | 304 | unsigned long __init free_all_bootmem(void) |
225 | { | 305 | { |
226 | return free_all_bootmem_core(NODE_DATA(0)->bdata); | 306 | #ifdef CONFIG_NO_BOOTMEM |
307 | /* | ||
308 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | ||
309 | * because in some case like Node0 doesnt have RAM installed | ||
310 | * low ram will be on Node1 | ||
311 | * Use MAX_NUMNODES will make sure all ranges in early_node_map[] | ||
312 | * will be used instead of only Node0 related | ||
313 | */ | ||
314 | return free_all_memory_core_early(MAX_NUMNODES); | ||
315 | #else | ||
316 | unsigned long total_pages = 0; | ||
317 | bootmem_data_t *bdata; | ||
318 | |||
319 | list_for_each_entry(bdata, &bdata_list, list) | ||
320 | total_pages += free_all_bootmem_core(bdata); | ||
321 | |||
322 | return total_pages; | ||
323 | #endif | ||
227 | } | 324 | } |
228 | 325 | ||
326 | #ifndef CONFIG_NO_BOOTMEM | ||
229 | static void __init __free(bootmem_data_t *bdata, | 327 | static void __init __free(bootmem_data_t *bdata, |
230 | unsigned long sidx, unsigned long eidx) | 328 | unsigned long sidx, unsigned long eidx) |
231 | { | 329 | { |
@@ -320,6 +418,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, | |||
320 | } | 418 | } |
321 | BUG(); | 419 | BUG(); |
322 | } | 420 | } |
421 | #endif | ||
323 | 422 | ||
324 | /** | 423 | /** |
325 | * free_bootmem_node - mark a page range as usable | 424 | * free_bootmem_node - mark a page range as usable |
@@ -334,6 +433,9 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, | |||
334 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 433 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
335 | unsigned long size) | 434 | unsigned long size) |
336 | { | 435 | { |
436 | #ifdef CONFIG_NO_BOOTMEM | ||
437 | free_early(physaddr, physaddr + size); | ||
438 | #else | ||
337 | unsigned long start, end; | 439 | unsigned long start, end; |
338 | 440 | ||
339 | kmemleak_free_part(__va(physaddr), size); | 441 | kmemleak_free_part(__va(physaddr), size); |
@@ -342,6 +444,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
342 | end = PFN_DOWN(physaddr + size); | 444 | end = PFN_DOWN(physaddr + size); |
343 | 445 | ||
344 | mark_bootmem_node(pgdat->bdata, start, end, 0, 0); | 446 | mark_bootmem_node(pgdat->bdata, start, end, 0, 0); |
447 | #endif | ||
345 | } | 448 | } |
346 | 449 | ||
347 | /** | 450 | /** |
@@ -355,6 +458,9 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
355 | */ | 458 | */ |
356 | void __init free_bootmem(unsigned long addr, unsigned long size) | 459 | void __init free_bootmem(unsigned long addr, unsigned long size) |
357 | { | 460 | { |
461 | #ifdef CONFIG_NO_BOOTMEM | ||
462 | free_early(addr, addr + size); | ||
463 | #else | ||
358 | unsigned long start, end; | 464 | unsigned long start, end; |
359 | 465 | ||
360 | kmemleak_free_part(__va(addr), size); | 466 | kmemleak_free_part(__va(addr), size); |
@@ -363,6 +469,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size) | |||
363 | end = PFN_DOWN(addr + size); | 469 | end = PFN_DOWN(addr + size); |
364 | 470 | ||
365 | mark_bootmem(start, end, 0, 0); | 471 | mark_bootmem(start, end, 0, 0); |
472 | #endif | ||
366 | } | 473 | } |
367 | 474 | ||
368 | /** | 475 | /** |
@@ -379,12 +486,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size) | |||
379 | int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 486 | int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
380 | unsigned long size, int flags) | 487 | unsigned long size, int flags) |
381 | { | 488 | { |
489 | #ifdef CONFIG_NO_BOOTMEM | ||
490 | panic("no bootmem"); | ||
491 | return 0; | ||
492 | #else | ||
382 | unsigned long start, end; | 493 | unsigned long start, end; |
383 | 494 | ||
384 | start = PFN_DOWN(physaddr); | 495 | start = PFN_DOWN(physaddr); |
385 | end = PFN_UP(physaddr + size); | 496 | end = PFN_UP(physaddr + size); |
386 | 497 | ||
387 | return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); | 498 | return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); |
499 | #endif | ||
388 | } | 500 | } |
389 | 501 | ||
390 | /** | 502 | /** |
@@ -400,16 +512,22 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
400 | int __init reserve_bootmem(unsigned long addr, unsigned long size, | 512 | int __init reserve_bootmem(unsigned long addr, unsigned long size, |
401 | int flags) | 513 | int flags) |
402 | { | 514 | { |
515 | #ifdef CONFIG_NO_BOOTMEM | ||
516 | panic("no bootmem"); | ||
517 | return 0; | ||
518 | #else | ||
403 | unsigned long start, end; | 519 | unsigned long start, end; |
404 | 520 | ||
405 | start = PFN_DOWN(addr); | 521 | start = PFN_DOWN(addr); |
406 | end = PFN_UP(addr + size); | 522 | end = PFN_UP(addr + size); |
407 | 523 | ||
408 | return mark_bootmem(start, end, 1, flags); | 524 | return mark_bootmem(start, end, 1, flags); |
525 | #endif | ||
409 | } | 526 | } |
410 | 527 | ||
411 | static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, | 528 | #ifndef CONFIG_NO_BOOTMEM |
412 | unsigned long step) | 529 | static unsigned long __init align_idx(struct bootmem_data *bdata, |
530 | unsigned long idx, unsigned long step) | ||
413 | { | 531 | { |
414 | unsigned long base = bdata->node_min_pfn; | 532 | unsigned long base = bdata->node_min_pfn; |
415 | 533 | ||
@@ -421,8 +539,8 @@ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, | |||
421 | return ALIGN(base + idx, step) - base; | 539 | return ALIGN(base + idx, step) - base; |
422 | } | 540 | } |
423 | 541 | ||
424 | static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, | 542 | static unsigned long __init align_off(struct bootmem_data *bdata, |
425 | unsigned long align) | 543 | unsigned long off, unsigned long align) |
426 | { | 544 | { |
427 | unsigned long base = PFN_PHYS(bdata->node_min_pfn); | 545 | unsigned long base = PFN_PHYS(bdata->node_min_pfn); |
428 | 546 | ||
@@ -558,12 +676,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | |||
558 | #endif | 676 | #endif |
559 | return NULL; | 677 | return NULL; |
560 | } | 678 | } |
679 | #endif | ||
561 | 680 | ||
562 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, | 681 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, |
563 | unsigned long align, | 682 | unsigned long align, |
564 | unsigned long goal, | 683 | unsigned long goal, |
565 | unsigned long limit) | 684 | unsigned long limit) |
566 | { | 685 | { |
686 | #ifdef CONFIG_NO_BOOTMEM | ||
687 | void *ptr; | ||
688 | |||
689 | if (WARN_ON_ONCE(slab_is_available())) | ||
690 | return kzalloc(size, GFP_NOWAIT); | ||
691 | |||
692 | restart: | ||
693 | |||
694 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); | ||
695 | |||
696 | if (ptr) | ||
697 | return ptr; | ||
698 | |||
699 | if (goal != 0) { | ||
700 | goal = 0; | ||
701 | goto restart; | ||
702 | } | ||
703 | |||
704 | return NULL; | ||
705 | #else | ||
567 | bootmem_data_t *bdata; | 706 | bootmem_data_t *bdata; |
568 | void *region; | 707 | void *region; |
569 | 708 | ||
@@ -589,6 +728,7 @@ restart: | |||
589 | } | 728 | } |
590 | 729 | ||
591 | return NULL; | 730 | return NULL; |
731 | #endif | ||
592 | } | 732 | } |
593 | 733 | ||
594 | /** | 734 | /** |
@@ -607,7 +747,13 @@ restart: | |||
607 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, | 747 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, |
608 | unsigned long goal) | 748 | unsigned long goal) |
609 | { | 749 | { |
610 | return ___alloc_bootmem_nopanic(size, align, goal, 0); | 750 | unsigned long limit = 0; |
751 | |||
752 | #ifdef CONFIG_NO_BOOTMEM | ||
753 | limit = -1UL; | ||
754 | #endif | ||
755 | |||
756 | return ___alloc_bootmem_nopanic(size, align, goal, limit); | ||
611 | } | 757 | } |
612 | 758 | ||
613 | static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, | 759 | static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, |
@@ -641,9 +787,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, | |||
641 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, | 787 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, |
642 | unsigned long goal) | 788 | unsigned long goal) |
643 | { | 789 | { |
644 | return ___alloc_bootmem(size, align, goal, 0); | 790 | unsigned long limit = 0; |
791 | |||
792 | #ifdef CONFIG_NO_BOOTMEM | ||
793 | limit = -1UL; | ||
794 | #endif | ||
795 | |||
796 | return ___alloc_bootmem(size, align, goal, limit); | ||
645 | } | 797 | } |
646 | 798 | ||
799 | #ifndef CONFIG_NO_BOOTMEM | ||
647 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | 800 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, |
648 | unsigned long size, unsigned long align, | 801 | unsigned long size, unsigned long align, |
649 | unsigned long goal, unsigned long limit) | 802 | unsigned long goal, unsigned long limit) |
@@ -660,6 +813,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | |||
660 | 813 | ||
661 | return ___alloc_bootmem(size, align, goal, limit); | 814 | return ___alloc_bootmem(size, align, goal, limit); |
662 | } | 815 | } |
816 | #endif | ||
663 | 817 | ||
664 | /** | 818 | /** |
665 | * __alloc_bootmem_node - allocate boot memory from a specific node | 819 | * __alloc_bootmem_node - allocate boot memory from a specific node |
@@ -682,7 +836,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | |||
682 | if (WARN_ON_ONCE(slab_is_available())) | 836 | if (WARN_ON_ONCE(slab_is_available())) |
683 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 837 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
684 | 838 | ||
839 | #ifdef CONFIG_NO_BOOTMEM | ||
840 | return __alloc_memory_core_early(pgdat->node_id, size, align, | ||
841 | goal, -1ULL); | ||
842 | #else | ||
685 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); | 843 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); |
844 | #endif | ||
845 | } | ||
846 | |||
847 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | ||
848 | unsigned long align, unsigned long goal) | ||
849 | { | ||
850 | #ifdef MAX_DMA32_PFN | ||
851 | unsigned long end_pfn; | ||
852 | |||
853 | if (WARN_ON_ONCE(slab_is_available())) | ||
854 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
855 | |||
856 | /* update goal according ...MAX_DMA32_PFN */ | ||
857 | end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; | ||
858 | |||
859 | if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && | ||
860 | (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { | ||
861 | void *ptr; | ||
862 | unsigned long new_goal; | ||
863 | |||
864 | new_goal = MAX_DMA32_PFN << PAGE_SHIFT; | ||
865 | #ifdef CONFIG_NO_BOOTMEM | ||
866 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
867 | new_goal, -1ULL); | ||
868 | #else | ||
869 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, | ||
870 | new_goal, 0); | ||
871 | #endif | ||
872 | if (ptr) | ||
873 | return ptr; | ||
874 | } | ||
875 | #endif | ||
876 | |||
877 | return __alloc_bootmem_node(pgdat, size, align, goal); | ||
878 | |||
686 | } | 879 | } |
687 | 880 | ||
688 | #ifdef CONFIG_SPARSEMEM | 881 | #ifdef CONFIG_SPARSEMEM |
@@ -696,6 +889,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | |||
696 | void * __init alloc_bootmem_section(unsigned long size, | 889 | void * __init alloc_bootmem_section(unsigned long size, |
697 | unsigned long section_nr) | 890 | unsigned long section_nr) |
698 | { | 891 | { |
892 | #ifdef CONFIG_NO_BOOTMEM | ||
893 | unsigned long pfn, goal, limit; | ||
894 | |||
895 | pfn = section_nr_to_pfn(section_nr); | ||
896 | goal = pfn << PAGE_SHIFT; | ||
897 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; | ||
898 | |||
899 | return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, | ||
900 | SMP_CACHE_BYTES, goal, limit); | ||
901 | #else | ||
699 | bootmem_data_t *bdata; | 902 | bootmem_data_t *bdata; |
700 | unsigned long pfn, goal, limit; | 903 | unsigned long pfn, goal, limit; |
701 | 904 | ||
@@ -705,6 +908,7 @@ void * __init alloc_bootmem_section(unsigned long size, | |||
705 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; | 908 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; |
706 | 909 | ||
707 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); | 910 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); |
911 | #endif | ||
708 | } | 912 | } |
709 | #endif | 913 | #endif |
710 | 914 | ||
@@ -716,11 +920,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | |||
716 | if (WARN_ON_ONCE(slab_is_available())) | 920 | if (WARN_ON_ONCE(slab_is_available())) |
717 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 921 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
718 | 922 | ||
923 | #ifdef CONFIG_NO_BOOTMEM | ||
924 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
925 | goal, -1ULL); | ||
926 | #else | ||
719 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); | 927 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); |
720 | if (ptr) | 928 | if (ptr) |
721 | return ptr; | 929 | return ptr; |
722 | 930 | ||
723 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); | 931 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); |
932 | #endif | ||
724 | if (ptr) | 933 | if (ptr) |
725 | return ptr; | 934 | return ptr; |
726 | 935 | ||
@@ -771,6 +980,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | |||
771 | if (WARN_ON_ONCE(slab_is_available())) | 980 | if (WARN_ON_ONCE(slab_is_available())) |
772 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 981 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
773 | 982 | ||
983 | #ifdef CONFIG_NO_BOOTMEM | ||
984 | return __alloc_memory_core_early(pgdat->node_id, size, align, | ||
985 | goal, ARCH_LOW_ADDRESS_LIMIT); | ||
986 | #else | ||
774 | return ___alloc_bootmem_node(pgdat->bdata, size, align, | 987 | return ___alloc_bootmem_node(pgdat->bdata, size, align, |
775 | goal, ARCH_LOW_ADDRESS_LIMIT); | 988 | goal, ARCH_LOW_ADDRESS_LIMIT); |
989 | #endif | ||
776 | } | 990 | } |
diff --git a/mm/bounce.c b/mm/bounce.c index a2b76a588e34..13b6dad1eed2 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/module.h> | 7 | #include <linux/module.h> |
8 | #include <linux/swap.h> | 8 | #include <linux/swap.h> |
9 | #include <linux/gfp.h> | ||
9 | #include <linux/bio.h> | 10 | #include <linux/bio.h> |
10 | #include <linux/pagemap.h> | 11 | #include <linux/pagemap.h> |
11 | #include <linux/mempool.h> | 12 | #include <linux/mempool.h> |
diff --git a/mm/fadvise.c b/mm/fadvise.c index e43359214f6f..8d723c9e8b75 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
77 | switch (advice) { | 77 | switch (advice) { |
78 | case POSIX_FADV_NORMAL: | 78 | case POSIX_FADV_NORMAL: |
79 | file->f_ra.ra_pages = bdi->ra_pages; | 79 | file->f_ra.ra_pages = bdi->ra_pages; |
80 | spin_lock(&file->f_lock); | ||
81 | file->f_mode &= ~FMODE_RANDOM; | ||
82 | spin_unlock(&file->f_lock); | ||
80 | break; | 83 | break; |
81 | case POSIX_FADV_RANDOM: | 84 | case POSIX_FADV_RANDOM: |
82 | file->f_ra.ra_pages = 0; | 85 | spin_lock(&file->f_lock); |
86 | file->f_mode |= FMODE_RANDOM; | ||
87 | spin_unlock(&file->f_lock); | ||
83 | break; | 88 | break; |
84 | case POSIX_FADV_SEQUENTIAL: | 89 | case POSIX_FADV_SEQUENTIAL: |
85 | file->f_ra.ra_pages = bdi->ra_pages * 2; | 90 | file->f_ra.ra_pages = bdi->ra_pages * 2; |
91 | spin_lock(&file->f_lock); | ||
92 | file->f_mode &= ~FMODE_RANDOM; | ||
93 | spin_unlock(&file->f_lock); | ||
86 | break; | 94 | break; |
87 | case POSIX_FADV_WILLNEED: | 95 | case POSIX_FADV_WILLNEED: |
88 | if (!mapping->a_ops->readpage) { | 96 | if (!mapping->a_ops->readpage) { |
diff --git a/mm/failslab.c b/mm/failslab.c index 9339de5f0a91..c5f88f240ddc 100644 --- a/mm/failslab.c +++ b/mm/failslab.c | |||
@@ -1,18 +1,21 @@ | |||
1 | #include <linux/fault-inject.h> | 1 | #include <linux/fault-inject.h> |
2 | #include <linux/gfp.h> | 2 | #include <linux/slab.h> |
3 | 3 | ||
4 | static struct { | 4 | static struct { |
5 | struct fault_attr attr; | 5 | struct fault_attr attr; |
6 | u32 ignore_gfp_wait; | 6 | u32 ignore_gfp_wait; |
7 | int cache_filter; | ||
7 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 8 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
8 | struct dentry *ignore_gfp_wait_file; | 9 | struct dentry *ignore_gfp_wait_file; |
10 | struct dentry *cache_filter_file; | ||
9 | #endif | 11 | #endif |
10 | } failslab = { | 12 | } failslab = { |
11 | .attr = FAULT_ATTR_INITIALIZER, | 13 | .attr = FAULT_ATTR_INITIALIZER, |
12 | .ignore_gfp_wait = 1, | 14 | .ignore_gfp_wait = 1, |
15 | .cache_filter = 0, | ||
13 | }; | 16 | }; |
14 | 17 | ||
15 | bool should_failslab(size_t size, gfp_t gfpflags) | 18 | bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) |
16 | { | 19 | { |
17 | if (gfpflags & __GFP_NOFAIL) | 20 | if (gfpflags & __GFP_NOFAIL) |
18 | return false; | 21 | return false; |
@@ -20,6 +23,9 @@ bool should_failslab(size_t size, gfp_t gfpflags) | |||
20 | if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) | 23 | if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) |
21 | return false; | 24 | return false; |
22 | 25 | ||
26 | if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) | ||
27 | return false; | ||
28 | |||
23 | return should_fail(&failslab.attr, size); | 29 | return should_fail(&failslab.attr, size); |
24 | } | 30 | } |
25 | 31 | ||
@@ -30,7 +36,6 @@ static int __init setup_failslab(char *str) | |||
30 | __setup("failslab=", setup_failslab); | 36 | __setup("failslab=", setup_failslab); |
31 | 37 | ||
32 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 38 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
33 | |||
34 | static int __init failslab_debugfs_init(void) | 39 | static int __init failslab_debugfs_init(void) |
35 | { | 40 | { |
36 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | 41 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
@@ -46,8 +51,14 @@ static int __init failslab_debugfs_init(void) | |||
46 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | 51 | debugfs_create_bool("ignore-gfp-wait", mode, dir, |
47 | &failslab.ignore_gfp_wait); | 52 | &failslab.ignore_gfp_wait); |
48 | 53 | ||
49 | if (!failslab.ignore_gfp_wait_file) { | 54 | failslab.cache_filter_file = |
55 | debugfs_create_bool("cache-filter", mode, dir, | ||
56 | &failslab.cache_filter); | ||
57 | |||
58 | if (!failslab.ignore_gfp_wait_file || | ||
59 | !failslab.cache_filter_file) { | ||
50 | err = -ENOMEM; | 60 | err = -ENOMEM; |
61 | debugfs_remove(failslab.cache_filter_file); | ||
51 | debugfs_remove(failslab.ignore_gfp_wait_file); | 62 | debugfs_remove(failslab.ignore_gfp_wait_file); |
52 | cleanup_fault_attr_dentries(&failslab.attr); | 63 | cleanup_fault_attr_dentries(&failslab.attr); |
53 | } | 64 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index ef169f37156d..140ebda9640f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -10,13 +10,13 @@ | |||
10 | * the NFS filesystem used to do this differently, for example) | 10 | * the NFS filesystem used to do this differently, for example) |
11 | */ | 11 | */ |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/slab.h> | ||
14 | #include <linux/compiler.h> | 13 | #include <linux/compiler.h> |
15 | #include <linux/fs.h> | 14 | #include <linux/fs.h> |
16 | #include <linux/uaccess.h> | 15 | #include <linux/uaccess.h> |
17 | #include <linux/aio.h> | 16 | #include <linux/aio.h> |
18 | #include <linux/capability.h> | 17 | #include <linux/capability.h> |
19 | #include <linux/kernel_stat.h> | 18 | #include <linux/kernel_stat.h> |
19 | #include <linux/gfp.h> | ||
20 | #include <linux/mm.h> | 20 | #include <linux/mm.h> |
21 | #include <linux/swap.h> | 21 | #include <linux/swap.h> |
22 | #include <linux/mman.h> | 22 | #include <linux/mman.h> |
@@ -260,27 +260,27 @@ int filemap_flush(struct address_space *mapping) | |||
260 | EXPORT_SYMBOL(filemap_flush); | 260 | EXPORT_SYMBOL(filemap_flush); |
261 | 261 | ||
262 | /** | 262 | /** |
263 | * wait_on_page_writeback_range - wait for writeback to complete | 263 | * filemap_fdatawait_range - wait for writeback to complete |
264 | * @mapping: target address_space | 264 | * @mapping: address space structure to wait for |
265 | * @start: beginning page index | 265 | * @start_byte: offset in bytes where the range starts |
266 | * @end: ending page index | 266 | * @end_byte: offset in bytes where the range ends (inclusive) |
267 | * | 267 | * |
268 | * Wait for writeback to complete against pages indexed by start->end | 268 | * Walk the list of under-writeback pages of the given address space |
269 | * inclusive | 269 | * in the given range and wait for all of them. |
270 | */ | 270 | */ |
271 | int wait_on_page_writeback_range(struct address_space *mapping, | 271 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, |
272 | pgoff_t start, pgoff_t end) | 272 | loff_t end_byte) |
273 | { | 273 | { |
274 | pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; | ||
275 | pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; | ||
274 | struct pagevec pvec; | 276 | struct pagevec pvec; |
275 | int nr_pages; | 277 | int nr_pages; |
276 | int ret = 0; | 278 | int ret = 0; |
277 | pgoff_t index; | ||
278 | 279 | ||
279 | if (end < start) | 280 | if (end_byte < start_byte) |
280 | return 0; | 281 | return 0; |
281 | 282 | ||
282 | pagevec_init(&pvec, 0); | 283 | pagevec_init(&pvec, 0); |
283 | index = start; | ||
284 | while ((index <= end) && | 284 | while ((index <= end) && |
285 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | 285 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, |
286 | PAGECACHE_TAG_WRITEBACK, | 286 | PAGECACHE_TAG_WRITEBACK, |
@@ -310,25 +310,6 @@ int wait_on_page_writeback_range(struct address_space *mapping, | |||
310 | 310 | ||
311 | return ret; | 311 | return ret; |
312 | } | 312 | } |
313 | |||
314 | /** | ||
315 | * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range | ||
316 | * @mapping: address space structure to wait for | ||
317 | * @start: offset in bytes where the range starts | ||
318 | * @end: offset in bytes where the range ends (inclusive) | ||
319 | * | ||
320 | * Walk the list of under-writeback pages of the given address space | ||
321 | * in the given range and wait for all of them. | ||
322 | * | ||
323 | * This is just a simple wrapper so that callers don't have to convert offsets | ||
324 | * to page indexes themselves | ||
325 | */ | ||
326 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start, | ||
327 | loff_t end) | ||
328 | { | ||
329 | return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT, | ||
330 | end >> PAGE_CACHE_SHIFT); | ||
331 | } | ||
332 | EXPORT_SYMBOL(filemap_fdatawait_range); | 313 | EXPORT_SYMBOL(filemap_fdatawait_range); |
333 | 314 | ||
334 | /** | 315 | /** |
@@ -345,8 +326,7 @@ int filemap_fdatawait(struct address_space *mapping) | |||
345 | if (i_size == 0) | 326 | if (i_size == 0) |
346 | return 0; | 327 | return 0; |
347 | 328 | ||
348 | return wait_on_page_writeback_range(mapping, 0, | 329 | return filemap_fdatawait_range(mapping, 0, i_size - 1); |
349 | (i_size - 1) >> PAGE_CACHE_SHIFT); | ||
350 | } | 330 | } |
351 | EXPORT_SYMBOL(filemap_fdatawait); | 331 | EXPORT_SYMBOL(filemap_fdatawait); |
352 | 332 | ||
@@ -393,9 +373,8 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
393 | WB_SYNC_ALL); | 373 | WB_SYNC_ALL); |
394 | /* See comment of filemap_write_and_wait() */ | 374 | /* See comment of filemap_write_and_wait() */ |
395 | if (err != -EIO) { | 375 | if (err != -EIO) { |
396 | int err2 = wait_on_page_writeback_range(mapping, | 376 | int err2 = filemap_fdatawait_range(mapping, |
397 | lstart >> PAGE_CACHE_SHIFT, | 377 | lstart, lend); |
398 | lend >> PAGE_CACHE_SHIFT); | ||
399 | if (!err) | 378 | if (!err) |
400 | err = err2; | 379 | err = err2; |
401 | } | 380 | } |
@@ -1138,7 +1117,7 @@ readpage: | |||
1138 | if (!PageUptodate(page)) { | 1117 | if (!PageUptodate(page)) { |
1139 | if (page->mapping == NULL) { | 1118 | if (page->mapping == NULL) { |
1140 | /* | 1119 | /* |
1141 | * invalidate_inode_pages got it | 1120 | * invalidate_mapping_pages got it |
1142 | */ | 1121 | */ |
1143 | unlock_page(page); | 1122 | unlock_page(page); |
1144 | page_cache_release(page); | 1123 | page_cache_release(page); |
@@ -1655,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); | |||
1655 | static struct page *__read_cache_page(struct address_space *mapping, | 1634 | static struct page *__read_cache_page(struct address_space *mapping, |
1656 | pgoff_t index, | 1635 | pgoff_t index, |
1657 | int (*filler)(void *,struct page*), | 1636 | int (*filler)(void *,struct page*), |
1658 | void *data) | 1637 | void *data, |
1638 | gfp_t gfp) | ||
1659 | { | 1639 | { |
1660 | struct page *page; | 1640 | struct page *page; |
1661 | int err; | 1641 | int err; |
1662 | repeat: | 1642 | repeat: |
1663 | page = find_get_page(mapping, index); | 1643 | page = find_get_page(mapping, index); |
1664 | if (!page) { | 1644 | if (!page) { |
1665 | page = page_cache_alloc_cold(mapping); | 1645 | page = __page_cache_alloc(gfp | __GFP_COLD); |
1666 | if (!page) | 1646 | if (!page) |
1667 | return ERR_PTR(-ENOMEM); | 1647 | return ERR_PTR(-ENOMEM); |
1668 | err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); | 1648 | err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); |
@@ -1682,31 +1662,18 @@ repeat: | |||
1682 | return page; | 1662 | return page; |
1683 | } | 1663 | } |
1684 | 1664 | ||
1685 | /** | 1665 | static struct page *do_read_cache_page(struct address_space *mapping, |
1686 | * read_cache_page_async - read into page cache, fill it if needed | ||
1687 | * @mapping: the page's address_space | ||
1688 | * @index: the page index | ||
1689 | * @filler: function to perform the read | ||
1690 | * @data: destination for read data | ||
1691 | * | ||
1692 | * Same as read_cache_page, but don't wait for page to become unlocked | ||
1693 | * after submitting it to the filler. | ||
1694 | * | ||
1695 | * Read into the page cache. If a page already exists, and PageUptodate() is | ||
1696 | * not set, try to fill the page but don't wait for it to become unlocked. | ||
1697 | * | ||
1698 | * If the page does not get brought uptodate, return -EIO. | ||
1699 | */ | ||
1700 | struct page *read_cache_page_async(struct address_space *mapping, | ||
1701 | pgoff_t index, | 1666 | pgoff_t index, |
1702 | int (*filler)(void *,struct page*), | 1667 | int (*filler)(void *,struct page*), |
1703 | void *data) | 1668 | void *data, |
1669 | gfp_t gfp) | ||
1670 | |||
1704 | { | 1671 | { |
1705 | struct page *page; | 1672 | struct page *page; |
1706 | int err; | 1673 | int err; |
1707 | 1674 | ||
1708 | retry: | 1675 | retry: |
1709 | page = __read_cache_page(mapping, index, filler, data); | 1676 | page = __read_cache_page(mapping, index, filler, data, gfp); |
1710 | if (IS_ERR(page)) | 1677 | if (IS_ERR(page)) |
1711 | return page; | 1678 | return page; |
1712 | if (PageUptodate(page)) | 1679 | if (PageUptodate(page)) |
@@ -1731,8 +1698,67 @@ out: | |||
1731 | mark_page_accessed(page); | 1698 | mark_page_accessed(page); |
1732 | return page; | 1699 | return page; |
1733 | } | 1700 | } |
1701 | |||
1702 | /** | ||
1703 | * read_cache_page_async - read into page cache, fill it if needed | ||
1704 | * @mapping: the page's address_space | ||
1705 | * @index: the page index | ||
1706 | * @filler: function to perform the read | ||
1707 | * @data: destination for read data | ||
1708 | * | ||
1709 | * Same as read_cache_page, but don't wait for page to become unlocked | ||
1710 | * after submitting it to the filler. | ||
1711 | * | ||
1712 | * Read into the page cache. If a page already exists, and PageUptodate() is | ||
1713 | * not set, try to fill the page but don't wait for it to become unlocked. | ||
1714 | * | ||
1715 | * If the page does not get brought uptodate, return -EIO. | ||
1716 | */ | ||
1717 | struct page *read_cache_page_async(struct address_space *mapping, | ||
1718 | pgoff_t index, | ||
1719 | int (*filler)(void *,struct page*), | ||
1720 | void *data) | ||
1721 | { | ||
1722 | return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); | ||
1723 | } | ||
1734 | EXPORT_SYMBOL(read_cache_page_async); | 1724 | EXPORT_SYMBOL(read_cache_page_async); |
1735 | 1725 | ||
1726 | static struct page *wait_on_page_read(struct page *page) | ||
1727 | { | ||
1728 | if (!IS_ERR(page)) { | ||
1729 | wait_on_page_locked(page); | ||
1730 | if (!PageUptodate(page)) { | ||
1731 | page_cache_release(page); | ||
1732 | page = ERR_PTR(-EIO); | ||
1733 | } | ||
1734 | } | ||
1735 | return page; | ||
1736 | } | ||
1737 | |||
1738 | /** | ||
1739 | * read_cache_page_gfp - read into page cache, using specified page allocation flags. | ||
1740 | * @mapping: the page's address_space | ||
1741 | * @index: the page index | ||
1742 | * @gfp: the page allocator flags to use if allocating | ||
1743 | * | ||
1744 | * This is the same as "read_mapping_page(mapping, index, NULL)", but with | ||
1745 | * any new page allocations done using the specified allocation flags. Note | ||
1746 | * that the Radix tree operations will still use GFP_KERNEL, so you can't | ||
1747 | * expect to do this atomically or anything like that - but you can pass in | ||
1748 | * other page requirements. | ||
1749 | * | ||
1750 | * If the page does not get brought uptodate, return -EIO. | ||
1751 | */ | ||
1752 | struct page *read_cache_page_gfp(struct address_space *mapping, | ||
1753 | pgoff_t index, | ||
1754 | gfp_t gfp) | ||
1755 | { | ||
1756 | filler_t *filler = (filler_t *)mapping->a_ops->readpage; | ||
1757 | |||
1758 | return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); | ||
1759 | } | ||
1760 | EXPORT_SYMBOL(read_cache_page_gfp); | ||
1761 | |||
1736 | /** | 1762 | /** |
1737 | * read_cache_page - read into page cache, fill it if needed | 1763 | * read_cache_page - read into page cache, fill it if needed |
1738 | * @mapping: the page's address_space | 1764 | * @mapping: the page's address_space |
@@ -1750,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping, | |||
1750 | int (*filler)(void *,struct page*), | 1776 | int (*filler)(void *,struct page*), |
1751 | void *data) | 1777 | void *data) |
1752 | { | 1778 | { |
1753 | struct page *page; | 1779 | return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); |
1754 | |||
1755 | page = read_cache_page_async(mapping, index, filler, data); | ||
1756 | if (IS_ERR(page)) | ||
1757 | goto out; | ||
1758 | wait_on_page_locked(page); | ||
1759 | if (!PageUptodate(page)) { | ||
1760 | page_cache_release(page); | ||
1761 | page = ERR_PTR(-EIO); | ||
1762 | } | ||
1763 | out: | ||
1764 | return page; | ||
1765 | } | 1780 | } |
1766 | EXPORT_SYMBOL(read_cache_page); | 1781 | EXPORT_SYMBOL(read_cache_page); |
1767 | 1782 | ||
@@ -1844,7 +1859,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr, | |||
1844 | 1859 | ||
1845 | /* | 1860 | /* |
1846 | * Copy as much as we can into the page and return the number of bytes which | 1861 | * Copy as much as we can into the page and return the number of bytes which |
1847 | * were sucessfully copied. If a fault is encountered then return the number of | 1862 | * were successfully copied. If a fault is encountered then return the number of |
1848 | * bytes which were copied. | 1863 | * bytes which were copied. |
1849 | */ | 1864 | */ |
1850 | size_t iov_iter_copy_from_user_atomic(struct page *page, | 1865 | size_t iov_iter_copy_from_user_atomic(struct page *page, |
@@ -1971,7 +1986,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count); | |||
1971 | inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) | 1986 | inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) |
1972 | { | 1987 | { |
1973 | struct inode *inode = file->f_mapping->host; | 1988 | struct inode *inode = file->f_mapping->host; |
1974 | unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | 1989 | unsigned long limit = rlimit(RLIMIT_FSIZE); |
1975 | 1990 | ||
1976 | if (unlikely(*pos < 0)) | 1991 | if (unlikely(*pos < 0)) |
1977 | return -EINVAL; | 1992 | return -EINVAL; |
@@ -2217,6 +2232,9 @@ again: | |||
2217 | if (unlikely(status)) | 2232 | if (unlikely(status)) |
2218 | break; | 2233 | break; |
2219 | 2234 | ||
2235 | if (mapping_writably_mapped(mapping)) | ||
2236 | flush_dcache_page(page); | ||
2237 | |||
2220 | pagefault_disable(); | 2238 | pagefault_disable(); |
2221 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); | 2239 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); |
2222 | pagefault_enable(); | 2240 | pagefault_enable(); |
@@ -2261,7 +2279,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2261 | size_t count, ssize_t written) | 2279 | size_t count, ssize_t written) |
2262 | { | 2280 | { |
2263 | struct file *file = iocb->ki_filp; | 2281 | struct file *file = iocb->ki_filp; |
2264 | struct address_space *mapping = file->f_mapping; | ||
2265 | ssize_t status; | 2282 | ssize_t status; |
2266 | struct iov_iter i; | 2283 | struct iov_iter i; |
2267 | 2284 | ||
@@ -2273,15 +2290,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2273 | *ppos = pos + status; | 2290 | *ppos = pos + status; |
2274 | } | 2291 | } |
2275 | 2292 | ||
2276 | /* | ||
2277 | * If we get here for O_DIRECT writes then we must have fallen through | ||
2278 | * to buffered writes (block instantiation inside i_size). So we sync | ||
2279 | * the file data here, to try to honour O_DIRECT expectations. | ||
2280 | */ | ||
2281 | if (unlikely(file->f_flags & O_DIRECT) && written) | ||
2282 | status = filemap_write_and_wait_range(mapping, | ||
2283 | pos, pos + written - 1); | ||
2284 | |||
2285 | return written ? written : status; | 2293 | return written ? written : status; |
2286 | } | 2294 | } |
2287 | EXPORT_SYMBOL(generic_file_buffered_write); | 2295 | EXPORT_SYMBOL(generic_file_buffered_write); |
@@ -2380,10 +2388,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2380 | * semantics. | 2388 | * semantics. |
2381 | */ | 2389 | */ |
2382 | endbyte = pos + written_buffered - written - 1; | 2390 | endbyte = pos + written_buffered - written - 1; |
2383 | err = do_sync_mapping_range(file->f_mapping, pos, endbyte, | 2391 | err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); |
2384 | SYNC_FILE_RANGE_WAIT_BEFORE| | ||
2385 | SYNC_FILE_RANGE_WRITE| | ||
2386 | SYNC_FILE_RANGE_WAIT_AFTER); | ||
2387 | if (err == 0) { | 2392 | if (err == 0) { |
2388 | written = written_buffered; | 2393 | written = written_buffered; |
2389 | invalidate_mapping_pages(mapping, | 2394 | invalidate_mapping_pages(mapping, |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 1888b2d71bb8..83364df74a33 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
18 | #include <linux/seqlock.h> | 18 | #include <linux/seqlock.h> |
19 | #include <linux/mutex.h> | 19 | #include <linux/mutex.h> |
20 | #include <linux/gfp.h> | ||
20 | #include <asm/tlbflush.h> | 21 | #include <asm/tlbflush.h> |
21 | #include <asm/io.h> | 22 | #include <asm/io.h> |
22 | 23 | ||
@@ -194,7 +195,7 @@ retry: | |||
194 | flush_cache_page(vma, address, pte_pfn(*pte)); | 195 | flush_cache_page(vma, address, pte_pfn(*pte)); |
195 | pteval = ptep_clear_flush_notify(vma, address, pte); | 196 | pteval = ptep_clear_flush_notify(vma, address, pte); |
196 | page_remove_rmap(page); | 197 | page_remove_rmap(page); |
197 | dec_mm_counter(mm, file_rss); | 198 | dec_mm_counter(mm, MM_FILEPAGES); |
198 | BUG_ON(pte_dirty(pteval)); | 199 | BUG_ON(pte_dirty(pteval)); |
199 | pte_unmap_unlock(pte, ptl); | 200 | pte_unmap_unlock(pte, ptl); |
200 | page_cache_release(page); | 201 | page_cache_release(page); |
diff --git a/mm/fremap.c b/mm/fremap.c index b6ec85abbb39..46f5dacf90a2 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
40 | page_remove_rmap(page); | 40 | page_remove_rmap(page); |
41 | page_cache_release(page); | 41 | page_cache_release(page); |
42 | update_hiwater_rss(mm); | 42 | update_hiwater_rss(mm); |
43 | dec_mm_counter(mm, file_rss); | 43 | dec_mm_counter(mm, MM_FILEPAGES); |
44 | } | 44 | } |
45 | } else { | 45 | } else { |
46 | if (!pte_file(pte)) | 46 | if (!pte_file(pte)) |
diff --git a/mm/highmem.c b/mm/highmem.c index 9c1e627f282e..bed8a8bfd01f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -220,7 +220,7 @@ EXPORT_SYMBOL(kmap_high); | |||
220 | * @page: &struct page to pin | 220 | * @page: &struct page to pin |
221 | * | 221 | * |
222 | * Returns the page's current virtual memory address, or NULL if no mapping | 222 | * Returns the page's current virtual memory address, or NULL if no mapping |
223 | * exists. When and only when a non null address is returned then a | 223 | * exists. If and only if a non null address is returned then a |
224 | * matching call to kunmap_high() is necessary. | 224 | * matching call to kunmap_high() is necessary. |
225 | * | 225 | * |
226 | * This can be called from any context. | 226 | * This can be called from any context. |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d7601b02874..4c9e6bbf3772 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -2,7 +2,6 @@ | |||
2 | * Generic hugetlb support. | 2 | * Generic hugetlb support. |
3 | * (C) William Irwin, April 2004 | 3 | * (C) William Irwin, April 2004 |
4 | */ | 4 | */ |
5 | #include <linux/gfp.h> | ||
6 | #include <linux/list.h> | 5 | #include <linux/list.h> |
7 | #include <linux/init.h> | 6 | #include <linux/init.h> |
8 | #include <linux/module.h> | 7 | #include <linux/module.h> |
@@ -18,12 +17,14 @@ | |||
18 | #include <linux/mutex.h> | 17 | #include <linux/mutex.h> |
19 | #include <linux/bootmem.h> | 18 | #include <linux/bootmem.h> |
20 | #include <linux/sysfs.h> | 19 | #include <linux/sysfs.h> |
20 | #include <linux/slab.h> | ||
21 | 21 | ||
22 | #include <asm/page.h> | 22 | #include <asm/page.h> |
23 | #include <asm/pgtable.h> | 23 | #include <asm/pgtable.h> |
24 | #include <asm/io.h> | 24 | #include <asm/io.h> |
25 | 25 | ||
26 | #include <linux/hugetlb.h> | 26 | #include <linux/hugetlb.h> |
27 | #include <linux/node.h> | ||
27 | #include "internal.h" | 28 | #include "internal.h" |
28 | 29 | ||
29 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 30 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
@@ -401,7 +402,7 @@ static void clear_huge_page(struct page *page, | |||
401 | { | 402 | { |
402 | int i; | 403 | int i; |
403 | 404 | ||
404 | if (unlikely(sz > MAX_ORDER_NR_PAGES)) { | 405 | if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { |
405 | clear_gigantic_page(page, addr, sz); | 406 | clear_gigantic_page(page, addr, sz); |
406 | return; | 407 | return; |
407 | } | 408 | } |
@@ -545,6 +546,7 @@ static void free_huge_page(struct page *page) | |||
545 | 546 | ||
546 | mapping = (struct address_space *) page_private(page); | 547 | mapping = (struct address_space *) page_private(page); |
547 | set_page_private(page, 0); | 548 | set_page_private(page, 0); |
549 | page->mapping = NULL; | ||
548 | BUG_ON(page_count(page)); | 550 | BUG_ON(page_count(page)); |
549 | INIT_LIST_HEAD(&page->lru); | 551 | INIT_LIST_HEAD(&page->lru); |
550 | 552 | ||
@@ -622,42 +624,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
622 | } | 624 | } |
623 | 625 | ||
624 | /* | 626 | /* |
625 | * Use a helper variable to find the next node and then | 627 | * common helper functions for hstate_next_node_to_{alloc|free}. |
626 | * copy it back to next_nid_to_alloc afterwards: | 628 | * We may have allocated or freed a huge page based on a different |
627 | * otherwise there's a window in which a racer might | 629 | * nodes_allowed previously, so h->next_node_to_{alloc|free} might |
628 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. | 630 | * be outside of *nodes_allowed. Ensure that we use an allowed |
629 | * But we don't need to use a spin_lock here: it really | 631 | * node for alloc or free. |
630 | * doesn't matter if occasionally a racer chooses the | ||
631 | * same nid as we do. Move nid forward in the mask even | ||
632 | * if we just successfully allocated a hugepage so that | ||
633 | * the next caller gets hugepages on the next node. | ||
634 | */ | 632 | */ |
635 | static int hstate_next_node_to_alloc(struct hstate *h) | 633 | static int next_node_allowed(int nid, nodemask_t *nodes_allowed) |
636 | { | 634 | { |
637 | int next_nid; | 635 | nid = next_node(nid, *nodes_allowed); |
638 | next_nid = next_node(h->next_nid_to_alloc, node_online_map); | 636 | if (nid == MAX_NUMNODES) |
639 | if (next_nid == MAX_NUMNODES) | 637 | nid = first_node(*nodes_allowed); |
640 | next_nid = first_node(node_online_map); | 638 | VM_BUG_ON(nid >= MAX_NUMNODES); |
641 | h->next_nid_to_alloc = next_nid; | 639 | |
642 | return next_nid; | 640 | return nid; |
643 | } | 641 | } |
644 | 642 | ||
645 | static int alloc_fresh_huge_page(struct hstate *h) | 643 | static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) |
644 | { | ||
645 | if (!node_isset(nid, *nodes_allowed)) | ||
646 | nid = next_node_allowed(nid, nodes_allowed); | ||
647 | return nid; | ||
648 | } | ||
649 | |||
650 | /* | ||
651 | * returns the previously saved node ["this node"] from which to | ||
652 | * allocate a persistent huge page for the pool and advance the | ||
653 | * next node from which to allocate, handling wrap at end of node | ||
654 | * mask. | ||
655 | */ | ||
656 | static int hstate_next_node_to_alloc(struct hstate *h, | ||
657 | nodemask_t *nodes_allowed) | ||
658 | { | ||
659 | int nid; | ||
660 | |||
661 | VM_BUG_ON(!nodes_allowed); | ||
662 | |||
663 | nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); | ||
664 | h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); | ||
665 | |||
666 | return nid; | ||
667 | } | ||
668 | |||
669 | static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) | ||
646 | { | 670 | { |
647 | struct page *page; | 671 | struct page *page; |
648 | int start_nid; | 672 | int start_nid; |
649 | int next_nid; | 673 | int next_nid; |
650 | int ret = 0; | 674 | int ret = 0; |
651 | 675 | ||
652 | start_nid = h->next_nid_to_alloc; | 676 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
653 | next_nid = start_nid; | 677 | next_nid = start_nid; |
654 | 678 | ||
655 | do { | 679 | do { |
656 | page = alloc_fresh_huge_page_node(h, next_nid); | 680 | page = alloc_fresh_huge_page_node(h, next_nid); |
657 | if (page) | 681 | if (page) { |
658 | ret = 1; | 682 | ret = 1; |
659 | next_nid = hstate_next_node_to_alloc(h); | 683 | break; |
660 | } while (!page && next_nid != start_nid); | 684 | } |
685 | next_nid = hstate_next_node_to_alloc(h, nodes_allowed); | ||
686 | } while (next_nid != start_nid); | ||
661 | 687 | ||
662 | if (ret) | 688 | if (ret) |
663 | count_vm_event(HTLB_BUDDY_PGALLOC); | 689 | count_vm_event(HTLB_BUDDY_PGALLOC); |
@@ -668,17 +694,21 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
668 | } | 694 | } |
669 | 695 | ||
670 | /* | 696 | /* |
671 | * helper for free_pool_huge_page() - find next node | 697 | * helper for free_pool_huge_page() - return the previously saved |
672 | * from which to free a huge page | 698 | * node ["this node"] from which to free a huge page. Advance the |
699 | * next node id whether or not we find a free huge page to free so | ||
700 | * that the next attempt to free addresses the next node. | ||
673 | */ | 701 | */ |
674 | static int hstate_next_node_to_free(struct hstate *h) | 702 | static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) |
675 | { | 703 | { |
676 | int next_nid; | 704 | int nid; |
677 | next_nid = next_node(h->next_nid_to_free, node_online_map); | 705 | |
678 | if (next_nid == MAX_NUMNODES) | 706 | VM_BUG_ON(!nodes_allowed); |
679 | next_nid = first_node(node_online_map); | 707 | |
680 | h->next_nid_to_free = next_nid; | 708 | nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); |
681 | return next_nid; | 709 | h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); |
710 | |||
711 | return nid; | ||
682 | } | 712 | } |
683 | 713 | ||
684 | /* | 714 | /* |
@@ -687,13 +717,14 @@ static int hstate_next_node_to_free(struct hstate *h) | |||
687 | * balanced over allowed nodes. | 717 | * balanced over allowed nodes. |
688 | * Called with hugetlb_lock locked. | 718 | * Called with hugetlb_lock locked. |
689 | */ | 719 | */ |
690 | static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | 720 | static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, |
721 | bool acct_surplus) | ||
691 | { | 722 | { |
692 | int start_nid; | 723 | int start_nid; |
693 | int next_nid; | 724 | int next_nid; |
694 | int ret = 0; | 725 | int ret = 0; |
695 | 726 | ||
696 | start_nid = h->next_nid_to_free; | 727 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
697 | next_nid = start_nid; | 728 | next_nid = start_nid; |
698 | 729 | ||
699 | do { | 730 | do { |
@@ -715,9 +746,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | |||
715 | } | 746 | } |
716 | update_and_free_page(h, page); | 747 | update_and_free_page(h, page); |
717 | ret = 1; | 748 | ret = 1; |
749 | break; | ||
718 | } | 750 | } |
719 | next_nid = hstate_next_node_to_free(h); | 751 | next_nid = hstate_next_node_to_free(h, nodes_allowed); |
720 | } while (!ret && next_nid != start_nid); | 752 | } while (next_nid != start_nid); |
721 | 753 | ||
722 | return ret; | 754 | return ret; |
723 | } | 755 | } |
@@ -911,14 +943,14 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
911 | 943 | ||
912 | /* | 944 | /* |
913 | * We want to release as many surplus pages as possible, spread | 945 | * We want to release as many surplus pages as possible, spread |
914 | * evenly across all nodes. Iterate across all nodes until we | 946 | * evenly across all nodes with memory. Iterate across these nodes |
915 | * can no longer free unreserved surplus pages. This occurs when | 947 | * until we can no longer free unreserved surplus pages. This occurs |
916 | * the nodes with surplus pages have no free pages. | 948 | * when the nodes with surplus pages have no free pages. |
917 | * free_pool_huge_page() will balance the the frees across the | 949 | * free_pool_huge_page() will balance the the freed pages across the |
918 | * on-line nodes for us and will handle the hstate accounting. | 950 | * on-line nodes with memory and will handle the hstate accounting. |
919 | */ | 951 | */ |
920 | while (nr_pages--) { | 952 | while (nr_pages--) { |
921 | if (!free_pool_huge_page(h, 1)) | 953 | if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) |
922 | break; | 954 | break; |
923 | } | 955 | } |
924 | } | 956 | } |
@@ -1007,7 +1039,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1007 | page = alloc_buddy_huge_page(h, vma, addr); | 1039 | page = alloc_buddy_huge_page(h, vma, addr); |
1008 | if (!page) { | 1040 | if (!page) { |
1009 | hugetlb_put_quota(inode->i_mapping, chg); | 1041 | hugetlb_put_quota(inode->i_mapping, chg); |
1010 | return ERR_PTR(-VM_FAULT_OOM); | 1042 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1011 | } | 1043 | } |
1012 | } | 1044 | } |
1013 | 1045 | ||
@@ -1022,16 +1054,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1022 | int __weak alloc_bootmem_huge_page(struct hstate *h) | 1054 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
1023 | { | 1055 | { |
1024 | struct huge_bootmem_page *m; | 1056 | struct huge_bootmem_page *m; |
1025 | int nr_nodes = nodes_weight(node_online_map); | 1057 | int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); |
1026 | 1058 | ||
1027 | while (nr_nodes) { | 1059 | while (nr_nodes) { |
1028 | void *addr; | 1060 | void *addr; |
1029 | 1061 | ||
1030 | addr = __alloc_bootmem_node_nopanic( | 1062 | addr = __alloc_bootmem_node_nopanic( |
1031 | NODE_DATA(h->next_nid_to_alloc), | 1063 | NODE_DATA(hstate_next_node_to_alloc(h, |
1064 | &node_states[N_HIGH_MEMORY])), | ||
1032 | huge_page_size(h), huge_page_size(h), 0); | 1065 | huge_page_size(h), huge_page_size(h), 0); |
1033 | 1066 | ||
1034 | hstate_next_node_to_alloc(h); | ||
1035 | if (addr) { | 1067 | if (addr) { |
1036 | /* | 1068 | /* |
1037 | * Use the beginning of the huge page to store the | 1069 | * Use the beginning of the huge page to store the |
@@ -1084,7 +1116,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
1084 | if (h->order >= MAX_ORDER) { | 1116 | if (h->order >= MAX_ORDER) { |
1085 | if (!alloc_bootmem_huge_page(h)) | 1117 | if (!alloc_bootmem_huge_page(h)) |
1086 | break; | 1118 | break; |
1087 | } else if (!alloc_fresh_huge_page(h)) | 1119 | } else if (!alloc_fresh_huge_page(h, |
1120 | &node_states[N_HIGH_MEMORY])) | ||
1088 | break; | 1121 | break; |
1089 | } | 1122 | } |
1090 | h->max_huge_pages = i; | 1123 | h->max_huge_pages = i; |
@@ -1126,14 +1159,15 @@ static void __init report_hugepages(void) | |||
1126 | } | 1159 | } |
1127 | 1160 | ||
1128 | #ifdef CONFIG_HIGHMEM | 1161 | #ifdef CONFIG_HIGHMEM |
1129 | static void try_to_free_low(struct hstate *h, unsigned long count) | 1162 | static void try_to_free_low(struct hstate *h, unsigned long count, |
1163 | nodemask_t *nodes_allowed) | ||
1130 | { | 1164 | { |
1131 | int i; | 1165 | int i; |
1132 | 1166 | ||
1133 | if (h->order >= MAX_ORDER) | 1167 | if (h->order >= MAX_ORDER) |
1134 | return; | 1168 | return; |
1135 | 1169 | ||
1136 | for (i = 0; i < MAX_NUMNODES; ++i) { | 1170 | for_each_node_mask(i, *nodes_allowed) { |
1137 | struct page *page, *next; | 1171 | struct page *page, *next; |
1138 | struct list_head *freel = &h->hugepage_freelists[i]; | 1172 | struct list_head *freel = &h->hugepage_freelists[i]; |
1139 | list_for_each_entry_safe(page, next, freel, lru) { | 1173 | list_for_each_entry_safe(page, next, freel, lru) { |
@@ -1149,7 +1183,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count) | |||
1149 | } | 1183 | } |
1150 | } | 1184 | } |
1151 | #else | 1185 | #else |
1152 | static inline void try_to_free_low(struct hstate *h, unsigned long count) | 1186 | static inline void try_to_free_low(struct hstate *h, unsigned long count, |
1187 | nodemask_t *nodes_allowed) | ||
1153 | { | 1188 | { |
1154 | } | 1189 | } |
1155 | #endif | 1190 | #endif |
@@ -1159,7 +1194,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
1159 | * balanced by operating on them in a round-robin fashion. | 1194 | * balanced by operating on them in a round-robin fashion. |
1160 | * Returns 1 if an adjustment was made. | 1195 | * Returns 1 if an adjustment was made. |
1161 | */ | 1196 | */ |
1162 | static int adjust_pool_surplus(struct hstate *h, int delta) | 1197 | static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, |
1198 | int delta) | ||
1163 | { | 1199 | { |
1164 | int start_nid, next_nid; | 1200 | int start_nid, next_nid; |
1165 | int ret = 0; | 1201 | int ret = 0; |
@@ -1167,29 +1203,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
1167 | VM_BUG_ON(delta != -1 && delta != 1); | 1203 | VM_BUG_ON(delta != -1 && delta != 1); |
1168 | 1204 | ||
1169 | if (delta < 0) | 1205 | if (delta < 0) |
1170 | start_nid = h->next_nid_to_alloc; | 1206 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
1171 | else | 1207 | else |
1172 | start_nid = h->next_nid_to_free; | 1208 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
1173 | next_nid = start_nid; | 1209 | next_nid = start_nid; |
1174 | 1210 | ||
1175 | do { | 1211 | do { |
1176 | int nid = next_nid; | 1212 | int nid = next_nid; |
1177 | if (delta < 0) { | 1213 | if (delta < 0) { |
1178 | next_nid = hstate_next_node_to_alloc(h); | ||
1179 | /* | 1214 | /* |
1180 | * To shrink on this node, there must be a surplus page | 1215 | * To shrink on this node, there must be a surplus page |
1181 | */ | 1216 | */ |
1182 | if (!h->surplus_huge_pages_node[nid]) | 1217 | if (!h->surplus_huge_pages_node[nid]) { |
1218 | next_nid = hstate_next_node_to_alloc(h, | ||
1219 | nodes_allowed); | ||
1183 | continue; | 1220 | continue; |
1221 | } | ||
1184 | } | 1222 | } |
1185 | if (delta > 0) { | 1223 | if (delta > 0) { |
1186 | next_nid = hstate_next_node_to_free(h); | ||
1187 | /* | 1224 | /* |
1188 | * Surplus cannot exceed the total number of pages | 1225 | * Surplus cannot exceed the total number of pages |
1189 | */ | 1226 | */ |
1190 | if (h->surplus_huge_pages_node[nid] >= | 1227 | if (h->surplus_huge_pages_node[nid] >= |
1191 | h->nr_huge_pages_node[nid]) | 1228 | h->nr_huge_pages_node[nid]) { |
1229 | next_nid = hstate_next_node_to_free(h, | ||
1230 | nodes_allowed); | ||
1192 | continue; | 1231 | continue; |
1232 | } | ||
1193 | } | 1233 | } |
1194 | 1234 | ||
1195 | h->surplus_huge_pages += delta; | 1235 | h->surplus_huge_pages += delta; |
@@ -1202,7 +1242,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
1202 | } | 1242 | } |
1203 | 1243 | ||
1204 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) | 1244 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
1205 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | 1245 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, |
1246 | nodemask_t *nodes_allowed) | ||
1206 | { | 1247 | { |
1207 | unsigned long min_count, ret; | 1248 | unsigned long min_count, ret; |
1208 | 1249 | ||
@@ -1222,7 +1263,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1222 | */ | 1263 | */ |
1223 | spin_lock(&hugetlb_lock); | 1264 | spin_lock(&hugetlb_lock); |
1224 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { | 1265 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { |
1225 | if (!adjust_pool_surplus(h, -1)) | 1266 | if (!adjust_pool_surplus(h, nodes_allowed, -1)) |
1226 | break; | 1267 | break; |
1227 | } | 1268 | } |
1228 | 1269 | ||
@@ -1233,11 +1274,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1233 | * and reducing the surplus. | 1274 | * and reducing the surplus. |
1234 | */ | 1275 | */ |
1235 | spin_unlock(&hugetlb_lock); | 1276 | spin_unlock(&hugetlb_lock); |
1236 | ret = alloc_fresh_huge_page(h); | 1277 | ret = alloc_fresh_huge_page(h, nodes_allowed); |
1237 | spin_lock(&hugetlb_lock); | 1278 | spin_lock(&hugetlb_lock); |
1238 | if (!ret) | 1279 | if (!ret) |
1239 | goto out; | 1280 | goto out; |
1240 | 1281 | ||
1282 | /* Bail for signals. Probably ctrl-c from user */ | ||
1283 | if (signal_pending(current)) | ||
1284 | goto out; | ||
1241 | } | 1285 | } |
1242 | 1286 | ||
1243 | /* | 1287 | /* |
@@ -1257,13 +1301,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1257 | */ | 1301 | */ |
1258 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; | 1302 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; |
1259 | min_count = max(count, min_count); | 1303 | min_count = max(count, min_count); |
1260 | try_to_free_low(h, min_count); | 1304 | try_to_free_low(h, min_count, nodes_allowed); |
1261 | while (min_count < persistent_huge_pages(h)) { | 1305 | while (min_count < persistent_huge_pages(h)) { |
1262 | if (!free_pool_huge_page(h, 0)) | 1306 | if (!free_pool_huge_page(h, nodes_allowed, 0)) |
1263 | break; | 1307 | break; |
1264 | } | 1308 | } |
1265 | while (count < persistent_huge_pages(h)) { | 1309 | while (count < persistent_huge_pages(h)) { |
1266 | if (!adjust_pool_surplus(h, 1)) | 1310 | if (!adjust_pool_surplus(h, nodes_allowed, 1)) |
1267 | break; | 1311 | break; |
1268 | } | 1312 | } |
1269 | out: | 1313 | out: |
@@ -1282,43 +1326,117 @@ out: | |||
1282 | static struct kobject *hugepages_kobj; | 1326 | static struct kobject *hugepages_kobj; |
1283 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | 1327 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; |
1284 | 1328 | ||
1285 | static struct hstate *kobj_to_hstate(struct kobject *kobj) | 1329 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); |
1330 | |||
1331 | static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) | ||
1286 | { | 1332 | { |
1287 | int i; | 1333 | int i; |
1334 | |||
1288 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | 1335 | for (i = 0; i < HUGE_MAX_HSTATE; i++) |
1289 | if (hstate_kobjs[i] == kobj) | 1336 | if (hstate_kobjs[i] == kobj) { |
1337 | if (nidp) | ||
1338 | *nidp = NUMA_NO_NODE; | ||
1290 | return &hstates[i]; | 1339 | return &hstates[i]; |
1291 | BUG(); | 1340 | } |
1292 | return NULL; | 1341 | |
1342 | return kobj_to_node_hstate(kobj, nidp); | ||
1293 | } | 1343 | } |
1294 | 1344 | ||
1295 | static ssize_t nr_hugepages_show(struct kobject *kobj, | 1345 | static ssize_t nr_hugepages_show_common(struct kobject *kobj, |
1296 | struct kobj_attribute *attr, char *buf) | 1346 | struct kobj_attribute *attr, char *buf) |
1297 | { | 1347 | { |
1298 | struct hstate *h = kobj_to_hstate(kobj); | 1348 | struct hstate *h; |
1299 | return sprintf(buf, "%lu\n", h->nr_huge_pages); | 1349 | unsigned long nr_huge_pages; |
1350 | int nid; | ||
1351 | |||
1352 | h = kobj_to_hstate(kobj, &nid); | ||
1353 | if (nid == NUMA_NO_NODE) | ||
1354 | nr_huge_pages = h->nr_huge_pages; | ||
1355 | else | ||
1356 | nr_huge_pages = h->nr_huge_pages_node[nid]; | ||
1357 | |||
1358 | return sprintf(buf, "%lu\n", nr_huge_pages); | ||
1300 | } | 1359 | } |
1301 | static ssize_t nr_hugepages_store(struct kobject *kobj, | 1360 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, |
1302 | struct kobj_attribute *attr, const char *buf, size_t count) | 1361 | struct kobject *kobj, struct kobj_attribute *attr, |
1362 | const char *buf, size_t len) | ||
1303 | { | 1363 | { |
1304 | int err; | 1364 | int err; |
1305 | unsigned long input; | 1365 | int nid; |
1306 | struct hstate *h = kobj_to_hstate(kobj); | 1366 | unsigned long count; |
1367 | struct hstate *h; | ||
1368 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); | ||
1307 | 1369 | ||
1308 | err = strict_strtoul(buf, 10, &input); | 1370 | err = strict_strtoul(buf, 10, &count); |
1309 | if (err) | 1371 | if (err) |
1310 | return 0; | 1372 | return 0; |
1311 | 1373 | ||
1312 | h->max_huge_pages = set_max_huge_pages(h, input); | 1374 | h = kobj_to_hstate(kobj, &nid); |
1375 | if (nid == NUMA_NO_NODE) { | ||
1376 | /* | ||
1377 | * global hstate attribute | ||
1378 | */ | ||
1379 | if (!(obey_mempolicy && | ||
1380 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
1381 | NODEMASK_FREE(nodes_allowed); | ||
1382 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1383 | } | ||
1384 | } else if (nodes_allowed) { | ||
1385 | /* | ||
1386 | * per node hstate attribute: adjust count to global, | ||
1387 | * but restrict alloc/free to the specified node. | ||
1388 | */ | ||
1389 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; | ||
1390 | init_nodemask_of_node(nodes_allowed, nid); | ||
1391 | } else | ||
1392 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1313 | 1393 | ||
1314 | return count; | 1394 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); |
1395 | |||
1396 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | ||
1397 | NODEMASK_FREE(nodes_allowed); | ||
1398 | |||
1399 | return len; | ||
1400 | } | ||
1401 | |||
1402 | static ssize_t nr_hugepages_show(struct kobject *kobj, | ||
1403 | struct kobj_attribute *attr, char *buf) | ||
1404 | { | ||
1405 | return nr_hugepages_show_common(kobj, attr, buf); | ||
1406 | } | ||
1407 | |||
1408 | static ssize_t nr_hugepages_store(struct kobject *kobj, | ||
1409 | struct kobj_attribute *attr, const char *buf, size_t len) | ||
1410 | { | ||
1411 | return nr_hugepages_store_common(false, kobj, attr, buf, len); | ||
1315 | } | 1412 | } |
1316 | HSTATE_ATTR(nr_hugepages); | 1413 | HSTATE_ATTR(nr_hugepages); |
1317 | 1414 | ||
1415 | #ifdef CONFIG_NUMA | ||
1416 | |||
1417 | /* | ||
1418 | * hstate attribute for optionally mempolicy-based constraint on persistent | ||
1419 | * huge page alloc/free. | ||
1420 | */ | ||
1421 | static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, | ||
1422 | struct kobj_attribute *attr, char *buf) | ||
1423 | { | ||
1424 | return nr_hugepages_show_common(kobj, attr, buf); | ||
1425 | } | ||
1426 | |||
1427 | static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, | ||
1428 | struct kobj_attribute *attr, const char *buf, size_t len) | ||
1429 | { | ||
1430 | return nr_hugepages_store_common(true, kobj, attr, buf, len); | ||
1431 | } | ||
1432 | HSTATE_ATTR(nr_hugepages_mempolicy); | ||
1433 | #endif | ||
1434 | |||
1435 | |||
1318 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, | 1436 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, |
1319 | struct kobj_attribute *attr, char *buf) | 1437 | struct kobj_attribute *attr, char *buf) |
1320 | { | 1438 | { |
1321 | struct hstate *h = kobj_to_hstate(kobj); | 1439 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1322 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); | 1440 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); |
1323 | } | 1441 | } |
1324 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | 1442 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, |
@@ -1326,7 +1444,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | |||
1326 | { | 1444 | { |
1327 | int err; | 1445 | int err; |
1328 | unsigned long input; | 1446 | unsigned long input; |
1329 | struct hstate *h = kobj_to_hstate(kobj); | 1447 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1330 | 1448 | ||
1331 | err = strict_strtoul(buf, 10, &input); | 1449 | err = strict_strtoul(buf, 10, &input); |
1332 | if (err) | 1450 | if (err) |
@@ -1343,15 +1461,24 @@ HSTATE_ATTR(nr_overcommit_hugepages); | |||
1343 | static ssize_t free_hugepages_show(struct kobject *kobj, | 1461 | static ssize_t free_hugepages_show(struct kobject *kobj, |
1344 | struct kobj_attribute *attr, char *buf) | 1462 | struct kobj_attribute *attr, char *buf) |
1345 | { | 1463 | { |
1346 | struct hstate *h = kobj_to_hstate(kobj); | 1464 | struct hstate *h; |
1347 | return sprintf(buf, "%lu\n", h->free_huge_pages); | 1465 | unsigned long free_huge_pages; |
1466 | int nid; | ||
1467 | |||
1468 | h = kobj_to_hstate(kobj, &nid); | ||
1469 | if (nid == NUMA_NO_NODE) | ||
1470 | free_huge_pages = h->free_huge_pages; | ||
1471 | else | ||
1472 | free_huge_pages = h->free_huge_pages_node[nid]; | ||
1473 | |||
1474 | return sprintf(buf, "%lu\n", free_huge_pages); | ||
1348 | } | 1475 | } |
1349 | HSTATE_ATTR_RO(free_hugepages); | 1476 | HSTATE_ATTR_RO(free_hugepages); |
1350 | 1477 | ||
1351 | static ssize_t resv_hugepages_show(struct kobject *kobj, | 1478 | static ssize_t resv_hugepages_show(struct kobject *kobj, |
1352 | struct kobj_attribute *attr, char *buf) | 1479 | struct kobj_attribute *attr, char *buf) |
1353 | { | 1480 | { |
1354 | struct hstate *h = kobj_to_hstate(kobj); | 1481 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1355 | return sprintf(buf, "%lu\n", h->resv_huge_pages); | 1482 | return sprintf(buf, "%lu\n", h->resv_huge_pages); |
1356 | } | 1483 | } |
1357 | HSTATE_ATTR_RO(resv_hugepages); | 1484 | HSTATE_ATTR_RO(resv_hugepages); |
@@ -1359,8 +1486,17 @@ HSTATE_ATTR_RO(resv_hugepages); | |||
1359 | static ssize_t surplus_hugepages_show(struct kobject *kobj, | 1486 | static ssize_t surplus_hugepages_show(struct kobject *kobj, |
1360 | struct kobj_attribute *attr, char *buf) | 1487 | struct kobj_attribute *attr, char *buf) |
1361 | { | 1488 | { |
1362 | struct hstate *h = kobj_to_hstate(kobj); | 1489 | struct hstate *h; |
1363 | return sprintf(buf, "%lu\n", h->surplus_huge_pages); | 1490 | unsigned long surplus_huge_pages; |
1491 | int nid; | ||
1492 | |||
1493 | h = kobj_to_hstate(kobj, &nid); | ||
1494 | if (nid == NUMA_NO_NODE) | ||
1495 | surplus_huge_pages = h->surplus_huge_pages; | ||
1496 | else | ||
1497 | surplus_huge_pages = h->surplus_huge_pages_node[nid]; | ||
1498 | |||
1499 | return sprintf(buf, "%lu\n", surplus_huge_pages); | ||
1364 | } | 1500 | } |
1365 | HSTATE_ATTR_RO(surplus_hugepages); | 1501 | HSTATE_ATTR_RO(surplus_hugepages); |
1366 | 1502 | ||
@@ -1370,6 +1506,9 @@ static struct attribute *hstate_attrs[] = { | |||
1370 | &free_hugepages_attr.attr, | 1506 | &free_hugepages_attr.attr, |
1371 | &resv_hugepages_attr.attr, | 1507 | &resv_hugepages_attr.attr, |
1372 | &surplus_hugepages_attr.attr, | 1508 | &surplus_hugepages_attr.attr, |
1509 | #ifdef CONFIG_NUMA | ||
1510 | &nr_hugepages_mempolicy_attr.attr, | ||
1511 | #endif | ||
1373 | NULL, | 1512 | NULL, |
1374 | }; | 1513 | }; |
1375 | 1514 | ||
@@ -1377,19 +1516,20 @@ static struct attribute_group hstate_attr_group = { | |||
1377 | .attrs = hstate_attrs, | 1516 | .attrs = hstate_attrs, |
1378 | }; | 1517 | }; |
1379 | 1518 | ||
1380 | static int __init hugetlb_sysfs_add_hstate(struct hstate *h) | 1519 | static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, |
1520 | struct kobject **hstate_kobjs, | ||
1521 | struct attribute_group *hstate_attr_group) | ||
1381 | { | 1522 | { |
1382 | int retval; | 1523 | int retval; |
1524 | int hi = h - hstates; | ||
1383 | 1525 | ||
1384 | hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, | 1526 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); |
1385 | hugepages_kobj); | 1527 | if (!hstate_kobjs[hi]) |
1386 | if (!hstate_kobjs[h - hstates]) | ||
1387 | return -ENOMEM; | 1528 | return -ENOMEM; |
1388 | 1529 | ||
1389 | retval = sysfs_create_group(hstate_kobjs[h - hstates], | 1530 | retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); |
1390 | &hstate_attr_group); | ||
1391 | if (retval) | 1531 | if (retval) |
1392 | kobject_put(hstate_kobjs[h - hstates]); | 1532 | kobject_put(hstate_kobjs[hi]); |
1393 | 1533 | ||
1394 | return retval; | 1534 | return retval; |
1395 | } | 1535 | } |
@@ -1404,17 +1544,184 @@ static void __init hugetlb_sysfs_init(void) | |||
1404 | return; | 1544 | return; |
1405 | 1545 | ||
1406 | for_each_hstate(h) { | 1546 | for_each_hstate(h) { |
1407 | err = hugetlb_sysfs_add_hstate(h); | 1547 | err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, |
1548 | hstate_kobjs, &hstate_attr_group); | ||
1408 | if (err) | 1549 | if (err) |
1409 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", | 1550 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", |
1410 | h->name); | 1551 | h->name); |
1411 | } | 1552 | } |
1412 | } | 1553 | } |
1413 | 1554 | ||
1555 | #ifdef CONFIG_NUMA | ||
1556 | |||
1557 | /* | ||
1558 | * node_hstate/s - associate per node hstate attributes, via their kobjects, | ||
1559 | * with node sysdevs in node_devices[] using a parallel array. The array | ||
1560 | * index of a node sysdev or _hstate == node id. | ||
1561 | * This is here to avoid any static dependency of the node sysdev driver, in | ||
1562 | * the base kernel, on the hugetlb module. | ||
1563 | */ | ||
1564 | struct node_hstate { | ||
1565 | struct kobject *hugepages_kobj; | ||
1566 | struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | ||
1567 | }; | ||
1568 | struct node_hstate node_hstates[MAX_NUMNODES]; | ||
1569 | |||
1570 | /* | ||
1571 | * A subset of global hstate attributes for node sysdevs | ||
1572 | */ | ||
1573 | static struct attribute *per_node_hstate_attrs[] = { | ||
1574 | &nr_hugepages_attr.attr, | ||
1575 | &free_hugepages_attr.attr, | ||
1576 | &surplus_hugepages_attr.attr, | ||
1577 | NULL, | ||
1578 | }; | ||
1579 | |||
1580 | static struct attribute_group per_node_hstate_attr_group = { | ||
1581 | .attrs = per_node_hstate_attrs, | ||
1582 | }; | ||
1583 | |||
1584 | /* | ||
1585 | * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. | ||
1586 | * Returns node id via non-NULL nidp. | ||
1587 | */ | ||
1588 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | ||
1589 | { | ||
1590 | int nid; | ||
1591 | |||
1592 | for (nid = 0; nid < nr_node_ids; nid++) { | ||
1593 | struct node_hstate *nhs = &node_hstates[nid]; | ||
1594 | int i; | ||
1595 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | ||
1596 | if (nhs->hstate_kobjs[i] == kobj) { | ||
1597 | if (nidp) | ||
1598 | *nidp = nid; | ||
1599 | return &hstates[i]; | ||
1600 | } | ||
1601 | } | ||
1602 | |||
1603 | BUG(); | ||
1604 | return NULL; | ||
1605 | } | ||
1606 | |||
1607 | /* | ||
1608 | * Unregister hstate attributes from a single node sysdev. | ||
1609 | * No-op if no hstate attributes attached. | ||
1610 | */ | ||
1611 | void hugetlb_unregister_node(struct node *node) | ||
1612 | { | ||
1613 | struct hstate *h; | ||
1614 | struct node_hstate *nhs = &node_hstates[node->sysdev.id]; | ||
1615 | |||
1616 | if (!nhs->hugepages_kobj) | ||
1617 | return; /* no hstate attributes */ | ||
1618 | |||
1619 | for_each_hstate(h) | ||
1620 | if (nhs->hstate_kobjs[h - hstates]) { | ||
1621 | kobject_put(nhs->hstate_kobjs[h - hstates]); | ||
1622 | nhs->hstate_kobjs[h - hstates] = NULL; | ||
1623 | } | ||
1624 | |||
1625 | kobject_put(nhs->hugepages_kobj); | ||
1626 | nhs->hugepages_kobj = NULL; | ||
1627 | } | ||
1628 | |||
1629 | /* | ||
1630 | * hugetlb module exit: unregister hstate attributes from node sysdevs | ||
1631 | * that have them. | ||
1632 | */ | ||
1633 | static void hugetlb_unregister_all_nodes(void) | ||
1634 | { | ||
1635 | int nid; | ||
1636 | |||
1637 | /* | ||
1638 | * disable node sysdev registrations. | ||
1639 | */ | ||
1640 | register_hugetlbfs_with_node(NULL, NULL); | ||
1641 | |||
1642 | /* | ||
1643 | * remove hstate attributes from any nodes that have them. | ||
1644 | */ | ||
1645 | for (nid = 0; nid < nr_node_ids; nid++) | ||
1646 | hugetlb_unregister_node(&node_devices[nid]); | ||
1647 | } | ||
1648 | |||
1649 | /* | ||
1650 | * Register hstate attributes for a single node sysdev. | ||
1651 | * No-op if attributes already registered. | ||
1652 | */ | ||
1653 | void hugetlb_register_node(struct node *node) | ||
1654 | { | ||
1655 | struct hstate *h; | ||
1656 | struct node_hstate *nhs = &node_hstates[node->sysdev.id]; | ||
1657 | int err; | ||
1658 | |||
1659 | if (nhs->hugepages_kobj) | ||
1660 | return; /* already allocated */ | ||
1661 | |||
1662 | nhs->hugepages_kobj = kobject_create_and_add("hugepages", | ||
1663 | &node->sysdev.kobj); | ||
1664 | if (!nhs->hugepages_kobj) | ||
1665 | return; | ||
1666 | |||
1667 | for_each_hstate(h) { | ||
1668 | err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, | ||
1669 | nhs->hstate_kobjs, | ||
1670 | &per_node_hstate_attr_group); | ||
1671 | if (err) { | ||
1672 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s" | ||
1673 | " for node %d\n", | ||
1674 | h->name, node->sysdev.id); | ||
1675 | hugetlb_unregister_node(node); | ||
1676 | break; | ||
1677 | } | ||
1678 | } | ||
1679 | } | ||
1680 | |||
1681 | /* | ||
1682 | * hugetlb init time: register hstate attributes for all registered node | ||
1683 | * sysdevs of nodes that have memory. All on-line nodes should have | ||
1684 | * registered their associated sysdev by this time. | ||
1685 | */ | ||
1686 | static void hugetlb_register_all_nodes(void) | ||
1687 | { | ||
1688 | int nid; | ||
1689 | |||
1690 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
1691 | struct node *node = &node_devices[nid]; | ||
1692 | if (node->sysdev.id == nid) | ||
1693 | hugetlb_register_node(node); | ||
1694 | } | ||
1695 | |||
1696 | /* | ||
1697 | * Let the node sysdev driver know we're here so it can | ||
1698 | * [un]register hstate attributes on node hotplug. | ||
1699 | */ | ||
1700 | register_hugetlbfs_with_node(hugetlb_register_node, | ||
1701 | hugetlb_unregister_node); | ||
1702 | } | ||
1703 | #else /* !CONFIG_NUMA */ | ||
1704 | |||
1705 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | ||
1706 | { | ||
1707 | BUG(); | ||
1708 | if (nidp) | ||
1709 | *nidp = -1; | ||
1710 | return NULL; | ||
1711 | } | ||
1712 | |||
1713 | static void hugetlb_unregister_all_nodes(void) { } | ||
1714 | |||
1715 | static void hugetlb_register_all_nodes(void) { } | ||
1716 | |||
1717 | #endif | ||
1718 | |||
1414 | static void __exit hugetlb_exit(void) | 1719 | static void __exit hugetlb_exit(void) |
1415 | { | 1720 | { |
1416 | struct hstate *h; | 1721 | struct hstate *h; |
1417 | 1722 | ||
1723 | hugetlb_unregister_all_nodes(); | ||
1724 | |||
1418 | for_each_hstate(h) { | 1725 | for_each_hstate(h) { |
1419 | kobject_put(hstate_kobjs[h - hstates]); | 1726 | kobject_put(hstate_kobjs[h - hstates]); |
1420 | } | 1727 | } |
@@ -1449,6 +1756,8 @@ static int __init hugetlb_init(void) | |||
1449 | 1756 | ||
1450 | hugetlb_sysfs_init(); | 1757 | hugetlb_sysfs_init(); |
1451 | 1758 | ||
1759 | hugetlb_register_all_nodes(); | ||
1760 | |||
1452 | return 0; | 1761 | return 0; |
1453 | } | 1762 | } |
1454 | module_init(hugetlb_init); | 1763 | module_init(hugetlb_init); |
@@ -1472,8 +1781,8 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1472 | h->free_huge_pages = 0; | 1781 | h->free_huge_pages = 0; |
1473 | for (i = 0; i < MAX_NUMNODES; ++i) | 1782 | for (i = 0; i < MAX_NUMNODES; ++i) |
1474 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1783 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1475 | h->next_nid_to_alloc = first_node(node_online_map); | 1784 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); |
1476 | h->next_nid_to_free = first_node(node_online_map); | 1785 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); |
1477 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1786 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1478 | huge_page_size(h)/1024); | 1787 | huge_page_size(h)/1024); |
1479 | 1788 | ||
@@ -1536,9 +1845,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
1536 | } | 1845 | } |
1537 | 1846 | ||
1538 | #ifdef CONFIG_SYSCTL | 1847 | #ifdef CONFIG_SYSCTL |
1539 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1848 | static int hugetlb_sysctl_handler_common(bool obey_mempolicy, |
1540 | void __user *buffer, | 1849 | struct ctl_table *table, int write, |
1541 | size_t *length, loff_t *ppos) | 1850 | void __user *buffer, size_t *length, loff_t *ppos) |
1542 | { | 1851 | { |
1543 | struct hstate *h = &default_hstate; | 1852 | struct hstate *h = &default_hstate; |
1544 | unsigned long tmp; | 1853 | unsigned long tmp; |
@@ -1550,12 +1859,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, | |||
1550 | table->maxlen = sizeof(unsigned long); | 1859 | table->maxlen = sizeof(unsigned long); |
1551 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1860 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1552 | 1861 | ||
1553 | if (write) | 1862 | if (write) { |
1554 | h->max_huge_pages = set_max_huge_pages(h, tmp); | 1863 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, |
1864 | GFP_KERNEL | __GFP_NORETRY); | ||
1865 | if (!(obey_mempolicy && | ||
1866 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
1867 | NODEMASK_FREE(nodes_allowed); | ||
1868 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1869 | } | ||
1870 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); | ||
1871 | |||
1872 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | ||
1873 | NODEMASK_FREE(nodes_allowed); | ||
1874 | } | ||
1555 | 1875 | ||
1556 | return 0; | 1876 | return 0; |
1557 | } | 1877 | } |
1558 | 1878 | ||
1879 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | ||
1880 | void __user *buffer, size_t *length, loff_t *ppos) | ||
1881 | { | ||
1882 | |||
1883 | return hugetlb_sysctl_handler_common(false, table, write, | ||
1884 | buffer, length, ppos); | ||
1885 | } | ||
1886 | |||
1887 | #ifdef CONFIG_NUMA | ||
1888 | int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, | ||
1889 | void __user *buffer, size_t *length, loff_t *ppos) | ||
1890 | { | ||
1891 | return hugetlb_sysctl_handler_common(true, table, write, | ||
1892 | buffer, length, ppos); | ||
1893 | } | ||
1894 | #endif /* CONFIG_NUMA */ | ||
1895 | |||
1559 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, | 1896 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, |
1560 | void __user *buffer, | 1897 | void __user *buffer, |
1561 | size_t *length, loff_t *ppos) | 1898 | size_t *length, loff_t *ppos) |
@@ -1751,7 +2088,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
1751 | 2088 | ||
1752 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); | 2089 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); |
1753 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { | 2090 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { |
1754 | update_mmu_cache(vma, address, entry); | 2091 | update_mmu_cache(vma, address, ptep); |
1755 | } | 2092 | } |
1756 | } | 2093 | } |
1757 | 2094 | ||
@@ -1903,6 +2240,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1903 | + (vma->vm_pgoff >> PAGE_SHIFT); | 2240 | + (vma->vm_pgoff >> PAGE_SHIFT); |
1904 | mapping = (struct address_space *)page_private(page); | 2241 | mapping = (struct address_space *)page_private(page); |
1905 | 2242 | ||
2243 | /* | ||
2244 | * Take the mapping lock for the duration of the table walk. As | ||
2245 | * this mapping should be shared between all the VMAs, | ||
2246 | * __unmap_hugepage_range() is called as the lock is already held | ||
2247 | */ | ||
2248 | spin_lock(&mapping->i_mmap_lock); | ||
1906 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 2249 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1907 | /* Do not unmap the current VMA */ | 2250 | /* Do not unmap the current VMA */ |
1908 | if (iter_vma == vma) | 2251 | if (iter_vma == vma) |
@@ -1916,10 +2259,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1916 | * from the time of fork. This would look like data corruption | 2259 | * from the time of fork. This would look like data corruption |
1917 | */ | 2260 | */ |
1918 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | 2261 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) |
1919 | unmap_hugepage_range(iter_vma, | 2262 | __unmap_hugepage_range(iter_vma, |
1920 | address, address + huge_page_size(h), | 2263 | address, address + huge_page_size(h), |
1921 | page); | 2264 | page); |
1922 | } | 2265 | } |
2266 | spin_unlock(&mapping->i_mmap_lock); | ||
1923 | 2267 | ||
1924 | return 1; | 2268 | return 1; |
1925 | } | 2269 | } |
@@ -1959,6 +2303,9 @@ retry_avoidcopy: | |||
1959 | outside_reserve = 1; | 2303 | outside_reserve = 1; |
1960 | 2304 | ||
1961 | page_cache_get(old_page); | 2305 | page_cache_get(old_page); |
2306 | |||
2307 | /* Drop page_table_lock as buddy allocator may be called */ | ||
2308 | spin_unlock(&mm->page_table_lock); | ||
1962 | new_page = alloc_huge_page(vma, address, outside_reserve); | 2309 | new_page = alloc_huge_page(vma, address, outside_reserve); |
1963 | 2310 | ||
1964 | if (IS_ERR(new_page)) { | 2311 | if (IS_ERR(new_page)) { |
@@ -1976,19 +2323,25 @@ retry_avoidcopy: | |||
1976 | if (unmap_ref_private(mm, vma, old_page, address)) { | 2323 | if (unmap_ref_private(mm, vma, old_page, address)) { |
1977 | BUG_ON(page_count(old_page) != 1); | 2324 | BUG_ON(page_count(old_page) != 1); |
1978 | BUG_ON(huge_pte_none(pte)); | 2325 | BUG_ON(huge_pte_none(pte)); |
2326 | spin_lock(&mm->page_table_lock); | ||
1979 | goto retry_avoidcopy; | 2327 | goto retry_avoidcopy; |
1980 | } | 2328 | } |
1981 | WARN_ON_ONCE(1); | 2329 | WARN_ON_ONCE(1); |
1982 | } | 2330 | } |
1983 | 2331 | ||
2332 | /* Caller expects lock to be held */ | ||
2333 | spin_lock(&mm->page_table_lock); | ||
1984 | return -PTR_ERR(new_page); | 2334 | return -PTR_ERR(new_page); |
1985 | } | 2335 | } |
1986 | 2336 | ||
1987 | spin_unlock(&mm->page_table_lock); | ||
1988 | copy_huge_page(new_page, old_page, address, vma); | 2337 | copy_huge_page(new_page, old_page, address, vma); |
1989 | __SetPageUptodate(new_page); | 2338 | __SetPageUptodate(new_page); |
1990 | spin_lock(&mm->page_table_lock); | ||
1991 | 2339 | ||
2340 | /* | ||
2341 | * Retake the page_table_lock to check for racing updates | ||
2342 | * before the page tables are altered | ||
2343 | */ | ||
2344 | spin_lock(&mm->page_table_lock); | ||
1992 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2345 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
1993 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 2346 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
1994 | /* Break COW */ | 2347 | /* Break COW */ |
@@ -2095,8 +2448,10 @@ retry: | |||
2095 | spin_lock(&inode->i_lock); | 2448 | spin_lock(&inode->i_lock); |
2096 | inode->i_blocks += blocks_per_huge_page(h); | 2449 | inode->i_blocks += blocks_per_huge_page(h); |
2097 | spin_unlock(&inode->i_lock); | 2450 | spin_unlock(&inode->i_lock); |
2098 | } else | 2451 | } else { |
2099 | lock_page(page); | 2452 | lock_page(page); |
2453 | page->mapping = HUGETLB_POISON; | ||
2454 | } | ||
2100 | } | 2455 | } |
2101 | 2456 | ||
2102 | /* | 2457 | /* |
@@ -2206,7 +2561,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2206 | entry = pte_mkyoung(entry); | 2561 | entry = pte_mkyoung(entry); |
2207 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, | 2562 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, |
2208 | flags & FAULT_FLAG_WRITE)) | 2563 | flags & FAULT_FLAG_WRITE)) |
2209 | update_mmu_cache(vma, address, entry); | 2564 | update_mmu_cache(vma, address, ptep); |
2210 | 2565 | ||
2211 | out_page_table_lock: | 2566 | out_page_table_lock: |
2212 | spin_unlock(&mm->page_table_lock); | 2567 | spin_unlock(&mm->page_table_lock); |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index e1d85137f086..10ea71905c1f 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -3,18 +3,68 @@ | |||
3 | #include <linux/debugfs.h> | 3 | #include <linux/debugfs.h> |
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/swap.h> | ||
7 | #include <linux/pagemap.h> | ||
8 | #include "internal.h" | ||
6 | 9 | ||
7 | static struct dentry *hwpoison_dir, *corrupt_pfn; | 10 | static struct dentry *hwpoison_dir; |
8 | 11 | ||
9 | static int hwpoison_inject(void *data, u64 val) | 12 | static int hwpoison_inject(void *data, u64 val) |
10 | { | 13 | { |
14 | unsigned long pfn = val; | ||
15 | struct page *p; | ||
16 | int err; | ||
17 | |||
18 | if (!capable(CAP_SYS_ADMIN)) | ||
19 | return -EPERM; | ||
20 | |||
21 | if (!hwpoison_filter_enable) | ||
22 | goto inject; | ||
23 | if (!pfn_valid(pfn)) | ||
24 | return -ENXIO; | ||
25 | |||
26 | p = pfn_to_page(pfn); | ||
27 | /* | ||
28 | * This implies unable to support free buddy pages. | ||
29 | */ | ||
30 | if (!get_page_unless_zero(p)) | ||
31 | return 0; | ||
32 | |||
33 | if (!PageLRU(p)) | ||
34 | shake_page(p, 0); | ||
35 | /* | ||
36 | * This implies unable to support non-LRU pages. | ||
37 | */ | ||
38 | if (!PageLRU(p)) | ||
39 | return 0; | ||
40 | |||
41 | /* | ||
42 | * do a racy check with elevated page count, to make sure PG_hwpoison | ||
43 | * will only be set for the targeted owner (or on a free page). | ||
44 | * We temporarily take page lock for try_get_mem_cgroup_from_page(). | ||
45 | * __memory_failure() will redo the check reliably inside page lock. | ||
46 | */ | ||
47 | lock_page(p); | ||
48 | err = hwpoison_filter(p); | ||
49 | unlock_page(p); | ||
50 | if (err) | ||
51 | return 0; | ||
52 | |||
53 | inject: | ||
54 | printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); | ||
55 | return __memory_failure(pfn, 18, MF_COUNT_INCREASED); | ||
56 | } | ||
57 | |||
58 | static int hwpoison_unpoison(void *data, u64 val) | ||
59 | { | ||
11 | if (!capable(CAP_SYS_ADMIN)) | 60 | if (!capable(CAP_SYS_ADMIN)) |
12 | return -EPERM; | 61 | return -EPERM; |
13 | printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); | 62 | |
14 | return __memory_failure(val, 18, 0); | 63 | return unpoison_memory(val); |
15 | } | 64 | } |
16 | 65 | ||
17 | DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); | 66 | DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); |
67 | DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); | ||
18 | 68 | ||
19 | static void pfn_inject_exit(void) | 69 | static void pfn_inject_exit(void) |
20 | { | 70 | { |
@@ -24,16 +74,63 @@ static void pfn_inject_exit(void) | |||
24 | 74 | ||
25 | static int pfn_inject_init(void) | 75 | static int pfn_inject_init(void) |
26 | { | 76 | { |
77 | struct dentry *dentry; | ||
78 | |||
27 | hwpoison_dir = debugfs_create_dir("hwpoison", NULL); | 79 | hwpoison_dir = debugfs_create_dir("hwpoison", NULL); |
28 | if (hwpoison_dir == NULL) | 80 | if (hwpoison_dir == NULL) |
29 | return -ENOMEM; | 81 | return -ENOMEM; |
30 | corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | 82 | |
83 | /* | ||
84 | * Note that the below poison/unpoison interfaces do not involve | ||
85 | * hardware status change, hence do not require hardware support. | ||
86 | * They are mainly for testing hwpoison in software level. | ||
87 | */ | ||
88 | dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | ||
31 | NULL, &hwpoison_fops); | 89 | NULL, &hwpoison_fops); |
32 | if (corrupt_pfn == NULL) { | 90 | if (!dentry) |
33 | pfn_inject_exit(); | 91 | goto fail; |
34 | return -ENOMEM; | 92 | |
35 | } | 93 | dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, |
94 | NULL, &unpoison_fops); | ||
95 | if (!dentry) | ||
96 | goto fail; | ||
97 | |||
98 | dentry = debugfs_create_u32("corrupt-filter-enable", 0600, | ||
99 | hwpoison_dir, &hwpoison_filter_enable); | ||
100 | if (!dentry) | ||
101 | goto fail; | ||
102 | |||
103 | dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600, | ||
104 | hwpoison_dir, &hwpoison_filter_dev_major); | ||
105 | if (!dentry) | ||
106 | goto fail; | ||
107 | |||
108 | dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600, | ||
109 | hwpoison_dir, &hwpoison_filter_dev_minor); | ||
110 | if (!dentry) | ||
111 | goto fail; | ||
112 | |||
113 | dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600, | ||
114 | hwpoison_dir, &hwpoison_filter_flags_mask); | ||
115 | if (!dentry) | ||
116 | goto fail; | ||
117 | |||
118 | dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600, | ||
119 | hwpoison_dir, &hwpoison_filter_flags_value); | ||
120 | if (!dentry) | ||
121 | goto fail; | ||
122 | |||
123 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
124 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, | ||
125 | hwpoison_dir, &hwpoison_filter_memcg); | ||
126 | if (!dentry) | ||
127 | goto fail; | ||
128 | #endif | ||
129 | |||
36 | return 0; | 130 | return 0; |
131 | fail: | ||
132 | pfn_inject_exit(); | ||
133 | return -ENOMEM; | ||
37 | } | 134 | } |
38 | 135 | ||
39 | module_init(pfn_inject_init); | 136 | module_init(pfn_inject_init); |
diff --git a/mm/internal.h b/mm/internal.h index 22ec8d2b0fb8..6a697bb97fc5 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page); | |||
50 | */ | 50 | */ |
51 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 51 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
52 | extern void prep_compound_page(struct page *page, unsigned long order); | 52 | extern void prep_compound_page(struct page *page, unsigned long order); |
53 | #ifdef CONFIG_MEMORY_FAILURE | ||
54 | extern bool is_free_buddy_page(struct page *page); | ||
55 | #endif | ||
53 | 56 | ||
54 | 57 | ||
55 | /* | 58 | /* |
@@ -63,7 +66,7 @@ static inline unsigned long page_order(struct page *page) | |||
63 | return page_private(page); | 66 | return page_private(page); |
64 | } | 67 | } |
65 | 68 | ||
66 | #ifdef CONFIG_HAVE_MLOCK | 69 | #ifdef CONFIG_MMU |
67 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, | 70 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, |
68 | unsigned long start, unsigned long end); | 71 | unsigned long start, unsigned long end); |
69 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, | 72 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, |
@@ -72,22 +75,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
72 | { | 75 | { |
73 | munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); | 76 | munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); |
74 | } | 77 | } |
75 | #endif | ||
76 | 78 | ||
77 | /* | 79 | /* |
78 | * unevictable_migrate_page() called only from migrate_page_copy() to | ||
79 | * migrate unevictable flag to new page. | ||
80 | * Note that the old page has been isolated from the LRU lists at this | ||
81 | * point so we don't need to worry about LRU statistics. | ||
82 | */ | ||
83 | static inline void unevictable_migrate_page(struct page *new, struct page *old) | ||
84 | { | ||
85 | if (TestClearPageUnevictable(old)) | ||
86 | SetPageUnevictable(new); | ||
87 | } | ||
88 | |||
89 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
90 | /* | ||
91 | * Called only in fault path via page_evictable() for a new page | 80 | * Called only in fault path via page_evictable() for a new page |
92 | * to determine if it's being mapped into a LOCKED vma. | 81 | * to determine if it's being mapped into a LOCKED vma. |
93 | * If so, mark page as mlocked. | 82 | * If so, mark page as mlocked. |
@@ -107,9 +96,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) | |||
107 | } | 96 | } |
108 | 97 | ||
109 | /* | 98 | /* |
110 | * must be called with vma's mmap_sem held for read, and page locked. | 99 | * must be called with vma's mmap_sem held for read or write, and page locked. |
111 | */ | 100 | */ |
112 | extern void mlock_vma_page(struct page *page); | 101 | extern void mlock_vma_page(struct page *page); |
102 | extern void munlock_vma_page(struct page *page); | ||
113 | 103 | ||
114 | /* | 104 | /* |
115 | * Clear the page's PageMlocked(). This can be useful in a situation where | 105 | * Clear the page's PageMlocked(). This can be useful in a situation where |
@@ -144,7 +134,7 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
144 | } | 134 | } |
145 | } | 135 | } |
146 | 136 | ||
147 | #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 137 | #else /* !CONFIG_MMU */ |
148 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 138 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) |
149 | { | 139 | { |
150 | return 0; | 140 | return 0; |
@@ -153,7 +143,7 @@ static inline void clear_page_mlock(struct page *page) { } | |||
153 | static inline void mlock_vma_page(struct page *page) { } | 143 | static inline void mlock_vma_page(struct page *page) { } |
154 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } | 144 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } |
155 | 145 | ||
156 | #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 146 | #endif /* !CONFIG_MMU */ |
157 | 147 | ||
158 | /* | 148 | /* |
159 | * Return the mem_map entry representing the 'offset' subpage within | 149 | * Return the mem_map entry representing the 'offset' subpage within |
@@ -260,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
260 | #define ZONE_RECLAIM_SOME 0 | 250 | #define ZONE_RECLAIM_SOME 0 |
261 | #define ZONE_RECLAIM_SUCCESS 1 | 251 | #define ZONE_RECLAIM_SUCCESS 1 |
262 | #endif | 252 | #endif |
253 | |||
254 | extern int hwpoison_filter(struct page *p); | ||
255 | |||
256 | extern u32 hwpoison_filter_dev_major; | ||
257 | extern u32 hwpoison_filter_dev_minor; | ||
258 | extern u64 hwpoison_filter_flags_mask; | ||
259 | extern u64 hwpoison_filter_flags_value; | ||
260 | extern u64 hwpoison_filter_memcg; | ||
261 | extern u32 hwpoison_filter_enable; | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 8bf765c4f58d..2c0d032ac898 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -72,7 +72,6 @@ | |||
72 | #include <linux/module.h> | 72 | #include <linux/module.h> |
73 | #include <linux/kthread.h> | 73 | #include <linux/kthread.h> |
74 | #include <linux/prio_tree.h> | 74 | #include <linux/prio_tree.h> |
75 | #include <linux/gfp.h> | ||
76 | #include <linux/fs.h> | 75 | #include <linux/fs.h> |
77 | #include <linux/debugfs.h> | 76 | #include <linux/debugfs.h> |
78 | #include <linux/seq_file.h> | 77 | #include <linux/seq_file.h> |
@@ -93,6 +92,7 @@ | |||
93 | #include <linux/nodemask.h> | 92 | #include <linux/nodemask.h> |
94 | #include <linux/mm.h> | 93 | #include <linux/mm.h> |
95 | #include <linux/workqueue.h> | 94 | #include <linux/workqueue.h> |
95 | #include <linux/crc32.h> | ||
96 | 96 | ||
97 | #include <asm/sections.h> | 97 | #include <asm/sections.h> |
98 | #include <asm/processor.h> | 98 | #include <asm/processor.h> |
@@ -108,7 +108,6 @@ | |||
108 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ | 108 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ |
109 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ | 109 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ |
110 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ | 110 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ |
111 | #define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ | ||
112 | #define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ | 111 | #define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ |
113 | 112 | ||
114 | #define BYTES_PER_POINTER sizeof(void *) | 113 | #define BYTES_PER_POINTER sizeof(void *) |
@@ -119,8 +118,8 @@ | |||
119 | /* scanning area inside a memory block */ | 118 | /* scanning area inside a memory block */ |
120 | struct kmemleak_scan_area { | 119 | struct kmemleak_scan_area { |
121 | struct hlist_node node; | 120 | struct hlist_node node; |
122 | unsigned long offset; | 121 | unsigned long start; |
123 | size_t length; | 122 | size_t size; |
124 | }; | 123 | }; |
125 | 124 | ||
126 | #define KMEMLEAK_GREY 0 | 125 | #define KMEMLEAK_GREY 0 |
@@ -149,6 +148,8 @@ struct kmemleak_object { | |||
149 | int min_count; | 148 | int min_count; |
150 | /* the total number of pointers found pointing to this object */ | 149 | /* the total number of pointers found pointing to this object */ |
151 | int count; | 150 | int count; |
151 | /* checksum for detecting modified objects */ | ||
152 | u32 checksum; | ||
152 | /* memory ranges to be scanned inside an object (empty for all) */ | 153 | /* memory ranges to be scanned inside an object (empty for all) */ |
153 | struct hlist_head area_list; | 154 | struct hlist_head area_list; |
154 | unsigned long trace[MAX_TRACE]; | 155 | unsigned long trace[MAX_TRACE]; |
@@ -164,8 +165,6 @@ struct kmemleak_object { | |||
164 | #define OBJECT_REPORTED (1 << 1) | 165 | #define OBJECT_REPORTED (1 << 1) |
165 | /* flag set to not scan the object */ | 166 | /* flag set to not scan the object */ |
166 | #define OBJECT_NO_SCAN (1 << 2) | 167 | #define OBJECT_NO_SCAN (1 << 2) |
167 | /* flag set on newly allocated objects */ | ||
168 | #define OBJECT_NEW (1 << 3) | ||
169 | 168 | ||
170 | /* number of bytes to print per line; must be 16 or 32 */ | 169 | /* number of bytes to print per line; must be 16 or 32 */ |
171 | #define HEX_ROW_SIZE 16 | 170 | #define HEX_ROW_SIZE 16 |
@@ -241,8 +240,6 @@ struct early_log { | |||
241 | const void *ptr; /* allocated/freed memory block */ | 240 | const void *ptr; /* allocated/freed memory block */ |
242 | size_t size; /* memory block size */ | 241 | size_t size; /* memory block size */ |
243 | int min_count; /* minimum reference count */ | 242 | int min_count; /* minimum reference count */ |
244 | unsigned long offset; /* scan area offset */ | ||
245 | size_t length; /* scan area length */ | ||
246 | unsigned long trace[MAX_TRACE]; /* stack trace */ | 243 | unsigned long trace[MAX_TRACE]; /* stack trace */ |
247 | unsigned int trace_len; /* stack trace length */ | 244 | unsigned int trace_len; /* stack trace length */ |
248 | }; | 245 | }; |
@@ -323,11 +320,6 @@ static bool color_gray(const struct kmemleak_object *object) | |||
323 | object->count >= object->min_count; | 320 | object->count >= object->min_count; |
324 | } | 321 | } |
325 | 322 | ||
326 | static bool color_black(const struct kmemleak_object *object) | ||
327 | { | ||
328 | return object->min_count == KMEMLEAK_BLACK; | ||
329 | } | ||
330 | |||
331 | /* | 323 | /* |
332 | * Objects are considered unreferenced only if their color is white, they have | 324 | * Objects are considered unreferenced only if their color is white, they have |
333 | * not be deleted and have a minimum age to avoid false positives caused by | 325 | * not be deleted and have a minimum age to avoid false positives caused by |
@@ -335,7 +327,7 @@ static bool color_black(const struct kmemleak_object *object) | |||
335 | */ | 327 | */ |
336 | static bool unreferenced_object(struct kmemleak_object *object) | 328 | static bool unreferenced_object(struct kmemleak_object *object) |
337 | { | 329 | { |
338 | return (object->flags & OBJECT_ALLOCATED) && color_white(object) && | 330 | return (color_white(object) && object->flags & OBJECT_ALLOCATED) && |
339 | time_before_eq(object->jiffies + jiffies_min_age, | 331 | time_before_eq(object->jiffies + jiffies_min_age, |
340 | jiffies_last_scan); | 332 | jiffies_last_scan); |
341 | } | 333 | } |
@@ -348,11 +340,13 @@ static void print_unreferenced(struct seq_file *seq, | |||
348 | struct kmemleak_object *object) | 340 | struct kmemleak_object *object) |
349 | { | 341 | { |
350 | int i; | 342 | int i; |
343 | unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); | ||
351 | 344 | ||
352 | seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", | 345 | seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", |
353 | object->pointer, object->size); | 346 | object->pointer, object->size); |
354 | seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", | 347 | seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", |
355 | object->comm, object->pid, object->jiffies); | 348 | object->comm, object->pid, object->jiffies, |
349 | msecs_age / 1000, msecs_age % 1000); | ||
356 | hex_dump_object(seq, object); | 350 | hex_dump_object(seq, object); |
357 | seq_printf(seq, " backtrace:\n"); | 351 | seq_printf(seq, " backtrace:\n"); |
358 | 352 | ||
@@ -381,6 +375,7 @@ static void dump_object_info(struct kmemleak_object *object) | |||
381 | pr_notice(" min_count = %d\n", object->min_count); | 375 | pr_notice(" min_count = %d\n", object->min_count); |
382 | pr_notice(" count = %d\n", object->count); | 376 | pr_notice(" count = %d\n", object->count); |
383 | pr_notice(" flags = 0x%lx\n", object->flags); | 377 | pr_notice(" flags = 0x%lx\n", object->flags); |
378 | pr_notice(" checksum = %d\n", object->checksum); | ||
384 | pr_notice(" backtrace:\n"); | 379 | pr_notice(" backtrace:\n"); |
385 | print_stack_trace(&trace, 4); | 380 | print_stack_trace(&trace, 4); |
386 | } | 381 | } |
@@ -522,12 +517,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
522 | INIT_HLIST_HEAD(&object->area_list); | 517 | INIT_HLIST_HEAD(&object->area_list); |
523 | spin_lock_init(&object->lock); | 518 | spin_lock_init(&object->lock); |
524 | atomic_set(&object->use_count, 1); | 519 | atomic_set(&object->use_count, 1); |
525 | object->flags = OBJECT_ALLOCATED | OBJECT_NEW; | 520 | object->flags = OBJECT_ALLOCATED; |
526 | object->pointer = ptr; | 521 | object->pointer = ptr; |
527 | object->size = size; | 522 | object->size = size; |
528 | object->min_count = min_count; | 523 | object->min_count = min_count; |
529 | object->count = -1; /* no color initially */ | 524 | object->count = 0; /* white color initially */ |
530 | object->jiffies = jiffies; | 525 | object->jiffies = jiffies; |
526 | object->checksum = 0; | ||
531 | 527 | ||
532 | /* task information */ | 528 | /* task information */ |
533 | if (in_irq()) { | 529 | if (in_irq()) { |
@@ -720,14 +716,13 @@ static void make_black_object(unsigned long ptr) | |||
720 | * Add a scanning area to the object. If at least one such area is added, | 716 | * Add a scanning area to the object. If at least one such area is added, |
721 | * kmemleak will only scan these ranges rather than the whole memory block. | 717 | * kmemleak will only scan these ranges rather than the whole memory block. |
722 | */ | 718 | */ |
723 | static void add_scan_area(unsigned long ptr, unsigned long offset, | 719 | static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) |
724 | size_t length, gfp_t gfp) | ||
725 | { | 720 | { |
726 | unsigned long flags; | 721 | unsigned long flags; |
727 | struct kmemleak_object *object; | 722 | struct kmemleak_object *object; |
728 | struct kmemleak_scan_area *area; | 723 | struct kmemleak_scan_area *area; |
729 | 724 | ||
730 | object = find_and_get_object(ptr, 0); | 725 | object = find_and_get_object(ptr, 1); |
731 | if (!object) { | 726 | if (!object) { |
732 | kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", | 727 | kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", |
733 | ptr); | 728 | ptr); |
@@ -741,7 +736,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, | |||
741 | } | 736 | } |
742 | 737 | ||
743 | spin_lock_irqsave(&object->lock, flags); | 738 | spin_lock_irqsave(&object->lock, flags); |
744 | if (offset + length > object->size) { | 739 | if (ptr + size > object->pointer + object->size) { |
745 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); | 740 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); |
746 | dump_object_info(object); | 741 | dump_object_info(object); |
747 | kmem_cache_free(scan_area_cache, area); | 742 | kmem_cache_free(scan_area_cache, area); |
@@ -749,8 +744,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, | |||
749 | } | 744 | } |
750 | 745 | ||
751 | INIT_HLIST_NODE(&area->node); | 746 | INIT_HLIST_NODE(&area->node); |
752 | area->offset = offset; | 747 | area->start = ptr; |
753 | area->length = length; | 748 | area->size = size; |
754 | 749 | ||
755 | hlist_add_head(&area->node, &object->area_list); | 750 | hlist_add_head(&area->node, &object->area_list); |
756 | out_unlock: | 751 | out_unlock: |
@@ -786,7 +781,7 @@ static void object_no_scan(unsigned long ptr) | |||
786 | * processed later once kmemleak is fully initialized. | 781 | * processed later once kmemleak is fully initialized. |
787 | */ | 782 | */ |
788 | static void __init log_early(int op_type, const void *ptr, size_t size, | 783 | static void __init log_early(int op_type, const void *ptr, size_t size, |
789 | int min_count, unsigned long offset, size_t length) | 784 | int min_count) |
790 | { | 785 | { |
791 | unsigned long flags; | 786 | unsigned long flags; |
792 | struct early_log *log; | 787 | struct early_log *log; |
@@ -808,8 +803,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size, | |||
808 | log->ptr = ptr; | 803 | log->ptr = ptr; |
809 | log->size = size; | 804 | log->size = size; |
810 | log->min_count = min_count; | 805 | log->min_count = min_count; |
811 | log->offset = offset; | ||
812 | log->length = length; | ||
813 | if (op_type == KMEMLEAK_ALLOC) | 806 | if (op_type == KMEMLEAK_ALLOC) |
814 | log->trace_len = __save_stack_trace(log->trace); | 807 | log->trace_len = __save_stack_trace(log->trace); |
815 | crt_early_log++; | 808 | crt_early_log++; |
@@ -858,7 +851,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, | |||
858 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 851 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
859 | create_object((unsigned long)ptr, size, min_count, gfp); | 852 | create_object((unsigned long)ptr, size, min_count, gfp); |
860 | else if (atomic_read(&kmemleak_early_log)) | 853 | else if (atomic_read(&kmemleak_early_log)) |
861 | log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); | 854 | log_early(KMEMLEAK_ALLOC, ptr, size, min_count); |
862 | } | 855 | } |
863 | EXPORT_SYMBOL_GPL(kmemleak_alloc); | 856 | EXPORT_SYMBOL_GPL(kmemleak_alloc); |
864 | 857 | ||
@@ -873,7 +866,7 @@ void __ref kmemleak_free(const void *ptr) | |||
873 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 866 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
874 | delete_object_full((unsigned long)ptr); | 867 | delete_object_full((unsigned long)ptr); |
875 | else if (atomic_read(&kmemleak_early_log)) | 868 | else if (atomic_read(&kmemleak_early_log)) |
876 | log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); | 869 | log_early(KMEMLEAK_FREE, ptr, 0, 0); |
877 | } | 870 | } |
878 | EXPORT_SYMBOL_GPL(kmemleak_free); | 871 | EXPORT_SYMBOL_GPL(kmemleak_free); |
879 | 872 | ||
@@ -888,7 +881,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) | |||
888 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 881 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
889 | delete_object_part((unsigned long)ptr, size); | 882 | delete_object_part((unsigned long)ptr, size); |
890 | else if (atomic_read(&kmemleak_early_log)) | 883 | else if (atomic_read(&kmemleak_early_log)) |
891 | log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); | 884 | log_early(KMEMLEAK_FREE_PART, ptr, size, 0); |
892 | } | 885 | } |
893 | EXPORT_SYMBOL_GPL(kmemleak_free_part); | 886 | EXPORT_SYMBOL_GPL(kmemleak_free_part); |
894 | 887 | ||
@@ -903,7 +896,7 @@ void __ref kmemleak_not_leak(const void *ptr) | |||
903 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 896 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
904 | make_gray_object((unsigned long)ptr); | 897 | make_gray_object((unsigned long)ptr); |
905 | else if (atomic_read(&kmemleak_early_log)) | 898 | else if (atomic_read(&kmemleak_early_log)) |
906 | log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); | 899 | log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); |
907 | } | 900 | } |
908 | EXPORT_SYMBOL(kmemleak_not_leak); | 901 | EXPORT_SYMBOL(kmemleak_not_leak); |
909 | 902 | ||
@@ -919,22 +912,21 @@ void __ref kmemleak_ignore(const void *ptr) | |||
919 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 912 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
920 | make_black_object((unsigned long)ptr); | 913 | make_black_object((unsigned long)ptr); |
921 | else if (atomic_read(&kmemleak_early_log)) | 914 | else if (atomic_read(&kmemleak_early_log)) |
922 | log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); | 915 | log_early(KMEMLEAK_IGNORE, ptr, 0, 0); |
923 | } | 916 | } |
924 | EXPORT_SYMBOL(kmemleak_ignore); | 917 | EXPORT_SYMBOL(kmemleak_ignore); |
925 | 918 | ||
926 | /* | 919 | /* |
927 | * Limit the range to be scanned in an allocated memory block. | 920 | * Limit the range to be scanned in an allocated memory block. |
928 | */ | 921 | */ |
929 | void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, | 922 | void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) |
930 | size_t length, gfp_t gfp) | ||
931 | { | 923 | { |
932 | pr_debug("%s(0x%p)\n", __func__, ptr); | 924 | pr_debug("%s(0x%p)\n", __func__, ptr); |
933 | 925 | ||
934 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 926 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
935 | add_scan_area((unsigned long)ptr, offset, length, gfp); | 927 | add_scan_area((unsigned long)ptr, size, gfp); |
936 | else if (atomic_read(&kmemleak_early_log)) | 928 | else if (atomic_read(&kmemleak_early_log)) |
937 | log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); | 929 | log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); |
938 | } | 930 | } |
939 | EXPORT_SYMBOL(kmemleak_scan_area); | 931 | EXPORT_SYMBOL(kmemleak_scan_area); |
940 | 932 | ||
@@ -948,11 +940,25 @@ void __ref kmemleak_no_scan(const void *ptr) | |||
948 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 940 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
949 | object_no_scan((unsigned long)ptr); | 941 | object_no_scan((unsigned long)ptr); |
950 | else if (atomic_read(&kmemleak_early_log)) | 942 | else if (atomic_read(&kmemleak_early_log)) |
951 | log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); | 943 | log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); |
952 | } | 944 | } |
953 | EXPORT_SYMBOL(kmemleak_no_scan); | 945 | EXPORT_SYMBOL(kmemleak_no_scan); |
954 | 946 | ||
955 | /* | 947 | /* |
948 | * Update an object's checksum and return true if it was modified. | ||
949 | */ | ||
950 | static bool update_checksum(struct kmemleak_object *object) | ||
951 | { | ||
952 | u32 old_csum = object->checksum; | ||
953 | |||
954 | if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) | ||
955 | return false; | ||
956 | |||
957 | object->checksum = crc32(0, (void *)object->pointer, object->size); | ||
958 | return object->checksum != old_csum; | ||
959 | } | ||
960 | |||
961 | /* | ||
956 | * Memory scanning is a long process and it needs to be interruptable. This | 962 | * Memory scanning is a long process and it needs to be interruptable. This |
957 | * function checks whether such interrupt condition occured. | 963 | * function checks whether such interrupt condition occured. |
958 | */ | 964 | */ |
@@ -1031,11 +1037,14 @@ static void scan_block(void *_start, void *_end, | |||
1031 | * added to the gray_list. | 1037 | * added to the gray_list. |
1032 | */ | 1038 | */ |
1033 | object->count++; | 1039 | object->count++; |
1034 | if (color_gray(object)) | 1040 | if (color_gray(object)) { |
1035 | list_add_tail(&object->gray_list, &gray_list); | 1041 | list_add_tail(&object->gray_list, &gray_list); |
1036 | else | 1042 | spin_unlock_irqrestore(&object->lock, flags); |
1037 | put_object(object); | 1043 | continue; |
1044 | } | ||
1045 | |||
1038 | spin_unlock_irqrestore(&object->lock, flags); | 1046 | spin_unlock_irqrestore(&object->lock, flags); |
1047 | put_object(object); | ||
1039 | } | 1048 | } |
1040 | } | 1049 | } |
1041 | 1050 | ||
@@ -1050,8 +1059,8 @@ static void scan_object(struct kmemleak_object *object) | |||
1050 | unsigned long flags; | 1059 | unsigned long flags; |
1051 | 1060 | ||
1052 | /* | 1061 | /* |
1053 | * Once the object->lock is aquired, the corresponding memory block | 1062 | * Once the object->lock is acquired, the corresponding memory block |
1054 | * cannot be freed (the same lock is aquired in delete_object). | 1063 | * cannot be freed (the same lock is acquired in delete_object). |
1055 | */ | 1064 | */ |
1056 | spin_lock_irqsave(&object->lock, flags); | 1065 | spin_lock_irqsave(&object->lock, flags); |
1057 | if (object->flags & OBJECT_NO_SCAN) | 1066 | if (object->flags & OBJECT_NO_SCAN) |
@@ -1075,14 +1084,47 @@ static void scan_object(struct kmemleak_object *object) | |||
1075 | } | 1084 | } |
1076 | } else | 1085 | } else |
1077 | hlist_for_each_entry(area, elem, &object->area_list, node) | 1086 | hlist_for_each_entry(area, elem, &object->area_list, node) |
1078 | scan_block((void *)(object->pointer + area->offset), | 1087 | scan_block((void *)area->start, |
1079 | (void *)(object->pointer + area->offset | 1088 | (void *)(area->start + area->size), |
1080 | + area->length), object, 0); | 1089 | object, 0); |
1081 | out: | 1090 | out: |
1082 | spin_unlock_irqrestore(&object->lock, flags); | 1091 | spin_unlock_irqrestore(&object->lock, flags); |
1083 | } | 1092 | } |
1084 | 1093 | ||
1085 | /* | 1094 | /* |
1095 | * Scan the objects already referenced (gray objects). More objects will be | ||
1096 | * referenced and, if there are no memory leaks, all the objects are scanned. | ||
1097 | */ | ||
1098 | static void scan_gray_list(void) | ||
1099 | { | ||
1100 | struct kmemleak_object *object, *tmp; | ||
1101 | |||
1102 | /* | ||
1103 | * The list traversal is safe for both tail additions and removals | ||
1104 | * from inside the loop. The kmemleak objects cannot be freed from | ||
1105 | * outside the loop because their use_count was incremented. | ||
1106 | */ | ||
1107 | object = list_entry(gray_list.next, typeof(*object), gray_list); | ||
1108 | while (&object->gray_list != &gray_list) { | ||
1109 | cond_resched(); | ||
1110 | |||
1111 | /* may add new objects to the list */ | ||
1112 | if (!scan_should_stop()) | ||
1113 | scan_object(object); | ||
1114 | |||
1115 | tmp = list_entry(object->gray_list.next, typeof(*object), | ||
1116 | gray_list); | ||
1117 | |||
1118 | /* remove the object from the list and release it */ | ||
1119 | list_del(&object->gray_list); | ||
1120 | put_object(object); | ||
1121 | |||
1122 | object = tmp; | ||
1123 | } | ||
1124 | WARN_ON(!list_empty(&gray_list)); | ||
1125 | } | ||
1126 | |||
1127 | /* | ||
1086 | * Scan data sections and all the referenced memory blocks allocated via the | 1128 | * Scan data sections and all the referenced memory blocks allocated via the |
1087 | * kernel's standard allocators. This function must be called with the | 1129 | * kernel's standard allocators. This function must be called with the |
1088 | * scan_mutex held. | 1130 | * scan_mutex held. |
@@ -1090,10 +1132,9 @@ out: | |||
1090 | static void kmemleak_scan(void) | 1132 | static void kmemleak_scan(void) |
1091 | { | 1133 | { |
1092 | unsigned long flags; | 1134 | unsigned long flags; |
1093 | struct kmemleak_object *object, *tmp; | 1135 | struct kmemleak_object *object; |
1094 | int i; | 1136 | int i; |
1095 | int new_leaks = 0; | 1137 | int new_leaks = 0; |
1096 | int gray_list_pass = 0; | ||
1097 | 1138 | ||
1098 | jiffies_last_scan = jiffies; | 1139 | jiffies_last_scan = jiffies; |
1099 | 1140 | ||
@@ -1114,7 +1155,6 @@ static void kmemleak_scan(void) | |||
1114 | #endif | 1155 | #endif |
1115 | /* reset the reference count (whiten the object) */ | 1156 | /* reset the reference count (whiten the object) */ |
1116 | object->count = 0; | 1157 | object->count = 0; |
1117 | object->flags &= ~OBJECT_NEW; | ||
1118 | if (color_gray(object) && get_object(object)) | 1158 | if (color_gray(object) && get_object(object)) |
1119 | list_add_tail(&object->gray_list, &gray_list); | 1159 | list_add_tail(&object->gray_list, &gray_list); |
1120 | 1160 | ||
@@ -1172,62 +1212,36 @@ static void kmemleak_scan(void) | |||
1172 | 1212 | ||
1173 | /* | 1213 | /* |
1174 | * Scan the objects already referenced from the sections scanned | 1214 | * Scan the objects already referenced from the sections scanned |
1175 | * above. More objects will be referenced and, if there are no memory | 1215 | * above. |
1176 | * leaks, all the objects will be scanned. The list traversal is safe | ||
1177 | * for both tail additions and removals from inside the loop. The | ||
1178 | * kmemleak objects cannot be freed from outside the loop because their | ||
1179 | * use_count was increased. | ||
1180 | */ | 1216 | */ |
1181 | repeat: | 1217 | scan_gray_list(); |
1182 | object = list_entry(gray_list.next, typeof(*object), gray_list); | ||
1183 | while (&object->gray_list != &gray_list) { | ||
1184 | cond_resched(); | ||
1185 | |||
1186 | /* may add new objects to the list */ | ||
1187 | if (!scan_should_stop()) | ||
1188 | scan_object(object); | ||
1189 | |||
1190 | tmp = list_entry(object->gray_list.next, typeof(*object), | ||
1191 | gray_list); | ||
1192 | |||
1193 | /* remove the object from the list and release it */ | ||
1194 | list_del(&object->gray_list); | ||
1195 | put_object(object); | ||
1196 | |||
1197 | object = tmp; | ||
1198 | } | ||
1199 | |||
1200 | if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES) | ||
1201 | goto scan_end; | ||
1202 | 1218 | ||
1203 | /* | 1219 | /* |
1204 | * Check for new objects allocated during this scanning and add them | 1220 | * Check for new or unreferenced objects modified since the previous |
1205 | * to the gray list. | 1221 | * scan and color them gray until the next scan. |
1206 | */ | 1222 | */ |
1207 | rcu_read_lock(); | 1223 | rcu_read_lock(); |
1208 | list_for_each_entry_rcu(object, &object_list, object_list) { | 1224 | list_for_each_entry_rcu(object, &object_list, object_list) { |
1209 | spin_lock_irqsave(&object->lock, flags); | 1225 | spin_lock_irqsave(&object->lock, flags); |
1210 | if ((object->flags & OBJECT_NEW) && !color_black(object) && | 1226 | if (color_white(object) && (object->flags & OBJECT_ALLOCATED) |
1211 | get_object(object)) { | 1227 | && update_checksum(object) && get_object(object)) { |
1212 | object->flags &= ~OBJECT_NEW; | 1228 | /* color it gray temporarily */ |
1229 | object->count = object->min_count; | ||
1213 | list_add_tail(&object->gray_list, &gray_list); | 1230 | list_add_tail(&object->gray_list, &gray_list); |
1214 | } | 1231 | } |
1215 | spin_unlock_irqrestore(&object->lock, flags); | 1232 | spin_unlock_irqrestore(&object->lock, flags); |
1216 | } | 1233 | } |
1217 | rcu_read_unlock(); | 1234 | rcu_read_unlock(); |
1218 | 1235 | ||
1219 | if (!list_empty(&gray_list)) | 1236 | /* |
1220 | goto repeat; | 1237 | * Re-scan the gray list for modified unreferenced objects. |
1221 | 1238 | */ | |
1222 | scan_end: | 1239 | scan_gray_list(); |
1223 | WARN_ON(!list_empty(&gray_list)); | ||
1224 | 1240 | ||
1225 | /* | 1241 | /* |
1226 | * If scanning was stopped or new objects were being allocated at a | 1242 | * If scanning was stopped do not report any new unreferenced objects. |
1227 | * higher rate than gray list scanning, do not report any new | ||
1228 | * unreferenced objects. | ||
1229 | */ | 1243 | */ |
1230 | if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) | 1244 | if (scan_should_stop()) |
1231 | return; | 1245 | return; |
1232 | 1246 | ||
1233 | /* | 1247 | /* |
@@ -1642,8 +1656,7 @@ void __init kmemleak_init(void) | |||
1642 | kmemleak_ignore(log->ptr); | 1656 | kmemleak_ignore(log->ptr); |
1643 | break; | 1657 | break; |
1644 | case KMEMLEAK_SCAN_AREA: | 1658 | case KMEMLEAK_SCAN_AREA: |
1645 | kmemleak_scan_area(log->ptr, log->offset, log->length, | 1659 | kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); |
1646 | GFP_KERNEL); | ||
1647 | break; | 1660 | break; |
1648 | case KMEMLEAK_NO_SCAN: | 1661 | case KMEMLEAK_NO_SCAN: |
1649 | kmemleak_no_scan(log->ptr); | 1662 | kmemleak_no_scan(log->ptr); |
@@ -29,11 +29,13 @@ | |||
29 | #include <linux/wait.h> | 29 | #include <linux/wait.h> |
30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
31 | #include <linux/rbtree.h> | 31 | #include <linux/rbtree.h> |
32 | #include <linux/memory.h> | ||
32 | #include <linux/mmu_notifier.h> | 33 | #include <linux/mmu_notifier.h> |
33 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
34 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
35 | 36 | ||
36 | #include <asm/tlbflush.h> | 37 | #include <asm/tlbflush.h> |
38 | #include "internal.h" | ||
37 | 39 | ||
38 | /* | 40 | /* |
39 | * A few notes about the KSM scanning process, | 41 | * A few notes about the KSM scanning process, |
@@ -79,13 +81,13 @@ | |||
79 | * struct mm_slot - ksm information per mm that is being scanned | 81 | * struct mm_slot - ksm information per mm that is being scanned |
80 | * @link: link to the mm_slots hash list | 82 | * @link: link to the mm_slots hash list |
81 | * @mm_list: link into the mm_slots list, rooted in ksm_mm_head | 83 | * @mm_list: link into the mm_slots list, rooted in ksm_mm_head |
82 | * @rmap_list: head for this mm_slot's list of rmap_items | 84 | * @rmap_list: head for this mm_slot's singly-linked list of rmap_items |
83 | * @mm: the mm that this information is valid for | 85 | * @mm: the mm that this information is valid for |
84 | */ | 86 | */ |
85 | struct mm_slot { | 87 | struct mm_slot { |
86 | struct hlist_node link; | 88 | struct hlist_node link; |
87 | struct list_head mm_list; | 89 | struct list_head mm_list; |
88 | struct list_head rmap_list; | 90 | struct rmap_item *rmap_list; |
89 | struct mm_struct *mm; | 91 | struct mm_struct *mm; |
90 | }; | 92 | }; |
91 | 93 | ||
@@ -93,7 +95,7 @@ struct mm_slot { | |||
93 | * struct ksm_scan - cursor for scanning | 95 | * struct ksm_scan - cursor for scanning |
94 | * @mm_slot: the current mm_slot we are scanning | 96 | * @mm_slot: the current mm_slot we are scanning |
95 | * @address: the next address inside that to be scanned | 97 | * @address: the next address inside that to be scanned |
96 | * @rmap_item: the current rmap that we are scanning inside the rmap_list | 98 | * @rmap_list: link to the next rmap to be scanned in the rmap_list |
97 | * @seqnr: count of completed full scans (needed when removing unstable node) | 99 | * @seqnr: count of completed full scans (needed when removing unstable node) |
98 | * | 100 | * |
99 | * There is only the one ksm_scan instance of this cursor structure. | 101 | * There is only the one ksm_scan instance of this cursor structure. |
@@ -101,37 +103,51 @@ struct mm_slot { | |||
101 | struct ksm_scan { | 103 | struct ksm_scan { |
102 | struct mm_slot *mm_slot; | 104 | struct mm_slot *mm_slot; |
103 | unsigned long address; | 105 | unsigned long address; |
104 | struct rmap_item *rmap_item; | 106 | struct rmap_item **rmap_list; |
105 | unsigned long seqnr; | 107 | unsigned long seqnr; |
106 | }; | 108 | }; |
107 | 109 | ||
108 | /** | 110 | /** |
111 | * struct stable_node - node of the stable rbtree | ||
112 | * @node: rb node of this ksm page in the stable tree | ||
113 | * @hlist: hlist head of rmap_items using this ksm page | ||
114 | * @kpfn: page frame number of this ksm page | ||
115 | */ | ||
116 | struct stable_node { | ||
117 | struct rb_node node; | ||
118 | struct hlist_head hlist; | ||
119 | unsigned long kpfn; | ||
120 | }; | ||
121 | |||
122 | /** | ||
109 | * struct rmap_item - reverse mapping item for virtual addresses | 123 | * struct rmap_item - reverse mapping item for virtual addresses |
110 | * @link: link into mm_slot's rmap_list (rmap_list is per mm) | 124 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list |
125 | * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree | ||
111 | * @mm: the memory structure this rmap_item is pointing into | 126 | * @mm: the memory structure this rmap_item is pointing into |
112 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) | 127 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) |
113 | * @oldchecksum: previous checksum of the page at that virtual address | 128 | * @oldchecksum: previous checksum of the page at that virtual address |
114 | * @node: rb_node of this rmap_item in either unstable or stable tree | 129 | * @node: rb node of this rmap_item in the unstable tree |
115 | * @next: next rmap_item hanging off the same node of the stable tree | 130 | * @head: pointer to stable_node heading this list in the stable tree |
116 | * @prev: previous rmap_item hanging off the same node of the stable tree | 131 | * @hlist: link into hlist of rmap_items hanging off that stable_node |
117 | */ | 132 | */ |
118 | struct rmap_item { | 133 | struct rmap_item { |
119 | struct list_head link; | 134 | struct rmap_item *rmap_list; |
135 | struct anon_vma *anon_vma; /* when stable */ | ||
120 | struct mm_struct *mm; | 136 | struct mm_struct *mm; |
121 | unsigned long address; /* + low bits used for flags below */ | 137 | unsigned long address; /* + low bits used for flags below */ |
138 | unsigned int oldchecksum; /* when unstable */ | ||
122 | union { | 139 | union { |
123 | unsigned int oldchecksum; /* when unstable */ | 140 | struct rb_node node; /* when node of unstable tree */ |
124 | struct rmap_item *next; /* when stable */ | 141 | struct { /* when listed from stable tree */ |
125 | }; | 142 | struct stable_node *head; |
126 | union { | 143 | struct hlist_node hlist; |
127 | struct rb_node node; /* when tree node */ | 144 | }; |
128 | struct rmap_item *prev; /* in stable list */ | ||
129 | }; | 145 | }; |
130 | }; | 146 | }; |
131 | 147 | ||
132 | #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ | 148 | #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ |
133 | #define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ | 149 | #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ |
134 | #define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ | 150 | #define STABLE_FLAG 0x200 /* is listed from the stable tree */ |
135 | 151 | ||
136 | /* The stable and unstable tree heads */ | 152 | /* The stable and unstable tree heads */ |
137 | static struct rb_root root_stable_tree = RB_ROOT; | 153 | static struct rb_root root_stable_tree = RB_ROOT; |
@@ -148,6 +164,7 @@ static struct ksm_scan ksm_scan = { | |||
148 | }; | 164 | }; |
149 | 165 | ||
150 | static struct kmem_cache *rmap_item_cache; | 166 | static struct kmem_cache *rmap_item_cache; |
167 | static struct kmem_cache *stable_node_cache; | ||
151 | static struct kmem_cache *mm_slot_cache; | 168 | static struct kmem_cache *mm_slot_cache; |
152 | 169 | ||
153 | /* The number of nodes in the stable tree */ | 170 | /* The number of nodes in the stable tree */ |
@@ -162,9 +179,6 @@ static unsigned long ksm_pages_unshared; | |||
162 | /* The number of rmap_items in use: to calculate pages_volatile */ | 179 | /* The number of rmap_items in use: to calculate pages_volatile */ |
163 | static unsigned long ksm_rmap_items; | 180 | static unsigned long ksm_rmap_items; |
164 | 181 | ||
165 | /* Limit on the number of unswappable pages used */ | ||
166 | static unsigned long ksm_max_kernel_pages; | ||
167 | |||
168 | /* Number of pages ksmd should scan in one batch */ | 182 | /* Number of pages ksmd should scan in one batch */ |
169 | static unsigned int ksm_thread_pages_to_scan = 100; | 183 | static unsigned int ksm_thread_pages_to_scan = 100; |
170 | 184 | ||
@@ -190,13 +204,19 @@ static int __init ksm_slab_init(void) | |||
190 | if (!rmap_item_cache) | 204 | if (!rmap_item_cache) |
191 | goto out; | 205 | goto out; |
192 | 206 | ||
207 | stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); | ||
208 | if (!stable_node_cache) | ||
209 | goto out_free1; | ||
210 | |||
193 | mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); | 211 | mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); |
194 | if (!mm_slot_cache) | 212 | if (!mm_slot_cache) |
195 | goto out_free; | 213 | goto out_free2; |
196 | 214 | ||
197 | return 0; | 215 | return 0; |
198 | 216 | ||
199 | out_free: | 217 | out_free2: |
218 | kmem_cache_destroy(stable_node_cache); | ||
219 | out_free1: | ||
200 | kmem_cache_destroy(rmap_item_cache); | 220 | kmem_cache_destroy(rmap_item_cache); |
201 | out: | 221 | out: |
202 | return -ENOMEM; | 222 | return -ENOMEM; |
@@ -205,6 +225,7 @@ out: | |||
205 | static void __init ksm_slab_free(void) | 225 | static void __init ksm_slab_free(void) |
206 | { | 226 | { |
207 | kmem_cache_destroy(mm_slot_cache); | 227 | kmem_cache_destroy(mm_slot_cache); |
228 | kmem_cache_destroy(stable_node_cache); | ||
208 | kmem_cache_destroy(rmap_item_cache); | 229 | kmem_cache_destroy(rmap_item_cache); |
209 | mm_slot_cache = NULL; | 230 | mm_slot_cache = NULL; |
210 | } | 231 | } |
@@ -226,6 +247,16 @@ static inline void free_rmap_item(struct rmap_item *rmap_item) | |||
226 | kmem_cache_free(rmap_item_cache, rmap_item); | 247 | kmem_cache_free(rmap_item_cache, rmap_item); |
227 | } | 248 | } |
228 | 249 | ||
250 | static inline struct stable_node *alloc_stable_node(void) | ||
251 | { | ||
252 | return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); | ||
253 | } | ||
254 | |||
255 | static inline void free_stable_node(struct stable_node *stable_node) | ||
256 | { | ||
257 | kmem_cache_free(stable_node_cache, stable_node); | ||
258 | } | ||
259 | |||
229 | static inline struct mm_slot *alloc_mm_slot(void) | 260 | static inline struct mm_slot *alloc_mm_slot(void) |
230 | { | 261 | { |
231 | if (!mm_slot_cache) /* initialization failed */ | 262 | if (!mm_slot_cache) /* initialization failed */ |
@@ -275,7 +306,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, | |||
275 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | 306 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) |
276 | % MM_SLOTS_HASH_HEADS]; | 307 | % MM_SLOTS_HASH_HEADS]; |
277 | mm_slot->mm = mm; | 308 | mm_slot->mm = mm; |
278 | INIT_LIST_HEAD(&mm_slot->rmap_list); | ||
279 | hlist_add_head(&mm_slot->link, bucket); | 309 | hlist_add_head(&mm_slot->link, bucket); |
280 | } | 310 | } |
281 | 311 | ||
@@ -284,6 +314,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) | |||
284 | return rmap_item->address & STABLE_FLAG; | 314 | return rmap_item->address & STABLE_FLAG; |
285 | } | 315 | } |
286 | 316 | ||
317 | static void hold_anon_vma(struct rmap_item *rmap_item, | ||
318 | struct anon_vma *anon_vma) | ||
319 | { | ||
320 | rmap_item->anon_vma = anon_vma; | ||
321 | atomic_inc(&anon_vma->ksm_refcount); | ||
322 | } | ||
323 | |||
324 | static void drop_anon_vma(struct rmap_item *rmap_item) | ||
325 | { | ||
326 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
327 | |||
328 | if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { | ||
329 | int empty = list_empty(&anon_vma->head); | ||
330 | spin_unlock(&anon_vma->lock); | ||
331 | if (empty) | ||
332 | anon_vma_free(anon_vma); | ||
333 | } | ||
334 | } | ||
335 | |||
287 | /* | 336 | /* |
288 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's | 337 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's |
289 | * page tables after it has passed through ksm_exit() - which, if necessary, | 338 | * page tables after it has passed through ksm_exit() - which, if necessary, |
@@ -316,7 +365,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
316 | do { | 365 | do { |
317 | cond_resched(); | 366 | cond_resched(); |
318 | page = follow_page(vma, addr, FOLL_GET); | 367 | page = follow_page(vma, addr, FOLL_GET); |
319 | if (!page) | 368 | if (IS_ERR_OR_NULL(page)) |
320 | break; | 369 | break; |
321 | if (PageKsm(page)) | 370 | if (PageKsm(page)) |
322 | ret = handle_mm_fault(vma->vm_mm, vma, addr, | 371 | ret = handle_mm_fault(vma->vm_mm, vma, addr, |
@@ -356,10 +405,18 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
356 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; | 405 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; |
357 | } | 406 | } |
358 | 407 | ||
359 | static void break_cow(struct mm_struct *mm, unsigned long addr) | 408 | static void break_cow(struct rmap_item *rmap_item) |
360 | { | 409 | { |
410 | struct mm_struct *mm = rmap_item->mm; | ||
411 | unsigned long addr = rmap_item->address; | ||
361 | struct vm_area_struct *vma; | 412 | struct vm_area_struct *vma; |
362 | 413 | ||
414 | /* | ||
415 | * It is not an accident that whenever we want to break COW | ||
416 | * to undo, we also need to drop a reference to the anon_vma. | ||
417 | */ | ||
418 | drop_anon_vma(rmap_item); | ||
419 | |||
363 | down_read(&mm->mmap_sem); | 420 | down_read(&mm->mmap_sem); |
364 | if (ksm_test_exit(mm)) | 421 | if (ksm_test_exit(mm)) |
365 | goto out; | 422 | goto out; |
@@ -390,7 +447,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) | |||
390 | goto out; | 447 | goto out; |
391 | 448 | ||
392 | page = follow_page(vma, addr, FOLL_GET); | 449 | page = follow_page(vma, addr, FOLL_GET); |
393 | if (!page) | 450 | if (IS_ERR_OR_NULL(page)) |
394 | goto out; | 451 | goto out; |
395 | if (PageAnon(page)) { | 452 | if (PageAnon(page)) { |
396 | flush_anon_page(vma, page, addr); | 453 | flush_anon_page(vma, page, addr); |
@@ -403,21 +460,77 @@ out: page = NULL; | |||
403 | return page; | 460 | return page; |
404 | } | 461 | } |
405 | 462 | ||
463 | static void remove_node_from_stable_tree(struct stable_node *stable_node) | ||
464 | { | ||
465 | struct rmap_item *rmap_item; | ||
466 | struct hlist_node *hlist; | ||
467 | |||
468 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
469 | if (rmap_item->hlist.next) | ||
470 | ksm_pages_sharing--; | ||
471 | else | ||
472 | ksm_pages_shared--; | ||
473 | drop_anon_vma(rmap_item); | ||
474 | rmap_item->address &= PAGE_MASK; | ||
475 | cond_resched(); | ||
476 | } | ||
477 | |||
478 | rb_erase(&stable_node->node, &root_stable_tree); | ||
479 | free_stable_node(stable_node); | ||
480 | } | ||
481 | |||
406 | /* | 482 | /* |
407 | * get_ksm_page: checks if the page at the virtual address in rmap_item | 483 | * get_ksm_page: checks if the page indicated by the stable node |
408 | * is still PageKsm, in which case we can trust the content of the page, | 484 | * is still its ksm page, despite having held no reference to it. |
409 | * and it returns the gotten page; but NULL if the page has been zapped. | 485 | * In which case we can trust the content of the page, and it |
486 | * returns the gotten page; but if the page has now been zapped, | ||
487 | * remove the stale node from the stable tree and return NULL. | ||
488 | * | ||
489 | * You would expect the stable_node to hold a reference to the ksm page. | ||
490 | * But if it increments the page's count, swapping out has to wait for | ||
491 | * ksmd to come around again before it can free the page, which may take | ||
492 | * seconds or even minutes: much too unresponsive. So instead we use a | ||
493 | * "keyhole reference": access to the ksm page from the stable node peeps | ||
494 | * out through its keyhole to see if that page still holds the right key, | ||
495 | * pointing back to this stable node. This relies on freeing a PageAnon | ||
496 | * page to reset its page->mapping to NULL, and relies on no other use of | ||
497 | * a page to put something that might look like our key in page->mapping. | ||
498 | * | ||
499 | * include/linux/pagemap.h page_cache_get_speculative() is a good reference, | ||
500 | * but this is different - made simpler by ksm_thread_mutex being held, but | ||
501 | * interesting for assuming that no other use of the struct page could ever | ||
502 | * put our expected_mapping into page->mapping (or a field of the union which | ||
503 | * coincides with page->mapping). The RCU calls are not for KSM at all, but | ||
504 | * to keep the page_count protocol described with page_cache_get_speculative. | ||
505 | * | ||
506 | * Note: it is possible that get_ksm_page() will return NULL one moment, | ||
507 | * then page the next, if the page is in between page_freeze_refs() and | ||
508 | * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page | ||
509 | * is on its way to being freed; but it is an anomaly to bear in mind. | ||
410 | */ | 510 | */ |
411 | static struct page *get_ksm_page(struct rmap_item *rmap_item) | 511 | static struct page *get_ksm_page(struct stable_node *stable_node) |
412 | { | 512 | { |
413 | struct page *page; | 513 | struct page *page; |
414 | 514 | void *expected_mapping; | |
415 | page = get_mergeable_page(rmap_item); | 515 | |
416 | if (page && !PageKsm(page)) { | 516 | page = pfn_to_page(stable_node->kpfn); |
517 | expected_mapping = (void *)stable_node + | ||
518 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | ||
519 | rcu_read_lock(); | ||
520 | if (page->mapping != expected_mapping) | ||
521 | goto stale; | ||
522 | if (!get_page_unless_zero(page)) | ||
523 | goto stale; | ||
524 | if (page->mapping != expected_mapping) { | ||
417 | put_page(page); | 525 | put_page(page); |
418 | page = NULL; | 526 | goto stale; |
419 | } | 527 | } |
528 | rcu_read_unlock(); | ||
420 | return page; | 529 | return page; |
530 | stale: | ||
531 | rcu_read_unlock(); | ||
532 | remove_node_from_stable_tree(stable_node); | ||
533 | return NULL; | ||
421 | } | 534 | } |
422 | 535 | ||
423 | /* | 536 | /* |
@@ -426,35 +539,29 @@ static struct page *get_ksm_page(struct rmap_item *rmap_item) | |||
426 | */ | 539 | */ |
427 | static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | 540 | static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) |
428 | { | 541 | { |
429 | if (in_stable_tree(rmap_item)) { | 542 | if (rmap_item->address & STABLE_FLAG) { |
430 | struct rmap_item *next_item = rmap_item->next; | 543 | struct stable_node *stable_node; |
431 | 544 | struct page *page; | |
432 | if (rmap_item->address & NODE_FLAG) { | ||
433 | if (next_item) { | ||
434 | rb_replace_node(&rmap_item->node, | ||
435 | &next_item->node, | ||
436 | &root_stable_tree); | ||
437 | next_item->address |= NODE_FLAG; | ||
438 | ksm_pages_sharing--; | ||
439 | } else { | ||
440 | rb_erase(&rmap_item->node, &root_stable_tree); | ||
441 | ksm_pages_shared--; | ||
442 | } | ||
443 | } else { | ||
444 | struct rmap_item *prev_item = rmap_item->prev; | ||
445 | 545 | ||
446 | BUG_ON(prev_item->next != rmap_item); | 546 | stable_node = rmap_item->head; |
447 | prev_item->next = next_item; | 547 | page = get_ksm_page(stable_node); |
448 | if (next_item) { | 548 | if (!page) |
449 | BUG_ON(next_item->prev != rmap_item); | 549 | goto out; |
450 | next_item->prev = rmap_item->prev; | 550 | |
451 | } | 551 | lock_page(page); |
552 | hlist_del(&rmap_item->hlist); | ||
553 | unlock_page(page); | ||
554 | put_page(page); | ||
555 | |||
556 | if (stable_node->hlist.first) | ||
452 | ksm_pages_sharing--; | 557 | ksm_pages_sharing--; |
453 | } | 558 | else |
559 | ksm_pages_shared--; | ||
454 | 560 | ||
455 | rmap_item->next = NULL; | 561 | drop_anon_vma(rmap_item); |
562 | rmap_item->address &= PAGE_MASK; | ||
456 | 563 | ||
457 | } else if (rmap_item->address & NODE_FLAG) { | 564 | } else if (rmap_item->address & UNSTABLE_FLAG) { |
458 | unsigned char age; | 565 | unsigned char age; |
459 | /* | 566 | /* |
460 | * Usually ksmd can and must skip the rb_erase, because | 567 | * Usually ksmd can and must skip the rb_erase, because |
@@ -467,24 +574,21 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
467 | BUG_ON(age > 1); | 574 | BUG_ON(age > 1); |
468 | if (!age) | 575 | if (!age) |
469 | rb_erase(&rmap_item->node, &root_unstable_tree); | 576 | rb_erase(&rmap_item->node, &root_unstable_tree); |
577 | |||
470 | ksm_pages_unshared--; | 578 | ksm_pages_unshared--; |
579 | rmap_item->address &= PAGE_MASK; | ||
471 | } | 580 | } |
472 | 581 | out: | |
473 | rmap_item->address &= PAGE_MASK; | ||
474 | |||
475 | cond_resched(); /* we're called from many long loops */ | 582 | cond_resched(); /* we're called from many long loops */ |
476 | } | 583 | } |
477 | 584 | ||
478 | static void remove_trailing_rmap_items(struct mm_slot *mm_slot, | 585 | static void remove_trailing_rmap_items(struct mm_slot *mm_slot, |
479 | struct list_head *cur) | 586 | struct rmap_item **rmap_list) |
480 | { | 587 | { |
481 | struct rmap_item *rmap_item; | 588 | while (*rmap_list) { |
482 | 589 | struct rmap_item *rmap_item = *rmap_list; | |
483 | while (cur != &mm_slot->rmap_list) { | 590 | *rmap_list = rmap_item->rmap_list; |
484 | rmap_item = list_entry(cur, struct rmap_item, link); | ||
485 | cur = cur->next; | ||
486 | remove_rmap_item_from_tree(rmap_item); | 591 | remove_rmap_item_from_tree(rmap_item); |
487 | list_del(&rmap_item->link); | ||
488 | free_rmap_item(rmap_item); | 592 | free_rmap_item(rmap_item); |
489 | } | 593 | } |
490 | } | 594 | } |
@@ -550,7 +654,7 @@ static int unmerge_and_remove_all_rmap_items(void) | |||
550 | goto error; | 654 | goto error; |
551 | } | 655 | } |
552 | 656 | ||
553 | remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); | 657 | remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); |
554 | 658 | ||
555 | spin_lock(&ksm_mmlist_lock); | 659 | spin_lock(&ksm_mmlist_lock); |
556 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, | 660 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, |
@@ -646,8 +750,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
646 | * Check that no O_DIRECT or similar I/O is in progress on the | 750 | * Check that no O_DIRECT or similar I/O is in progress on the |
647 | * page | 751 | * page |
648 | */ | 752 | */ |
649 | if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { | 753 | if (page_mapcount(page) + 1 + swapped != page_count(page)) { |
650 | set_pte_at_notify(mm, addr, ptep, entry); | 754 | set_pte_at(mm, addr, ptep, entry); |
651 | goto out_unlock; | 755 | goto out_unlock; |
652 | } | 756 | } |
653 | entry = pte_wrprotect(entry); | 757 | entry = pte_wrprotect(entry); |
@@ -664,15 +768,15 @@ out: | |||
664 | 768 | ||
665 | /** | 769 | /** |
666 | * replace_page - replace page in vma by new ksm page | 770 | * replace_page - replace page in vma by new ksm page |
667 | * @vma: vma that holds the pte pointing to oldpage | 771 | * @vma: vma that holds the pte pointing to page |
668 | * @oldpage: the page we are replacing by newpage | 772 | * @page: the page we are replacing by kpage |
669 | * @newpage: the ksm page we replace oldpage by | 773 | * @kpage: the ksm page we replace page by |
670 | * @orig_pte: the original value of the pte | 774 | * @orig_pte: the original value of the pte |
671 | * | 775 | * |
672 | * Returns 0 on success, -EFAULT on failure. | 776 | * Returns 0 on success, -EFAULT on failure. |
673 | */ | 777 | */ |
674 | static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | 778 | static int replace_page(struct vm_area_struct *vma, struct page *page, |
675 | struct page *newpage, pte_t orig_pte) | 779 | struct page *kpage, pte_t orig_pte) |
676 | { | 780 | { |
677 | struct mm_struct *mm = vma->vm_mm; | 781 | struct mm_struct *mm = vma->vm_mm; |
678 | pgd_t *pgd; | 782 | pgd_t *pgd; |
@@ -681,12 +785,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | |||
681 | pte_t *ptep; | 785 | pte_t *ptep; |
682 | spinlock_t *ptl; | 786 | spinlock_t *ptl; |
683 | unsigned long addr; | 787 | unsigned long addr; |
684 | pgprot_t prot; | ||
685 | int err = -EFAULT; | 788 | int err = -EFAULT; |
686 | 789 | ||
687 | prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); | 790 | addr = page_address_in_vma(page, vma); |
688 | |||
689 | addr = page_address_in_vma(oldpage, vma); | ||
690 | if (addr == -EFAULT) | 791 | if (addr == -EFAULT) |
691 | goto out; | 792 | goto out; |
692 | 793 | ||
@@ -708,15 +809,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | |||
708 | goto out; | 809 | goto out; |
709 | } | 810 | } |
710 | 811 | ||
711 | get_page(newpage); | 812 | get_page(kpage); |
712 | page_add_ksm_rmap(newpage); | 813 | page_add_anon_rmap(kpage, vma, addr); |
713 | 814 | ||
714 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | 815 | flush_cache_page(vma, addr, pte_pfn(*ptep)); |
715 | ptep_clear_flush(vma, addr, ptep); | 816 | ptep_clear_flush(vma, addr, ptep); |
716 | set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); | 817 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); |
717 | 818 | ||
718 | page_remove_rmap(oldpage); | 819 | page_remove_rmap(page); |
719 | put_page(oldpage); | 820 | put_page(page); |
720 | 821 | ||
721 | pte_unmap_unlock(ptep, ptl); | 822 | pte_unmap_unlock(ptep, ptl); |
722 | err = 0; | 823 | err = 0; |
@@ -726,32 +827,27 @@ out: | |||
726 | 827 | ||
727 | /* | 828 | /* |
728 | * try_to_merge_one_page - take two pages and merge them into one | 829 | * try_to_merge_one_page - take two pages and merge them into one |
729 | * @vma: the vma that hold the pte pointing into oldpage | 830 | * @vma: the vma that holds the pte pointing to page |
730 | * @oldpage: the page that we want to replace with newpage | 831 | * @page: the PageAnon page that we want to replace with kpage |
731 | * @newpage: the page that we want to map instead of oldpage | 832 | * @kpage: the PageKsm page that we want to map instead of page, |
732 | * | 833 | * or NULL the first time when we want to use page as kpage. |
733 | * Note: | ||
734 | * oldpage should be a PageAnon page, while newpage should be a PageKsm page, | ||
735 | * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm. | ||
736 | * | 834 | * |
737 | * This function returns 0 if the pages were merged, -EFAULT otherwise. | 835 | * This function returns 0 if the pages were merged, -EFAULT otherwise. |
738 | */ | 836 | */ |
739 | static int try_to_merge_one_page(struct vm_area_struct *vma, | 837 | static int try_to_merge_one_page(struct vm_area_struct *vma, |
740 | struct page *oldpage, | 838 | struct page *page, struct page *kpage) |
741 | struct page *newpage) | ||
742 | { | 839 | { |
743 | pte_t orig_pte = __pte(0); | 840 | pte_t orig_pte = __pte(0); |
744 | int err = -EFAULT; | 841 | int err = -EFAULT; |
745 | 842 | ||
843 | if (page == kpage) /* ksm page forked */ | ||
844 | return 0; | ||
845 | |||
746 | if (!(vma->vm_flags & VM_MERGEABLE)) | 846 | if (!(vma->vm_flags & VM_MERGEABLE)) |
747 | goto out; | 847 | goto out; |
748 | 848 | if (!PageAnon(page)) | |
749 | if (!PageAnon(oldpage)) | ||
750 | goto out; | 849 | goto out; |
751 | 850 | ||
752 | get_page(newpage); | ||
753 | get_page(oldpage); | ||
754 | |||
755 | /* | 851 | /* |
756 | * We need the page lock to read a stable PageSwapCache in | 852 | * We need the page lock to read a stable PageSwapCache in |
757 | * write_protect_page(). We use trylock_page() instead of | 853 | * write_protect_page(). We use trylock_page() instead of |
@@ -759,26 +855,39 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
759 | * prefer to continue scanning and merging different pages, | 855 | * prefer to continue scanning and merging different pages, |
760 | * then come back to this page when it is unlocked. | 856 | * then come back to this page when it is unlocked. |
761 | */ | 857 | */ |
762 | if (!trylock_page(oldpage)) | 858 | if (!trylock_page(page)) |
763 | goto out_putpage; | 859 | goto out; |
764 | /* | 860 | /* |
765 | * If this anonymous page is mapped only here, its pte may need | 861 | * If this anonymous page is mapped only here, its pte may need |
766 | * to be write-protected. If it's mapped elsewhere, all of its | 862 | * to be write-protected. If it's mapped elsewhere, all of its |
767 | * ptes are necessarily already write-protected. But in either | 863 | * ptes are necessarily already write-protected. But in either |
768 | * case, we need to lock and check page_count is not raised. | 864 | * case, we need to lock and check page_count is not raised. |
769 | */ | 865 | */ |
770 | if (write_protect_page(vma, oldpage, &orig_pte)) { | 866 | if (write_protect_page(vma, page, &orig_pte) == 0) { |
771 | unlock_page(oldpage); | 867 | if (!kpage) { |
772 | goto out_putpage; | 868 | /* |
869 | * While we hold page lock, upgrade page from | ||
870 | * PageAnon+anon_vma to PageKsm+NULL stable_node: | ||
871 | * stable_tree_insert() will update stable_node. | ||
872 | */ | ||
873 | set_page_stable_node(page, NULL); | ||
874 | mark_page_accessed(page); | ||
875 | err = 0; | ||
876 | } else if (pages_identical(page, kpage)) | ||
877 | err = replace_page(vma, page, kpage, orig_pte); | ||
773 | } | 878 | } |
774 | unlock_page(oldpage); | ||
775 | 879 | ||
776 | if (pages_identical(oldpage, newpage)) | 880 | if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { |
777 | err = replace_page(vma, oldpage, newpage, orig_pte); | 881 | munlock_vma_page(page); |
882 | if (!PageMlocked(kpage)) { | ||
883 | unlock_page(page); | ||
884 | lock_page(kpage); | ||
885 | mlock_vma_page(kpage); | ||
886 | page = kpage; /* for final unlock */ | ||
887 | } | ||
888 | } | ||
778 | 889 | ||
779 | out_putpage: | 890 | unlock_page(page); |
780 | put_page(oldpage); | ||
781 | put_page(newpage); | ||
782 | out: | 891 | out: |
783 | return err; | 892 | return err; |
784 | } | 893 | } |
@@ -786,26 +895,31 @@ out: | |||
786 | /* | 895 | /* |
787 | * try_to_merge_with_ksm_page - like try_to_merge_two_pages, | 896 | * try_to_merge_with_ksm_page - like try_to_merge_two_pages, |
788 | * but no new kernel page is allocated: kpage must already be a ksm page. | 897 | * but no new kernel page is allocated: kpage must already be a ksm page. |
898 | * | ||
899 | * This function returns 0 if the pages were merged, -EFAULT otherwise. | ||
789 | */ | 900 | */ |
790 | static int try_to_merge_with_ksm_page(struct mm_struct *mm1, | 901 | static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, |
791 | unsigned long addr1, | 902 | struct page *page, struct page *kpage) |
792 | struct page *page1, | ||
793 | struct page *kpage) | ||
794 | { | 903 | { |
904 | struct mm_struct *mm = rmap_item->mm; | ||
795 | struct vm_area_struct *vma; | 905 | struct vm_area_struct *vma; |
796 | int err = -EFAULT; | 906 | int err = -EFAULT; |
797 | 907 | ||
798 | down_read(&mm1->mmap_sem); | 908 | down_read(&mm->mmap_sem); |
799 | if (ksm_test_exit(mm1)) | 909 | if (ksm_test_exit(mm)) |
910 | goto out; | ||
911 | vma = find_vma(mm, rmap_item->address); | ||
912 | if (!vma || vma->vm_start > rmap_item->address) | ||
800 | goto out; | 913 | goto out; |
801 | 914 | ||
802 | vma = find_vma(mm1, addr1); | 915 | err = try_to_merge_one_page(vma, page, kpage); |
803 | if (!vma || vma->vm_start > addr1) | 916 | if (err) |
804 | goto out; | 917 | goto out; |
805 | 918 | ||
806 | err = try_to_merge_one_page(vma, page1, kpage); | 919 | /* Must get reference to anon_vma while still holding mmap_sem */ |
920 | hold_anon_vma(rmap_item, vma->anon_vma); | ||
807 | out: | 921 | out: |
808 | up_read(&mm1->mmap_sem); | 922 | up_read(&mm->mmap_sem); |
809 | return err; | 923 | return err; |
810 | } | 924 | } |
811 | 925 | ||
@@ -813,109 +927,73 @@ out: | |||
813 | * try_to_merge_two_pages - take two identical pages and prepare them | 927 | * try_to_merge_two_pages - take two identical pages and prepare them |
814 | * to be merged into one page. | 928 | * to be merged into one page. |
815 | * | 929 | * |
816 | * This function returns 0 if we successfully mapped two identical pages | 930 | * This function returns the kpage if we successfully merged two identical |
817 | * into one page, -EFAULT otherwise. | 931 | * pages into one ksm page, NULL otherwise. |
818 | * | 932 | * |
819 | * Note that this function allocates a new kernel page: if one of the pages | 933 | * Note that this function upgrades page to ksm page: if one of the pages |
820 | * is already a ksm page, try_to_merge_with_ksm_page should be used. | 934 | * is already a ksm page, try_to_merge_with_ksm_page should be used. |
821 | */ | 935 | */ |
822 | static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, | 936 | static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, |
823 | struct page *page1, struct mm_struct *mm2, | 937 | struct page *page, |
824 | unsigned long addr2, struct page *page2) | 938 | struct rmap_item *tree_rmap_item, |
939 | struct page *tree_page) | ||
825 | { | 940 | { |
826 | struct vm_area_struct *vma; | 941 | int err; |
827 | struct page *kpage; | ||
828 | int err = -EFAULT; | ||
829 | |||
830 | /* | ||
831 | * The number of nodes in the stable tree | ||
832 | * is the number of kernel pages that we hold. | ||
833 | */ | ||
834 | if (ksm_max_kernel_pages && | ||
835 | ksm_max_kernel_pages <= ksm_pages_shared) | ||
836 | return err; | ||
837 | |||
838 | kpage = alloc_page(GFP_HIGHUSER); | ||
839 | if (!kpage) | ||
840 | return err; | ||
841 | |||
842 | down_read(&mm1->mmap_sem); | ||
843 | if (ksm_test_exit(mm1)) { | ||
844 | up_read(&mm1->mmap_sem); | ||
845 | goto out; | ||
846 | } | ||
847 | vma = find_vma(mm1, addr1); | ||
848 | if (!vma || vma->vm_start > addr1) { | ||
849 | up_read(&mm1->mmap_sem); | ||
850 | goto out; | ||
851 | } | ||
852 | |||
853 | copy_user_highpage(kpage, page1, addr1, vma); | ||
854 | err = try_to_merge_one_page(vma, page1, kpage); | ||
855 | up_read(&mm1->mmap_sem); | ||
856 | 942 | ||
943 | err = try_to_merge_with_ksm_page(rmap_item, page, NULL); | ||
857 | if (!err) { | 944 | if (!err) { |
858 | err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); | 945 | err = try_to_merge_with_ksm_page(tree_rmap_item, |
946 | tree_page, page); | ||
859 | /* | 947 | /* |
860 | * If that fails, we have a ksm page with only one pte | 948 | * If that fails, we have a ksm page with only one pte |
861 | * pointing to it: so break it. | 949 | * pointing to it: so break it. |
862 | */ | 950 | */ |
863 | if (err) | 951 | if (err) |
864 | break_cow(mm1, addr1); | 952 | break_cow(rmap_item); |
865 | } | 953 | } |
866 | out: | 954 | return err ? NULL : page; |
867 | put_page(kpage); | ||
868 | return err; | ||
869 | } | 955 | } |
870 | 956 | ||
871 | /* | 957 | /* |
872 | * stable_tree_search - search page inside the stable tree | 958 | * stable_tree_search - search for page inside the stable tree |
873 | * @page: the page that we are searching identical pages to. | ||
874 | * @page2: pointer into identical page that we are holding inside the stable | ||
875 | * tree that we have found. | ||
876 | * @rmap_item: the reverse mapping item | ||
877 | * | 959 | * |
878 | * This function checks if there is a page inside the stable tree | 960 | * This function checks if there is a page inside the stable tree |
879 | * with identical content to the page that we are scanning right now. | 961 | * with identical content to the page that we are scanning right now. |
880 | * | 962 | * |
881 | * This function return rmap_item pointer to the identical item if found, | 963 | * This function returns the stable tree node of identical content if found, |
882 | * NULL otherwise. | 964 | * NULL otherwise. |
883 | */ | 965 | */ |
884 | static struct rmap_item *stable_tree_search(struct page *page, | 966 | static struct page *stable_tree_search(struct page *page) |
885 | struct page **page2, | ||
886 | struct rmap_item *rmap_item) | ||
887 | { | 967 | { |
888 | struct rb_node *node = root_stable_tree.rb_node; | 968 | struct rb_node *node = root_stable_tree.rb_node; |
969 | struct stable_node *stable_node; | ||
970 | |||
971 | stable_node = page_stable_node(page); | ||
972 | if (stable_node) { /* ksm page forked */ | ||
973 | get_page(page); | ||
974 | return page; | ||
975 | } | ||
889 | 976 | ||
890 | while (node) { | 977 | while (node) { |
891 | struct rmap_item *tree_rmap_item, *next_rmap_item; | 978 | struct page *tree_page; |
892 | int ret; | 979 | int ret; |
893 | 980 | ||
894 | tree_rmap_item = rb_entry(node, struct rmap_item, node); | 981 | cond_resched(); |
895 | while (tree_rmap_item) { | 982 | stable_node = rb_entry(node, struct stable_node, node); |
896 | BUG_ON(!in_stable_tree(tree_rmap_item)); | 983 | tree_page = get_ksm_page(stable_node); |
897 | cond_resched(); | 984 | if (!tree_page) |
898 | page2[0] = get_ksm_page(tree_rmap_item); | ||
899 | if (page2[0]) | ||
900 | break; | ||
901 | next_rmap_item = tree_rmap_item->next; | ||
902 | remove_rmap_item_from_tree(tree_rmap_item); | ||
903 | tree_rmap_item = next_rmap_item; | ||
904 | } | ||
905 | if (!tree_rmap_item) | ||
906 | return NULL; | 985 | return NULL; |
907 | 986 | ||
908 | ret = memcmp_pages(page, page2[0]); | 987 | ret = memcmp_pages(page, tree_page); |
909 | 988 | ||
910 | if (ret < 0) { | 989 | if (ret < 0) { |
911 | put_page(page2[0]); | 990 | put_page(tree_page); |
912 | node = node->rb_left; | 991 | node = node->rb_left; |
913 | } else if (ret > 0) { | 992 | } else if (ret > 0) { |
914 | put_page(page2[0]); | 993 | put_page(tree_page); |
915 | node = node->rb_right; | 994 | node = node->rb_right; |
916 | } else { | 995 | } else |
917 | return tree_rmap_item; | 996 | return tree_page; |
918 | } | ||
919 | } | 997 | } |
920 | 998 | ||
921 | return NULL; | 999 | return NULL; |
@@ -925,38 +1003,26 @@ static struct rmap_item *stable_tree_search(struct page *page, | |||
925 | * stable_tree_insert - insert rmap_item pointing to new ksm page | 1003 | * stable_tree_insert - insert rmap_item pointing to new ksm page |
926 | * into the stable tree. | 1004 | * into the stable tree. |
927 | * | 1005 | * |
928 | * @page: the page that we are searching identical page to inside the stable | 1006 | * This function returns the stable tree node just allocated on success, |
929 | * tree. | 1007 | * NULL otherwise. |
930 | * @rmap_item: pointer to the reverse mapping item. | ||
931 | * | ||
932 | * This function returns rmap_item if success, NULL otherwise. | ||
933 | */ | 1008 | */ |
934 | static struct rmap_item *stable_tree_insert(struct page *page, | 1009 | static struct stable_node *stable_tree_insert(struct page *kpage) |
935 | struct rmap_item *rmap_item) | ||
936 | { | 1010 | { |
937 | struct rb_node **new = &root_stable_tree.rb_node; | 1011 | struct rb_node **new = &root_stable_tree.rb_node; |
938 | struct rb_node *parent = NULL; | 1012 | struct rb_node *parent = NULL; |
1013 | struct stable_node *stable_node; | ||
939 | 1014 | ||
940 | while (*new) { | 1015 | while (*new) { |
941 | struct rmap_item *tree_rmap_item, *next_rmap_item; | ||
942 | struct page *tree_page; | 1016 | struct page *tree_page; |
943 | int ret; | 1017 | int ret; |
944 | 1018 | ||
945 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); | 1019 | cond_resched(); |
946 | while (tree_rmap_item) { | 1020 | stable_node = rb_entry(*new, struct stable_node, node); |
947 | BUG_ON(!in_stable_tree(tree_rmap_item)); | 1021 | tree_page = get_ksm_page(stable_node); |
948 | cond_resched(); | 1022 | if (!tree_page) |
949 | tree_page = get_ksm_page(tree_rmap_item); | ||
950 | if (tree_page) | ||
951 | break; | ||
952 | next_rmap_item = tree_rmap_item->next; | ||
953 | remove_rmap_item_from_tree(tree_rmap_item); | ||
954 | tree_rmap_item = next_rmap_item; | ||
955 | } | ||
956 | if (!tree_rmap_item) | ||
957 | return NULL; | 1023 | return NULL; |
958 | 1024 | ||
959 | ret = memcmp_pages(page, tree_page); | 1025 | ret = memcmp_pages(kpage, tree_page); |
960 | put_page(tree_page); | 1026 | put_page(tree_page); |
961 | 1027 | ||
962 | parent = *new; | 1028 | parent = *new; |
@@ -974,22 +1040,24 @@ static struct rmap_item *stable_tree_insert(struct page *page, | |||
974 | } | 1040 | } |
975 | } | 1041 | } |
976 | 1042 | ||
977 | rmap_item->address |= NODE_FLAG | STABLE_FLAG; | 1043 | stable_node = alloc_stable_node(); |
978 | rmap_item->next = NULL; | 1044 | if (!stable_node) |
979 | rb_link_node(&rmap_item->node, parent, new); | 1045 | return NULL; |
980 | rb_insert_color(&rmap_item->node, &root_stable_tree); | ||
981 | 1046 | ||
982 | ksm_pages_shared++; | 1047 | rb_link_node(&stable_node->node, parent, new); |
983 | return rmap_item; | 1048 | rb_insert_color(&stable_node->node, &root_stable_tree); |
1049 | |||
1050 | INIT_HLIST_HEAD(&stable_node->hlist); | ||
1051 | |||
1052 | stable_node->kpfn = page_to_pfn(kpage); | ||
1053 | set_page_stable_node(kpage, stable_node); | ||
1054 | |||
1055 | return stable_node; | ||
984 | } | 1056 | } |
985 | 1057 | ||
986 | /* | 1058 | /* |
987 | * unstable_tree_search_insert - search and insert items into the unstable tree. | 1059 | * unstable_tree_search_insert - search for identical page, |
988 | * | 1060 | * else insert rmap_item into the unstable tree. |
989 | * @page: the page that we are going to search for identical page or to insert | ||
990 | * into the unstable tree | ||
991 | * @page2: pointer into identical page that was found inside the unstable tree | ||
992 | * @rmap_item: the reverse mapping item of page | ||
993 | * | 1061 | * |
994 | * This function searches for a page in the unstable tree identical to the | 1062 | * This function searches for a page in the unstable tree identical to the |
995 | * page currently being scanned; and if no identical page is found in the | 1063 | * page currently being scanned; and if no identical page is found in the |
@@ -1001,47 +1069,50 @@ static struct rmap_item *stable_tree_insert(struct page *page, | |||
1001 | * This function does both searching and inserting, because they share | 1069 | * This function does both searching and inserting, because they share |
1002 | * the same walking algorithm in an rbtree. | 1070 | * the same walking algorithm in an rbtree. |
1003 | */ | 1071 | */ |
1004 | static struct rmap_item *unstable_tree_search_insert(struct page *page, | 1072 | static |
1005 | struct page **page2, | 1073 | struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, |
1006 | struct rmap_item *rmap_item) | 1074 | struct page *page, |
1075 | struct page **tree_pagep) | ||
1076 | |||
1007 | { | 1077 | { |
1008 | struct rb_node **new = &root_unstable_tree.rb_node; | 1078 | struct rb_node **new = &root_unstable_tree.rb_node; |
1009 | struct rb_node *parent = NULL; | 1079 | struct rb_node *parent = NULL; |
1010 | 1080 | ||
1011 | while (*new) { | 1081 | while (*new) { |
1012 | struct rmap_item *tree_rmap_item; | 1082 | struct rmap_item *tree_rmap_item; |
1083 | struct page *tree_page; | ||
1013 | int ret; | 1084 | int ret; |
1014 | 1085 | ||
1015 | cond_resched(); | 1086 | cond_resched(); |
1016 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); | 1087 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); |
1017 | page2[0] = get_mergeable_page(tree_rmap_item); | 1088 | tree_page = get_mergeable_page(tree_rmap_item); |
1018 | if (!page2[0]) | 1089 | if (IS_ERR_OR_NULL(tree_page)) |
1019 | return NULL; | 1090 | return NULL; |
1020 | 1091 | ||
1021 | /* | 1092 | /* |
1022 | * Don't substitute an unswappable ksm page | 1093 | * Don't substitute a ksm page for a forked page. |
1023 | * just for one good swappable forked page. | ||
1024 | */ | 1094 | */ |
1025 | if (page == page2[0]) { | 1095 | if (page == tree_page) { |
1026 | put_page(page2[0]); | 1096 | put_page(tree_page); |
1027 | return NULL; | 1097 | return NULL; |
1028 | } | 1098 | } |
1029 | 1099 | ||
1030 | ret = memcmp_pages(page, page2[0]); | 1100 | ret = memcmp_pages(page, tree_page); |
1031 | 1101 | ||
1032 | parent = *new; | 1102 | parent = *new; |
1033 | if (ret < 0) { | 1103 | if (ret < 0) { |
1034 | put_page(page2[0]); | 1104 | put_page(tree_page); |
1035 | new = &parent->rb_left; | 1105 | new = &parent->rb_left; |
1036 | } else if (ret > 0) { | 1106 | } else if (ret > 0) { |
1037 | put_page(page2[0]); | 1107 | put_page(tree_page); |
1038 | new = &parent->rb_right; | 1108 | new = &parent->rb_right; |
1039 | } else { | 1109 | } else { |
1110 | *tree_pagep = tree_page; | ||
1040 | return tree_rmap_item; | 1111 | return tree_rmap_item; |
1041 | } | 1112 | } |
1042 | } | 1113 | } |
1043 | 1114 | ||
1044 | rmap_item->address |= NODE_FLAG; | 1115 | rmap_item->address |= UNSTABLE_FLAG; |
1045 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); | 1116 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); |
1046 | rb_link_node(&rmap_item->node, parent, new); | 1117 | rb_link_node(&rmap_item->node, parent, new); |
1047 | rb_insert_color(&rmap_item->node, &root_unstable_tree); | 1118 | rb_insert_color(&rmap_item->node, &root_unstable_tree); |
@@ -1056,18 +1127,16 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page, | |||
1056 | * the same ksm page. | 1127 | * the same ksm page. |
1057 | */ | 1128 | */ |
1058 | static void stable_tree_append(struct rmap_item *rmap_item, | 1129 | static void stable_tree_append(struct rmap_item *rmap_item, |
1059 | struct rmap_item *tree_rmap_item) | 1130 | struct stable_node *stable_node) |
1060 | { | 1131 | { |
1061 | rmap_item->next = tree_rmap_item->next; | 1132 | rmap_item->head = stable_node; |
1062 | rmap_item->prev = tree_rmap_item; | ||
1063 | |||
1064 | if (tree_rmap_item->next) | ||
1065 | tree_rmap_item->next->prev = rmap_item; | ||
1066 | |||
1067 | tree_rmap_item->next = rmap_item; | ||
1068 | rmap_item->address |= STABLE_FLAG; | 1133 | rmap_item->address |= STABLE_FLAG; |
1134 | hlist_add_head(&rmap_item->hlist, &stable_node->hlist); | ||
1069 | 1135 | ||
1070 | ksm_pages_sharing++; | 1136 | if (rmap_item->hlist.next) |
1137 | ksm_pages_sharing++; | ||
1138 | else | ||
1139 | ksm_pages_shared++; | ||
1071 | } | 1140 | } |
1072 | 1141 | ||
1073 | /* | 1142 | /* |
@@ -1081,49 +1150,37 @@ static void stable_tree_append(struct rmap_item *rmap_item, | |||
1081 | */ | 1150 | */ |
1082 | static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | 1151 | static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) |
1083 | { | 1152 | { |
1084 | struct page *page2[1]; | ||
1085 | struct rmap_item *tree_rmap_item; | 1153 | struct rmap_item *tree_rmap_item; |
1154 | struct page *tree_page = NULL; | ||
1155 | struct stable_node *stable_node; | ||
1156 | struct page *kpage; | ||
1086 | unsigned int checksum; | 1157 | unsigned int checksum; |
1087 | int err; | 1158 | int err; |
1088 | 1159 | ||
1089 | if (in_stable_tree(rmap_item)) | 1160 | remove_rmap_item_from_tree(rmap_item); |
1090 | remove_rmap_item_from_tree(rmap_item); | ||
1091 | 1161 | ||
1092 | /* We first start with searching the page inside the stable tree */ | 1162 | /* We first start with searching the page inside the stable tree */ |
1093 | tree_rmap_item = stable_tree_search(page, page2, rmap_item); | 1163 | kpage = stable_tree_search(page); |
1094 | if (tree_rmap_item) { | 1164 | if (kpage) { |
1095 | if (page == page2[0]) /* forked */ | 1165 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
1096 | err = 0; | ||
1097 | else | ||
1098 | err = try_to_merge_with_ksm_page(rmap_item->mm, | ||
1099 | rmap_item->address, | ||
1100 | page, page2[0]); | ||
1101 | put_page(page2[0]); | ||
1102 | |||
1103 | if (!err) { | 1166 | if (!err) { |
1104 | /* | 1167 | /* |
1105 | * The page was successfully merged: | 1168 | * The page was successfully merged: |
1106 | * add its rmap_item to the stable tree. | 1169 | * add its rmap_item to the stable tree. |
1107 | */ | 1170 | */ |
1108 | stable_tree_append(rmap_item, tree_rmap_item); | 1171 | lock_page(kpage); |
1172 | stable_tree_append(rmap_item, page_stable_node(kpage)); | ||
1173 | unlock_page(kpage); | ||
1109 | } | 1174 | } |
1175 | put_page(kpage); | ||
1110 | return; | 1176 | return; |
1111 | } | 1177 | } |
1112 | 1178 | ||
1113 | /* | 1179 | /* |
1114 | * A ksm page might have got here by fork, but its other | 1180 | * If the hash value of the page has changed from the last time |
1115 | * references have already been removed from the stable tree. | 1181 | * we calculated it, this page is changing frequently: therefore we |
1116 | * Or it might be left over from a break_ksm which failed | 1182 | * don't want to insert it in the unstable tree, and we don't want |
1117 | * when the mem_cgroup had reached its limit: try again now. | 1183 | * to waste our time searching for something identical to it there. |
1118 | */ | ||
1119 | if (PageKsm(page)) | ||
1120 | break_cow(rmap_item->mm, rmap_item->address); | ||
1121 | |||
1122 | /* | ||
1123 | * In case the hash value of the page was changed from the last time we | ||
1124 | * have calculated it, this page to be changed frequely, therefore we | ||
1125 | * don't want to insert it to the unstable tree, and we don't want to | ||
1126 | * waste our time to search if there is something identical to it there. | ||
1127 | */ | 1184 | */ |
1128 | checksum = calc_checksum(page); | 1185 | checksum = calc_checksum(page); |
1129 | if (rmap_item->oldchecksum != checksum) { | 1186 | if (rmap_item->oldchecksum != checksum) { |
@@ -1131,21 +1188,27 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1131 | return; | 1188 | return; |
1132 | } | 1189 | } |
1133 | 1190 | ||
1134 | tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); | 1191 | tree_rmap_item = |
1192 | unstable_tree_search_insert(rmap_item, page, &tree_page); | ||
1135 | if (tree_rmap_item) { | 1193 | if (tree_rmap_item) { |
1136 | err = try_to_merge_two_pages(rmap_item->mm, | 1194 | kpage = try_to_merge_two_pages(rmap_item, page, |
1137 | rmap_item->address, page, | 1195 | tree_rmap_item, tree_page); |
1138 | tree_rmap_item->mm, | 1196 | put_page(tree_page); |
1139 | tree_rmap_item->address, page2[0]); | ||
1140 | /* | 1197 | /* |
1141 | * As soon as we merge this page, we want to remove the | 1198 | * As soon as we merge this page, we want to remove the |
1142 | * rmap_item of the page we have merged with from the unstable | 1199 | * rmap_item of the page we have merged with from the unstable |
1143 | * tree, and insert it instead as new node in the stable tree. | 1200 | * tree, and insert it instead as new node in the stable tree. |
1144 | */ | 1201 | */ |
1145 | if (!err) { | 1202 | if (kpage) { |
1146 | rb_erase(&tree_rmap_item->node, &root_unstable_tree); | 1203 | remove_rmap_item_from_tree(tree_rmap_item); |
1147 | tree_rmap_item->address &= ~NODE_FLAG; | 1204 | |
1148 | ksm_pages_unshared--; | 1205 | lock_page(kpage); |
1206 | stable_node = stable_tree_insert(kpage); | ||
1207 | if (stable_node) { | ||
1208 | stable_tree_append(tree_rmap_item, stable_node); | ||
1209 | stable_tree_append(rmap_item, stable_node); | ||
1210 | } | ||
1211 | unlock_page(kpage); | ||
1149 | 1212 | ||
1150 | /* | 1213 | /* |
1151 | * If we fail to insert the page into the stable tree, | 1214 | * If we fail to insert the page into the stable tree, |
@@ -1153,37 +1216,28 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1153 | * to a ksm page left outside the stable tree, | 1216 | * to a ksm page left outside the stable tree, |
1154 | * in which case we need to break_cow on both. | 1217 | * in which case we need to break_cow on both. |
1155 | */ | 1218 | */ |
1156 | if (stable_tree_insert(page2[0], tree_rmap_item)) | 1219 | if (!stable_node) { |
1157 | stable_tree_append(rmap_item, tree_rmap_item); | 1220 | break_cow(tree_rmap_item); |
1158 | else { | 1221 | break_cow(rmap_item); |
1159 | break_cow(tree_rmap_item->mm, | ||
1160 | tree_rmap_item->address); | ||
1161 | break_cow(rmap_item->mm, rmap_item->address); | ||
1162 | } | 1222 | } |
1163 | } | 1223 | } |
1164 | |||
1165 | put_page(page2[0]); | ||
1166 | } | 1224 | } |
1167 | } | 1225 | } |
1168 | 1226 | ||
1169 | static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, | 1227 | static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, |
1170 | struct list_head *cur, | 1228 | struct rmap_item **rmap_list, |
1171 | unsigned long addr) | 1229 | unsigned long addr) |
1172 | { | 1230 | { |
1173 | struct rmap_item *rmap_item; | 1231 | struct rmap_item *rmap_item; |
1174 | 1232 | ||
1175 | while (cur != &mm_slot->rmap_list) { | 1233 | while (*rmap_list) { |
1176 | rmap_item = list_entry(cur, struct rmap_item, link); | 1234 | rmap_item = *rmap_list; |
1177 | if ((rmap_item->address & PAGE_MASK) == addr) { | 1235 | if ((rmap_item->address & PAGE_MASK) == addr) |
1178 | if (!in_stable_tree(rmap_item)) | ||
1179 | remove_rmap_item_from_tree(rmap_item); | ||
1180 | return rmap_item; | 1236 | return rmap_item; |
1181 | } | ||
1182 | if (rmap_item->address > addr) | 1237 | if (rmap_item->address > addr) |
1183 | break; | 1238 | break; |
1184 | cur = cur->next; | 1239 | *rmap_list = rmap_item->rmap_list; |
1185 | remove_rmap_item_from_tree(rmap_item); | 1240 | remove_rmap_item_from_tree(rmap_item); |
1186 | list_del(&rmap_item->link); | ||
1187 | free_rmap_item(rmap_item); | 1241 | free_rmap_item(rmap_item); |
1188 | } | 1242 | } |
1189 | 1243 | ||
@@ -1192,7 +1246,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, | |||
1192 | /* It has already been zeroed */ | 1246 | /* It has already been zeroed */ |
1193 | rmap_item->mm = mm_slot->mm; | 1247 | rmap_item->mm = mm_slot->mm; |
1194 | rmap_item->address = addr; | 1248 | rmap_item->address = addr; |
1195 | list_add_tail(&rmap_item->link, cur); | 1249 | rmap_item->rmap_list = *rmap_list; |
1250 | *rmap_list = rmap_item; | ||
1196 | } | 1251 | } |
1197 | return rmap_item; | 1252 | return rmap_item; |
1198 | } | 1253 | } |
@@ -1217,8 +1272,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1217 | spin_unlock(&ksm_mmlist_lock); | 1272 | spin_unlock(&ksm_mmlist_lock); |
1218 | next_mm: | 1273 | next_mm: |
1219 | ksm_scan.address = 0; | 1274 | ksm_scan.address = 0; |
1220 | ksm_scan.rmap_item = list_entry(&slot->rmap_list, | 1275 | ksm_scan.rmap_list = &slot->rmap_list; |
1221 | struct rmap_item, link); | ||
1222 | } | 1276 | } |
1223 | 1277 | ||
1224 | mm = slot->mm; | 1278 | mm = slot->mm; |
@@ -1240,21 +1294,21 @@ next_mm: | |||
1240 | if (ksm_test_exit(mm)) | 1294 | if (ksm_test_exit(mm)) |
1241 | break; | 1295 | break; |
1242 | *page = follow_page(vma, ksm_scan.address, FOLL_GET); | 1296 | *page = follow_page(vma, ksm_scan.address, FOLL_GET); |
1243 | if (*page && PageAnon(*page)) { | 1297 | if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { |
1244 | flush_anon_page(vma, *page, ksm_scan.address); | 1298 | flush_anon_page(vma, *page, ksm_scan.address); |
1245 | flush_dcache_page(*page); | 1299 | flush_dcache_page(*page); |
1246 | rmap_item = get_next_rmap_item(slot, | 1300 | rmap_item = get_next_rmap_item(slot, |
1247 | ksm_scan.rmap_item->link.next, | 1301 | ksm_scan.rmap_list, ksm_scan.address); |
1248 | ksm_scan.address); | ||
1249 | if (rmap_item) { | 1302 | if (rmap_item) { |
1250 | ksm_scan.rmap_item = rmap_item; | 1303 | ksm_scan.rmap_list = |
1304 | &rmap_item->rmap_list; | ||
1251 | ksm_scan.address += PAGE_SIZE; | 1305 | ksm_scan.address += PAGE_SIZE; |
1252 | } else | 1306 | } else |
1253 | put_page(*page); | 1307 | put_page(*page); |
1254 | up_read(&mm->mmap_sem); | 1308 | up_read(&mm->mmap_sem); |
1255 | return rmap_item; | 1309 | return rmap_item; |
1256 | } | 1310 | } |
1257 | if (*page) | 1311 | if (!IS_ERR_OR_NULL(*page)) |
1258 | put_page(*page); | 1312 | put_page(*page); |
1259 | ksm_scan.address += PAGE_SIZE; | 1313 | ksm_scan.address += PAGE_SIZE; |
1260 | cond_resched(); | 1314 | cond_resched(); |
@@ -1263,14 +1317,13 @@ next_mm: | |||
1263 | 1317 | ||
1264 | if (ksm_test_exit(mm)) { | 1318 | if (ksm_test_exit(mm)) { |
1265 | ksm_scan.address = 0; | 1319 | ksm_scan.address = 0; |
1266 | ksm_scan.rmap_item = list_entry(&slot->rmap_list, | 1320 | ksm_scan.rmap_list = &slot->rmap_list; |
1267 | struct rmap_item, link); | ||
1268 | } | 1321 | } |
1269 | /* | 1322 | /* |
1270 | * Nuke all the rmap_items that are above this current rmap: | 1323 | * Nuke all the rmap_items that are above this current rmap: |
1271 | * because there were no VM_MERGEABLE vmas with such addresses. | 1324 | * because there were no VM_MERGEABLE vmas with such addresses. |
1272 | */ | 1325 | */ |
1273 | remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); | 1326 | remove_trailing_rmap_items(slot, ksm_scan.rmap_list); |
1274 | 1327 | ||
1275 | spin_lock(&ksm_mmlist_lock); | 1328 | spin_lock(&ksm_mmlist_lock); |
1276 | ksm_scan.mm_slot = list_entry(slot->mm_list.next, | 1329 | ksm_scan.mm_slot = list_entry(slot->mm_list.next, |
@@ -1314,7 +1367,7 @@ next_mm: | |||
1314 | static void ksm_do_scan(unsigned int scan_npages) | 1367 | static void ksm_do_scan(unsigned int scan_npages) |
1315 | { | 1368 | { |
1316 | struct rmap_item *rmap_item; | 1369 | struct rmap_item *rmap_item; |
1317 | struct page *page; | 1370 | struct page *uninitialized_var(page); |
1318 | 1371 | ||
1319 | while (scan_npages--) { | 1372 | while (scan_npages--) { |
1320 | cond_resched(); | 1373 | cond_resched(); |
@@ -1323,14 +1376,6 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
1323 | return; | 1376 | return; |
1324 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) | 1377 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) |
1325 | cmp_and_merge_page(page, rmap_item); | 1378 | cmp_and_merge_page(page, rmap_item); |
1326 | else if (page_mapcount(page) == 1) { | ||
1327 | /* | ||
1328 | * Replace now-unshared ksm page by ordinary page. | ||
1329 | */ | ||
1330 | break_cow(rmap_item->mm, rmap_item->address); | ||
1331 | remove_rmap_item_from_tree(rmap_item); | ||
1332 | rmap_item->oldchecksum = calc_checksum(page); | ||
1333 | } | ||
1334 | put_page(page); | 1379 | put_page(page); |
1335 | } | 1380 | } |
1336 | } | 1381 | } |
@@ -1375,7 +1420,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
1375 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | | 1420 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | |
1376 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | 1421 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | |
1377 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | 1422 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | |
1378 | VM_MIXEDMAP | VM_SAO)) | 1423 | VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) |
1379 | return 0; /* just ignore the advice */ | 1424 | return 0; /* just ignore the advice */ |
1380 | 1425 | ||
1381 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { | 1426 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { |
@@ -1452,7 +1497,7 @@ void __ksm_exit(struct mm_struct *mm) | |||
1452 | spin_lock(&ksm_mmlist_lock); | 1497 | spin_lock(&ksm_mmlist_lock); |
1453 | mm_slot = get_mm_slot(mm); | 1498 | mm_slot = get_mm_slot(mm); |
1454 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { | 1499 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { |
1455 | if (list_empty(&mm_slot->rmap_list)) { | 1500 | if (!mm_slot->rmap_list) { |
1456 | hlist_del(&mm_slot->link); | 1501 | hlist_del(&mm_slot->link); |
1457 | list_del(&mm_slot->mm_list); | 1502 | list_del(&mm_slot->mm_list); |
1458 | easy_to_free = 1; | 1503 | easy_to_free = 1; |
@@ -1473,6 +1518,255 @@ void __ksm_exit(struct mm_struct *mm) | |||
1473 | } | 1518 | } |
1474 | } | 1519 | } |
1475 | 1520 | ||
1521 | struct page *ksm_does_need_to_copy(struct page *page, | ||
1522 | struct vm_area_struct *vma, unsigned long address) | ||
1523 | { | ||
1524 | struct page *new_page; | ||
1525 | |||
1526 | unlock_page(page); /* any racers will COW it, not modify it */ | ||
1527 | |||
1528 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
1529 | if (new_page) { | ||
1530 | copy_user_highpage(new_page, page, address, vma); | ||
1531 | |||
1532 | SetPageDirty(new_page); | ||
1533 | __SetPageUptodate(new_page); | ||
1534 | SetPageSwapBacked(new_page); | ||
1535 | __set_page_locked(new_page); | ||
1536 | |||
1537 | if (page_evictable(new_page, vma)) | ||
1538 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); | ||
1539 | else | ||
1540 | add_page_to_unevictable_list(new_page); | ||
1541 | } | ||
1542 | |||
1543 | page_cache_release(page); | ||
1544 | return new_page; | ||
1545 | } | ||
1546 | |||
1547 | int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | ||
1548 | unsigned long *vm_flags) | ||
1549 | { | ||
1550 | struct stable_node *stable_node; | ||
1551 | struct rmap_item *rmap_item; | ||
1552 | struct hlist_node *hlist; | ||
1553 | unsigned int mapcount = page_mapcount(page); | ||
1554 | int referenced = 0; | ||
1555 | int search_new_forks = 0; | ||
1556 | |||
1557 | VM_BUG_ON(!PageKsm(page)); | ||
1558 | VM_BUG_ON(!PageLocked(page)); | ||
1559 | |||
1560 | stable_node = page_stable_node(page); | ||
1561 | if (!stable_node) | ||
1562 | return 0; | ||
1563 | again: | ||
1564 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
1565 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1566 | struct anon_vma_chain *vmac; | ||
1567 | struct vm_area_struct *vma; | ||
1568 | |||
1569 | spin_lock(&anon_vma->lock); | ||
1570 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | ||
1571 | vma = vmac->vma; | ||
1572 | if (rmap_item->address < vma->vm_start || | ||
1573 | rmap_item->address >= vma->vm_end) | ||
1574 | continue; | ||
1575 | /* | ||
1576 | * Initially we examine only the vma which covers this | ||
1577 | * rmap_item; but later, if there is still work to do, | ||
1578 | * we examine covering vmas in other mms: in case they | ||
1579 | * were forked from the original since ksmd passed. | ||
1580 | */ | ||
1581 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1582 | continue; | ||
1583 | |||
1584 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | ||
1585 | continue; | ||
1586 | |||
1587 | referenced += page_referenced_one(page, vma, | ||
1588 | rmap_item->address, &mapcount, vm_flags); | ||
1589 | if (!search_new_forks || !mapcount) | ||
1590 | break; | ||
1591 | } | ||
1592 | spin_unlock(&anon_vma->lock); | ||
1593 | if (!mapcount) | ||
1594 | goto out; | ||
1595 | } | ||
1596 | if (!search_new_forks++) | ||
1597 | goto again; | ||
1598 | out: | ||
1599 | return referenced; | ||
1600 | } | ||
1601 | |||
1602 | int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | ||
1603 | { | ||
1604 | struct stable_node *stable_node; | ||
1605 | struct hlist_node *hlist; | ||
1606 | struct rmap_item *rmap_item; | ||
1607 | int ret = SWAP_AGAIN; | ||
1608 | int search_new_forks = 0; | ||
1609 | |||
1610 | VM_BUG_ON(!PageKsm(page)); | ||
1611 | VM_BUG_ON(!PageLocked(page)); | ||
1612 | |||
1613 | stable_node = page_stable_node(page); | ||
1614 | if (!stable_node) | ||
1615 | return SWAP_FAIL; | ||
1616 | again: | ||
1617 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
1618 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1619 | struct anon_vma_chain *vmac; | ||
1620 | struct vm_area_struct *vma; | ||
1621 | |||
1622 | spin_lock(&anon_vma->lock); | ||
1623 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | ||
1624 | vma = vmac->vma; | ||
1625 | if (rmap_item->address < vma->vm_start || | ||
1626 | rmap_item->address >= vma->vm_end) | ||
1627 | continue; | ||
1628 | /* | ||
1629 | * Initially we examine only the vma which covers this | ||
1630 | * rmap_item; but later, if there is still work to do, | ||
1631 | * we examine covering vmas in other mms: in case they | ||
1632 | * were forked from the original since ksmd passed. | ||
1633 | */ | ||
1634 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1635 | continue; | ||
1636 | |||
1637 | ret = try_to_unmap_one(page, vma, | ||
1638 | rmap_item->address, flags); | ||
1639 | if (ret != SWAP_AGAIN || !page_mapped(page)) { | ||
1640 | spin_unlock(&anon_vma->lock); | ||
1641 | goto out; | ||
1642 | } | ||
1643 | } | ||
1644 | spin_unlock(&anon_vma->lock); | ||
1645 | } | ||
1646 | if (!search_new_forks++) | ||
1647 | goto again; | ||
1648 | out: | ||
1649 | return ret; | ||
1650 | } | ||
1651 | |||
1652 | #ifdef CONFIG_MIGRATION | ||
1653 | int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, | ||
1654 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1655 | { | ||
1656 | struct stable_node *stable_node; | ||
1657 | struct hlist_node *hlist; | ||
1658 | struct rmap_item *rmap_item; | ||
1659 | int ret = SWAP_AGAIN; | ||
1660 | int search_new_forks = 0; | ||
1661 | |||
1662 | VM_BUG_ON(!PageKsm(page)); | ||
1663 | VM_BUG_ON(!PageLocked(page)); | ||
1664 | |||
1665 | stable_node = page_stable_node(page); | ||
1666 | if (!stable_node) | ||
1667 | return ret; | ||
1668 | again: | ||
1669 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
1670 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1671 | struct anon_vma_chain *vmac; | ||
1672 | struct vm_area_struct *vma; | ||
1673 | |||
1674 | spin_lock(&anon_vma->lock); | ||
1675 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | ||
1676 | vma = vmac->vma; | ||
1677 | if (rmap_item->address < vma->vm_start || | ||
1678 | rmap_item->address >= vma->vm_end) | ||
1679 | continue; | ||
1680 | /* | ||
1681 | * Initially we examine only the vma which covers this | ||
1682 | * rmap_item; but later, if there is still work to do, | ||
1683 | * we examine covering vmas in other mms: in case they | ||
1684 | * were forked from the original since ksmd passed. | ||
1685 | */ | ||
1686 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1687 | continue; | ||
1688 | |||
1689 | ret = rmap_one(page, vma, rmap_item->address, arg); | ||
1690 | if (ret != SWAP_AGAIN) { | ||
1691 | spin_unlock(&anon_vma->lock); | ||
1692 | goto out; | ||
1693 | } | ||
1694 | } | ||
1695 | spin_unlock(&anon_vma->lock); | ||
1696 | } | ||
1697 | if (!search_new_forks++) | ||
1698 | goto again; | ||
1699 | out: | ||
1700 | return ret; | ||
1701 | } | ||
1702 | |||
1703 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) | ||
1704 | { | ||
1705 | struct stable_node *stable_node; | ||
1706 | |||
1707 | VM_BUG_ON(!PageLocked(oldpage)); | ||
1708 | VM_BUG_ON(!PageLocked(newpage)); | ||
1709 | VM_BUG_ON(newpage->mapping != oldpage->mapping); | ||
1710 | |||
1711 | stable_node = page_stable_node(newpage); | ||
1712 | if (stable_node) { | ||
1713 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); | ||
1714 | stable_node->kpfn = page_to_pfn(newpage); | ||
1715 | } | ||
1716 | } | ||
1717 | #endif /* CONFIG_MIGRATION */ | ||
1718 | |||
1719 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
1720 | static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, | ||
1721 | unsigned long end_pfn) | ||
1722 | { | ||
1723 | struct rb_node *node; | ||
1724 | |||
1725 | for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { | ||
1726 | struct stable_node *stable_node; | ||
1727 | |||
1728 | stable_node = rb_entry(node, struct stable_node, node); | ||
1729 | if (stable_node->kpfn >= start_pfn && | ||
1730 | stable_node->kpfn < end_pfn) | ||
1731 | return stable_node; | ||
1732 | } | ||
1733 | return NULL; | ||
1734 | } | ||
1735 | |||
1736 | static int ksm_memory_callback(struct notifier_block *self, | ||
1737 | unsigned long action, void *arg) | ||
1738 | { | ||
1739 | struct memory_notify *mn = arg; | ||
1740 | struct stable_node *stable_node; | ||
1741 | |||
1742 | switch (action) { | ||
1743 | case MEM_GOING_OFFLINE: | ||
1744 | /* | ||
1745 | * Keep it very simple for now: just lock out ksmd and | ||
1746 | * MADV_UNMERGEABLE while any memory is going offline. | ||
1747 | */ | ||
1748 | mutex_lock(&ksm_thread_mutex); | ||
1749 | break; | ||
1750 | |||
1751 | case MEM_OFFLINE: | ||
1752 | /* | ||
1753 | * Most of the work is done by page migration; but there might | ||
1754 | * be a few stable_nodes left over, still pointing to struct | ||
1755 | * pages which have been offlined: prune those from the tree. | ||
1756 | */ | ||
1757 | while ((stable_node = ksm_check_stable_tree(mn->start_pfn, | ||
1758 | mn->start_pfn + mn->nr_pages)) != NULL) | ||
1759 | remove_node_from_stable_tree(stable_node); | ||
1760 | /* fallthrough */ | ||
1761 | |||
1762 | case MEM_CANCEL_OFFLINE: | ||
1763 | mutex_unlock(&ksm_thread_mutex); | ||
1764 | break; | ||
1765 | } | ||
1766 | return NOTIFY_OK; | ||
1767 | } | ||
1768 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
1769 | |||
1476 | #ifdef CONFIG_SYSFS | 1770 | #ifdef CONFIG_SYSFS |
1477 | /* | 1771 | /* |
1478 | * This all compiles without CONFIG_SYSFS, but is a waste of space. | 1772 | * This all compiles without CONFIG_SYSFS, but is a waste of space. |
@@ -1551,8 +1845,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1551 | /* | 1845 | /* |
1552 | * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. | 1846 | * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. |
1553 | * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, | 1847 | * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, |
1554 | * breaking COW to free the unswappable pages_shared (but leaves | 1848 | * breaking COW to free the pages_shared (but leaves mm_slots |
1555 | * mm_slots on the list for when ksmd may be set running again). | 1849 | * on the list for when ksmd may be set running again). |
1556 | */ | 1850 | */ |
1557 | 1851 | ||
1558 | mutex_lock(&ksm_thread_mutex); | 1852 | mutex_lock(&ksm_thread_mutex); |
@@ -1577,29 +1871,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1577 | } | 1871 | } |
1578 | KSM_ATTR(run); | 1872 | KSM_ATTR(run); |
1579 | 1873 | ||
1580 | static ssize_t max_kernel_pages_store(struct kobject *kobj, | ||
1581 | struct kobj_attribute *attr, | ||
1582 | const char *buf, size_t count) | ||
1583 | { | ||
1584 | int err; | ||
1585 | unsigned long nr_pages; | ||
1586 | |||
1587 | err = strict_strtoul(buf, 10, &nr_pages); | ||
1588 | if (err) | ||
1589 | return -EINVAL; | ||
1590 | |||
1591 | ksm_max_kernel_pages = nr_pages; | ||
1592 | |||
1593 | return count; | ||
1594 | } | ||
1595 | |||
1596 | static ssize_t max_kernel_pages_show(struct kobject *kobj, | ||
1597 | struct kobj_attribute *attr, char *buf) | ||
1598 | { | ||
1599 | return sprintf(buf, "%lu\n", ksm_max_kernel_pages); | ||
1600 | } | ||
1601 | KSM_ATTR(max_kernel_pages); | ||
1602 | |||
1603 | static ssize_t pages_shared_show(struct kobject *kobj, | 1874 | static ssize_t pages_shared_show(struct kobject *kobj, |
1604 | struct kobj_attribute *attr, char *buf) | 1875 | struct kobj_attribute *attr, char *buf) |
1605 | { | 1876 | { |
@@ -1649,7 +1920,6 @@ static struct attribute *ksm_attrs[] = { | |||
1649 | &sleep_millisecs_attr.attr, | 1920 | &sleep_millisecs_attr.attr, |
1650 | &pages_to_scan_attr.attr, | 1921 | &pages_to_scan_attr.attr, |
1651 | &run_attr.attr, | 1922 | &run_attr.attr, |
1652 | &max_kernel_pages_attr.attr, | ||
1653 | &pages_shared_attr.attr, | 1923 | &pages_shared_attr.attr, |
1654 | &pages_sharing_attr.attr, | 1924 | &pages_sharing_attr.attr, |
1655 | &pages_unshared_attr.attr, | 1925 | &pages_unshared_attr.attr, |
@@ -1669,8 +1939,6 @@ static int __init ksm_init(void) | |||
1669 | struct task_struct *ksm_thread; | 1939 | struct task_struct *ksm_thread; |
1670 | int err; | 1940 | int err; |
1671 | 1941 | ||
1672 | ksm_max_kernel_pages = totalram_pages / 4; | ||
1673 | |||
1674 | err = ksm_slab_init(); | 1942 | err = ksm_slab_init(); |
1675 | if (err) | 1943 | if (err) |
1676 | goto out; | 1944 | goto out; |
@@ -1698,6 +1966,13 @@ static int __init ksm_init(void) | |||
1698 | 1966 | ||
1699 | #endif /* CONFIG_SYSFS */ | 1967 | #endif /* CONFIG_SYSFS */ |
1700 | 1968 | ||
1969 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
1970 | /* | ||
1971 | * Choose a high priority since the callback takes ksm_thread_mutex: | ||
1972 | * later callbacks could only be taking locks which nest within that. | ||
1973 | */ | ||
1974 | hotplug_memory_notifier(ksm_memory_callback, 100); | ||
1975 | #endif | ||
1701 | return 0; | 1976 | return 0; |
1702 | 1977 | ||
1703 | out_free2: | 1978 | out_free2: |
diff --git a/mm/maccess.c b/mm/maccess.c index 9073695ff25f..4e348dbaecd7 100644 --- a/mm/maccess.c +++ b/mm/maccess.c | |||
@@ -14,7 +14,11 @@ | |||
14 | * Safely read from address @src to the buffer at @dst. If a kernel fault | 14 | * Safely read from address @src to the buffer at @dst. If a kernel fault |
15 | * happens, handle that and return -EFAULT. | 15 | * happens, handle that and return -EFAULT. |
16 | */ | 16 | */ |
17 | long probe_kernel_read(void *dst, void *src, size_t size) | 17 | |
18 | long __weak probe_kernel_read(void *dst, void *src, size_t size) | ||
19 | __attribute__((alias("__probe_kernel_read"))); | ||
20 | |||
21 | long __probe_kernel_read(void *dst, void *src, size_t size) | ||
18 | { | 22 | { |
19 | long ret; | 23 | long ret; |
20 | mm_segment_t old_fs = get_fs(); | 24 | mm_segment_t old_fs = get_fs(); |
@@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); | |||
39 | * Safely write to address @dst from the buffer at @src. If a kernel fault | 43 | * Safely write to address @dst from the buffer at @src. If a kernel fault |
40 | * happens, handle that and return -EFAULT. | 44 | * happens, handle that and return -EFAULT. |
41 | */ | 45 | */ |
42 | long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) | 46 | long __weak probe_kernel_write(void *dst, void *src, size_t size) |
47 | __attribute__((alias("__probe_kernel_write"))); | ||
48 | |||
49 | long __probe_kernel_write(void *dst, void *src, size_t size) | ||
43 | { | 50 | { |
44 | long ret; | 51 | long ret; |
45 | mm_segment_t old_fs = get_fs(); | 52 | mm_segment_t old_fs = get_fs(); |
diff --git a/mm/madvise.c b/mm/madvise.c index 35b1479b7c9d..319528b8db74 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/pagemap.h> | 9 | #include <linux/pagemap.h> |
10 | #include <linux/syscalls.h> | 10 | #include <linux/syscalls.h> |
11 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
12 | #include <linux/page-isolation.h> | ||
12 | #include <linux/hugetlb.h> | 13 | #include <linux/hugetlb.h> |
13 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
14 | #include <linux/ksm.h> | 15 | #include <linux/ksm.h> |
@@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
222 | /* | 223 | /* |
223 | * Error injection support for memory error handling. | 224 | * Error injection support for memory error handling. |
224 | */ | 225 | */ |
225 | static int madvise_hwpoison(unsigned long start, unsigned long end) | 226 | static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) |
226 | { | 227 | { |
227 | int ret = 0; | 228 | int ret = 0; |
228 | 229 | ||
@@ -230,15 +231,21 @@ static int madvise_hwpoison(unsigned long start, unsigned long end) | |||
230 | return -EPERM; | 231 | return -EPERM; |
231 | for (; start < end; start += PAGE_SIZE) { | 232 | for (; start < end; start += PAGE_SIZE) { |
232 | struct page *p; | 233 | struct page *p; |
233 | int ret = get_user_pages(current, current->mm, start, 1, | 234 | int ret = get_user_pages_fast(start, 1, 0, &p); |
234 | 0, 0, &p, NULL); | ||
235 | if (ret != 1) | 235 | if (ret != 1) |
236 | return ret; | 236 | return ret; |
237 | if (bhv == MADV_SOFT_OFFLINE) { | ||
238 | printk(KERN_INFO "Soft offlining page %lx at %lx\n", | ||
239 | page_to_pfn(p), start); | ||
240 | ret = soft_offline_page(p, MF_COUNT_INCREASED); | ||
241 | if (ret) | ||
242 | break; | ||
243 | continue; | ||
244 | } | ||
237 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", | 245 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", |
238 | page_to_pfn(p), start); | 246 | page_to_pfn(p), start); |
239 | /* Ignore return value for now */ | 247 | /* Ignore return value for now */ |
240 | __memory_failure(page_to_pfn(p), 0, 1); | 248 | __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); |
241 | put_page(p); | ||
242 | } | 249 | } |
243 | return ret; | 250 | return ret; |
244 | } | 251 | } |
@@ -335,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
335 | size_t len; | 342 | size_t len; |
336 | 343 | ||
337 | #ifdef CONFIG_MEMORY_FAILURE | 344 | #ifdef CONFIG_MEMORY_FAILURE |
338 | if (behavior == MADV_HWPOISON) | 345 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) |
339 | return madvise_hwpoison(start, start+len_in); | 346 | return madvise_hwpoison(behavior, start, start+len_in); |
340 | #endif | 347 | #endif |
341 | if (!madvise_behavior_valid(behavior)) | 348 | if (!madvise_behavior_valid(behavior)) |
342 | return error; | 349 | return error; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f99f5991d6bb..8a79a6f0f029 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -6,6 +6,10 @@ | |||
6 | * Copyright 2007 OpenVZ SWsoft Inc | 6 | * Copyright 2007 OpenVZ SWsoft Inc |
7 | * Author: Pavel Emelianov <xemul@openvz.org> | 7 | * Author: Pavel Emelianov <xemul@openvz.org> |
8 | * | 8 | * |
9 | * Memory thresholds | ||
10 | * Copyright (C) 2009 Nokia Corporation | ||
11 | * Author: Kirill A. Shutemov | ||
12 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | 13 | * This program is free software; you can redistribute it and/or modify |
10 | * it under the terms of the GNU General Public License as published by | 14 | * it under the terms of the GNU General Public License as published by |
11 | * the Free Software Foundation; either version 2 of the License, or | 15 | * the Free Software Foundation; either version 2 of the License, or |
@@ -21,6 +25,7 @@ | |||
21 | #include <linux/memcontrol.h> | 25 | #include <linux/memcontrol.h> |
22 | #include <linux/cgroup.h> | 26 | #include <linux/cgroup.h> |
23 | #include <linux/mm.h> | 27 | #include <linux/mm.h> |
28 | #include <linux/hugetlb.h> | ||
24 | #include <linux/pagemap.h> | 29 | #include <linux/pagemap.h> |
25 | #include <linux/smp.h> | 30 | #include <linux/smp.h> |
26 | #include <linux/page-flags.h> | 31 | #include <linux/page-flags.h> |
@@ -32,12 +37,16 @@ | |||
32 | #include <linux/rbtree.h> | 37 | #include <linux/rbtree.h> |
33 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
34 | #include <linux/swap.h> | 39 | #include <linux/swap.h> |
40 | #include <linux/swapops.h> | ||
35 | #include <linux/spinlock.h> | 41 | #include <linux/spinlock.h> |
42 | #include <linux/eventfd.h> | ||
43 | #include <linux/sort.h> | ||
36 | #include <linux/fs.h> | 44 | #include <linux/fs.h> |
37 | #include <linux/seq_file.h> | 45 | #include <linux/seq_file.h> |
38 | #include <linux/vmalloc.h> | 46 | #include <linux/vmalloc.h> |
39 | #include <linux/mm_inline.h> | 47 | #include <linux/mm_inline.h> |
40 | #include <linux/page_cgroup.h> | 48 | #include <linux/page_cgroup.h> |
49 | #include <linux/cpu.h> | ||
41 | #include "internal.h" | 50 | #include "internal.h" |
42 | 51 | ||
43 | #include <asm/uaccess.h> | 52 | #include <asm/uaccess.h> |
@@ -54,8 +63,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
54 | #define do_swap_account (0) | 63 | #define do_swap_account (0) |
55 | #endif | 64 | #endif |
56 | 65 | ||
57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | 66 | /* |
58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | 67 | * Per memcg event counter is incremented at every pagein/pageout. This counter |
68 | * is used for trigger some periodic events. This is straightforward and better | ||
69 | * than using jiffies etc. to handle periodic memcg event. | ||
70 | * | ||
71 | * These values will be used as !((event) & ((1 <<(thresh)) - 1)) | ||
72 | */ | ||
73 | #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ | ||
74 | #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ | ||
59 | 75 | ||
60 | /* | 76 | /* |
61 | * Statistics for memory cgroup. | 77 | * Statistics for memory cgroup. |
@@ -66,65 +82,19 @@ enum mem_cgroup_stat_index { | |||
66 | */ | 82 | */ |
67 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 83 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
68 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 84 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | 85 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 86 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 87 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | ||
73 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 88 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
89 | MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ | ||
74 | 90 | ||
75 | MEM_CGROUP_STAT_NSTATS, | 91 | MEM_CGROUP_STAT_NSTATS, |
76 | }; | 92 | }; |
77 | 93 | ||
78 | struct mem_cgroup_stat_cpu { | 94 | struct mem_cgroup_stat_cpu { |
79 | s64 count[MEM_CGROUP_STAT_NSTATS]; | 95 | s64 count[MEM_CGROUP_STAT_NSTATS]; |
80 | } ____cacheline_aligned_in_smp; | ||
81 | |||
82 | struct mem_cgroup_stat { | ||
83 | struct mem_cgroup_stat_cpu cpustat[0]; | ||
84 | }; | 96 | }; |
85 | 97 | ||
86 | static inline void | ||
87 | __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, | ||
88 | enum mem_cgroup_stat_index idx) | ||
89 | { | ||
90 | stat->count[idx] = 0; | ||
91 | } | ||
92 | |||
93 | static inline s64 | ||
94 | __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, | ||
95 | enum mem_cgroup_stat_index idx) | ||
96 | { | ||
97 | return stat->count[idx]; | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * For accounting under irq disable, no need for increment preempt count. | ||
102 | */ | ||
103 | static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, | ||
104 | enum mem_cgroup_stat_index idx, int val) | ||
105 | { | ||
106 | stat->count[idx] += val; | ||
107 | } | ||
108 | |||
109 | static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | ||
110 | enum mem_cgroup_stat_index idx) | ||
111 | { | ||
112 | int cpu; | ||
113 | s64 ret = 0; | ||
114 | for_each_possible_cpu(cpu) | ||
115 | ret += stat->cpustat[cpu].count[idx]; | ||
116 | return ret; | ||
117 | } | ||
118 | |||
119 | static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) | ||
120 | { | ||
121 | s64 ret; | ||
122 | |||
123 | ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); | ||
124 | ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); | ||
125 | return ret; | ||
126 | } | ||
127 | |||
128 | /* | 98 | /* |
129 | * per-zone information in memory controller. | 99 | * per-zone information in memory controller. |
130 | */ | 100 | */ |
@@ -174,6 +144,22 @@ struct mem_cgroup_tree { | |||
174 | 144 | ||
175 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | 145 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; |
176 | 146 | ||
147 | struct mem_cgroup_threshold { | ||
148 | struct eventfd_ctx *eventfd; | ||
149 | u64 threshold; | ||
150 | }; | ||
151 | |||
152 | struct mem_cgroup_threshold_ary { | ||
153 | /* An array index points to threshold just below usage. */ | ||
154 | atomic_t current_threshold; | ||
155 | /* Size of entries[] */ | ||
156 | unsigned int size; | ||
157 | /* Array of thresholds */ | ||
158 | struct mem_cgroup_threshold entries[0]; | ||
159 | }; | ||
160 | |||
161 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | ||
162 | |||
177 | /* | 163 | /* |
178 | * The memory controller data structure. The memory controller controls both | 164 | * The memory controller data structure. The memory controller controls both |
179 | * page cache and RSS per cgroup. We would eventually like to provide | 165 | * page cache and RSS per cgroup. We would eventually like to provide |
@@ -209,7 +195,7 @@ struct mem_cgroup { | |||
209 | int prev_priority; /* for recording reclaim priority */ | 195 | int prev_priority; /* for recording reclaim priority */ |
210 | 196 | ||
211 | /* | 197 | /* |
212 | * While reclaiming in a hiearchy, we cache the last child we | 198 | * While reclaiming in a hierarchy, we cache the last child we |
213 | * reclaimed from. | 199 | * reclaimed from. |
214 | */ | 200 | */ |
215 | int last_scanned_child; | 201 | int last_scanned_child; |
@@ -217,7 +203,7 @@ struct mem_cgroup { | |||
217 | * Should the accounting and control be hierarchical, per subtree? | 203 | * Should the accounting and control be hierarchical, per subtree? |
218 | */ | 204 | */ |
219 | bool use_hierarchy; | 205 | bool use_hierarchy; |
220 | unsigned long last_oom_jiffies; | 206 | atomic_t oom_lock; |
221 | atomic_t refcnt; | 207 | atomic_t refcnt; |
222 | 208 | ||
223 | unsigned int swappiness; | 209 | unsigned int swappiness; |
@@ -225,10 +211,48 @@ struct mem_cgroup { | |||
225 | /* set when res.limit == memsw.limit */ | 211 | /* set when res.limit == memsw.limit */ |
226 | bool memsw_is_minimum; | 212 | bool memsw_is_minimum; |
227 | 213 | ||
214 | /* protect arrays of thresholds */ | ||
215 | struct mutex thresholds_lock; | ||
216 | |||
217 | /* thresholds for memory usage. RCU-protected */ | ||
218 | struct mem_cgroup_threshold_ary *thresholds; | ||
219 | |||
220 | /* thresholds for mem+swap usage. RCU-protected */ | ||
221 | struct mem_cgroup_threshold_ary *memsw_thresholds; | ||
222 | |||
223 | /* | ||
224 | * Should we move charges of a task when a task is moved into this | ||
225 | * mem_cgroup ? And what type of charges should we move ? | ||
226 | */ | ||
227 | unsigned long move_charge_at_immigrate; | ||
228 | |||
228 | /* | 229 | /* |
229 | * statistics. This must be placed at the end of memcg. | 230 | * percpu counter. |
230 | */ | 231 | */ |
231 | struct mem_cgroup_stat stat; | 232 | struct mem_cgroup_stat_cpu *stat; |
233 | }; | ||
234 | |||
235 | /* Stuffs for move charges at task migration. */ | ||
236 | /* | ||
237 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a | ||
238 | * left-shifted bitmap of these types. | ||
239 | */ | ||
240 | enum move_type { | ||
241 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | ||
242 | NR_MOVE_TYPE, | ||
243 | }; | ||
244 | |||
245 | /* "mc" and its members are protected by cgroup_mutex */ | ||
246 | static struct move_charge_struct { | ||
247 | struct mem_cgroup *from; | ||
248 | struct mem_cgroup *to; | ||
249 | unsigned long precharge; | ||
250 | unsigned long moved_charge; | ||
251 | unsigned long moved_swap; | ||
252 | struct task_struct *moving_task; /* a task moving charges */ | ||
253 | wait_queue_head_t waitq; /* a waitq for other context */ | ||
254 | } mc = { | ||
255 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | ||
232 | }; | 256 | }; |
233 | 257 | ||
234 | /* | 258 | /* |
@@ -275,6 +299,7 @@ enum charge_type { | |||
275 | static void mem_cgroup_get(struct mem_cgroup *mem); | 299 | static void mem_cgroup_get(struct mem_cgroup *mem); |
276 | static void mem_cgroup_put(struct mem_cgroup *mem); | 300 | static void mem_cgroup_put(struct mem_cgroup *mem); |
277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 301 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
302 | static void drain_all_stock_async(void); | ||
278 | 303 | ||
279 | static struct mem_cgroup_per_zone * | 304 | static struct mem_cgroup_per_zone * |
280 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 305 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
@@ -282,6 +307,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | |||
282 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | 307 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; |
283 | } | 308 | } |
284 | 309 | ||
310 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | ||
311 | { | ||
312 | return &mem->css; | ||
313 | } | ||
314 | |||
285 | static struct mem_cgroup_per_zone * | 315 | static struct mem_cgroup_per_zone * |
286 | page_cgroup_zoneinfo(struct page_cgroup *pc) | 316 | page_cgroup_zoneinfo(struct page_cgroup *pc) |
287 | { | 317 | { |
@@ -365,23 +395,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | |||
365 | spin_unlock(&mctz->lock); | 395 | spin_unlock(&mctz->lock); |
366 | } | 396 | } |
367 | 397 | ||
368 | static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) | ||
369 | { | ||
370 | bool ret = false; | ||
371 | int cpu; | ||
372 | s64 val; | ||
373 | struct mem_cgroup_stat_cpu *cpustat; | ||
374 | |||
375 | cpu = get_cpu(); | ||
376 | cpustat = &mem->stat.cpustat[cpu]; | ||
377 | val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
378 | if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { | ||
379 | __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
380 | ret = true; | ||
381 | } | ||
382 | put_cpu(); | ||
383 | return ret; | ||
384 | } | ||
385 | 398 | ||
386 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | 399 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) |
387 | { | 400 | { |
@@ -475,17 +488,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
475 | return mz; | 488 | return mz; |
476 | } | 489 | } |
477 | 490 | ||
491 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | ||
492 | enum mem_cgroup_stat_index idx) | ||
493 | { | ||
494 | int cpu; | ||
495 | s64 val = 0; | ||
496 | |||
497 | for_each_possible_cpu(cpu) | ||
498 | val += per_cpu(mem->stat->count[idx], cpu); | ||
499 | return val; | ||
500 | } | ||
501 | |||
502 | static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) | ||
503 | { | ||
504 | s64 ret; | ||
505 | |||
506 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | ||
507 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | ||
508 | return ret; | ||
509 | } | ||
510 | |||
478 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | 511 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, |
479 | bool charge) | 512 | bool charge) |
480 | { | 513 | { |
481 | int val = (charge) ? 1 : -1; | 514 | int val = (charge) ? 1 : -1; |
482 | struct mem_cgroup_stat *stat = &mem->stat; | 515 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
483 | struct mem_cgroup_stat_cpu *cpustat; | ||
484 | int cpu = get_cpu(); | ||
485 | |||
486 | cpustat = &stat->cpustat[cpu]; | ||
487 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); | ||
488 | put_cpu(); | ||
489 | } | 516 | } |
490 | 517 | ||
491 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 518 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
@@ -493,24 +520,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
493 | bool charge) | 520 | bool charge) |
494 | { | 521 | { |
495 | int val = (charge) ? 1 : -1; | 522 | int val = (charge) ? 1 : -1; |
496 | struct mem_cgroup_stat *stat = &mem->stat; | ||
497 | struct mem_cgroup_stat_cpu *cpustat; | ||
498 | int cpu = get_cpu(); | ||
499 | 523 | ||
500 | cpustat = &stat->cpustat[cpu]; | 524 | preempt_disable(); |
525 | |||
501 | if (PageCgroupCache(pc)) | 526 | if (PageCgroupCache(pc)) |
502 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); | 527 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); |
503 | else | 528 | else |
504 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); | 529 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); |
505 | 530 | ||
506 | if (charge) | 531 | if (charge) |
507 | __mem_cgroup_stat_add_safe(cpustat, | 532 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); |
508 | MEM_CGROUP_STAT_PGPGIN_COUNT, 1); | ||
509 | else | 533 | else |
510 | __mem_cgroup_stat_add_safe(cpustat, | 534 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); |
511 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 535 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); |
512 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); | 536 | |
513 | put_cpu(); | 537 | preempt_enable(); |
514 | } | 538 | } |
515 | 539 | ||
516 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 540 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
@@ -528,6 +552,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | |||
528 | return total; | 552 | return total; |
529 | } | 553 | } |
530 | 554 | ||
555 | static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) | ||
556 | { | ||
557 | s64 val; | ||
558 | |||
559 | val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); | ||
560 | |||
561 | return !(val & ((1 << event_mask_shift) - 1)); | ||
562 | } | ||
563 | |||
564 | /* | ||
565 | * Check events in order. | ||
566 | * | ||
567 | */ | ||
568 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | ||
569 | { | ||
570 | /* threshold event is triggered in finer grain than soft limit */ | ||
571 | if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { | ||
572 | mem_cgroup_threshold(mem); | ||
573 | if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) | ||
574 | mem_cgroup_update_tree(mem, page); | ||
575 | } | ||
576 | } | ||
577 | |||
531 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 578 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
532 | { | 579 | { |
533 | return container_of(cgroup_subsys_state(cont, | 580 | return container_of(cgroup_subsys_state(cont, |
@@ -758,7 +805,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
758 | task_unlock(task); | 805 | task_unlock(task); |
759 | if (!curr) | 806 | if (!curr) |
760 | return 0; | 807 | return 0; |
761 | if (curr->use_hierarchy) | 808 | /* |
809 | * We should check use_hierarchy of "mem" not "curr". Because checking | ||
810 | * use_hierarchy of "curr" here make this function true if hierarchy is | ||
811 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | ||
812 | * hierarchy(even if use_hierarchy is disabled in "mem"). | ||
813 | */ | ||
814 | if (mem->use_hierarchy) | ||
762 | ret = css_is_ancestor(&curr->css, &mem->css); | 815 | ret = css_is_ancestor(&curr->css, &mem->css); |
763 | else | 816 | else |
764 | ret = (curr == mem); | 817 | ret = (curr == mem); |
@@ -988,7 +1041,7 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) | |||
988 | } | 1041 | } |
989 | 1042 | ||
990 | /** | 1043 | /** |
991 | * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. | 1044 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. |
992 | * @memcg: The memory cgroup that went over limit | 1045 | * @memcg: The memory cgroup that went over limit |
993 | * @p: Task that is going to be killed | 1046 | * @p: Task that is going to be killed |
994 | * | 1047 | * |
@@ -1007,7 +1060,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1007 | static char memcg_name[PATH_MAX]; | 1060 | static char memcg_name[PATH_MAX]; |
1008 | int ret; | 1061 | int ret; |
1009 | 1062 | ||
1010 | if (!memcg) | 1063 | if (!memcg || !p) |
1011 | return; | 1064 | return; |
1012 | 1065 | ||
1013 | 1066 | ||
@@ -1137,6 +1190,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1137 | victim = mem_cgroup_select_victim(root_mem); | 1190 | victim = mem_cgroup_select_victim(root_mem); |
1138 | if (victim == root_mem) { | 1191 | if (victim == root_mem) { |
1139 | loop++; | 1192 | loop++; |
1193 | if (loop >= 1) | ||
1194 | drain_all_stock_async(); | ||
1140 | if (loop >= 2) { | 1195 | if (loop >= 2) { |
1141 | /* | 1196 | /* |
1142 | * If we have not been able to reclaim | 1197 | * If we have not been able to reclaim |
@@ -1160,7 +1215,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1160 | } | 1215 | } |
1161 | } | 1216 | } |
1162 | } | 1217 | } |
1163 | if (!mem_cgroup_local_usage(&victim->stat)) { | 1218 | if (!mem_cgroup_local_usage(victim)) { |
1164 | /* this cgroup's local usage == 0 */ | 1219 | /* this cgroup's local usage == 0 */ |
1165 | css_put(&victim->css); | 1220 | css_put(&victim->css); |
1166 | continue; | 1221 | continue; |
@@ -1191,90 +1246,284 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1191 | return total; | 1246 | return total; |
1192 | } | 1247 | } |
1193 | 1248 | ||
1194 | bool mem_cgroup_oom_called(struct task_struct *task) | 1249 | static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) |
1195 | { | 1250 | { |
1196 | bool ret = false; | 1251 | int *val = (int *)data; |
1197 | struct mem_cgroup *mem; | 1252 | int x; |
1198 | struct mm_struct *mm; | 1253 | /* |
1254 | * Logically, we can stop scanning immediately when we find | ||
1255 | * a memcg is already locked. But condidering unlock ops and | ||
1256 | * creation/removal of memcg, scan-all is simple operation. | ||
1257 | */ | ||
1258 | x = atomic_inc_return(&mem->oom_lock); | ||
1259 | *val = max(x, *val); | ||
1260 | return 0; | ||
1261 | } | ||
1262 | /* | ||
1263 | * Check OOM-Killer is already running under our hierarchy. | ||
1264 | * If someone is running, return false. | ||
1265 | */ | ||
1266 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | ||
1267 | { | ||
1268 | int lock_count = 0; | ||
1199 | 1269 | ||
1200 | rcu_read_lock(); | 1270 | mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); |
1201 | mm = task->mm; | 1271 | |
1202 | if (!mm) | 1272 | if (lock_count == 1) |
1203 | mm = &init_mm; | 1273 | return true; |
1204 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1274 | return false; |
1205 | if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) | ||
1206 | ret = true; | ||
1207 | rcu_read_unlock(); | ||
1208 | return ret; | ||
1209 | } | 1275 | } |
1210 | 1276 | ||
1211 | static int record_last_oom_cb(struct mem_cgroup *mem, void *data) | 1277 | static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) |
1212 | { | 1278 | { |
1213 | mem->last_oom_jiffies = jiffies; | 1279 | /* |
1280 | * When a new child is created while the hierarchy is under oom, | ||
1281 | * mem_cgroup_oom_lock() may not be called. We have to use | ||
1282 | * atomic_add_unless() here. | ||
1283 | */ | ||
1284 | atomic_add_unless(&mem->oom_lock, -1, 0); | ||
1214 | return 0; | 1285 | return 0; |
1215 | } | 1286 | } |
1216 | 1287 | ||
1217 | static void record_last_oom(struct mem_cgroup *mem) | 1288 | static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) |
1289 | { | ||
1290 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); | ||
1291 | } | ||
1292 | |||
1293 | static DEFINE_MUTEX(memcg_oom_mutex); | ||
1294 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | ||
1295 | |||
1296 | /* | ||
1297 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | ||
1298 | */ | ||
1299 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | ||
1218 | { | 1300 | { |
1219 | mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); | 1301 | DEFINE_WAIT(wait); |
1302 | bool locked; | ||
1303 | |||
1304 | /* At first, try to OOM lock hierarchy under mem.*/ | ||
1305 | mutex_lock(&memcg_oom_mutex); | ||
1306 | locked = mem_cgroup_oom_lock(mem); | ||
1307 | /* | ||
1308 | * Even if signal_pending(), we can't quit charge() loop without | ||
1309 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | ||
1310 | * under OOM is always welcomed, use TASK_KILLABLE here. | ||
1311 | */ | ||
1312 | if (!locked) | ||
1313 | prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); | ||
1314 | mutex_unlock(&memcg_oom_mutex); | ||
1315 | |||
1316 | if (locked) | ||
1317 | mem_cgroup_out_of_memory(mem, mask); | ||
1318 | else { | ||
1319 | schedule(); | ||
1320 | finish_wait(&memcg_oom_waitq, &wait); | ||
1321 | } | ||
1322 | mutex_lock(&memcg_oom_mutex); | ||
1323 | mem_cgroup_oom_unlock(mem); | ||
1324 | /* | ||
1325 | * Here, we use global waitq .....more fine grained waitq ? | ||
1326 | * Assume following hierarchy. | ||
1327 | * A/ | ||
1328 | * 01 | ||
1329 | * 02 | ||
1330 | * assume OOM happens both in A and 01 at the same time. Tthey are | ||
1331 | * mutually exclusive by lock. (kill in 01 helps A.) | ||
1332 | * When we use per memcg waitq, we have to wake up waiters on A and 02 | ||
1333 | * in addtion to waiters on 01. We use global waitq for avoiding mess. | ||
1334 | * It will not be a big problem. | ||
1335 | * (And a task may be moved to other groups while it's waiting for OOM.) | ||
1336 | */ | ||
1337 | wake_up_all(&memcg_oom_waitq); | ||
1338 | mutex_unlock(&memcg_oom_mutex); | ||
1339 | |||
1340 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | ||
1341 | return false; | ||
1342 | /* Give chance to dying process */ | ||
1343 | schedule_timeout(1); | ||
1344 | return true; | ||
1220 | } | 1345 | } |
1221 | 1346 | ||
1222 | /* | 1347 | /* |
1223 | * Currently used to update mapped file statistics, but the routine can be | 1348 | * Currently used to update mapped file statistics, but the routine can be |
1224 | * generalized to update other statistics as well. | 1349 | * generalized to update other statistics as well. |
1225 | */ | 1350 | */ |
1226 | void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | 1351 | void mem_cgroup_update_file_mapped(struct page *page, int val) |
1227 | { | 1352 | { |
1228 | struct mem_cgroup *mem; | 1353 | struct mem_cgroup *mem; |
1229 | struct mem_cgroup_stat *stat; | ||
1230 | struct mem_cgroup_stat_cpu *cpustat; | ||
1231 | int cpu; | ||
1232 | struct page_cgroup *pc; | 1354 | struct page_cgroup *pc; |
1233 | 1355 | ||
1234 | if (!page_is_file_cache(page)) | ||
1235 | return; | ||
1236 | |||
1237 | pc = lookup_page_cgroup(page); | 1356 | pc = lookup_page_cgroup(page); |
1238 | if (unlikely(!pc)) | 1357 | if (unlikely(!pc)) |
1239 | return; | 1358 | return; |
1240 | 1359 | ||
1241 | lock_page_cgroup(pc); | 1360 | lock_page_cgroup(pc); |
1242 | mem = pc->mem_cgroup; | 1361 | mem = pc->mem_cgroup; |
1243 | if (!mem) | 1362 | if (!mem || !PageCgroupUsed(pc)) |
1244 | goto done; | ||
1245 | |||
1246 | if (!PageCgroupUsed(pc)) | ||
1247 | goto done; | 1363 | goto done; |
1248 | 1364 | ||
1249 | /* | 1365 | /* |
1250 | * Preemption is already disabled, we don't need get_cpu() | 1366 | * Preemption is already disabled. We can use __this_cpu_xxx |
1251 | */ | 1367 | */ |
1252 | cpu = smp_processor_id(); | 1368 | if (val > 0) { |
1253 | stat = &mem->stat; | 1369 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
1254 | cpustat = &stat->cpustat[cpu]; | 1370 | SetPageCgroupFileMapped(pc); |
1371 | } else { | ||
1372 | __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | ||
1373 | ClearPageCgroupFileMapped(pc); | ||
1374 | } | ||
1255 | 1375 | ||
1256 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); | ||
1257 | done: | 1376 | done: |
1258 | unlock_page_cgroup(pc); | 1377 | unlock_page_cgroup(pc); |
1259 | } | 1378 | } |
1260 | 1379 | ||
1261 | /* | 1380 | /* |
1381 | * size of first charge trial. "32" comes from vmscan.c's magic value. | ||
1382 | * TODO: maybe necessary to use big numbers in big irons. | ||
1383 | */ | ||
1384 | #define CHARGE_SIZE (32 * PAGE_SIZE) | ||
1385 | struct memcg_stock_pcp { | ||
1386 | struct mem_cgroup *cached; /* this never be root cgroup */ | ||
1387 | int charge; | ||
1388 | struct work_struct work; | ||
1389 | }; | ||
1390 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | ||
1391 | static atomic_t memcg_drain_count; | ||
1392 | |||
1393 | /* | ||
1394 | * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed | ||
1395 | * from local stock and true is returned. If the stock is 0 or charges from a | ||
1396 | * cgroup which is not current target, returns false. This stock will be | ||
1397 | * refilled. | ||
1398 | */ | ||
1399 | static bool consume_stock(struct mem_cgroup *mem) | ||
1400 | { | ||
1401 | struct memcg_stock_pcp *stock; | ||
1402 | bool ret = true; | ||
1403 | |||
1404 | stock = &get_cpu_var(memcg_stock); | ||
1405 | if (mem == stock->cached && stock->charge) | ||
1406 | stock->charge -= PAGE_SIZE; | ||
1407 | else /* need to call res_counter_charge */ | ||
1408 | ret = false; | ||
1409 | put_cpu_var(memcg_stock); | ||
1410 | return ret; | ||
1411 | } | ||
1412 | |||
1413 | /* | ||
1414 | * Returns stocks cached in percpu to res_counter and reset cached information. | ||
1415 | */ | ||
1416 | static void drain_stock(struct memcg_stock_pcp *stock) | ||
1417 | { | ||
1418 | struct mem_cgroup *old = stock->cached; | ||
1419 | |||
1420 | if (stock->charge) { | ||
1421 | res_counter_uncharge(&old->res, stock->charge); | ||
1422 | if (do_swap_account) | ||
1423 | res_counter_uncharge(&old->memsw, stock->charge); | ||
1424 | } | ||
1425 | stock->cached = NULL; | ||
1426 | stock->charge = 0; | ||
1427 | } | ||
1428 | |||
1429 | /* | ||
1430 | * This must be called under preempt disabled or must be called by | ||
1431 | * a thread which is pinned to local cpu. | ||
1432 | */ | ||
1433 | static void drain_local_stock(struct work_struct *dummy) | ||
1434 | { | ||
1435 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | ||
1436 | drain_stock(stock); | ||
1437 | } | ||
1438 | |||
1439 | /* | ||
1440 | * Cache charges(val) which is from res_counter, to local per_cpu area. | ||
1441 | * This will be consumed by consumt_stock() function, later. | ||
1442 | */ | ||
1443 | static void refill_stock(struct mem_cgroup *mem, int val) | ||
1444 | { | ||
1445 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | ||
1446 | |||
1447 | if (stock->cached != mem) { /* reset if necessary */ | ||
1448 | drain_stock(stock); | ||
1449 | stock->cached = mem; | ||
1450 | } | ||
1451 | stock->charge += val; | ||
1452 | put_cpu_var(memcg_stock); | ||
1453 | } | ||
1454 | |||
1455 | /* | ||
1456 | * Tries to drain stocked charges in other cpus. This function is asynchronous | ||
1457 | * and just put a work per cpu for draining localy on each cpu. Caller can | ||
1458 | * expects some charges will be back to res_counter later but cannot wait for | ||
1459 | * it. | ||
1460 | */ | ||
1461 | static void drain_all_stock_async(void) | ||
1462 | { | ||
1463 | int cpu; | ||
1464 | /* This function is for scheduling "drain" in asynchronous way. | ||
1465 | * The result of "drain" is not directly handled by callers. Then, | ||
1466 | * if someone is calling drain, we don't have to call drain more. | ||
1467 | * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if | ||
1468 | * there is a race. We just do loose check here. | ||
1469 | */ | ||
1470 | if (atomic_read(&memcg_drain_count)) | ||
1471 | return; | ||
1472 | /* Notify other cpus that system-wide "drain" is running */ | ||
1473 | atomic_inc(&memcg_drain_count); | ||
1474 | get_online_cpus(); | ||
1475 | for_each_online_cpu(cpu) { | ||
1476 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | ||
1477 | schedule_work_on(cpu, &stock->work); | ||
1478 | } | ||
1479 | put_online_cpus(); | ||
1480 | atomic_dec(&memcg_drain_count); | ||
1481 | /* We don't wait for flush_work */ | ||
1482 | } | ||
1483 | |||
1484 | /* This is a synchronous drain interface. */ | ||
1485 | static void drain_all_stock_sync(void) | ||
1486 | { | ||
1487 | /* called when force_empty is called */ | ||
1488 | atomic_inc(&memcg_drain_count); | ||
1489 | schedule_on_each_cpu(drain_local_stock); | ||
1490 | atomic_dec(&memcg_drain_count); | ||
1491 | } | ||
1492 | |||
1493 | static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | ||
1494 | unsigned long action, | ||
1495 | void *hcpu) | ||
1496 | { | ||
1497 | int cpu = (unsigned long)hcpu; | ||
1498 | struct memcg_stock_pcp *stock; | ||
1499 | |||
1500 | if (action != CPU_DEAD) | ||
1501 | return NOTIFY_OK; | ||
1502 | stock = &per_cpu(memcg_stock, cpu); | ||
1503 | drain_stock(stock); | ||
1504 | return NOTIFY_OK; | ||
1505 | } | ||
1506 | |||
1507 | /* | ||
1262 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 1508 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
1263 | * oom-killer can be invoked. | 1509 | * oom-killer can be invoked. |
1264 | */ | 1510 | */ |
1265 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1511 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
1266 | gfp_t gfp_mask, struct mem_cgroup **memcg, | 1512 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) |
1267 | bool oom, struct page *page) | ||
1268 | { | 1513 | { |
1269 | struct mem_cgroup *mem, *mem_over_limit; | 1514 | struct mem_cgroup *mem, *mem_over_limit; |
1270 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1515 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1271 | struct res_counter *fail_res; | 1516 | struct res_counter *fail_res; |
1517 | int csize = CHARGE_SIZE; | ||
1272 | 1518 | ||
1273 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1519 | /* |
1274 | /* Don't account this! */ | 1520 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
1275 | *memcg = NULL; | 1521 | * in system level. So, allow to go ahead dying process in addition to |
1276 | return 0; | 1522 | * MEMDIE process. |
1277 | } | 1523 | */ |
1524 | if (unlikely(test_thread_flag(TIF_MEMDIE) | ||
1525 | || fatal_signal_pending(current))) | ||
1526 | goto bypass; | ||
1278 | 1527 | ||
1279 | /* | 1528 | /* |
1280 | * We always charge the cgroup the mm_struct belongs to. | 1529 | * We always charge the cgroup the mm_struct belongs to. |
@@ -1293,23 +1542,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1293 | return 0; | 1542 | return 0; |
1294 | 1543 | ||
1295 | VM_BUG_ON(css_is_removed(&mem->css)); | 1544 | VM_BUG_ON(css_is_removed(&mem->css)); |
1545 | if (mem_cgroup_is_root(mem)) | ||
1546 | goto done; | ||
1296 | 1547 | ||
1297 | while (1) { | 1548 | while (1) { |
1298 | int ret = 0; | 1549 | int ret = 0; |
1299 | unsigned long flags = 0; | 1550 | unsigned long flags = 0; |
1300 | 1551 | ||
1301 | if (mem_cgroup_is_root(mem)) | 1552 | if (consume_stock(mem)) |
1302 | goto done; | 1553 | goto done; |
1303 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1554 | |
1555 | ret = res_counter_charge(&mem->res, csize, &fail_res); | ||
1304 | if (likely(!ret)) { | 1556 | if (likely(!ret)) { |
1305 | if (!do_swap_account) | 1557 | if (!do_swap_account) |
1306 | break; | 1558 | break; |
1307 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | 1559 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); |
1308 | &fail_res); | ||
1309 | if (likely(!ret)) | 1560 | if (likely(!ret)) |
1310 | break; | 1561 | break; |
1311 | /* mem+swap counter fails */ | 1562 | /* mem+swap counter fails */ |
1312 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1563 | res_counter_uncharge(&mem->res, csize); |
1313 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1564 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1314 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1565 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1315 | memsw); | 1566 | memsw); |
@@ -1318,6 +1569,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1318 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1569 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1319 | res); | 1570 | res); |
1320 | 1571 | ||
1572 | /* reduce request size and retry */ | ||
1573 | if (csize > PAGE_SIZE) { | ||
1574 | csize = PAGE_SIZE; | ||
1575 | continue; | ||
1576 | } | ||
1321 | if (!(gfp_mask & __GFP_WAIT)) | 1577 | if (!(gfp_mask & __GFP_WAIT)) |
1322 | goto nomem; | 1578 | goto nomem; |
1323 | 1579 | ||
@@ -1337,27 +1593,92 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1337 | if (mem_cgroup_check_under_limit(mem_over_limit)) | 1593 | if (mem_cgroup_check_under_limit(mem_over_limit)) |
1338 | continue; | 1594 | continue; |
1339 | 1595 | ||
1596 | /* try to avoid oom while someone is moving charge */ | ||
1597 | if (mc.moving_task && current != mc.moving_task) { | ||
1598 | struct mem_cgroup *from, *to; | ||
1599 | bool do_continue = false; | ||
1600 | /* | ||
1601 | * There is a small race that "from" or "to" can be | ||
1602 | * freed by rmdir, so we use css_tryget(). | ||
1603 | */ | ||
1604 | from = mc.from; | ||
1605 | to = mc.to; | ||
1606 | if (from && css_tryget(&from->css)) { | ||
1607 | if (mem_over_limit->use_hierarchy) | ||
1608 | do_continue = css_is_ancestor( | ||
1609 | &from->css, | ||
1610 | &mem_over_limit->css); | ||
1611 | else | ||
1612 | do_continue = (from == mem_over_limit); | ||
1613 | css_put(&from->css); | ||
1614 | } | ||
1615 | if (!do_continue && to && css_tryget(&to->css)) { | ||
1616 | if (mem_over_limit->use_hierarchy) | ||
1617 | do_continue = css_is_ancestor( | ||
1618 | &to->css, | ||
1619 | &mem_over_limit->css); | ||
1620 | else | ||
1621 | do_continue = (to == mem_over_limit); | ||
1622 | css_put(&to->css); | ||
1623 | } | ||
1624 | if (do_continue) { | ||
1625 | DEFINE_WAIT(wait); | ||
1626 | prepare_to_wait(&mc.waitq, &wait, | ||
1627 | TASK_INTERRUPTIBLE); | ||
1628 | /* moving charge context might have finished. */ | ||
1629 | if (mc.moving_task) | ||
1630 | schedule(); | ||
1631 | finish_wait(&mc.waitq, &wait); | ||
1632 | continue; | ||
1633 | } | ||
1634 | } | ||
1635 | |||
1340 | if (!nr_retries--) { | 1636 | if (!nr_retries--) { |
1341 | if (oom) { | 1637 | if (!oom) |
1342 | mutex_lock(&memcg_tasklist); | 1638 | goto nomem; |
1343 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 1639 | if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { |
1344 | mutex_unlock(&memcg_tasklist); | 1640 | nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1345 | record_last_oom(mem_over_limit); | 1641 | continue; |
1346 | } | 1642 | } |
1347 | goto nomem; | 1643 | /* When we reach here, current task is dying .*/ |
1644 | css_put(&mem->css); | ||
1645 | goto bypass; | ||
1348 | } | 1646 | } |
1349 | } | 1647 | } |
1350 | /* | 1648 | if (csize > PAGE_SIZE) |
1351 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 1649 | refill_stock(mem, csize - PAGE_SIZE); |
1352 | * if they exceeds softlimit. | ||
1353 | */ | ||
1354 | if (mem_cgroup_soft_limit_check(mem)) | ||
1355 | mem_cgroup_update_tree(mem, page); | ||
1356 | done: | 1650 | done: |
1357 | return 0; | 1651 | return 0; |
1358 | nomem: | 1652 | nomem: |
1359 | css_put(&mem->css); | 1653 | css_put(&mem->css); |
1360 | return -ENOMEM; | 1654 | return -ENOMEM; |
1655 | bypass: | ||
1656 | *memcg = NULL; | ||
1657 | return 0; | ||
1658 | } | ||
1659 | |||
1660 | /* | ||
1661 | * Somemtimes we have to undo a charge we got by try_charge(). | ||
1662 | * This function is for that and do uncharge, put css's refcnt. | ||
1663 | * gotten by try_charge(). | ||
1664 | */ | ||
1665 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | ||
1666 | unsigned long count) | ||
1667 | { | ||
1668 | if (!mem_cgroup_is_root(mem)) { | ||
1669 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | ||
1670 | if (do_swap_account) | ||
1671 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); | ||
1672 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
1673 | WARN_ON_ONCE(count > INT_MAX); | ||
1674 | __css_put(&mem->css, (int)count); | ||
1675 | } | ||
1676 | /* we don't need css_put for root */ | ||
1677 | } | ||
1678 | |||
1679 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | ||
1680 | { | ||
1681 | __mem_cgroup_cancel_charge(mem, 1); | ||
1361 | } | 1682 | } |
1362 | 1683 | ||
1363 | /* | 1684 | /* |
@@ -1379,25 +1700,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
1379 | return container_of(css, struct mem_cgroup, css); | 1700 | return container_of(css, struct mem_cgroup, css); |
1380 | } | 1701 | } |
1381 | 1702 | ||
1382 | static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) | 1703 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
1383 | { | 1704 | { |
1384 | struct mem_cgroup *mem; | 1705 | struct mem_cgroup *mem = NULL; |
1385 | struct page_cgroup *pc; | 1706 | struct page_cgroup *pc; |
1386 | unsigned short id; | 1707 | unsigned short id; |
1387 | swp_entry_t ent; | 1708 | swp_entry_t ent; |
1388 | 1709 | ||
1389 | VM_BUG_ON(!PageLocked(page)); | 1710 | VM_BUG_ON(!PageLocked(page)); |
1390 | 1711 | ||
1391 | if (!PageSwapCache(page)) | ||
1392 | return NULL; | ||
1393 | |||
1394 | pc = lookup_page_cgroup(page); | 1712 | pc = lookup_page_cgroup(page); |
1395 | lock_page_cgroup(pc); | 1713 | lock_page_cgroup(pc); |
1396 | if (PageCgroupUsed(pc)) { | 1714 | if (PageCgroupUsed(pc)) { |
1397 | mem = pc->mem_cgroup; | 1715 | mem = pc->mem_cgroup; |
1398 | if (mem && !css_tryget(&mem->css)) | 1716 | if (mem && !css_tryget(&mem->css)) |
1399 | mem = NULL; | 1717 | mem = NULL; |
1400 | } else { | 1718 | } else if (PageSwapCache(page)) { |
1401 | ent.val = page_private(page); | 1719 | ent.val = page_private(page); |
1402 | id = lookup_swap_cgroup(ent); | 1720 | id = lookup_swap_cgroup(ent); |
1403 | rcu_read_lock(); | 1721 | rcu_read_lock(); |
@@ -1426,12 +1744,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1426 | lock_page_cgroup(pc); | 1744 | lock_page_cgroup(pc); |
1427 | if (unlikely(PageCgroupUsed(pc))) { | 1745 | if (unlikely(PageCgroupUsed(pc))) { |
1428 | unlock_page_cgroup(pc); | 1746 | unlock_page_cgroup(pc); |
1429 | if (!mem_cgroup_is_root(mem)) { | 1747 | mem_cgroup_cancel_charge(mem); |
1430 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1431 | if (do_swap_account) | ||
1432 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1433 | } | ||
1434 | css_put(&mem->css); | ||
1435 | return; | 1748 | return; |
1436 | } | 1749 | } |
1437 | 1750 | ||
@@ -1461,88 +1774,83 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1461 | mem_cgroup_charge_statistics(mem, pc, true); | 1774 | mem_cgroup_charge_statistics(mem, pc, true); |
1462 | 1775 | ||
1463 | unlock_page_cgroup(pc); | 1776 | unlock_page_cgroup(pc); |
1777 | /* | ||
1778 | * "charge_statistics" updated event counter. Then, check it. | ||
1779 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
1780 | * if they exceeds softlimit. | ||
1781 | */ | ||
1782 | memcg_check_events(mem, pc->page); | ||
1464 | } | 1783 | } |
1465 | 1784 | ||
1466 | /** | 1785 | /** |
1467 | * mem_cgroup_move_account - move account of the page | 1786 | * __mem_cgroup_move_account - move account of the page |
1468 | * @pc: page_cgroup of the page. | 1787 | * @pc: page_cgroup of the page. |
1469 | * @from: mem_cgroup which the page is moved from. | 1788 | * @from: mem_cgroup which the page is moved from. |
1470 | * @to: mem_cgroup which the page is moved to. @from != @to. | 1789 | * @to: mem_cgroup which the page is moved to. @from != @to. |
1790 | * @uncharge: whether we should call uncharge and css_put against @from. | ||
1471 | * | 1791 | * |
1472 | * The caller must confirm following. | 1792 | * The caller must confirm following. |
1473 | * - page is not on LRU (isolate_page() is useful.) | 1793 | * - page is not on LRU (isolate_page() is useful.) |
1794 | * - the pc is locked, used, and ->mem_cgroup points to @from. | ||
1474 | * | 1795 | * |
1475 | * returns 0 at success, | 1796 | * This function doesn't do "charge" nor css_get to new cgroup. It should be |
1476 | * returns -EBUSY when lock is busy or "pc" is unstable. | 1797 | * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is |
1477 | * | 1798 | * true, this function does "uncharge" from old cgroup, but it doesn't if |
1478 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | 1799 | * @uncharge is false, so a caller should do "uncharge". |
1479 | * new cgroup. It should be done by a caller. | ||
1480 | */ | 1800 | */ |
1481 | 1801 | ||
1482 | static int mem_cgroup_move_account(struct page_cgroup *pc, | 1802 | static void __mem_cgroup_move_account(struct page_cgroup *pc, |
1483 | struct mem_cgroup *from, struct mem_cgroup *to) | 1803 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) |
1484 | { | 1804 | { |
1485 | struct mem_cgroup_per_zone *from_mz, *to_mz; | ||
1486 | int nid, zid; | ||
1487 | int ret = -EBUSY; | ||
1488 | struct page *page; | ||
1489 | int cpu; | ||
1490 | struct mem_cgroup_stat *stat; | ||
1491 | struct mem_cgroup_stat_cpu *cpustat; | ||
1492 | |||
1493 | VM_BUG_ON(from == to); | 1805 | VM_BUG_ON(from == to); |
1494 | VM_BUG_ON(PageLRU(pc->page)); | 1806 | VM_BUG_ON(PageLRU(pc->page)); |
1495 | 1807 | VM_BUG_ON(!PageCgroupLocked(pc)); | |
1496 | nid = page_cgroup_nid(pc); | 1808 | VM_BUG_ON(!PageCgroupUsed(pc)); |
1497 | zid = page_cgroup_zid(pc); | 1809 | VM_BUG_ON(pc->mem_cgroup != from); |
1498 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); | 1810 | |
1499 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); | 1811 | if (PageCgroupFileMapped(pc)) { |
1500 | 1812 | /* Update mapped_file data for mem_cgroup */ | |
1501 | if (!trylock_page_cgroup(pc)) | 1813 | preempt_disable(); |
1502 | return ret; | 1814 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
1503 | 1815 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | |
1504 | if (!PageCgroupUsed(pc)) | 1816 | preempt_enable(); |
1505 | goto out; | ||
1506 | |||
1507 | if (pc->mem_cgroup != from) | ||
1508 | goto out; | ||
1509 | |||
1510 | if (!mem_cgroup_is_root(from)) | ||
1511 | res_counter_uncharge(&from->res, PAGE_SIZE); | ||
1512 | mem_cgroup_charge_statistics(from, pc, false); | ||
1513 | |||
1514 | page = pc->page; | ||
1515 | if (page_is_file_cache(page) && page_mapped(page)) { | ||
1516 | cpu = smp_processor_id(); | ||
1517 | /* Update mapped_file data for mem_cgroup "from" */ | ||
1518 | stat = &from->stat; | ||
1519 | cpustat = &stat->cpustat[cpu]; | ||
1520 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | ||
1521 | -1); | ||
1522 | |||
1523 | /* Update mapped_file data for mem_cgroup "to" */ | ||
1524 | stat = &to->stat; | ||
1525 | cpustat = &stat->cpustat[cpu]; | ||
1526 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | ||
1527 | 1); | ||
1528 | } | 1817 | } |
1818 | mem_cgroup_charge_statistics(from, pc, false); | ||
1819 | if (uncharge) | ||
1820 | /* This is not "cancel", but cancel_charge does all we need. */ | ||
1821 | mem_cgroup_cancel_charge(from); | ||
1529 | 1822 | ||
1530 | if (do_swap_account && !mem_cgroup_is_root(from)) | 1823 | /* caller should have done css_get */ |
1531 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
1532 | css_put(&from->css); | ||
1533 | |||
1534 | css_get(&to->css); | ||
1535 | pc->mem_cgroup = to; | 1824 | pc->mem_cgroup = to; |
1536 | mem_cgroup_charge_statistics(to, pc, true); | 1825 | mem_cgroup_charge_statistics(to, pc, true); |
1537 | ret = 0; | ||
1538 | out: | ||
1539 | unlock_page_cgroup(pc); | ||
1540 | /* | 1826 | /* |
1541 | * We charges against "to" which may not have any tasks. Then, "to" | 1827 | * We charges against "to" which may not have any tasks. Then, "to" |
1542 | * can be under rmdir(). But in current implementation, caller of | 1828 | * can be under rmdir(). But in current implementation, caller of |
1543 | * this function is just force_empty() and it's garanteed that | 1829 | * this function is just force_empty() and move charge, so it's |
1544 | * "to" is never removed. So, we don't check rmdir status here. | 1830 | * garanteed that "to" is never removed. So, we don't check rmdir |
1831 | * status here. | ||
1832 | */ | ||
1833 | } | ||
1834 | |||
1835 | /* | ||
1836 | * check whether the @pc is valid for moving account and call | ||
1837 | * __mem_cgroup_move_account() | ||
1838 | */ | ||
1839 | static int mem_cgroup_move_account(struct page_cgroup *pc, | ||
1840 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) | ||
1841 | { | ||
1842 | int ret = -EINVAL; | ||
1843 | lock_page_cgroup(pc); | ||
1844 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | ||
1845 | __mem_cgroup_move_account(pc, from, to, uncharge); | ||
1846 | ret = 0; | ||
1847 | } | ||
1848 | unlock_page_cgroup(pc); | ||
1849 | /* | ||
1850 | * check events | ||
1545 | */ | 1851 | */ |
1852 | memcg_check_events(to, pc->page); | ||
1853 | memcg_check_events(from, pc->page); | ||
1546 | return ret; | 1854 | return ret; |
1547 | } | 1855 | } |
1548 | 1856 | ||
@@ -1564,45 +1872,25 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
1564 | if (!pcg) | 1872 | if (!pcg) |
1565 | return -EINVAL; | 1873 | return -EINVAL; |
1566 | 1874 | ||
1875 | ret = -EBUSY; | ||
1876 | if (!get_page_unless_zero(page)) | ||
1877 | goto out; | ||
1878 | if (isolate_lru_page(page)) | ||
1879 | goto put; | ||
1567 | 1880 | ||
1568 | parent = mem_cgroup_from_cont(pcg); | 1881 | parent = mem_cgroup_from_cont(pcg); |
1569 | 1882 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | |
1570 | |||
1571 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); | ||
1572 | if (ret || !parent) | 1883 | if (ret || !parent) |
1573 | return ret; | 1884 | goto put_back; |
1574 | |||
1575 | if (!get_page_unless_zero(page)) { | ||
1576 | ret = -EBUSY; | ||
1577 | goto uncharge; | ||
1578 | } | ||
1579 | |||
1580 | ret = isolate_lru_page(page); | ||
1581 | 1885 | ||
1886 | ret = mem_cgroup_move_account(pc, child, parent, true); | ||
1582 | if (ret) | 1887 | if (ret) |
1583 | goto cancel; | 1888 | mem_cgroup_cancel_charge(parent); |
1584 | 1889 | put_back: | |
1585 | ret = mem_cgroup_move_account(pc, child, parent); | ||
1586 | |||
1587 | putback_lru_page(page); | 1890 | putback_lru_page(page); |
1588 | if (!ret) { | 1891 | put: |
1589 | put_page(page); | ||
1590 | /* drop extra refcnt by try_charge() */ | ||
1591 | css_put(&parent->css); | ||
1592 | return 0; | ||
1593 | } | ||
1594 | |||
1595 | cancel: | ||
1596 | put_page(page); | 1892 | put_page(page); |
1597 | uncharge: | 1893 | out: |
1598 | /* drop extra refcnt by try_charge() */ | ||
1599 | css_put(&parent->css); | ||
1600 | /* uncharge if move fails */ | ||
1601 | if (!mem_cgroup_is_root(parent)) { | ||
1602 | res_counter_uncharge(&parent->res, PAGE_SIZE); | ||
1603 | if (do_swap_account) | ||
1604 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
1605 | } | ||
1606 | return ret; | 1894 | return ret; |
1607 | } | 1895 | } |
1608 | 1896 | ||
@@ -1627,7 +1915,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
1627 | prefetchw(pc); | 1915 | prefetchw(pc); |
1628 | 1916 | ||
1629 | mem = memcg; | 1917 | mem = memcg; |
1630 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); | 1918 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); |
1631 | if (ret || !mem) | 1919 | if (ret || !mem) |
1632 | return ret; | 1920 | return ret; |
1633 | 1921 | ||
@@ -1720,7 +2008,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
1720 | /* | 2008 | /* |
1721 | * While swap-in, try_charge -> commit or cancel, the page is locked. | 2009 | * While swap-in, try_charge -> commit or cancel, the page is locked. |
1722 | * And when try_charge() successfully returns, one refcnt to memcg without | 2010 | * And when try_charge() successfully returns, one refcnt to memcg without |
1723 | * struct page_cgroup is aquired. This refcnt will be cumsumed by | 2011 | * struct page_cgroup is acquired. This refcnt will be consumed by |
1724 | * "commit()" or removed by "cancel()" | 2012 | * "commit()" or removed by "cancel()" |
1725 | */ | 2013 | */ |
1726 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 2014 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
@@ -1737,23 +2025,24 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
1737 | goto charge_cur_mm; | 2025 | goto charge_cur_mm; |
1738 | /* | 2026 | /* |
1739 | * A racing thread's fault, or swapoff, may have already updated | 2027 | * A racing thread's fault, or swapoff, may have already updated |
1740 | * the pte, and even removed page from swap cache: return success | 2028 | * the pte, and even removed page from swap cache: in those cases |
1741 | * to go on to do_swap_page()'s pte_same() test, which should fail. | 2029 | * do_swap_page()'s pte_same() test will fail; but there's also a |
2030 | * KSM case which does need to charge the page. | ||
1742 | */ | 2031 | */ |
1743 | if (!PageSwapCache(page)) | 2032 | if (!PageSwapCache(page)) |
1744 | return 0; | 2033 | goto charge_cur_mm; |
1745 | mem = try_get_mem_cgroup_from_swapcache(page); | 2034 | mem = try_get_mem_cgroup_from_page(page); |
1746 | if (!mem) | 2035 | if (!mem) |
1747 | goto charge_cur_mm; | 2036 | goto charge_cur_mm; |
1748 | *ptr = mem; | 2037 | *ptr = mem; |
1749 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); | 2038 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); |
1750 | /* drop extra refcnt from tryget */ | 2039 | /* drop extra refcnt from tryget */ |
1751 | css_put(&mem->css); | 2040 | css_put(&mem->css); |
1752 | return ret; | 2041 | return ret; |
1753 | charge_cur_mm: | 2042 | charge_cur_mm: |
1754 | if (unlikely(!mm)) | 2043 | if (unlikely(!mm)) |
1755 | mm = &init_mm; | 2044 | mm = &init_mm; |
1756 | return __mem_cgroup_try_charge(mm, mask, ptr, true, page); | 2045 | return __mem_cgroup_try_charge(mm, mask, ptr, true); |
1757 | } | 2046 | } |
1758 | 2047 | ||
1759 | static void | 2048 | static void |
@@ -1818,14 +2107,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
1818 | return; | 2107 | return; |
1819 | if (!mem) | 2108 | if (!mem) |
1820 | return; | 2109 | return; |
1821 | if (!mem_cgroup_is_root(mem)) { | 2110 | mem_cgroup_cancel_charge(mem); |
1822 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1823 | if (do_swap_account) | ||
1824 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1825 | } | ||
1826 | css_put(&mem->css); | ||
1827 | } | 2111 | } |
1828 | 2112 | ||
2113 | static void | ||
2114 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | ||
2115 | { | ||
2116 | struct memcg_batch_info *batch = NULL; | ||
2117 | bool uncharge_memsw = true; | ||
2118 | /* If swapout, usage of swap doesn't decrease */ | ||
2119 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
2120 | uncharge_memsw = false; | ||
2121 | /* | ||
2122 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
2123 | * In those cases, all pages freed continously can be expected to be in | ||
2124 | * the same cgroup and we have chance to coalesce uncharges. | ||
2125 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
2126 | * because we want to do uncharge as soon as possible. | ||
2127 | */ | ||
2128 | if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||
2129 | goto direct_uncharge; | ||
2130 | |||
2131 | batch = ¤t->memcg_batch; | ||
2132 | /* | ||
2133 | * In usual, we do css_get() when we remember memcg pointer. | ||
2134 | * But in this case, we keep res->usage until end of a series of | ||
2135 | * uncharges. Then, it's ok to ignore memcg's refcnt. | ||
2136 | */ | ||
2137 | if (!batch->memcg) | ||
2138 | batch->memcg = mem; | ||
2139 | /* | ||
2140 | * In typical case, batch->memcg == mem. This means we can | ||
2141 | * merge a series of uncharges to an uncharge of res_counter. | ||
2142 | * If not, we uncharge res_counter ony by one. | ||
2143 | */ | ||
2144 | if (batch->memcg != mem) | ||
2145 | goto direct_uncharge; | ||
2146 | /* remember freed charge and uncharge it later */ | ||
2147 | batch->bytes += PAGE_SIZE; | ||
2148 | if (uncharge_memsw) | ||
2149 | batch->memsw_bytes += PAGE_SIZE; | ||
2150 | return; | ||
2151 | direct_uncharge: | ||
2152 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
2153 | if (uncharge_memsw) | ||
2154 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
2155 | return; | ||
2156 | } | ||
1829 | 2157 | ||
1830 | /* | 2158 | /* |
1831 | * uncharge if !page_mapped(page) | 2159 | * uncharge if !page_mapped(page) |
@@ -1874,12 +2202,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1874 | break; | 2202 | break; |
1875 | } | 2203 | } |
1876 | 2204 | ||
1877 | if (!mem_cgroup_is_root(mem)) { | 2205 | if (!mem_cgroup_is_root(mem)) |
1878 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2206 | __do_uncharge(mem, ctype); |
1879 | if (do_swap_account && | ||
1880 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1881 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1882 | } | ||
1883 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2207 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1884 | mem_cgroup_swap_statistics(mem, true); | 2208 | mem_cgroup_swap_statistics(mem, true); |
1885 | mem_cgroup_charge_statistics(mem, pc, false); | 2209 | mem_cgroup_charge_statistics(mem, pc, false); |
@@ -1895,8 +2219,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1895 | mz = page_cgroup_zoneinfo(pc); | 2219 | mz = page_cgroup_zoneinfo(pc); |
1896 | unlock_page_cgroup(pc); | 2220 | unlock_page_cgroup(pc); |
1897 | 2221 | ||
1898 | if (mem_cgroup_soft_limit_check(mem)) | 2222 | memcg_check_events(mem, page); |
1899 | mem_cgroup_update_tree(mem, page); | ||
1900 | /* at swapout, this memcg will be accessed to record to swap */ | 2223 | /* at swapout, this memcg will be accessed to record to swap */ |
1901 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2224 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1902 | css_put(&mem->css); | 2225 | css_put(&mem->css); |
@@ -1925,6 +2248,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
1925 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 2248 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
1926 | } | 2249 | } |
1927 | 2250 | ||
2251 | /* | ||
2252 | * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. | ||
2253 | * In that cases, pages are freed continuously and we can expect pages | ||
2254 | * are in the same memcg. All these calls itself limits the number of | ||
2255 | * pages freed at once, then uncharge_start/end() is called properly. | ||
2256 | * This may be called prural(2) times in a context, | ||
2257 | */ | ||
2258 | |||
2259 | void mem_cgroup_uncharge_start(void) | ||
2260 | { | ||
2261 | current->memcg_batch.do_batch++; | ||
2262 | /* We can do nest. */ | ||
2263 | if (current->memcg_batch.do_batch == 1) { | ||
2264 | current->memcg_batch.memcg = NULL; | ||
2265 | current->memcg_batch.bytes = 0; | ||
2266 | current->memcg_batch.memsw_bytes = 0; | ||
2267 | } | ||
2268 | } | ||
2269 | |||
2270 | void mem_cgroup_uncharge_end(void) | ||
2271 | { | ||
2272 | struct memcg_batch_info *batch = ¤t->memcg_batch; | ||
2273 | |||
2274 | if (!batch->do_batch) | ||
2275 | return; | ||
2276 | |||
2277 | batch->do_batch--; | ||
2278 | if (batch->do_batch) /* If stacked, do nothing. */ | ||
2279 | return; | ||
2280 | |||
2281 | if (!batch->memcg) | ||
2282 | return; | ||
2283 | /* | ||
2284 | * This "batch->memcg" is valid without any css_get/put etc... | ||
2285 | * bacause we hide charges behind us. | ||
2286 | */ | ||
2287 | if (batch->bytes) | ||
2288 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | ||
2289 | if (batch->memsw_bytes) | ||
2290 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | ||
2291 | /* forget this pointer (for sanity check) */ | ||
2292 | batch->memcg = NULL; | ||
2293 | } | ||
2294 | |||
1928 | #ifdef CONFIG_SWAP | 2295 | #ifdef CONFIG_SWAP |
1929 | /* | 2296 | /* |
1930 | * called after __delete_from_swap_cache() and drop "page" account. | 2297 | * called after __delete_from_swap_cache() and drop "page" account. |
@@ -1979,6 +2346,64 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
1979 | } | 2346 | } |
1980 | rcu_read_unlock(); | 2347 | rcu_read_unlock(); |
1981 | } | 2348 | } |
2349 | |||
2350 | /** | ||
2351 | * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. | ||
2352 | * @entry: swap entry to be moved | ||
2353 | * @from: mem_cgroup which the entry is moved from | ||
2354 | * @to: mem_cgroup which the entry is moved to | ||
2355 | * @need_fixup: whether we should fixup res_counters and refcounts. | ||
2356 | * | ||
2357 | * It succeeds only when the swap_cgroup's record for this entry is the same | ||
2358 | * as the mem_cgroup's id of @from. | ||
2359 | * | ||
2360 | * Returns 0 on success, -EINVAL on failure. | ||
2361 | * | ||
2362 | * The caller must have charged to @to, IOW, called res_counter_charge() about | ||
2363 | * both res and memsw, and called css_get(). | ||
2364 | */ | ||
2365 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | ||
2366 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | ||
2367 | { | ||
2368 | unsigned short old_id, new_id; | ||
2369 | |||
2370 | old_id = css_id(&from->css); | ||
2371 | new_id = css_id(&to->css); | ||
2372 | |||
2373 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | ||
2374 | mem_cgroup_swap_statistics(from, false); | ||
2375 | mem_cgroup_swap_statistics(to, true); | ||
2376 | /* | ||
2377 | * This function is only called from task migration context now. | ||
2378 | * It postpones res_counter and refcount handling till the end | ||
2379 | * of task migration(mem_cgroup_clear_mc()) for performance | ||
2380 | * improvement. But we cannot postpone mem_cgroup_get(to) | ||
2381 | * because if the process that has been moved to @to does | ||
2382 | * swap-in, the refcount of @to might be decreased to 0. | ||
2383 | */ | ||
2384 | mem_cgroup_get(to); | ||
2385 | if (need_fixup) { | ||
2386 | if (!mem_cgroup_is_root(from)) | ||
2387 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
2388 | mem_cgroup_put(from); | ||
2389 | /* | ||
2390 | * we charged both to->res and to->memsw, so we should | ||
2391 | * uncharge to->res. | ||
2392 | */ | ||
2393 | if (!mem_cgroup_is_root(to)) | ||
2394 | res_counter_uncharge(&to->res, PAGE_SIZE); | ||
2395 | css_put(&to->css); | ||
2396 | } | ||
2397 | return 0; | ||
2398 | } | ||
2399 | return -EINVAL; | ||
2400 | } | ||
2401 | #else | ||
2402 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | ||
2403 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | ||
2404 | { | ||
2405 | return -EINVAL; | ||
2406 | } | ||
1982 | #endif | 2407 | #endif |
1983 | 2408 | ||
1984 | /* | 2409 | /* |
@@ -2002,12 +2427,11 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
2002 | } | 2427 | } |
2003 | unlock_page_cgroup(pc); | 2428 | unlock_page_cgroup(pc); |
2004 | 2429 | ||
2430 | *ptr = mem; | ||
2005 | if (mem) { | 2431 | if (mem) { |
2006 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, | 2432 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); |
2007 | page); | ||
2008 | css_put(&mem->css); | 2433 | css_put(&mem->css); |
2009 | } | 2434 | } |
2010 | *ptr = mem; | ||
2011 | return ret; | 2435 | return ret; |
2012 | } | 2436 | } |
2013 | 2437 | ||
@@ -2100,7 +2524,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2100 | unsigned long long val) | 2524 | unsigned long long val) |
2101 | { | 2525 | { |
2102 | int retry_count; | 2526 | int retry_count; |
2103 | int progress; | ||
2104 | u64 memswlimit; | 2527 | u64 memswlimit; |
2105 | int ret = 0; | 2528 | int ret = 0; |
2106 | int children = mem_cgroup_count_children(memcg); | 2529 | int children = mem_cgroup_count_children(memcg); |
@@ -2144,8 +2567,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2144 | if (!ret) | 2567 | if (!ret) |
2145 | break; | 2568 | break; |
2146 | 2569 | ||
2147 | progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, | 2570 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
2148 | GFP_KERNEL, | ||
2149 | MEM_CGROUP_RECLAIM_SHRINK); | 2571 | MEM_CGROUP_RECLAIM_SHRINK); |
2150 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2572 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
2151 | /* Usage is reduced ? */ | 2573 | /* Usage is reduced ? */ |
@@ -2334,7 +2756,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
2334 | pc = list_entry(list->prev, struct page_cgroup, lru); | 2756 | pc = list_entry(list->prev, struct page_cgroup, lru); |
2335 | if (busy == pc) { | 2757 | if (busy == pc) { |
2336 | list_move(&pc->lru, list); | 2758 | list_move(&pc->lru, list); |
2337 | busy = 0; | 2759 | busy = NULL; |
2338 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 2760 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
2339 | continue; | 2761 | continue; |
2340 | } | 2762 | } |
@@ -2375,7 +2797,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) | |||
2375 | if (free_all) | 2797 | if (free_all) |
2376 | goto try_to_free; | 2798 | goto try_to_free; |
2377 | move_account: | 2799 | move_account: |
2378 | while (mem->res.usage > 0) { | 2800 | do { |
2379 | ret = -EBUSY; | 2801 | ret = -EBUSY; |
2380 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | 2802 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) |
2381 | goto out; | 2803 | goto out; |
@@ -2384,6 +2806,7 @@ move_account: | |||
2384 | goto out; | 2806 | goto out; |
2385 | /* This is for making all *used* pages to be on LRU. */ | 2807 | /* This is for making all *used* pages to be on LRU. */ |
2386 | lru_add_drain_all(); | 2808 | lru_add_drain_all(); |
2809 | drain_all_stock_sync(); | ||
2387 | ret = 0; | 2810 | ret = 0; |
2388 | for_each_node_state(node, N_HIGH_MEMORY) { | 2811 | for_each_node_state(node, N_HIGH_MEMORY) { |
2389 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 2812 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
@@ -2402,8 +2825,8 @@ move_account: | |||
2402 | if (ret == -ENOMEM) | 2825 | if (ret == -ENOMEM) |
2403 | goto try_to_free; | 2826 | goto try_to_free; |
2404 | cond_resched(); | 2827 | cond_resched(); |
2405 | } | 2828 | /* "ret" should also be checked to ensure all lists are empty. */ |
2406 | ret = 0; | 2829 | } while (mem->res.usage > 0 || ret); |
2407 | out: | 2830 | out: |
2408 | css_put(&mem->css); | 2831 | css_put(&mem->css); |
2409 | return ret; | 2832 | return ret; |
@@ -2436,10 +2859,7 @@ try_to_free: | |||
2436 | } | 2859 | } |
2437 | lru_add_drain(); | 2860 | lru_add_drain(); |
2438 | /* try move_account...there may be some *locked* pages. */ | 2861 | /* try move_account...there may be some *locked* pages. */ |
2439 | if (mem->res.usage) | 2862 | goto move_account; |
2440 | goto move_account; | ||
2441 | ret = 0; | ||
2442 | goto out; | ||
2443 | } | 2863 | } |
2444 | 2864 | ||
2445 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | 2865 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) |
@@ -2466,7 +2886,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
2466 | 2886 | ||
2467 | cgroup_lock(); | 2887 | cgroup_lock(); |
2468 | /* | 2888 | /* |
2469 | * If parent's use_hiearchy is set, we can't make any modifications | 2889 | * If parent's use_hierarchy is set, we can't make any modifications |
2470 | * in the child subtrees. If it is unset, then the change can | 2890 | * in the child subtrees. If it is unset, then the change can |
2471 | * occur, provided the current cgroup has no children. | 2891 | * occur, provided the current cgroup has no children. |
2472 | * | 2892 | * |
@@ -2495,7 +2915,7 @@ static int | |||
2495 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) | 2915 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) |
2496 | { | 2916 | { |
2497 | struct mem_cgroup_idx_data *d = data; | 2917 | struct mem_cgroup_idx_data *d = data; |
2498 | d->val += mem_cgroup_read_stat(&mem->stat, d->idx); | 2918 | d->val += mem_cgroup_read_stat(mem, d->idx); |
2499 | return 0; | 2919 | return 0; |
2500 | } | 2920 | } |
2501 | 2921 | ||
@@ -2510,39 +2930,50 @@ mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | |||
2510 | *val = d.val; | 2930 | *val = d.val; |
2511 | } | 2931 | } |
2512 | 2932 | ||
2933 | static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | ||
2934 | { | ||
2935 | u64 idx_val, val; | ||
2936 | |||
2937 | if (!mem_cgroup_is_root(mem)) { | ||
2938 | if (!swap) | ||
2939 | return res_counter_read_u64(&mem->res, RES_USAGE); | ||
2940 | else | ||
2941 | return res_counter_read_u64(&mem->memsw, RES_USAGE); | ||
2942 | } | ||
2943 | |||
2944 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); | ||
2945 | val = idx_val; | ||
2946 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); | ||
2947 | val += idx_val; | ||
2948 | |||
2949 | if (swap) { | ||
2950 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2951 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
2952 | val += idx_val; | ||
2953 | } | ||
2954 | |||
2955 | return val << PAGE_SHIFT; | ||
2956 | } | ||
2957 | |||
2513 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 2958 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
2514 | { | 2959 | { |
2515 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2960 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
2516 | u64 idx_val, val; | 2961 | u64 val; |
2517 | int type, name; | 2962 | int type, name; |
2518 | 2963 | ||
2519 | type = MEMFILE_TYPE(cft->private); | 2964 | type = MEMFILE_TYPE(cft->private); |
2520 | name = MEMFILE_ATTR(cft->private); | 2965 | name = MEMFILE_ATTR(cft->private); |
2521 | switch (type) { | 2966 | switch (type) { |
2522 | case _MEM: | 2967 | case _MEM: |
2523 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { | 2968 | if (name == RES_USAGE) |
2524 | mem_cgroup_get_recursive_idx_stat(mem, | 2969 | val = mem_cgroup_usage(mem, false); |
2525 | MEM_CGROUP_STAT_CACHE, &idx_val); | 2970 | else |
2526 | val = idx_val; | ||
2527 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2528 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2529 | val += idx_val; | ||
2530 | val <<= PAGE_SHIFT; | ||
2531 | } else | ||
2532 | val = res_counter_read_u64(&mem->res, name); | 2971 | val = res_counter_read_u64(&mem->res, name); |
2533 | break; | 2972 | break; |
2534 | case _MEMSWAP: | 2973 | case _MEMSWAP: |
2535 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { | 2974 | if (name == RES_USAGE) |
2536 | mem_cgroup_get_recursive_idx_stat(mem, | 2975 | val = mem_cgroup_usage(mem, true); |
2537 | MEM_CGROUP_STAT_CACHE, &idx_val); | 2976 | else |
2538 | val = idx_val; | ||
2539 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2540 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2541 | val += idx_val; | ||
2542 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2543 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
2544 | val <<= PAGE_SHIFT; | ||
2545 | } else | ||
2546 | val = res_counter_read_u64(&mem->memsw, name); | 2977 | val = res_counter_read_u64(&mem->memsw, name); |
2547 | break; | 2978 | break; |
2548 | default: | 2979 | default: |
@@ -2655,12 +3086,45 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2655 | return 0; | 3086 | return 0; |
2656 | } | 3087 | } |
2657 | 3088 | ||
3089 | static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, | ||
3090 | struct cftype *cft) | ||
3091 | { | ||
3092 | return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; | ||
3093 | } | ||
3094 | |||
3095 | #ifdef CONFIG_MMU | ||
3096 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
3097 | struct cftype *cft, u64 val) | ||
3098 | { | ||
3099 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3100 | |||
3101 | if (val >= (1 << NR_MOVE_TYPE)) | ||
3102 | return -EINVAL; | ||
3103 | /* | ||
3104 | * We check this value several times in both in can_attach() and | ||
3105 | * attach(), so we need cgroup lock to prevent this value from being | ||
3106 | * inconsistent. | ||
3107 | */ | ||
3108 | cgroup_lock(); | ||
3109 | mem->move_charge_at_immigrate = val; | ||
3110 | cgroup_unlock(); | ||
3111 | |||
3112 | return 0; | ||
3113 | } | ||
3114 | #else | ||
3115 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
3116 | struct cftype *cft, u64 val) | ||
3117 | { | ||
3118 | return -ENOSYS; | ||
3119 | } | ||
3120 | #endif | ||
3121 | |||
2658 | 3122 | ||
2659 | /* For read statistics */ | 3123 | /* For read statistics */ |
2660 | enum { | 3124 | enum { |
2661 | MCS_CACHE, | 3125 | MCS_CACHE, |
2662 | MCS_RSS, | 3126 | MCS_RSS, |
2663 | MCS_MAPPED_FILE, | 3127 | MCS_FILE_MAPPED, |
2664 | MCS_PGPGIN, | 3128 | MCS_PGPGIN, |
2665 | MCS_PGPGOUT, | 3129 | MCS_PGPGOUT, |
2666 | MCS_SWAP, | 3130 | MCS_SWAP, |
@@ -2700,18 +3164,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
2700 | s64 val; | 3164 | s64 val; |
2701 | 3165 | ||
2702 | /* per cpu stat */ | 3166 | /* per cpu stat */ |
2703 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); | 3167 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); |
2704 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 3168 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
2705 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | 3169 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); |
2706 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 3170 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
2707 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); | 3171 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); |
2708 | s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; | 3172 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
2709 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); | 3173 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); |
2710 | s->stat[MCS_PGPGIN] += val; | 3174 | s->stat[MCS_PGPGIN] += val; |
2711 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 3175 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
2712 | s->stat[MCS_PGPGOUT] += val; | 3176 | s->stat[MCS_PGPGOUT] += val; |
2713 | if (do_swap_account) { | 3177 | if (do_swap_account) { |
2714 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); | 3178 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
2715 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | 3179 | s->stat[MCS_SWAP] += val * PAGE_SIZE; |
2716 | } | 3180 | } |
2717 | 3181 | ||
@@ -2839,12 +3303,249 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | |||
2839 | return 0; | 3303 | return 0; |
2840 | } | 3304 | } |
2841 | 3305 | ||
3306 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | ||
3307 | { | ||
3308 | struct mem_cgroup_threshold_ary *t; | ||
3309 | u64 usage; | ||
3310 | int i; | ||
3311 | |||
3312 | rcu_read_lock(); | ||
3313 | if (!swap) | ||
3314 | t = rcu_dereference(memcg->thresholds); | ||
3315 | else | ||
3316 | t = rcu_dereference(memcg->memsw_thresholds); | ||
3317 | |||
3318 | if (!t) | ||
3319 | goto unlock; | ||
3320 | |||
3321 | usage = mem_cgroup_usage(memcg, swap); | ||
3322 | |||
3323 | /* | ||
3324 | * current_threshold points to threshold just below usage. | ||
3325 | * If it's not true, a threshold was crossed after last | ||
3326 | * call of __mem_cgroup_threshold(). | ||
3327 | */ | ||
3328 | i = atomic_read(&t->current_threshold); | ||
3329 | |||
3330 | /* | ||
3331 | * Iterate backward over array of thresholds starting from | ||
3332 | * current_threshold and check if a threshold is crossed. | ||
3333 | * If none of thresholds below usage is crossed, we read | ||
3334 | * only one element of the array here. | ||
3335 | */ | ||
3336 | for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) | ||
3337 | eventfd_signal(t->entries[i].eventfd, 1); | ||
3338 | |||
3339 | /* i = current_threshold + 1 */ | ||
3340 | i++; | ||
3341 | |||
3342 | /* | ||
3343 | * Iterate forward over array of thresholds starting from | ||
3344 | * current_threshold+1 and check if a threshold is crossed. | ||
3345 | * If none of thresholds above usage is crossed, we read | ||
3346 | * only one element of the array here. | ||
3347 | */ | ||
3348 | for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) | ||
3349 | eventfd_signal(t->entries[i].eventfd, 1); | ||
3350 | |||
3351 | /* Update current_threshold */ | ||
3352 | atomic_set(&t->current_threshold, i - 1); | ||
3353 | unlock: | ||
3354 | rcu_read_unlock(); | ||
3355 | } | ||
3356 | |||
3357 | static void mem_cgroup_threshold(struct mem_cgroup *memcg) | ||
3358 | { | ||
3359 | __mem_cgroup_threshold(memcg, false); | ||
3360 | if (do_swap_account) | ||
3361 | __mem_cgroup_threshold(memcg, true); | ||
3362 | } | ||
3363 | |||
3364 | static int compare_thresholds(const void *a, const void *b) | ||
3365 | { | ||
3366 | const struct mem_cgroup_threshold *_a = a; | ||
3367 | const struct mem_cgroup_threshold *_b = b; | ||
3368 | |||
3369 | return _a->threshold - _b->threshold; | ||
3370 | } | ||
3371 | |||
3372 | static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, | ||
3373 | struct eventfd_ctx *eventfd, const char *args) | ||
3374 | { | ||
3375 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3376 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | ||
3377 | int type = MEMFILE_TYPE(cft->private); | ||
3378 | u64 threshold, usage; | ||
3379 | int size; | ||
3380 | int i, ret; | ||
3381 | |||
3382 | ret = res_counter_memparse_write_strategy(args, &threshold); | ||
3383 | if (ret) | ||
3384 | return ret; | ||
3385 | |||
3386 | mutex_lock(&memcg->thresholds_lock); | ||
3387 | if (type == _MEM) | ||
3388 | thresholds = memcg->thresholds; | ||
3389 | else if (type == _MEMSWAP) | ||
3390 | thresholds = memcg->memsw_thresholds; | ||
3391 | else | ||
3392 | BUG(); | ||
3393 | |||
3394 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | ||
3395 | |||
3396 | /* Check if a threshold crossed before adding a new one */ | ||
3397 | if (thresholds) | ||
3398 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | ||
3399 | |||
3400 | if (thresholds) | ||
3401 | size = thresholds->size + 1; | ||
3402 | else | ||
3403 | size = 1; | ||
3404 | |||
3405 | /* Allocate memory for new array of thresholds */ | ||
3406 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
3407 | size * sizeof(struct mem_cgroup_threshold), | ||
3408 | GFP_KERNEL); | ||
3409 | if (!thresholds_new) { | ||
3410 | ret = -ENOMEM; | ||
3411 | goto unlock; | ||
3412 | } | ||
3413 | thresholds_new->size = size; | ||
3414 | |||
3415 | /* Copy thresholds (if any) to new array */ | ||
3416 | if (thresholds) | ||
3417 | memcpy(thresholds_new->entries, thresholds->entries, | ||
3418 | thresholds->size * | ||
3419 | sizeof(struct mem_cgroup_threshold)); | ||
3420 | /* Add new threshold */ | ||
3421 | thresholds_new->entries[size - 1].eventfd = eventfd; | ||
3422 | thresholds_new->entries[size - 1].threshold = threshold; | ||
3423 | |||
3424 | /* Sort thresholds. Registering of new threshold isn't time-critical */ | ||
3425 | sort(thresholds_new->entries, size, | ||
3426 | sizeof(struct mem_cgroup_threshold), | ||
3427 | compare_thresholds, NULL); | ||
3428 | |||
3429 | /* Find current threshold */ | ||
3430 | atomic_set(&thresholds_new->current_threshold, -1); | ||
3431 | for (i = 0; i < size; i++) { | ||
3432 | if (thresholds_new->entries[i].threshold < usage) { | ||
3433 | /* | ||
3434 | * thresholds_new->current_threshold will not be used | ||
3435 | * until rcu_assign_pointer(), so it's safe to increment | ||
3436 | * it here. | ||
3437 | */ | ||
3438 | atomic_inc(&thresholds_new->current_threshold); | ||
3439 | } | ||
3440 | } | ||
3441 | |||
3442 | if (type == _MEM) | ||
3443 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | ||
3444 | else | ||
3445 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
3446 | |||
3447 | /* To be sure that nobody uses thresholds before freeing it */ | ||
3448 | synchronize_rcu(); | ||
3449 | |||
3450 | kfree(thresholds); | ||
3451 | unlock: | ||
3452 | mutex_unlock(&memcg->thresholds_lock); | ||
3453 | |||
3454 | return ret; | ||
3455 | } | ||
3456 | |||
3457 | static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | ||
3458 | struct eventfd_ctx *eventfd) | ||
3459 | { | ||
3460 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3461 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | ||
3462 | int type = MEMFILE_TYPE(cft->private); | ||
3463 | u64 usage; | ||
3464 | int size = 0; | ||
3465 | int i, j, ret; | ||
3466 | |||
3467 | mutex_lock(&memcg->thresholds_lock); | ||
3468 | if (type == _MEM) | ||
3469 | thresholds = memcg->thresholds; | ||
3470 | else if (type == _MEMSWAP) | ||
3471 | thresholds = memcg->memsw_thresholds; | ||
3472 | else | ||
3473 | BUG(); | ||
3474 | |||
3475 | /* | ||
3476 | * Something went wrong if we trying to unregister a threshold | ||
3477 | * if we don't have thresholds | ||
3478 | */ | ||
3479 | BUG_ON(!thresholds); | ||
3480 | |||
3481 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | ||
3482 | |||
3483 | /* Check if a threshold crossed before removing */ | ||
3484 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | ||
3485 | |||
3486 | /* Calculate new number of threshold */ | ||
3487 | for (i = 0; i < thresholds->size; i++) { | ||
3488 | if (thresholds->entries[i].eventfd != eventfd) | ||
3489 | size++; | ||
3490 | } | ||
3491 | |||
3492 | /* Set thresholds array to NULL if we don't have thresholds */ | ||
3493 | if (!size) { | ||
3494 | thresholds_new = NULL; | ||
3495 | goto assign; | ||
3496 | } | ||
3497 | |||
3498 | /* Allocate memory for new array of thresholds */ | ||
3499 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
3500 | size * sizeof(struct mem_cgroup_threshold), | ||
3501 | GFP_KERNEL); | ||
3502 | if (!thresholds_new) { | ||
3503 | ret = -ENOMEM; | ||
3504 | goto unlock; | ||
3505 | } | ||
3506 | thresholds_new->size = size; | ||
3507 | |||
3508 | /* Copy thresholds and find current threshold */ | ||
3509 | atomic_set(&thresholds_new->current_threshold, -1); | ||
3510 | for (i = 0, j = 0; i < thresholds->size; i++) { | ||
3511 | if (thresholds->entries[i].eventfd == eventfd) | ||
3512 | continue; | ||
3513 | |||
3514 | thresholds_new->entries[j] = thresholds->entries[i]; | ||
3515 | if (thresholds_new->entries[j].threshold < usage) { | ||
3516 | /* | ||
3517 | * thresholds_new->current_threshold will not be used | ||
3518 | * until rcu_assign_pointer(), so it's safe to increment | ||
3519 | * it here. | ||
3520 | */ | ||
3521 | atomic_inc(&thresholds_new->current_threshold); | ||
3522 | } | ||
3523 | j++; | ||
3524 | } | ||
3525 | |||
3526 | assign: | ||
3527 | if (type == _MEM) | ||
3528 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | ||
3529 | else | ||
3530 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
3531 | |||
3532 | /* To be sure that nobody uses thresholds before freeing it */ | ||
3533 | synchronize_rcu(); | ||
3534 | |||
3535 | kfree(thresholds); | ||
3536 | unlock: | ||
3537 | mutex_unlock(&memcg->thresholds_lock); | ||
3538 | |||
3539 | return ret; | ||
3540 | } | ||
2842 | 3541 | ||
2843 | static struct cftype mem_cgroup_files[] = { | 3542 | static struct cftype mem_cgroup_files[] = { |
2844 | { | 3543 | { |
2845 | .name = "usage_in_bytes", | 3544 | .name = "usage_in_bytes", |
2846 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 3545 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
2847 | .read_u64 = mem_cgroup_read, | 3546 | .read_u64 = mem_cgroup_read, |
3547 | .register_event = mem_cgroup_register_event, | ||
3548 | .unregister_event = mem_cgroup_unregister_event, | ||
2848 | }, | 3549 | }, |
2849 | { | 3550 | { |
2850 | .name = "max_usage_in_bytes", | 3551 | .name = "max_usage_in_bytes", |
@@ -2888,6 +3589,11 @@ static struct cftype mem_cgroup_files[] = { | |||
2888 | .read_u64 = mem_cgroup_swappiness_read, | 3589 | .read_u64 = mem_cgroup_swappiness_read, |
2889 | .write_u64 = mem_cgroup_swappiness_write, | 3590 | .write_u64 = mem_cgroup_swappiness_write, |
2890 | }, | 3591 | }, |
3592 | { | ||
3593 | .name = "move_charge_at_immigrate", | ||
3594 | .read_u64 = mem_cgroup_move_charge_read, | ||
3595 | .write_u64 = mem_cgroup_move_charge_write, | ||
3596 | }, | ||
2891 | }; | 3597 | }; |
2892 | 3598 | ||
2893 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3599 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -2896,6 +3602,8 @@ static struct cftype memsw_cgroup_files[] = { | |||
2896 | .name = "memsw.usage_in_bytes", | 3602 | .name = "memsw.usage_in_bytes", |
2897 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 3603 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
2898 | .read_u64 = mem_cgroup_read, | 3604 | .read_u64 = mem_cgroup_read, |
3605 | .register_event = mem_cgroup_register_event, | ||
3606 | .unregister_event = mem_cgroup_unregister_event, | ||
2899 | }, | 3607 | }, |
2900 | { | 3608 | { |
2901 | .name = "memsw.max_usage_in_bytes", | 3609 | .name = "memsw.max_usage_in_bytes", |
@@ -2970,24 +3678,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
2970 | kfree(mem->info.nodeinfo[node]); | 3678 | kfree(mem->info.nodeinfo[node]); |
2971 | } | 3679 | } |
2972 | 3680 | ||
2973 | static int mem_cgroup_size(void) | ||
2974 | { | ||
2975 | int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); | ||
2976 | return sizeof(struct mem_cgroup) + cpustat_size; | ||
2977 | } | ||
2978 | |||
2979 | static struct mem_cgroup *mem_cgroup_alloc(void) | 3681 | static struct mem_cgroup *mem_cgroup_alloc(void) |
2980 | { | 3682 | { |
2981 | struct mem_cgroup *mem; | 3683 | struct mem_cgroup *mem; |
2982 | int size = mem_cgroup_size(); | 3684 | int size = sizeof(struct mem_cgroup); |
2983 | 3685 | ||
3686 | /* Can be very big if MAX_NUMNODES is very big */ | ||
2984 | if (size < PAGE_SIZE) | 3687 | if (size < PAGE_SIZE) |
2985 | mem = kmalloc(size, GFP_KERNEL); | 3688 | mem = kmalloc(size, GFP_KERNEL); |
2986 | else | 3689 | else |
2987 | mem = vmalloc(size); | 3690 | mem = vmalloc(size); |
2988 | 3691 | ||
2989 | if (mem) | 3692 | if (!mem) |
2990 | memset(mem, 0, size); | 3693 | return NULL; |
3694 | |||
3695 | memset(mem, 0, size); | ||
3696 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | ||
3697 | if (!mem->stat) { | ||
3698 | if (size < PAGE_SIZE) | ||
3699 | kfree(mem); | ||
3700 | else | ||
3701 | vfree(mem); | ||
3702 | mem = NULL; | ||
3703 | } | ||
2991 | return mem; | 3704 | return mem; |
2992 | } | 3705 | } |
2993 | 3706 | ||
@@ -3012,7 +3725,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) | |||
3012 | for_each_node_state(node, N_POSSIBLE) | 3725 | for_each_node_state(node, N_POSSIBLE) |
3013 | free_mem_cgroup_per_zone_info(mem, node); | 3726 | free_mem_cgroup_per_zone_info(mem, node); |
3014 | 3727 | ||
3015 | if (mem_cgroup_size() < PAGE_SIZE) | 3728 | free_percpu(mem->stat); |
3729 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) | ||
3016 | kfree(mem); | 3730 | kfree(mem); |
3017 | else | 3731 | else |
3018 | vfree(mem); | 3732 | vfree(mem); |
@@ -3023,9 +3737,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem) | |||
3023 | atomic_inc(&mem->refcnt); | 3737 | atomic_inc(&mem->refcnt); |
3024 | } | 3738 | } |
3025 | 3739 | ||
3026 | static void mem_cgroup_put(struct mem_cgroup *mem) | 3740 | static void __mem_cgroup_put(struct mem_cgroup *mem, int count) |
3027 | { | 3741 | { |
3028 | if (atomic_dec_and_test(&mem->refcnt)) { | 3742 | if (atomic_sub_and_test(count, &mem->refcnt)) { |
3029 | struct mem_cgroup *parent = parent_mem_cgroup(mem); | 3743 | struct mem_cgroup *parent = parent_mem_cgroup(mem); |
3030 | __mem_cgroup_free(mem); | 3744 | __mem_cgroup_free(mem); |
3031 | if (parent) | 3745 | if (parent) |
@@ -3033,6 +3747,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem) | |||
3033 | } | 3747 | } |
3034 | } | 3748 | } |
3035 | 3749 | ||
3750 | static void mem_cgroup_put(struct mem_cgroup *mem) | ||
3751 | { | ||
3752 | __mem_cgroup_put(mem, 1); | ||
3753 | } | ||
3754 | |||
3036 | /* | 3755 | /* |
3037 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. | 3756 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. |
3038 | */ | 3757 | */ |
@@ -3097,12 +3816,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3097 | 3816 | ||
3098 | /* root ? */ | 3817 | /* root ? */ |
3099 | if (cont->parent == NULL) { | 3818 | if (cont->parent == NULL) { |
3819 | int cpu; | ||
3100 | enable_swap_cgroup(); | 3820 | enable_swap_cgroup(); |
3101 | parent = NULL; | 3821 | parent = NULL; |
3102 | root_mem_cgroup = mem; | 3822 | root_mem_cgroup = mem; |
3103 | if (mem_cgroup_soft_limit_tree_init()) | 3823 | if (mem_cgroup_soft_limit_tree_init()) |
3104 | goto free_out; | 3824 | goto free_out; |
3105 | 3825 | for_each_possible_cpu(cpu) { | |
3826 | struct memcg_stock_pcp *stock = | ||
3827 | &per_cpu(memcg_stock, cpu); | ||
3828 | INIT_WORK(&stock->work, drain_local_stock); | ||
3829 | } | ||
3830 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | ||
3106 | } else { | 3831 | } else { |
3107 | parent = mem_cgroup_from_cont(cont->parent); | 3832 | parent = mem_cgroup_from_cont(cont->parent); |
3108 | mem->use_hierarchy = parent->use_hierarchy; | 3833 | mem->use_hierarchy = parent->use_hierarchy; |
@@ -3128,6 +3853,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3128 | if (parent) | 3853 | if (parent) |
3129 | mem->swappiness = get_swappiness(parent); | 3854 | mem->swappiness = get_swappiness(parent); |
3130 | atomic_set(&mem->refcnt, 1); | 3855 | atomic_set(&mem->refcnt, 1); |
3856 | mem->move_charge_at_immigrate = 0; | ||
3857 | mutex_init(&mem->thresholds_lock); | ||
3131 | return &mem->css; | 3858 | return &mem->css; |
3132 | free_out: | 3859 | free_out: |
3133 | __mem_cgroup_free(mem); | 3860 | __mem_cgroup_free(mem); |
@@ -3164,19 +3891,445 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
3164 | return ret; | 3891 | return ret; |
3165 | } | 3892 | } |
3166 | 3893 | ||
3894 | #ifdef CONFIG_MMU | ||
3895 | /* Handlers for move charge at task migration. */ | ||
3896 | #define PRECHARGE_COUNT_AT_ONCE 256 | ||
3897 | static int mem_cgroup_do_precharge(unsigned long count) | ||
3898 | { | ||
3899 | int ret = 0; | ||
3900 | int batch_count = PRECHARGE_COUNT_AT_ONCE; | ||
3901 | struct mem_cgroup *mem = mc.to; | ||
3902 | |||
3903 | if (mem_cgroup_is_root(mem)) { | ||
3904 | mc.precharge += count; | ||
3905 | /* we don't need css_get for root */ | ||
3906 | return ret; | ||
3907 | } | ||
3908 | /* try to charge at once */ | ||
3909 | if (count > 1) { | ||
3910 | struct res_counter *dummy; | ||
3911 | /* | ||
3912 | * "mem" cannot be under rmdir() because we've already checked | ||
3913 | * by cgroup_lock_live_cgroup() that it is not removed and we | ||
3914 | * are still under the same cgroup_mutex. So we can postpone | ||
3915 | * css_get(). | ||
3916 | */ | ||
3917 | if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) | ||
3918 | goto one_by_one; | ||
3919 | if (do_swap_account && res_counter_charge(&mem->memsw, | ||
3920 | PAGE_SIZE * count, &dummy)) { | ||
3921 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | ||
3922 | goto one_by_one; | ||
3923 | } | ||
3924 | mc.precharge += count; | ||
3925 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
3926 | WARN_ON_ONCE(count > INT_MAX); | ||
3927 | __css_get(&mem->css, (int)count); | ||
3928 | return ret; | ||
3929 | } | ||
3930 | one_by_one: | ||
3931 | /* fall back to one by one charge */ | ||
3932 | while (count--) { | ||
3933 | if (signal_pending(current)) { | ||
3934 | ret = -EINTR; | ||
3935 | break; | ||
3936 | } | ||
3937 | if (!batch_count--) { | ||
3938 | batch_count = PRECHARGE_COUNT_AT_ONCE; | ||
3939 | cond_resched(); | ||
3940 | } | ||
3941 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | ||
3942 | if (ret || !mem) | ||
3943 | /* mem_cgroup_clear_mc() will do uncharge later */ | ||
3944 | return -ENOMEM; | ||
3945 | mc.precharge++; | ||
3946 | } | ||
3947 | return ret; | ||
3948 | } | ||
3949 | |||
3950 | /** | ||
3951 | * is_target_pte_for_mc - check a pte whether it is valid for move charge | ||
3952 | * @vma: the vma the pte to be checked belongs | ||
3953 | * @addr: the address corresponding to the pte to be checked | ||
3954 | * @ptent: the pte to be checked | ||
3955 | * @target: the pointer the target page or swap ent will be stored(can be NULL) | ||
3956 | * | ||
3957 | * Returns | ||
3958 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. | ||
3959 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for | ||
3960 | * move charge. if @target is not NULL, the page is stored in target->page | ||
3961 | * with extra refcnt got(Callers should handle it). | ||
3962 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a | ||
3963 | * target for charge migration. if @target is not NULL, the entry is stored | ||
3964 | * in target->ent. | ||
3965 | * | ||
3966 | * Called with pte lock held. | ||
3967 | */ | ||
3968 | union mc_target { | ||
3969 | struct page *page; | ||
3970 | swp_entry_t ent; | ||
3971 | }; | ||
3972 | |||
3973 | enum mc_target_type { | ||
3974 | MC_TARGET_NONE, /* not used */ | ||
3975 | MC_TARGET_PAGE, | ||
3976 | MC_TARGET_SWAP, | ||
3977 | }; | ||
3978 | |||
3979 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | ||
3980 | unsigned long addr, pte_t ptent, union mc_target *target) | ||
3981 | { | ||
3982 | struct page *page = NULL; | ||
3983 | struct page_cgroup *pc; | ||
3984 | int ret = 0; | ||
3985 | swp_entry_t ent = { .val = 0 }; | ||
3986 | int usage_count = 0; | ||
3987 | bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, | ||
3988 | &mc.to->move_charge_at_immigrate); | ||
3989 | |||
3990 | if (!pte_present(ptent)) { | ||
3991 | /* TODO: handle swap of shmes/tmpfs */ | ||
3992 | if (pte_none(ptent) || pte_file(ptent)) | ||
3993 | return 0; | ||
3994 | else if (is_swap_pte(ptent)) { | ||
3995 | ent = pte_to_swp_entry(ptent); | ||
3996 | if (!move_anon || non_swap_entry(ent)) | ||
3997 | return 0; | ||
3998 | usage_count = mem_cgroup_count_swap_user(ent, &page); | ||
3999 | } | ||
4000 | } else { | ||
4001 | page = vm_normal_page(vma, addr, ptent); | ||
4002 | if (!page || !page_mapped(page)) | ||
4003 | return 0; | ||
4004 | /* | ||
4005 | * TODO: We don't move charges of file(including shmem/tmpfs) | ||
4006 | * pages for now. | ||
4007 | */ | ||
4008 | if (!move_anon || !PageAnon(page)) | ||
4009 | return 0; | ||
4010 | if (!get_page_unless_zero(page)) | ||
4011 | return 0; | ||
4012 | usage_count = page_mapcount(page); | ||
4013 | } | ||
4014 | if (usage_count > 1) { | ||
4015 | /* | ||
4016 | * TODO: We don't move charges of shared(used by multiple | ||
4017 | * processes) pages for now. | ||
4018 | */ | ||
4019 | if (page) | ||
4020 | put_page(page); | ||
4021 | return 0; | ||
4022 | } | ||
4023 | if (page) { | ||
4024 | pc = lookup_page_cgroup(page); | ||
4025 | /* | ||
4026 | * Do only loose check w/o page_cgroup lock. | ||
4027 | * mem_cgroup_move_account() checks the pc is valid or not under | ||
4028 | * the lock. | ||
4029 | */ | ||
4030 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | ||
4031 | ret = MC_TARGET_PAGE; | ||
4032 | if (target) | ||
4033 | target->page = page; | ||
4034 | } | ||
4035 | if (!ret || !target) | ||
4036 | put_page(page); | ||
4037 | } | ||
4038 | /* throught */ | ||
4039 | if (ent.val && do_swap_account && !ret && | ||
4040 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { | ||
4041 | ret = MC_TARGET_SWAP; | ||
4042 | if (target) | ||
4043 | target->ent = ent; | ||
4044 | } | ||
4045 | return ret; | ||
4046 | } | ||
4047 | |||
4048 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | ||
4049 | unsigned long addr, unsigned long end, | ||
4050 | struct mm_walk *walk) | ||
4051 | { | ||
4052 | struct vm_area_struct *vma = walk->private; | ||
4053 | pte_t *pte; | ||
4054 | spinlock_t *ptl; | ||
4055 | |||
4056 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
4057 | for (; addr != end; pte++, addr += PAGE_SIZE) | ||
4058 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | ||
4059 | mc.precharge++; /* increment precharge temporarily */ | ||
4060 | pte_unmap_unlock(pte - 1, ptl); | ||
4061 | cond_resched(); | ||
4062 | |||
4063 | return 0; | ||
4064 | } | ||
4065 | |||
4066 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | ||
4067 | { | ||
4068 | unsigned long precharge; | ||
4069 | struct vm_area_struct *vma; | ||
4070 | |||
4071 | down_read(&mm->mmap_sem); | ||
4072 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
4073 | struct mm_walk mem_cgroup_count_precharge_walk = { | ||
4074 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
4075 | .mm = mm, | ||
4076 | .private = vma, | ||
4077 | }; | ||
4078 | if (is_vm_hugetlb_page(vma)) | ||
4079 | continue; | ||
4080 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4081 | if (vma->vm_flags & VM_SHARED) | ||
4082 | continue; | ||
4083 | walk_page_range(vma->vm_start, vma->vm_end, | ||
4084 | &mem_cgroup_count_precharge_walk); | ||
4085 | } | ||
4086 | up_read(&mm->mmap_sem); | ||
4087 | |||
4088 | precharge = mc.precharge; | ||
4089 | mc.precharge = 0; | ||
4090 | |||
4091 | return precharge; | ||
4092 | } | ||
4093 | |||
4094 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | ||
4095 | { | ||
4096 | return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); | ||
4097 | } | ||
4098 | |||
4099 | static void mem_cgroup_clear_mc(void) | ||
4100 | { | ||
4101 | /* we must uncharge all the leftover precharges from mc.to */ | ||
4102 | if (mc.precharge) { | ||
4103 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); | ||
4104 | mc.precharge = 0; | ||
4105 | } | ||
4106 | /* | ||
4107 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | ||
4108 | * we must uncharge here. | ||
4109 | */ | ||
4110 | if (mc.moved_charge) { | ||
4111 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | ||
4112 | mc.moved_charge = 0; | ||
4113 | } | ||
4114 | /* we must fixup refcnts and charges */ | ||
4115 | if (mc.moved_swap) { | ||
4116 | WARN_ON_ONCE(mc.moved_swap > INT_MAX); | ||
4117 | /* uncharge swap account from the old cgroup */ | ||
4118 | if (!mem_cgroup_is_root(mc.from)) | ||
4119 | res_counter_uncharge(&mc.from->memsw, | ||
4120 | PAGE_SIZE * mc.moved_swap); | ||
4121 | __mem_cgroup_put(mc.from, mc.moved_swap); | ||
4122 | |||
4123 | if (!mem_cgroup_is_root(mc.to)) { | ||
4124 | /* | ||
4125 | * we charged both to->res and to->memsw, so we should | ||
4126 | * uncharge to->res. | ||
4127 | */ | ||
4128 | res_counter_uncharge(&mc.to->res, | ||
4129 | PAGE_SIZE * mc.moved_swap); | ||
4130 | VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); | ||
4131 | __css_put(&mc.to->css, mc.moved_swap); | ||
4132 | } | ||
4133 | /* we've already done mem_cgroup_get(mc.to) */ | ||
4134 | |||
4135 | mc.moved_swap = 0; | ||
4136 | } | ||
4137 | mc.from = NULL; | ||
4138 | mc.to = NULL; | ||
4139 | mc.moving_task = NULL; | ||
4140 | wake_up_all(&mc.waitq); | ||
4141 | } | ||
4142 | |||
4143 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
4144 | struct cgroup *cgroup, | ||
4145 | struct task_struct *p, | ||
4146 | bool threadgroup) | ||
4147 | { | ||
4148 | int ret = 0; | ||
4149 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); | ||
4150 | |||
4151 | if (mem->move_charge_at_immigrate) { | ||
4152 | struct mm_struct *mm; | ||
4153 | struct mem_cgroup *from = mem_cgroup_from_task(p); | ||
4154 | |||
4155 | VM_BUG_ON(from == mem); | ||
4156 | |||
4157 | mm = get_task_mm(p); | ||
4158 | if (!mm) | ||
4159 | return 0; | ||
4160 | /* We move charges only when we move a owner of the mm */ | ||
4161 | if (mm->owner == p) { | ||
4162 | VM_BUG_ON(mc.from); | ||
4163 | VM_BUG_ON(mc.to); | ||
4164 | VM_BUG_ON(mc.precharge); | ||
4165 | VM_BUG_ON(mc.moved_charge); | ||
4166 | VM_BUG_ON(mc.moved_swap); | ||
4167 | VM_BUG_ON(mc.moving_task); | ||
4168 | mc.from = from; | ||
4169 | mc.to = mem; | ||
4170 | mc.precharge = 0; | ||
4171 | mc.moved_charge = 0; | ||
4172 | mc.moved_swap = 0; | ||
4173 | mc.moving_task = current; | ||
4174 | |||
4175 | ret = mem_cgroup_precharge_mc(mm); | ||
4176 | if (ret) | ||
4177 | mem_cgroup_clear_mc(); | ||
4178 | } | ||
4179 | mmput(mm); | ||
4180 | } | ||
4181 | return ret; | ||
4182 | } | ||
4183 | |||
4184 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
4185 | struct cgroup *cgroup, | ||
4186 | struct task_struct *p, | ||
4187 | bool threadgroup) | ||
4188 | { | ||
4189 | mem_cgroup_clear_mc(); | ||
4190 | } | ||
4191 | |||
4192 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | ||
4193 | unsigned long addr, unsigned long end, | ||
4194 | struct mm_walk *walk) | ||
4195 | { | ||
4196 | int ret = 0; | ||
4197 | struct vm_area_struct *vma = walk->private; | ||
4198 | pte_t *pte; | ||
4199 | spinlock_t *ptl; | ||
4200 | |||
4201 | retry: | ||
4202 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
4203 | for (; addr != end; addr += PAGE_SIZE) { | ||
4204 | pte_t ptent = *(pte++); | ||
4205 | union mc_target target; | ||
4206 | int type; | ||
4207 | struct page *page; | ||
4208 | struct page_cgroup *pc; | ||
4209 | swp_entry_t ent; | ||
4210 | |||
4211 | if (!mc.precharge) | ||
4212 | break; | ||
4213 | |||
4214 | type = is_target_pte_for_mc(vma, addr, ptent, &target); | ||
4215 | switch (type) { | ||
4216 | case MC_TARGET_PAGE: | ||
4217 | page = target.page; | ||
4218 | if (isolate_lru_page(page)) | ||
4219 | goto put; | ||
4220 | pc = lookup_page_cgroup(page); | ||
4221 | if (!mem_cgroup_move_account(pc, | ||
4222 | mc.from, mc.to, false)) { | ||
4223 | mc.precharge--; | ||
4224 | /* we uncharge from mc.from later. */ | ||
4225 | mc.moved_charge++; | ||
4226 | } | ||
4227 | putback_lru_page(page); | ||
4228 | put: /* is_target_pte_for_mc() gets the page */ | ||
4229 | put_page(page); | ||
4230 | break; | ||
4231 | case MC_TARGET_SWAP: | ||
4232 | ent = target.ent; | ||
4233 | if (!mem_cgroup_move_swap_account(ent, | ||
4234 | mc.from, mc.to, false)) { | ||
4235 | mc.precharge--; | ||
4236 | /* we fixup refcnts and charges later. */ | ||
4237 | mc.moved_swap++; | ||
4238 | } | ||
4239 | break; | ||
4240 | default: | ||
4241 | break; | ||
4242 | } | ||
4243 | } | ||
4244 | pte_unmap_unlock(pte - 1, ptl); | ||
4245 | cond_resched(); | ||
4246 | |||
4247 | if (addr != end) { | ||
4248 | /* | ||
4249 | * We have consumed all precharges we got in can_attach(). | ||
4250 | * We try charge one by one, but don't do any additional | ||
4251 | * charges to mc.to if we have failed in charge once in attach() | ||
4252 | * phase. | ||
4253 | */ | ||
4254 | ret = mem_cgroup_do_precharge(1); | ||
4255 | if (!ret) | ||
4256 | goto retry; | ||
4257 | } | ||
4258 | |||
4259 | return ret; | ||
4260 | } | ||
4261 | |||
4262 | static void mem_cgroup_move_charge(struct mm_struct *mm) | ||
4263 | { | ||
4264 | struct vm_area_struct *vma; | ||
4265 | |||
4266 | lru_add_drain_all(); | ||
4267 | down_read(&mm->mmap_sem); | ||
4268 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
4269 | int ret; | ||
4270 | struct mm_walk mem_cgroup_move_charge_walk = { | ||
4271 | .pmd_entry = mem_cgroup_move_charge_pte_range, | ||
4272 | .mm = mm, | ||
4273 | .private = vma, | ||
4274 | }; | ||
4275 | if (is_vm_hugetlb_page(vma)) | ||
4276 | continue; | ||
4277 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4278 | if (vma->vm_flags & VM_SHARED) | ||
4279 | continue; | ||
4280 | ret = walk_page_range(vma->vm_start, vma->vm_end, | ||
4281 | &mem_cgroup_move_charge_walk); | ||
4282 | if (ret) | ||
4283 | /* | ||
4284 | * means we have consumed all precharges and failed in | ||
4285 | * doing additional charge. Just abandon here. | ||
4286 | */ | ||
4287 | break; | ||
4288 | } | ||
4289 | up_read(&mm->mmap_sem); | ||
4290 | } | ||
4291 | |||
3167 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 4292 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
3168 | struct cgroup *cont, | 4293 | struct cgroup *cont, |
3169 | struct cgroup *old_cont, | 4294 | struct cgroup *old_cont, |
3170 | struct task_struct *p, | 4295 | struct task_struct *p, |
3171 | bool threadgroup) | 4296 | bool threadgroup) |
3172 | { | 4297 | { |
3173 | mutex_lock(&memcg_tasklist); | 4298 | struct mm_struct *mm; |
3174 | /* | 4299 | |
3175 | * FIXME: It's better to move charges of this process from old | 4300 | if (!mc.to) |
3176 | * memcg to new memcg. But it's just on TODO-List now. | 4301 | /* no need to move charge */ |
3177 | */ | 4302 | return; |
3178 | mutex_unlock(&memcg_tasklist); | 4303 | |
4304 | mm = get_task_mm(p); | ||
4305 | if (mm) { | ||
4306 | mem_cgroup_move_charge(mm); | ||
4307 | mmput(mm); | ||
4308 | } | ||
4309 | mem_cgroup_clear_mc(); | ||
4310 | } | ||
4311 | #else /* !CONFIG_MMU */ | ||
4312 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
4313 | struct cgroup *cgroup, | ||
4314 | struct task_struct *p, | ||
4315 | bool threadgroup) | ||
4316 | { | ||
4317 | return 0; | ||
3179 | } | 4318 | } |
4319 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
4320 | struct cgroup *cgroup, | ||
4321 | struct task_struct *p, | ||
4322 | bool threadgroup) | ||
4323 | { | ||
4324 | } | ||
4325 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | ||
4326 | struct cgroup *cont, | ||
4327 | struct cgroup *old_cont, | ||
4328 | struct task_struct *p, | ||
4329 | bool threadgroup) | ||
4330 | { | ||
4331 | } | ||
4332 | #endif | ||
3180 | 4333 | ||
3181 | struct cgroup_subsys mem_cgroup_subsys = { | 4334 | struct cgroup_subsys mem_cgroup_subsys = { |
3182 | .name = "memory", | 4335 | .name = "memory", |
@@ -3185,6 +4338,8 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
3185 | .pre_destroy = mem_cgroup_pre_destroy, | 4338 | .pre_destroy = mem_cgroup_pre_destroy, |
3186 | .destroy = mem_cgroup_destroy, | 4339 | .destroy = mem_cgroup_destroy, |
3187 | .populate = mem_cgroup_populate, | 4340 | .populate = mem_cgroup_populate, |
4341 | .can_attach = mem_cgroup_can_attach, | ||
4342 | .cancel_attach = mem_cgroup_cancel_attach, | ||
3188 | .attach = mem_cgroup_move_task, | 4343 | .attach = mem_cgroup_move_task, |
3189 | .early_init = 0, | 4344 | .early_init = 0, |
3190 | .use_id = 1, | 4345 | .use_id = 1, |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index dacc64183874..620b0b461593 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -34,12 +34,17 @@ | |||
34 | #include <linux/kernel.h> | 34 | #include <linux/kernel.h> |
35 | #include <linux/mm.h> | 35 | #include <linux/mm.h> |
36 | #include <linux/page-flags.h> | 36 | #include <linux/page-flags.h> |
37 | #include <linux/kernel-page-flags.h> | ||
37 | #include <linux/sched.h> | 38 | #include <linux/sched.h> |
38 | #include <linux/ksm.h> | 39 | #include <linux/ksm.h> |
39 | #include <linux/rmap.h> | 40 | #include <linux/rmap.h> |
40 | #include <linux/pagemap.h> | 41 | #include <linux/pagemap.h> |
41 | #include <linux/swap.h> | 42 | #include <linux/swap.h> |
42 | #include <linux/backing-dev.h> | 43 | #include <linux/backing-dev.h> |
44 | #include <linux/migrate.h> | ||
45 | #include <linux/page-isolation.h> | ||
46 | #include <linux/suspend.h> | ||
47 | #include <linux/slab.h> | ||
43 | #include "internal.h" | 48 | #include "internal.h" |
44 | 49 | ||
45 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 50 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -48,6 +53,129 @@ int sysctl_memory_failure_recovery __read_mostly = 1; | |||
48 | 53 | ||
49 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); | 54 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); |
50 | 55 | ||
56 | #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) | ||
57 | |||
58 | u32 hwpoison_filter_enable = 0; | ||
59 | u32 hwpoison_filter_dev_major = ~0U; | ||
60 | u32 hwpoison_filter_dev_minor = ~0U; | ||
61 | u64 hwpoison_filter_flags_mask; | ||
62 | u64 hwpoison_filter_flags_value; | ||
63 | EXPORT_SYMBOL_GPL(hwpoison_filter_enable); | ||
64 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); | ||
65 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); | ||
66 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); | ||
67 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); | ||
68 | |||
69 | static int hwpoison_filter_dev(struct page *p) | ||
70 | { | ||
71 | struct address_space *mapping; | ||
72 | dev_t dev; | ||
73 | |||
74 | if (hwpoison_filter_dev_major == ~0U && | ||
75 | hwpoison_filter_dev_minor == ~0U) | ||
76 | return 0; | ||
77 | |||
78 | /* | ||
79 | * page_mapping() does not accept slab page | ||
80 | */ | ||
81 | if (PageSlab(p)) | ||
82 | return -EINVAL; | ||
83 | |||
84 | mapping = page_mapping(p); | ||
85 | if (mapping == NULL || mapping->host == NULL) | ||
86 | return -EINVAL; | ||
87 | |||
88 | dev = mapping->host->i_sb->s_dev; | ||
89 | if (hwpoison_filter_dev_major != ~0U && | ||
90 | hwpoison_filter_dev_major != MAJOR(dev)) | ||
91 | return -EINVAL; | ||
92 | if (hwpoison_filter_dev_minor != ~0U && | ||
93 | hwpoison_filter_dev_minor != MINOR(dev)) | ||
94 | return -EINVAL; | ||
95 | |||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | static int hwpoison_filter_flags(struct page *p) | ||
100 | { | ||
101 | if (!hwpoison_filter_flags_mask) | ||
102 | return 0; | ||
103 | |||
104 | if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == | ||
105 | hwpoison_filter_flags_value) | ||
106 | return 0; | ||
107 | else | ||
108 | return -EINVAL; | ||
109 | } | ||
110 | |||
111 | /* | ||
112 | * This allows stress tests to limit test scope to a collection of tasks | ||
113 | * by putting them under some memcg. This prevents killing unrelated/important | ||
114 | * processes such as /sbin/init. Note that the target task may share clean | ||
115 | * pages with init (eg. libc text), which is harmless. If the target task | ||
116 | * share _dirty_ pages with another task B, the test scheme must make sure B | ||
117 | * is also included in the memcg. At last, due to race conditions this filter | ||
118 | * can only guarantee that the page either belongs to the memcg tasks, or is | ||
119 | * a freed page. | ||
120 | */ | ||
121 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
122 | u64 hwpoison_filter_memcg; | ||
123 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); | ||
124 | static int hwpoison_filter_task(struct page *p) | ||
125 | { | ||
126 | struct mem_cgroup *mem; | ||
127 | struct cgroup_subsys_state *css; | ||
128 | unsigned long ino; | ||
129 | |||
130 | if (!hwpoison_filter_memcg) | ||
131 | return 0; | ||
132 | |||
133 | mem = try_get_mem_cgroup_from_page(p); | ||
134 | if (!mem) | ||
135 | return -EINVAL; | ||
136 | |||
137 | css = mem_cgroup_css(mem); | ||
138 | /* root_mem_cgroup has NULL dentries */ | ||
139 | if (!css->cgroup->dentry) | ||
140 | return -EINVAL; | ||
141 | |||
142 | ino = css->cgroup->dentry->d_inode->i_ino; | ||
143 | css_put(css); | ||
144 | |||
145 | if (ino != hwpoison_filter_memcg) | ||
146 | return -EINVAL; | ||
147 | |||
148 | return 0; | ||
149 | } | ||
150 | #else | ||
151 | static int hwpoison_filter_task(struct page *p) { return 0; } | ||
152 | #endif | ||
153 | |||
154 | int hwpoison_filter(struct page *p) | ||
155 | { | ||
156 | if (!hwpoison_filter_enable) | ||
157 | return 0; | ||
158 | |||
159 | if (hwpoison_filter_dev(p)) | ||
160 | return -EINVAL; | ||
161 | |||
162 | if (hwpoison_filter_flags(p)) | ||
163 | return -EINVAL; | ||
164 | |||
165 | if (hwpoison_filter_task(p)) | ||
166 | return -EINVAL; | ||
167 | |||
168 | return 0; | ||
169 | } | ||
170 | #else | ||
171 | int hwpoison_filter(struct page *p) | ||
172 | { | ||
173 | return 0; | ||
174 | } | ||
175 | #endif | ||
176 | |||
177 | EXPORT_SYMBOL_GPL(hwpoison_filter); | ||
178 | |||
51 | /* | 179 | /* |
52 | * Send all the processes who have the page mapped an ``action optional'' | 180 | * Send all the processes who have the page mapped an ``action optional'' |
53 | * signal. | 181 | * signal. |
@@ -83,6 +211,36 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | |||
83 | } | 211 | } |
84 | 212 | ||
85 | /* | 213 | /* |
214 | * When a unknown page type is encountered drain as many buffers as possible | ||
215 | * in the hope to turn the page into a LRU or free page, which we can handle. | ||
216 | */ | ||
217 | void shake_page(struct page *p, int access) | ||
218 | { | ||
219 | if (!PageSlab(p)) { | ||
220 | lru_add_drain_all(); | ||
221 | if (PageLRU(p)) | ||
222 | return; | ||
223 | drain_all_pages(); | ||
224 | if (PageLRU(p) || is_free_buddy_page(p)) | ||
225 | return; | ||
226 | } | ||
227 | |||
228 | /* | ||
229 | * Only all shrink_slab here (which would also | ||
230 | * shrink other caches) if access is not potentially fatal. | ||
231 | */ | ||
232 | if (access) { | ||
233 | int nr; | ||
234 | do { | ||
235 | nr = shrink_slab(1000, GFP_KERNEL, 1000); | ||
236 | if (page_count(p) == 0) | ||
237 | break; | ||
238 | } while (nr > 10); | ||
239 | } | ||
240 | } | ||
241 | EXPORT_SYMBOL_GPL(shake_page); | ||
242 | |||
243 | /* | ||
86 | * Kill all processes that have a poisoned page mapped and then isolate | 244 | * Kill all processes that have a poisoned page mapped and then isolate |
87 | * the page. | 245 | * the page. |
88 | * | 246 | * |
@@ -174,10 +332,9 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, | |||
174 | list_for_each_entry_safe (tk, next, to_kill, nd) { | 332 | list_for_each_entry_safe (tk, next, to_kill, nd) { |
175 | if (doit) { | 333 | if (doit) { |
176 | /* | 334 | /* |
177 | * In case something went wrong with munmaping | 335 | * In case something went wrong with munmapping |
178 | * make sure the process doesn't catch the | 336 | * make sure the process doesn't catch the |
179 | * signal and then access the memory. Just kill it. | 337 | * signal and then access the memory. Just kill it. |
180 | * the signal handlers | ||
181 | */ | 338 | */ |
182 | if (fail || tk->addr_valid == 0) { | 339 | if (fail || tk->addr_valid == 0) { |
183 | printk(KERN_ERR | 340 | printk(KERN_ERR |
@@ -227,9 +384,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
227 | if (av == NULL) /* Not actually mapped anymore */ | 384 | if (av == NULL) /* Not actually mapped anymore */ |
228 | goto out; | 385 | goto out; |
229 | for_each_process (tsk) { | 386 | for_each_process (tsk) { |
387 | struct anon_vma_chain *vmac; | ||
388 | |||
230 | if (!task_early_kill(tsk)) | 389 | if (!task_early_kill(tsk)) |
231 | continue; | 390 | continue; |
232 | list_for_each_entry (vma, &av->head, anon_vma_node) { | 391 | list_for_each_entry(vmac, &av->head, same_anon_vma) { |
392 | vma = vmac->vma; | ||
233 | if (!page_mapped_in_vma(page, vma)) | 393 | if (!page_mapped_in_vma(page, vma)) |
234 | continue; | 394 | continue; |
235 | if (vma->vm_mm == tsk->mm) | 395 | if (vma->vm_mm == tsk->mm) |
@@ -314,33 +474,49 @@ static void collect_procs(struct page *page, struct list_head *tokill) | |||
314 | */ | 474 | */ |
315 | 475 | ||
316 | enum outcome { | 476 | enum outcome { |
317 | FAILED, /* Error handling failed */ | 477 | IGNORED, /* Error: cannot be handled */ |
478 | FAILED, /* Error: handling failed */ | ||
318 | DELAYED, /* Will be handled later */ | 479 | DELAYED, /* Will be handled later */ |
319 | IGNORED, /* Error safely ignored */ | ||
320 | RECOVERED, /* Successfully recovered */ | 480 | RECOVERED, /* Successfully recovered */ |
321 | }; | 481 | }; |
322 | 482 | ||
323 | static const char *action_name[] = { | 483 | static const char *action_name[] = { |
484 | [IGNORED] = "Ignored", | ||
324 | [FAILED] = "Failed", | 485 | [FAILED] = "Failed", |
325 | [DELAYED] = "Delayed", | 486 | [DELAYED] = "Delayed", |
326 | [IGNORED] = "Ignored", | ||
327 | [RECOVERED] = "Recovered", | 487 | [RECOVERED] = "Recovered", |
328 | }; | 488 | }; |
329 | 489 | ||
330 | /* | 490 | /* |
331 | * Error hit kernel page. | 491 | * XXX: It is possible that a page is isolated from LRU cache, |
332 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | 492 | * and then kept in swap cache or failed to remove from page cache. |
333 | * could be more sophisticated. | 493 | * The page count will stop it from being freed by unpoison. |
494 | * Stress tests should be aware of this memory leak problem. | ||
334 | */ | 495 | */ |
335 | static int me_kernel(struct page *p, unsigned long pfn) | 496 | static int delete_from_lru_cache(struct page *p) |
336 | { | 497 | { |
337 | return DELAYED; | 498 | if (!isolate_lru_page(p)) { |
499 | /* | ||
500 | * Clear sensible page flags, so that the buddy system won't | ||
501 | * complain when the page is unpoison-and-freed. | ||
502 | */ | ||
503 | ClearPageActive(p); | ||
504 | ClearPageUnevictable(p); | ||
505 | /* | ||
506 | * drop the page count elevated by isolate_lru_page() | ||
507 | */ | ||
508 | page_cache_release(p); | ||
509 | return 0; | ||
510 | } | ||
511 | return -EIO; | ||
338 | } | 512 | } |
339 | 513 | ||
340 | /* | 514 | /* |
341 | * Already poisoned page. | 515 | * Error hit kernel page. |
516 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | ||
517 | * could be more sophisticated. | ||
342 | */ | 518 | */ |
343 | static int me_ignore(struct page *p, unsigned long pfn) | 519 | static int me_kernel(struct page *p, unsigned long pfn) |
344 | { | 520 | { |
345 | return IGNORED; | 521 | return IGNORED; |
346 | } | 522 | } |
@@ -355,14 +531,6 @@ static int me_unknown(struct page *p, unsigned long pfn) | |||
355 | } | 531 | } |
356 | 532 | ||
357 | /* | 533 | /* |
358 | * Free memory | ||
359 | */ | ||
360 | static int me_free(struct page *p, unsigned long pfn) | ||
361 | { | ||
362 | return DELAYED; | ||
363 | } | ||
364 | |||
365 | /* | ||
366 | * Clean (or cleaned) page cache page. | 534 | * Clean (or cleaned) page cache page. |
367 | */ | 535 | */ |
368 | static int me_pagecache_clean(struct page *p, unsigned long pfn) | 536 | static int me_pagecache_clean(struct page *p, unsigned long pfn) |
@@ -371,6 +539,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
371 | int ret = FAILED; | 539 | int ret = FAILED; |
372 | struct address_space *mapping; | 540 | struct address_space *mapping; |
373 | 541 | ||
542 | delete_from_lru_cache(p); | ||
543 | |||
374 | /* | 544 | /* |
375 | * For anonymous pages we're done the only reference left | 545 | * For anonymous pages we're done the only reference left |
376 | * should be the one m_f() holds. | 546 | * should be the one m_f() holds. |
@@ -500,14 +670,20 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) | |||
500 | /* Trigger EIO in shmem: */ | 670 | /* Trigger EIO in shmem: */ |
501 | ClearPageUptodate(p); | 671 | ClearPageUptodate(p); |
502 | 672 | ||
503 | return DELAYED; | 673 | if (!delete_from_lru_cache(p)) |
674 | return DELAYED; | ||
675 | else | ||
676 | return FAILED; | ||
504 | } | 677 | } |
505 | 678 | ||
506 | static int me_swapcache_clean(struct page *p, unsigned long pfn) | 679 | static int me_swapcache_clean(struct page *p, unsigned long pfn) |
507 | { | 680 | { |
508 | delete_from_swap_cache(p); | 681 | delete_from_swap_cache(p); |
509 | 682 | ||
510 | return RECOVERED; | 683 | if (!delete_from_lru_cache(p)) |
684 | return RECOVERED; | ||
685 | else | ||
686 | return FAILED; | ||
511 | } | 687 | } |
512 | 688 | ||
513 | /* | 689 | /* |
@@ -550,7 +726,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
550 | #define tail (1UL << PG_tail) | 726 | #define tail (1UL << PG_tail) |
551 | #define compound (1UL << PG_compound) | 727 | #define compound (1UL << PG_compound) |
552 | #define slab (1UL << PG_slab) | 728 | #define slab (1UL << PG_slab) |
553 | #define buddy (1UL << PG_buddy) | ||
554 | #define reserved (1UL << PG_reserved) | 729 | #define reserved (1UL << PG_reserved) |
555 | 730 | ||
556 | static struct page_state { | 731 | static struct page_state { |
@@ -559,8 +734,11 @@ static struct page_state { | |||
559 | char *msg; | 734 | char *msg; |
560 | int (*action)(struct page *p, unsigned long pfn); | 735 | int (*action)(struct page *p, unsigned long pfn); |
561 | } error_states[] = { | 736 | } error_states[] = { |
562 | { reserved, reserved, "reserved kernel", me_ignore }, | 737 | { reserved, reserved, "reserved kernel", me_kernel }, |
563 | { buddy, buddy, "free kernel", me_free }, | 738 | /* |
739 | * free pages are specially detected outside this table: | ||
740 | * PG_buddy pages only make a small fraction of all free pages. | ||
741 | */ | ||
564 | 742 | ||
565 | /* | 743 | /* |
566 | * Could in theory check if slab page is free or if we can drop | 744 | * Could in theory check if slab page is free or if we can drop |
@@ -582,14 +760,11 @@ static struct page_state { | |||
582 | { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, | 760 | { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, |
583 | { unevict, unevict, "unevictable LRU", me_pagecache_clean}, | 761 | { unevict, unevict, "unevictable LRU", me_pagecache_clean}, |
584 | 762 | ||
585 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
586 | { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, | 763 | { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, |
587 | { mlock, mlock, "mlocked LRU", me_pagecache_clean }, | 764 | { mlock, mlock, "mlocked LRU", me_pagecache_clean }, |
588 | #endif | ||
589 | 765 | ||
590 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, | 766 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, |
591 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 767 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, |
592 | { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, | ||
593 | 768 | ||
594 | /* | 769 | /* |
595 | * Catchall entry: must be at end. | 770 | * Catchall entry: must be at end. |
@@ -597,20 +772,31 @@ static struct page_state { | |||
597 | { 0, 0, "unknown page state", me_unknown }, | 772 | { 0, 0, "unknown page state", me_unknown }, |
598 | }; | 773 | }; |
599 | 774 | ||
775 | #undef dirty | ||
776 | #undef sc | ||
777 | #undef unevict | ||
778 | #undef mlock | ||
779 | #undef writeback | ||
780 | #undef lru | ||
781 | #undef swapbacked | ||
782 | #undef head | ||
783 | #undef tail | ||
784 | #undef compound | ||
785 | #undef slab | ||
786 | #undef reserved | ||
787 | |||
600 | static void action_result(unsigned long pfn, char *msg, int result) | 788 | static void action_result(unsigned long pfn, char *msg, int result) |
601 | { | 789 | { |
602 | struct page *page = NULL; | 790 | struct page *page = pfn_to_page(pfn); |
603 | if (pfn_valid(pfn)) | ||
604 | page = pfn_to_page(pfn); | ||
605 | 791 | ||
606 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", | 792 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", |
607 | pfn, | 793 | pfn, |
608 | page && PageDirty(page) ? "dirty " : "", | 794 | PageDirty(page) ? "dirty " : "", |
609 | msg, action_name[result]); | 795 | msg, action_name[result]); |
610 | } | 796 | } |
611 | 797 | ||
612 | static int page_action(struct page_state *ps, struct page *p, | 798 | static int page_action(struct page_state *ps, struct page *p, |
613 | unsigned long pfn, int ref) | 799 | unsigned long pfn) |
614 | { | 800 | { |
615 | int result; | 801 | int result; |
616 | int count; | 802 | int count; |
@@ -618,18 +804,22 @@ static int page_action(struct page_state *ps, struct page *p, | |||
618 | result = ps->action(p, pfn); | 804 | result = ps->action(p, pfn); |
619 | action_result(pfn, ps->msg, result); | 805 | action_result(pfn, ps->msg, result); |
620 | 806 | ||
621 | count = page_count(p) - 1 - ref; | 807 | count = page_count(p) - 1; |
622 | if (count != 0) | 808 | if (ps->action == me_swapcache_dirty && result == DELAYED) |
809 | count--; | ||
810 | if (count != 0) { | ||
623 | printk(KERN_ERR | 811 | printk(KERN_ERR |
624 | "MCE %#lx: %s page still referenced by %d users\n", | 812 | "MCE %#lx: %s page still referenced by %d users\n", |
625 | pfn, ps->msg, count); | 813 | pfn, ps->msg, count); |
814 | result = FAILED; | ||
815 | } | ||
626 | 816 | ||
627 | /* Could do more checks here if page looks ok */ | 817 | /* Could do more checks here if page looks ok */ |
628 | /* | 818 | /* |
629 | * Could adjust zone counters here to correct for the missing page. | 819 | * Could adjust zone counters here to correct for the missing page. |
630 | */ | 820 | */ |
631 | 821 | ||
632 | return result == RECOVERED ? 0 : -EBUSY; | 822 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; |
633 | } | 823 | } |
634 | 824 | ||
635 | #define N_UNMAP_TRIES 5 | 825 | #define N_UNMAP_TRIES 5 |
@@ -638,7 +828,7 @@ static int page_action(struct page_state *ps, struct page *p, | |||
638 | * Do all that is necessary to remove user space mappings. Unmap | 828 | * Do all that is necessary to remove user space mappings. Unmap |
639 | * the pages and send SIGBUS to the processes if the data was dirty. | 829 | * the pages and send SIGBUS to the processes if the data was dirty. |
640 | */ | 830 | */ |
641 | static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | 831 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, |
642 | int trapno) | 832 | int trapno) |
643 | { | 833 | { |
644 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | 834 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
@@ -648,15 +838,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
648 | int i; | 838 | int i; |
649 | int kill = 1; | 839 | int kill = 1; |
650 | 840 | ||
651 | if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) | 841 | if (PageReserved(p) || PageSlab(p)) |
652 | return; | 842 | return SWAP_SUCCESS; |
653 | 843 | ||
654 | /* | 844 | /* |
655 | * This check implies we don't kill processes if their pages | 845 | * This check implies we don't kill processes if their pages |
656 | * are in the swap cache early. Those are always late kills. | 846 | * are in the swap cache early. Those are always late kills. |
657 | */ | 847 | */ |
658 | if (!page_mapped(p)) | 848 | if (!page_mapped(p)) |
659 | return; | 849 | return SWAP_SUCCESS; |
850 | |||
851 | if (PageCompound(p) || PageKsm(p)) | ||
852 | return SWAP_FAIL; | ||
660 | 853 | ||
661 | if (PageSwapCache(p)) { | 854 | if (PageSwapCache(p)) { |
662 | printk(KERN_ERR | 855 | printk(KERN_ERR |
@@ -667,6 +860,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
667 | /* | 860 | /* |
668 | * Propagate the dirty bit from PTEs to struct page first, because we | 861 | * Propagate the dirty bit from PTEs to struct page first, because we |
669 | * need this to decide if we should kill or just drop the page. | 862 | * need this to decide if we should kill or just drop the page. |
863 | * XXX: the dirty test could be racy: set_page_dirty() may not always | ||
864 | * be called inside page lock (it's recommended but not enforced). | ||
670 | */ | 865 | */ |
671 | mapping = page_mapping(p); | 866 | mapping = page_mapping(p); |
672 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { | 867 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { |
@@ -718,11 +913,12 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
718 | */ | 913 | */ |
719 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, | 914 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, |
720 | ret != SWAP_SUCCESS, pfn); | 915 | ret != SWAP_SUCCESS, pfn); |
916 | |||
917 | return ret; | ||
721 | } | 918 | } |
722 | 919 | ||
723 | int __memory_failure(unsigned long pfn, int trapno, int ref) | 920 | int __memory_failure(unsigned long pfn, int trapno, int flags) |
724 | { | 921 | { |
725 | unsigned long lru_flag; | ||
726 | struct page_state *ps; | 922 | struct page_state *ps; |
727 | struct page *p; | 923 | struct page *p; |
728 | int res; | 924 | int res; |
@@ -731,13 +927,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
731 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | 927 | panic("Memory failure from trap %d on page %lx", trapno, pfn); |
732 | 928 | ||
733 | if (!pfn_valid(pfn)) { | 929 | if (!pfn_valid(pfn)) { |
734 | action_result(pfn, "memory outside kernel control", IGNORED); | 930 | printk(KERN_ERR |
735 | return -EIO; | 931 | "MCE %#lx: memory outside kernel control\n", |
932 | pfn); | ||
933 | return -ENXIO; | ||
736 | } | 934 | } |
737 | 935 | ||
738 | p = pfn_to_page(pfn); | 936 | p = pfn_to_page(pfn); |
739 | if (TestSetPageHWPoison(p)) { | 937 | if (TestSetPageHWPoison(p)) { |
740 | action_result(pfn, "already hardware poisoned", IGNORED); | 938 | printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); |
741 | return 0; | 939 | return 0; |
742 | } | 940 | } |
743 | 941 | ||
@@ -754,9 +952,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
754 | * In fact it's dangerous to directly bump up page count from 0, | 952 | * In fact it's dangerous to directly bump up page count from 0, |
755 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. | 953 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. |
756 | */ | 954 | */ |
757 | if (!get_page_unless_zero(compound_head(p))) { | 955 | if (!(flags & MF_COUNT_INCREASED) && |
758 | action_result(pfn, "free or high order kernel", IGNORED); | 956 | !get_page_unless_zero(compound_head(p))) { |
759 | return PageBuddy(compound_head(p)) ? 0 : -EBUSY; | 957 | if (is_free_buddy_page(p)) { |
958 | action_result(pfn, "free buddy", DELAYED); | ||
959 | return 0; | ||
960 | } else { | ||
961 | action_result(pfn, "high order kernel", IGNORED); | ||
962 | return -EBUSY; | ||
963 | } | ||
760 | } | 964 | } |
761 | 965 | ||
762 | /* | 966 | /* |
@@ -768,14 +972,19 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
768 | * walked by the page reclaim code, however that's not a big loss. | 972 | * walked by the page reclaim code, however that's not a big loss. |
769 | */ | 973 | */ |
770 | if (!PageLRU(p)) | 974 | if (!PageLRU(p)) |
771 | lru_add_drain_all(); | 975 | shake_page(p, 0); |
772 | lru_flag = p->flags & lru; | 976 | if (!PageLRU(p)) { |
773 | if (isolate_lru_page(p)) { | 977 | /* |
978 | * shake_page could have turned it free. | ||
979 | */ | ||
980 | if (is_free_buddy_page(p)) { | ||
981 | action_result(pfn, "free buddy, 2nd try", DELAYED); | ||
982 | return 0; | ||
983 | } | ||
774 | action_result(pfn, "non LRU", IGNORED); | 984 | action_result(pfn, "non LRU", IGNORED); |
775 | put_page(p); | 985 | put_page(p); |
776 | return -EBUSY; | 986 | return -EBUSY; |
777 | } | 987 | } |
778 | page_cache_release(p); | ||
779 | 988 | ||
780 | /* | 989 | /* |
781 | * Lock the page and wait for writeback to finish. | 990 | * Lock the page and wait for writeback to finish. |
@@ -783,26 +992,48 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
783 | * and in many cases impossible, so we just avoid it here. | 992 | * and in many cases impossible, so we just avoid it here. |
784 | */ | 993 | */ |
785 | lock_page_nosync(p); | 994 | lock_page_nosync(p); |
995 | |||
996 | /* | ||
997 | * unpoison always clear PG_hwpoison inside page lock | ||
998 | */ | ||
999 | if (!PageHWPoison(p)) { | ||
1000 | printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); | ||
1001 | res = 0; | ||
1002 | goto out; | ||
1003 | } | ||
1004 | if (hwpoison_filter(p)) { | ||
1005 | if (TestClearPageHWPoison(p)) | ||
1006 | atomic_long_dec(&mce_bad_pages); | ||
1007 | unlock_page(p); | ||
1008 | put_page(p); | ||
1009 | return 0; | ||
1010 | } | ||
1011 | |||
786 | wait_on_page_writeback(p); | 1012 | wait_on_page_writeback(p); |
787 | 1013 | ||
788 | /* | 1014 | /* |
789 | * Now take care of user space mappings. | 1015 | * Now take care of user space mappings. |
1016 | * Abort on fail: __remove_from_page_cache() assumes unmapped page. | ||
790 | */ | 1017 | */ |
791 | hwpoison_user_mappings(p, pfn, trapno); | 1018 | if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { |
1019 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); | ||
1020 | res = -EBUSY; | ||
1021 | goto out; | ||
1022 | } | ||
792 | 1023 | ||
793 | /* | 1024 | /* |
794 | * Torn down by someone else? | 1025 | * Torn down by someone else? |
795 | */ | 1026 | */ |
796 | if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { | 1027 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { |
797 | action_result(pfn, "already truncated LRU", IGNORED); | 1028 | action_result(pfn, "already truncated LRU", IGNORED); |
798 | res = 0; | 1029 | res = -EBUSY; |
799 | goto out; | 1030 | goto out; |
800 | } | 1031 | } |
801 | 1032 | ||
802 | res = -EBUSY; | 1033 | res = -EBUSY; |
803 | for (ps = error_states;; ps++) { | 1034 | for (ps = error_states;; ps++) { |
804 | if (((p->flags | lru_flag)& ps->mask) == ps->res) { | 1035 | if ((p->flags & ps->mask) == ps->res) { |
805 | res = page_action(ps, p, pfn, ref); | 1036 | res = page_action(ps, p, pfn); |
806 | break; | 1037 | break; |
807 | } | 1038 | } |
808 | } | 1039 | } |
@@ -833,3 +1064,235 @@ void memory_failure(unsigned long pfn, int trapno) | |||
833 | { | 1064 | { |
834 | __memory_failure(pfn, trapno, 0); | 1065 | __memory_failure(pfn, trapno, 0); |
835 | } | 1066 | } |
1067 | |||
1068 | /** | ||
1069 | * unpoison_memory - Unpoison a previously poisoned page | ||
1070 | * @pfn: Page number of the to be unpoisoned page | ||
1071 | * | ||
1072 | * Software-unpoison a page that has been poisoned by | ||
1073 | * memory_failure() earlier. | ||
1074 | * | ||
1075 | * This is only done on the software-level, so it only works | ||
1076 | * for linux injected failures, not real hardware failures | ||
1077 | * | ||
1078 | * Returns 0 for success, otherwise -errno. | ||
1079 | */ | ||
1080 | int unpoison_memory(unsigned long pfn) | ||
1081 | { | ||
1082 | struct page *page; | ||
1083 | struct page *p; | ||
1084 | int freeit = 0; | ||
1085 | |||
1086 | if (!pfn_valid(pfn)) | ||
1087 | return -ENXIO; | ||
1088 | |||
1089 | p = pfn_to_page(pfn); | ||
1090 | page = compound_head(p); | ||
1091 | |||
1092 | if (!PageHWPoison(p)) { | ||
1093 | pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); | ||
1094 | return 0; | ||
1095 | } | ||
1096 | |||
1097 | if (!get_page_unless_zero(page)) { | ||
1098 | if (TestClearPageHWPoison(p)) | ||
1099 | atomic_long_dec(&mce_bad_pages); | ||
1100 | pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); | ||
1101 | return 0; | ||
1102 | } | ||
1103 | |||
1104 | lock_page_nosync(page); | ||
1105 | /* | ||
1106 | * This test is racy because PG_hwpoison is set outside of page lock. | ||
1107 | * That's acceptable because that won't trigger kernel panic. Instead, | ||
1108 | * the PG_hwpoison page will be caught and isolated on the entrance to | ||
1109 | * the free buddy page pool. | ||
1110 | */ | ||
1111 | if (TestClearPageHWPoison(p)) { | ||
1112 | pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); | ||
1113 | atomic_long_dec(&mce_bad_pages); | ||
1114 | freeit = 1; | ||
1115 | } | ||
1116 | unlock_page(page); | ||
1117 | |||
1118 | put_page(page); | ||
1119 | if (freeit) | ||
1120 | put_page(page); | ||
1121 | |||
1122 | return 0; | ||
1123 | } | ||
1124 | EXPORT_SYMBOL(unpoison_memory); | ||
1125 | |||
1126 | static struct page *new_page(struct page *p, unsigned long private, int **x) | ||
1127 | { | ||
1128 | int nid = page_to_nid(p); | ||
1129 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | ||
1130 | } | ||
1131 | |||
1132 | /* | ||
1133 | * Safely get reference count of an arbitrary page. | ||
1134 | * Returns 0 for a free page, -EIO for a zero refcount page | ||
1135 | * that is not free, and 1 for any other page type. | ||
1136 | * For 1 the page is returned with increased page count, otherwise not. | ||
1137 | */ | ||
1138 | static int get_any_page(struct page *p, unsigned long pfn, int flags) | ||
1139 | { | ||
1140 | int ret; | ||
1141 | |||
1142 | if (flags & MF_COUNT_INCREASED) | ||
1143 | return 1; | ||
1144 | |||
1145 | /* | ||
1146 | * The lock_system_sleep prevents a race with memory hotplug, | ||
1147 | * because the isolation assumes there's only a single user. | ||
1148 | * This is a big hammer, a better would be nicer. | ||
1149 | */ | ||
1150 | lock_system_sleep(); | ||
1151 | |||
1152 | /* | ||
1153 | * Isolate the page, so that it doesn't get reallocated if it | ||
1154 | * was free. | ||
1155 | */ | ||
1156 | set_migratetype_isolate(p); | ||
1157 | if (!get_page_unless_zero(compound_head(p))) { | ||
1158 | if (is_free_buddy_page(p)) { | ||
1159 | pr_debug("get_any_page: %#lx free buddy page\n", pfn); | ||
1160 | /* Set hwpoison bit while page is still isolated */ | ||
1161 | SetPageHWPoison(p); | ||
1162 | ret = 0; | ||
1163 | } else { | ||
1164 | pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", | ||
1165 | pfn, p->flags); | ||
1166 | ret = -EIO; | ||
1167 | } | ||
1168 | } else { | ||
1169 | /* Not a free page */ | ||
1170 | ret = 1; | ||
1171 | } | ||
1172 | unset_migratetype_isolate(p); | ||
1173 | unlock_system_sleep(); | ||
1174 | return ret; | ||
1175 | } | ||
1176 | |||
1177 | /** | ||
1178 | * soft_offline_page - Soft offline a page. | ||
1179 | * @page: page to offline | ||
1180 | * @flags: flags. Same as memory_failure(). | ||
1181 | * | ||
1182 | * Returns 0 on success, otherwise negated errno. | ||
1183 | * | ||
1184 | * Soft offline a page, by migration or invalidation, | ||
1185 | * without killing anything. This is for the case when | ||
1186 | * a page is not corrupted yet (so it's still valid to access), | ||
1187 | * but has had a number of corrected errors and is better taken | ||
1188 | * out. | ||
1189 | * | ||
1190 | * The actual policy on when to do that is maintained by | ||
1191 | * user space. | ||
1192 | * | ||
1193 | * This should never impact any application or cause data loss, | ||
1194 | * however it might take some time. | ||
1195 | * | ||
1196 | * This is not a 100% solution for all memory, but tries to be | ||
1197 | * ``good enough'' for the majority of memory. | ||
1198 | */ | ||
1199 | int soft_offline_page(struct page *page, int flags) | ||
1200 | { | ||
1201 | int ret; | ||
1202 | unsigned long pfn = page_to_pfn(page); | ||
1203 | |||
1204 | ret = get_any_page(page, pfn, flags); | ||
1205 | if (ret < 0) | ||
1206 | return ret; | ||
1207 | if (ret == 0) | ||
1208 | goto done; | ||
1209 | |||
1210 | /* | ||
1211 | * Page cache page we can handle? | ||
1212 | */ | ||
1213 | if (!PageLRU(page)) { | ||
1214 | /* | ||
1215 | * Try to free it. | ||
1216 | */ | ||
1217 | put_page(page); | ||
1218 | shake_page(page, 1); | ||
1219 | |||
1220 | /* | ||
1221 | * Did it turn free? | ||
1222 | */ | ||
1223 | ret = get_any_page(page, pfn, 0); | ||
1224 | if (ret < 0) | ||
1225 | return ret; | ||
1226 | if (ret == 0) | ||
1227 | goto done; | ||
1228 | } | ||
1229 | if (!PageLRU(page)) { | ||
1230 | pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", | ||
1231 | pfn, page->flags); | ||
1232 | return -EIO; | ||
1233 | } | ||
1234 | |||
1235 | lock_page(page); | ||
1236 | wait_on_page_writeback(page); | ||
1237 | |||
1238 | /* | ||
1239 | * Synchronized using the page lock with memory_failure() | ||
1240 | */ | ||
1241 | if (PageHWPoison(page)) { | ||
1242 | unlock_page(page); | ||
1243 | put_page(page); | ||
1244 | pr_debug("soft offline: %#lx page already poisoned\n", pfn); | ||
1245 | return -EBUSY; | ||
1246 | } | ||
1247 | |||
1248 | /* | ||
1249 | * Try to invalidate first. This should work for | ||
1250 | * non dirty unmapped page cache pages. | ||
1251 | */ | ||
1252 | ret = invalidate_inode_page(page); | ||
1253 | unlock_page(page); | ||
1254 | |||
1255 | /* | ||
1256 | * Drop count because page migration doesn't like raised | ||
1257 | * counts. The page could get re-allocated, but if it becomes | ||
1258 | * LRU the isolation will just fail. | ||
1259 | * RED-PEN would be better to keep it isolated here, but we | ||
1260 | * would need to fix isolation locking first. | ||
1261 | */ | ||
1262 | put_page(page); | ||
1263 | if (ret == 1) { | ||
1264 | ret = 0; | ||
1265 | pr_debug("soft_offline: %#lx: invalidated\n", pfn); | ||
1266 | goto done; | ||
1267 | } | ||
1268 | |||
1269 | /* | ||
1270 | * Simple invalidation didn't work. | ||
1271 | * Try to migrate to a new page instead. migrate.c | ||
1272 | * handles a large number of cases for us. | ||
1273 | */ | ||
1274 | ret = isolate_lru_page(page); | ||
1275 | if (!ret) { | ||
1276 | LIST_HEAD(pagelist); | ||
1277 | |||
1278 | list_add(&page->lru, &pagelist); | ||
1279 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | ||
1280 | if (ret) { | ||
1281 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | ||
1282 | pfn, ret, page->flags); | ||
1283 | if (ret > 0) | ||
1284 | ret = -EIO; | ||
1285 | } | ||
1286 | } else { | ||
1287 | pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | ||
1288 | pfn, ret, page_count(page), page->flags); | ||
1289 | } | ||
1290 | if (ret) | ||
1291 | return ret; | ||
1292 | |||
1293 | done: | ||
1294 | atomic_long_add(1, &mce_bad_pages); | ||
1295 | SetPageHWPoison(page); | ||
1296 | /* keep elevated page count for bad page */ | ||
1297 | return ret; | ||
1298 | } | ||
diff --git a/mm/memory.c b/mm/memory.c index 6ab19dd4a199..833952d8b74d 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -56,6 +56,7 @@ | |||
56 | #include <linux/kallsyms.h> | 56 | #include <linux/kallsyms.h> |
57 | #include <linux/swapops.h> | 57 | #include <linux/swapops.h> |
58 | #include <linux/elf.h> | 58 | #include <linux/elf.h> |
59 | #include <linux/gfp.h> | ||
59 | 60 | ||
60 | #include <asm/io.h> | 61 | #include <asm/io.h> |
61 | #include <asm/pgalloc.h> | 62 | #include <asm/pgalloc.h> |
@@ -121,6 +122,77 @@ static int __init init_zero_pfn(void) | |||
121 | } | 122 | } |
122 | core_initcall(init_zero_pfn); | 123 | core_initcall(init_zero_pfn); |
123 | 124 | ||
125 | |||
126 | #if defined(SPLIT_RSS_COUNTING) | ||
127 | |||
128 | static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) | ||
129 | { | ||
130 | int i; | ||
131 | |||
132 | for (i = 0; i < NR_MM_COUNTERS; i++) { | ||
133 | if (task->rss_stat.count[i]) { | ||
134 | add_mm_counter(mm, i, task->rss_stat.count[i]); | ||
135 | task->rss_stat.count[i] = 0; | ||
136 | } | ||
137 | } | ||
138 | task->rss_stat.events = 0; | ||
139 | } | ||
140 | |||
141 | static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) | ||
142 | { | ||
143 | struct task_struct *task = current; | ||
144 | |||
145 | if (likely(task->mm == mm)) | ||
146 | task->rss_stat.count[member] += val; | ||
147 | else | ||
148 | add_mm_counter(mm, member, val); | ||
149 | } | ||
150 | #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) | ||
151 | #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) | ||
152 | |||
153 | /* sync counter once per 64 page faults */ | ||
154 | #define TASK_RSS_EVENTS_THRESH (64) | ||
155 | static void check_sync_rss_stat(struct task_struct *task) | ||
156 | { | ||
157 | if (unlikely(task != current)) | ||
158 | return; | ||
159 | if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) | ||
160 | __sync_task_rss_stat(task, task->mm); | ||
161 | } | ||
162 | |||
163 | unsigned long get_mm_counter(struct mm_struct *mm, int member) | ||
164 | { | ||
165 | long val = 0; | ||
166 | |||
167 | /* | ||
168 | * Don't use task->mm here...for avoiding to use task_get_mm().. | ||
169 | * The caller must guarantee task->mm is not invalid. | ||
170 | */ | ||
171 | val = atomic_long_read(&mm->rss_stat.count[member]); | ||
172 | /* | ||
173 | * counter is updated in asynchronous manner and may go to minus. | ||
174 | * But it's never be expected number for users. | ||
175 | */ | ||
176 | if (val < 0) | ||
177 | return 0; | ||
178 | return (unsigned long)val; | ||
179 | } | ||
180 | |||
181 | void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | ||
182 | { | ||
183 | __sync_task_rss_stat(task, mm); | ||
184 | } | ||
185 | #else | ||
186 | |||
187 | #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) | ||
188 | #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) | ||
189 | |||
190 | static void check_sync_rss_stat(struct task_struct *task) | ||
191 | { | ||
192 | } | ||
193 | |||
194 | #endif | ||
195 | |||
124 | /* | 196 | /* |
125 | * If a p?d_bad entry is found while walking page tables, report | 197 | * If a p?d_bad entry is found while walking page tables, report |
126 | * the error, before resetting entry to p?d_none. Usually (but | 198 | * the error, before resetting entry to p?d_none. Usually (but |
@@ -300,7 +372,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
300 | * Hide vma from rmap and truncate_pagecache before freeing | 372 | * Hide vma from rmap and truncate_pagecache before freeing |
301 | * pgtables | 373 | * pgtables |
302 | */ | 374 | */ |
303 | anon_vma_unlink(vma); | 375 | unlink_anon_vmas(vma); |
304 | unlink_file_vma(vma); | 376 | unlink_file_vma(vma); |
305 | 377 | ||
306 | if (is_vm_hugetlb_page(vma)) { | 378 | if (is_vm_hugetlb_page(vma)) { |
@@ -314,7 +386,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
314 | && !is_vm_hugetlb_page(next)) { | 386 | && !is_vm_hugetlb_page(next)) { |
315 | vma = next; | 387 | vma = next; |
316 | next = vma->vm_next; | 388 | next = vma->vm_next; |
317 | anon_vma_unlink(vma); | 389 | unlink_anon_vmas(vma); |
318 | unlink_file_vma(vma); | 390 | unlink_file_vma(vma); |
319 | } | 391 | } |
320 | free_pgd_range(tlb, addr, vma->vm_end, | 392 | free_pgd_range(tlb, addr, vma->vm_end, |
@@ -376,12 +448,20 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) | |||
376 | return 0; | 448 | return 0; |
377 | } | 449 | } |
378 | 450 | ||
379 | static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | 451 | static inline void init_rss_vec(int *rss) |
380 | { | 452 | { |
381 | if (file_rss) | 453 | memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); |
382 | add_mm_counter(mm, file_rss, file_rss); | 454 | } |
383 | if (anon_rss) | 455 | |
384 | add_mm_counter(mm, anon_rss, anon_rss); | 456 | static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) |
457 | { | ||
458 | int i; | ||
459 | |||
460 | if (current->mm == mm) | ||
461 | sync_mm_rss(current, mm); | ||
462 | for (i = 0; i < NR_MM_COUNTERS; i++) | ||
463 | if (rss[i]) | ||
464 | add_mm_counter(mm, i, rss[i]); | ||
385 | } | 465 | } |
386 | 466 | ||
387 | /* | 467 | /* |
@@ -430,12 +510,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
430 | "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", | 510 | "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", |
431 | current->comm, | 511 | current->comm, |
432 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); | 512 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); |
433 | if (page) { | 513 | if (page) |
434 | printk(KERN_ALERT | 514 | dump_page(page); |
435 | "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", | ||
436 | page, (void *)page->flags, page_count(page), | ||
437 | page_mapcount(page), page->mapping, page->index); | ||
438 | } | ||
439 | printk(KERN_ALERT | 515 | printk(KERN_ALERT |
440 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", | 516 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", |
441 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); | 517 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); |
@@ -572,7 +648,7 @@ out: | |||
572 | * covered by this vma. | 648 | * covered by this vma. |
573 | */ | 649 | */ |
574 | 650 | ||
575 | static inline void | 651 | static inline unsigned long |
576 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 652 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
577 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, | 653 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, |
578 | unsigned long addr, int *rss) | 654 | unsigned long addr, int *rss) |
@@ -586,7 +662,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
586 | if (!pte_file(pte)) { | 662 | if (!pte_file(pte)) { |
587 | swp_entry_t entry = pte_to_swp_entry(pte); | 663 | swp_entry_t entry = pte_to_swp_entry(pte); |
588 | 664 | ||
589 | swap_duplicate(entry); | 665 | if (swap_duplicate(entry) < 0) |
666 | return entry.val; | ||
667 | |||
590 | /* make sure dst_mm is on swapoff's mmlist. */ | 668 | /* make sure dst_mm is on swapoff's mmlist. */ |
591 | if (unlikely(list_empty(&dst_mm->mmlist))) { | 669 | if (unlikely(list_empty(&dst_mm->mmlist))) { |
592 | spin_lock(&mmlist_lock); | 670 | spin_lock(&mmlist_lock); |
@@ -595,7 +673,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
595 | &src_mm->mmlist); | 673 | &src_mm->mmlist); |
596 | spin_unlock(&mmlist_lock); | 674 | spin_unlock(&mmlist_lock); |
597 | } | 675 | } |
598 | if (is_write_migration_entry(entry) && | 676 | if (likely(!non_swap_entry(entry))) |
677 | rss[MM_SWAPENTS]++; | ||
678 | else if (is_write_migration_entry(entry) && | ||
599 | is_cow_mapping(vm_flags)) { | 679 | is_cow_mapping(vm_flags)) { |
600 | /* | 680 | /* |
601 | * COW mappings require pages in both parent | 681 | * COW mappings require pages in both parent |
@@ -630,11 +710,15 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
630 | if (page) { | 710 | if (page) { |
631 | get_page(page); | 711 | get_page(page); |
632 | page_dup_rmap(page); | 712 | page_dup_rmap(page); |
633 | rss[PageAnon(page)]++; | 713 | if (PageAnon(page)) |
714 | rss[MM_ANONPAGES]++; | ||
715 | else | ||
716 | rss[MM_FILEPAGES]++; | ||
634 | } | 717 | } |
635 | 718 | ||
636 | out_set_pte: | 719 | out_set_pte: |
637 | set_pte_at(dst_mm, addr, dst_pte, pte); | 720 | set_pte_at(dst_mm, addr, dst_pte, pte); |
721 | return 0; | ||
638 | } | 722 | } |
639 | 723 | ||
640 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 724 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
@@ -645,10 +729,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
645 | pte_t *src_pte, *dst_pte; | 729 | pte_t *src_pte, *dst_pte; |
646 | spinlock_t *src_ptl, *dst_ptl; | 730 | spinlock_t *src_ptl, *dst_ptl; |
647 | int progress = 0; | 731 | int progress = 0; |
648 | int rss[2]; | 732 | int rss[NR_MM_COUNTERS]; |
733 | swp_entry_t entry = (swp_entry_t){0}; | ||
649 | 734 | ||
650 | again: | 735 | again: |
651 | rss[1] = rss[0] = 0; | 736 | init_rss_vec(rss); |
737 | |||
652 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); | 738 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); |
653 | if (!dst_pte) | 739 | if (!dst_pte) |
654 | return -ENOMEM; | 740 | return -ENOMEM; |
@@ -674,16 +760,25 @@ again: | |||
674 | progress++; | 760 | progress++; |
675 | continue; | 761 | continue; |
676 | } | 762 | } |
677 | copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); | 763 | entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, |
764 | vma, addr, rss); | ||
765 | if (entry.val) | ||
766 | break; | ||
678 | progress += 8; | 767 | progress += 8; |
679 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); | 768 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); |
680 | 769 | ||
681 | arch_leave_lazy_mmu_mode(); | 770 | arch_leave_lazy_mmu_mode(); |
682 | spin_unlock(src_ptl); | 771 | spin_unlock(src_ptl); |
683 | pte_unmap_nested(orig_src_pte); | 772 | pte_unmap_nested(orig_src_pte); |
684 | add_mm_rss(dst_mm, rss[0], rss[1]); | 773 | add_mm_rss_vec(dst_mm, rss); |
685 | pte_unmap_unlock(orig_dst_pte, dst_ptl); | 774 | pte_unmap_unlock(orig_dst_pte, dst_ptl); |
686 | cond_resched(); | 775 | cond_resched(); |
776 | |||
777 | if (entry.val) { | ||
778 | if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) | ||
779 | return -ENOMEM; | ||
780 | progress = 0; | ||
781 | } | ||
687 | if (addr != end) | 782 | if (addr != end) |
688 | goto again; | 783 | goto again; |
689 | return 0; | 784 | return 0; |
@@ -803,8 +898,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
803 | struct mm_struct *mm = tlb->mm; | 898 | struct mm_struct *mm = tlb->mm; |
804 | pte_t *pte; | 899 | pte_t *pte; |
805 | spinlock_t *ptl; | 900 | spinlock_t *ptl; |
806 | int file_rss = 0; | 901 | int rss[NR_MM_COUNTERS]; |
807 | int anon_rss = 0; | 902 | |
903 | init_rss_vec(rss); | ||
808 | 904 | ||
809 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 905 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
810 | arch_enter_lazy_mmu_mode(); | 906 | arch_enter_lazy_mmu_mode(); |
@@ -850,14 +946,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
850 | set_pte_at(mm, addr, pte, | 946 | set_pte_at(mm, addr, pte, |
851 | pgoff_to_pte(page->index)); | 947 | pgoff_to_pte(page->index)); |
852 | if (PageAnon(page)) | 948 | if (PageAnon(page)) |
853 | anon_rss--; | 949 | rss[MM_ANONPAGES]--; |
854 | else { | 950 | else { |
855 | if (pte_dirty(ptent)) | 951 | if (pte_dirty(ptent)) |
856 | set_page_dirty(page); | 952 | set_page_dirty(page); |
857 | if (pte_young(ptent) && | 953 | if (pte_young(ptent) && |
858 | likely(!VM_SequentialReadHint(vma))) | 954 | likely(!VM_SequentialReadHint(vma))) |
859 | mark_page_accessed(page); | 955 | mark_page_accessed(page); |
860 | file_rss--; | 956 | rss[MM_FILEPAGES]--; |
861 | } | 957 | } |
862 | page_remove_rmap(page); | 958 | page_remove_rmap(page); |
863 | if (unlikely(page_mapcount(page) < 0)) | 959 | if (unlikely(page_mapcount(page) < 0)) |
@@ -874,13 +970,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
874 | if (pte_file(ptent)) { | 970 | if (pte_file(ptent)) { |
875 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) | 971 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) |
876 | print_bad_pte(vma, addr, ptent, NULL); | 972 | print_bad_pte(vma, addr, ptent, NULL); |
877 | } else if | 973 | } else { |
878 | (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) | 974 | swp_entry_t entry = pte_to_swp_entry(ptent); |
879 | print_bad_pte(vma, addr, ptent, NULL); | 975 | |
976 | if (!non_swap_entry(entry)) | ||
977 | rss[MM_SWAPENTS]--; | ||
978 | if (unlikely(!free_swap_and_cache(entry))) | ||
979 | print_bad_pte(vma, addr, ptent, NULL); | ||
980 | } | ||
880 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 981 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
881 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); | 982 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); |
882 | 983 | ||
883 | add_mm_rss(mm, file_rss, anon_rss); | 984 | add_mm_rss_vec(mm, rss); |
884 | arch_leave_lazy_mmu_mode(); | 985 | arch_leave_lazy_mmu_mode(); |
885 | pte_unmap_unlock(pte - 1, ptl); | 986 | pte_unmap_unlock(pte - 1, ptl); |
886 | 987 | ||
@@ -943,6 +1044,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
943 | details = NULL; | 1044 | details = NULL; |
944 | 1045 | ||
945 | BUG_ON(addr >= end); | 1046 | BUG_ON(addr >= end); |
1047 | mem_cgroup_uncharge_start(); | ||
946 | tlb_start_vma(tlb, vma); | 1048 | tlb_start_vma(tlb, vma); |
947 | pgd = pgd_offset(vma->vm_mm, addr); | 1049 | pgd = pgd_offset(vma->vm_mm, addr); |
948 | do { | 1050 | do { |
@@ -955,6 +1057,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
955 | zap_work, details); | 1057 | zap_work, details); |
956 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); | 1058 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); |
957 | tlb_end_vma(tlb, vma); | 1059 | tlb_end_vma(tlb, vma); |
1060 | mem_cgroup_uncharge_end(); | ||
958 | 1061 | ||
959 | return addr; | 1062 | return addr; |
960 | } | 1063 | } |
@@ -1512,7 +1615,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
1512 | 1615 | ||
1513 | /* Ok, finally just insert the thing.. */ | 1616 | /* Ok, finally just insert the thing.. */ |
1514 | get_page(page); | 1617 | get_page(page); |
1515 | inc_mm_counter(mm, file_rss); | 1618 | inc_mm_counter_fast(mm, MM_FILEPAGES); |
1516 | page_add_file_rmap(page); | 1619 | page_add_file_rmap(page); |
1517 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 1620 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); |
1518 | 1621 | ||
@@ -1578,7 +1681,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
1578 | /* Ok, finally just insert the thing.. */ | 1681 | /* Ok, finally just insert the thing.. */ |
1579 | entry = pte_mkspecial(pfn_pte(pfn, prot)); | 1682 | entry = pte_mkspecial(pfn_pte(pfn, prot)); |
1580 | set_pte_at(mm, addr, pte, entry); | 1683 | set_pte_at(mm, addr, pte, entry); |
1581 | update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ | 1684 | update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ |
1582 | 1685 | ||
1583 | retval = 0; | 1686 | retval = 0; |
1584 | out_unlock: | 1687 | out_unlock: |
@@ -2029,6 +2132,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2029 | page_cache_release(old_page); | 2132 | page_cache_release(old_page); |
2030 | } | 2133 | } |
2031 | reuse = reuse_swap_page(old_page); | 2134 | reuse = reuse_swap_page(old_page); |
2135 | if (reuse) | ||
2136 | /* | ||
2137 | * The page is all ours. Move it to our anon_vma so | ||
2138 | * the rmap code will not search our parent or siblings. | ||
2139 | * Protected against the rmap code by the page lock. | ||
2140 | */ | ||
2141 | page_move_anon_rmap(old_page, vma, address); | ||
2032 | unlock_page(old_page); | 2142 | unlock_page(old_page); |
2033 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2143 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2034 | (VM_WRITE|VM_SHARED))) { | 2144 | (VM_WRITE|VM_SHARED))) { |
@@ -2101,7 +2211,7 @@ reuse: | |||
2101 | entry = pte_mkyoung(orig_pte); | 2211 | entry = pte_mkyoung(orig_pte); |
2102 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2212 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2103 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) | 2213 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) |
2104 | update_mmu_cache(vma, address, entry); | 2214 | update_mmu_cache(vma, address, page_table); |
2105 | ret |= VM_FAULT_WRITE; | 2215 | ret |= VM_FAULT_WRITE; |
2106 | goto unlock; | 2216 | goto unlock; |
2107 | } | 2217 | } |
@@ -2148,11 +2258,11 @@ gotten: | |||
2148 | if (likely(pte_same(*page_table, orig_pte))) { | 2258 | if (likely(pte_same(*page_table, orig_pte))) { |
2149 | if (old_page) { | 2259 | if (old_page) { |
2150 | if (!PageAnon(old_page)) { | 2260 | if (!PageAnon(old_page)) { |
2151 | dec_mm_counter(mm, file_rss); | 2261 | dec_mm_counter_fast(mm, MM_FILEPAGES); |
2152 | inc_mm_counter(mm, anon_rss); | 2262 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2153 | } | 2263 | } |
2154 | } else | 2264 | } else |
2155 | inc_mm_counter(mm, anon_rss); | 2265 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2156 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2266 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
2157 | entry = mk_pte(new_page, vma->vm_page_prot); | 2267 | entry = mk_pte(new_page, vma->vm_page_prot); |
2158 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2268 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
@@ -2170,7 +2280,7 @@ gotten: | |||
2170 | * new page to be mapped directly into the secondary page table. | 2280 | * new page to be mapped directly into the secondary page table. |
2171 | */ | 2281 | */ |
2172 | set_pte_at_notify(mm, address, page_table, entry); | 2282 | set_pte_at_notify(mm, address, page_table, entry); |
2173 | update_mmu_cache(vma, address, entry); | 2283 | update_mmu_cache(vma, address, page_table); |
2174 | if (old_page) { | 2284 | if (old_page) { |
2175 | /* | 2285 | /* |
2176 | * Only after switching the pte to the new page may | 2286 | * Only after switching the pte to the new page may |
@@ -2514,7 +2624,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2514 | ret = VM_FAULT_HWPOISON; | 2624 | ret = VM_FAULT_HWPOISON; |
2515 | } else { | 2625 | } else { |
2516 | print_bad_pte(vma, address, orig_pte, NULL); | 2626 | print_bad_pte(vma, address, orig_pte, NULL); |
2517 | ret = VM_FAULT_OOM; | 2627 | ret = VM_FAULT_SIGBUS; |
2518 | } | 2628 | } |
2519 | goto out; | 2629 | goto out; |
2520 | } | 2630 | } |
@@ -2540,6 +2650,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2540 | ret = VM_FAULT_MAJOR; | 2650 | ret = VM_FAULT_MAJOR; |
2541 | count_vm_event(PGMAJFAULT); | 2651 | count_vm_event(PGMAJFAULT); |
2542 | } else if (PageHWPoison(page)) { | 2652 | } else if (PageHWPoison(page)) { |
2653 | /* | ||
2654 | * hwpoisoned dirty swapcache pages are kept for killing | ||
2655 | * owner processes (which may be unknown at hwpoison time) | ||
2656 | */ | ||
2543 | ret = VM_FAULT_HWPOISON; | 2657 | ret = VM_FAULT_HWPOISON; |
2544 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2658 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2545 | goto out_release; | 2659 | goto out_release; |
@@ -2548,6 +2662,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2548 | lock_page(page); | 2662 | lock_page(page); |
2549 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2663 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2550 | 2664 | ||
2665 | page = ksm_might_need_to_copy(page, vma, address); | ||
2666 | if (!page) { | ||
2667 | ret = VM_FAULT_OOM; | ||
2668 | goto out; | ||
2669 | } | ||
2670 | |||
2551 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { | 2671 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { |
2552 | ret = VM_FAULT_OOM; | 2672 | ret = VM_FAULT_OOM; |
2553 | goto out_page; | 2673 | goto out_page; |
@@ -2579,7 +2699,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2579 | * discarded at swap_free(). | 2699 | * discarded at swap_free(). |
2580 | */ | 2700 | */ |
2581 | 2701 | ||
2582 | inc_mm_counter(mm, anon_rss); | 2702 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2703 | dec_mm_counter_fast(mm, MM_SWAPENTS); | ||
2583 | pte = mk_pte(page, vma->vm_page_prot); | 2704 | pte = mk_pte(page, vma->vm_page_prot); |
2584 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { | 2705 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { |
2585 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2706 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
@@ -2604,7 +2725,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2604 | } | 2725 | } |
2605 | 2726 | ||
2606 | /* No need to invalidate - it was non-present before */ | 2727 | /* No need to invalidate - it was non-present before */ |
2607 | update_mmu_cache(vma, address, pte); | 2728 | update_mmu_cache(vma, address, page_table); |
2608 | unlock: | 2729 | unlock: |
2609 | pte_unmap_unlock(page_table, ptl); | 2730 | pte_unmap_unlock(page_table, ptl); |
2610 | out: | 2731 | out: |
@@ -2663,13 +2784,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2663 | if (!pte_none(*page_table)) | 2784 | if (!pte_none(*page_table)) |
2664 | goto release; | 2785 | goto release; |
2665 | 2786 | ||
2666 | inc_mm_counter(mm, anon_rss); | 2787 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2667 | page_add_new_anon_rmap(page, vma, address); | 2788 | page_add_new_anon_rmap(page, vma, address); |
2668 | setpte: | 2789 | setpte: |
2669 | set_pte_at(mm, address, page_table, entry); | 2790 | set_pte_at(mm, address, page_table, entry); |
2670 | 2791 | ||
2671 | /* No need to invalidate - it was non-present before */ | 2792 | /* No need to invalidate - it was non-present before */ |
2672 | update_mmu_cache(vma, address, entry); | 2793 | update_mmu_cache(vma, address, page_table); |
2673 | unlock: | 2794 | unlock: |
2674 | pte_unmap_unlock(page_table, ptl); | 2795 | pte_unmap_unlock(page_table, ptl); |
2675 | return 0; | 2796 | return 0; |
@@ -2817,10 +2938,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2817 | if (flags & FAULT_FLAG_WRITE) | 2938 | if (flags & FAULT_FLAG_WRITE) |
2818 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2939 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2819 | if (anon) { | 2940 | if (anon) { |
2820 | inc_mm_counter(mm, anon_rss); | 2941 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2821 | page_add_new_anon_rmap(page, vma, address); | 2942 | page_add_new_anon_rmap(page, vma, address); |
2822 | } else { | 2943 | } else { |
2823 | inc_mm_counter(mm, file_rss); | 2944 | inc_mm_counter_fast(mm, MM_FILEPAGES); |
2824 | page_add_file_rmap(page); | 2945 | page_add_file_rmap(page); |
2825 | if (flags & FAULT_FLAG_WRITE) { | 2946 | if (flags & FAULT_FLAG_WRITE) { |
2826 | dirty_page = page; | 2947 | dirty_page = page; |
@@ -2830,7 +2951,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2830 | set_pte_at(mm, address, page_table, entry); | 2951 | set_pte_at(mm, address, page_table, entry); |
2831 | 2952 | ||
2832 | /* no need to invalidate: a not-present page won't be cached */ | 2953 | /* no need to invalidate: a not-present page won't be cached */ |
2833 | update_mmu_cache(vma, address, entry); | 2954 | update_mmu_cache(vma, address, page_table); |
2834 | } else { | 2955 | } else { |
2835 | if (charged) | 2956 | if (charged) |
2836 | mem_cgroup_uncharge_page(page); | 2957 | mem_cgroup_uncharge_page(page); |
@@ -2910,7 +3031,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2910 | * Page table corrupted: show pte and kill process. | 3031 | * Page table corrupted: show pte and kill process. |
2911 | */ | 3032 | */ |
2912 | print_bad_pte(vma, address, orig_pte, NULL); | 3033 | print_bad_pte(vma, address, orig_pte, NULL); |
2913 | return VM_FAULT_OOM; | 3034 | return VM_FAULT_SIGBUS; |
2914 | } | 3035 | } |
2915 | 3036 | ||
2916 | pgoff = pte_to_pgoff(orig_pte); | 3037 | pgoff = pte_to_pgoff(orig_pte); |
@@ -2967,7 +3088,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2967 | } | 3088 | } |
2968 | entry = pte_mkyoung(entry); | 3089 | entry = pte_mkyoung(entry); |
2969 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { | 3090 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { |
2970 | update_mmu_cache(vma, address, entry); | 3091 | update_mmu_cache(vma, address, pte); |
2971 | } else { | 3092 | } else { |
2972 | /* | 3093 | /* |
2973 | * This is needed only for protection faults but the arch code | 3094 | * This is needed only for protection faults but the arch code |
@@ -2998,6 +3119,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2998 | 3119 | ||
2999 | count_vm_event(PGFAULT); | 3120 | count_vm_event(PGFAULT); |
3000 | 3121 | ||
3122 | /* do counter updates before entering really critical section. */ | ||
3123 | check_sync_rss_stat(current); | ||
3124 | |||
3001 | if (unlikely(is_vm_hugetlb_page(vma))) | 3125 | if (unlikely(is_vm_hugetlb_page(vma))) |
3002 | return hugetlb_fault(mm, vma, address, flags); | 3126 | return hugetlb_fault(mm, vma, address, flags); |
3003 | 3127 | ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2047465cd27c..be211a582930 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -27,6 +27,8 @@ | |||
27 | #include <linux/page-isolation.h> | 27 | #include <linux/page-isolation.h> |
28 | #include <linux/pfn.h> | 28 | #include <linux/pfn.h> |
29 | #include <linux/suspend.h> | 29 | #include <linux/suspend.h> |
30 | #include <linux/mm_inline.h> | ||
31 | #include <linux/firmware-map.h> | ||
30 | 32 | ||
31 | #include <asm/tlbflush.h> | 33 | #include <asm/tlbflush.h> |
32 | 34 | ||
@@ -71,7 +73,9 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) | |||
71 | atomic_inc(&page->_count); | 73 | atomic_inc(&page->_count); |
72 | } | 74 | } |
73 | 75 | ||
74 | void put_page_bootmem(struct page *page) | 76 | /* reference to __meminit __free_pages_bootmem is valid |
77 | * so use __ref to tell modpost not to generate a warning */ | ||
78 | void __ref put_page_bootmem(struct page *page) | ||
75 | { | 79 | { |
76 | int type; | 80 | int type; |
77 | 81 | ||
@@ -520,6 +524,9 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
520 | BUG_ON(ret); | 524 | BUG_ON(ret); |
521 | } | 525 | } |
522 | 526 | ||
527 | /* create new memmap entry */ | ||
528 | firmware_map_add_hotplug(start, start + size, "System RAM"); | ||
529 | |||
523 | goto out; | 530 | goto out; |
524 | 531 | ||
525 | error: | 532 | error: |
@@ -672,15 +679,18 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
672 | if (!ret) { /* Success */ | 679 | if (!ret) { /* Success */ |
673 | list_add_tail(&page->lru, &source); | 680 | list_add_tail(&page->lru, &source); |
674 | move_pages--; | 681 | move_pages--; |
682 | inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
683 | page_is_file_cache(page)); | ||
684 | |||
675 | } else { | 685 | } else { |
676 | /* Becasue we don't have big zone->lock. we should | 686 | /* Becasue we don't have big zone->lock. we should |
677 | check this again here. */ | 687 | check this again here. */ |
678 | if (page_count(page)) | 688 | if (page_count(page)) |
679 | not_managed++; | 689 | not_managed++; |
680 | #ifdef CONFIG_DEBUG_VM | 690 | #ifdef CONFIG_DEBUG_VM |
681 | printk(KERN_INFO "removing from LRU failed" | 691 | printk(KERN_ALERT "removing pfn %lx from LRU failed\n", |
682 | " %lx/%d/%lx\n", | 692 | pfn); |
683 | pfn, page_count(page), page->flags); | 693 | dump_page(page); |
684 | #endif | 694 | #endif |
685 | } | 695 | } |
686 | } | 696 | } |
@@ -694,7 +704,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
694 | if (list_empty(&source)) | 704 | if (list_empty(&source)) |
695 | goto out; | 705 | goto out; |
696 | /* this function returns # of failed pages */ | 706 | /* this function returns # of failed pages */ |
697 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0); | 707 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); |
698 | 708 | ||
699 | out: | 709 | out: |
700 | return ret; | 710 | return ret; |
@@ -747,7 +757,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
747 | return offlined; | 757 | return offlined; |
748 | } | 758 | } |
749 | 759 | ||
750 | int offline_pages(unsigned long start_pfn, | 760 | static int offline_pages(unsigned long start_pfn, |
751 | unsigned long end_pfn, unsigned long timeout) | 761 | unsigned long end_pfn, unsigned long timeout) |
752 | { | 762 | { |
753 | unsigned long pfn, nr_pages, expire; | 763 | unsigned long pfn, nr_pages, expire; |
@@ -849,6 +859,10 @@ repeat: | |||
849 | 859 | ||
850 | setup_per_zone_wmarks(); | 860 | setup_per_zone_wmarks(); |
851 | calculate_zone_inactive_ratio(zone); | 861 | calculate_zone_inactive_ratio(zone); |
862 | if (!node_present_pages(node)) { | ||
863 | node_clear_state(node, N_HIGH_MEMORY); | ||
864 | kswapd_stop(node); | ||
865 | } | ||
852 | 866 | ||
853 | vm_total_pages = nr_free_pagecache_pages(); | 867 | vm_total_pages = nr_free_pagecache_pages(); |
854 | writeback_set_ratelimit(); | 868 | writeback_set_ratelimit(); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4545d5944243..08f40a2f3fe0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -73,7 +73,6 @@ | |||
73 | #include <linux/sched.h> | 73 | #include <linux/sched.h> |
74 | #include <linux/nodemask.h> | 74 | #include <linux/nodemask.h> |
75 | #include <linux/cpuset.h> | 75 | #include <linux/cpuset.h> |
76 | #include <linux/gfp.h> | ||
77 | #include <linux/slab.h> | 76 | #include <linux/slab.h> |
78 | #include <linux/string.h> | 77 | #include <linux/string.h> |
79 | #include <linux/module.h> | 78 | #include <linux/module.h> |
@@ -85,10 +84,12 @@ | |||
85 | #include <linux/seq_file.h> | 84 | #include <linux/seq_file.h> |
86 | #include <linux/proc_fs.h> | 85 | #include <linux/proc_fs.h> |
87 | #include <linux/migrate.h> | 86 | #include <linux/migrate.h> |
87 | #include <linux/ksm.h> | ||
88 | #include <linux/rmap.h> | 88 | #include <linux/rmap.h> |
89 | #include <linux/security.h> | 89 | #include <linux/security.h> |
90 | #include <linux/syscalls.h> | 90 | #include <linux/syscalls.h> |
91 | #include <linux/ctype.h> | 91 | #include <linux/ctype.h> |
92 | #include <linux/mm_inline.h> | ||
92 | 93 | ||
93 | #include <asm/tlbflush.h> | 94 | #include <asm/tlbflush.h> |
94 | #include <asm/uaccess.h> | 95 | #include <asm/uaccess.h> |
@@ -412,17 +413,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
412 | if (!page) | 413 | if (!page) |
413 | continue; | 414 | continue; |
414 | /* | 415 | /* |
415 | * The check for PageReserved here is important to avoid | 416 | * vm_normal_page() filters out zero pages, but there might |
416 | * handling zero pages and other pages that may have been | 417 | * still be PageReserved pages to skip, perhaps in a VDSO. |
417 | * marked special by the system. | 418 | * And we cannot move PageKsm pages sensibly or safely yet. |
418 | * | ||
419 | * If the PageReserved would not be checked here then f.e. | ||
420 | * the location of the zero page could have an influence | ||
421 | * on MPOL_MF_STRICT, zero pages would be counted for | ||
422 | * the per node stats, and there would be useless attempts | ||
423 | * to put zero pages on the migration list. | ||
424 | */ | 419 | */ |
425 | if (PageReserved(page)) | 420 | if (PageReserved(page) || PageKsm(page)) |
426 | continue; | 421 | continue; |
427 | nid = page_to_nid(page); | 422 | nid = page_to_nid(page); |
428 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 423 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
@@ -567,24 +562,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) | |||
567 | } | 562 | } |
568 | 563 | ||
569 | /* Step 2: apply policy to a range and do splits. */ | 564 | /* Step 2: apply policy to a range and do splits. */ |
570 | static int mbind_range(struct vm_area_struct *vma, unsigned long start, | 565 | static int mbind_range(struct mm_struct *mm, unsigned long start, |
571 | unsigned long end, struct mempolicy *new) | 566 | unsigned long end, struct mempolicy *new_pol) |
572 | { | 567 | { |
573 | struct vm_area_struct *next; | 568 | struct vm_area_struct *next; |
574 | int err; | 569 | struct vm_area_struct *prev; |
570 | struct vm_area_struct *vma; | ||
571 | int err = 0; | ||
572 | pgoff_t pgoff; | ||
573 | unsigned long vmstart; | ||
574 | unsigned long vmend; | ||
575 | 575 | ||
576 | err = 0; | 576 | vma = find_vma_prev(mm, start, &prev); |
577 | for (; vma && vma->vm_start < end; vma = next) { | 577 | if (!vma || vma->vm_start > start) |
578 | return -EFAULT; | ||
579 | |||
580 | for (; vma && vma->vm_start < end; prev = vma, vma = next) { | ||
578 | next = vma->vm_next; | 581 | next = vma->vm_next; |
579 | if (vma->vm_start < start) | 582 | vmstart = max(start, vma->vm_start); |
580 | err = split_vma(vma->vm_mm, vma, start, 1); | 583 | vmend = min(end, vma->vm_end); |
581 | if (!err && vma->vm_end > end) | 584 | |
582 | err = split_vma(vma->vm_mm, vma, end, 0); | 585 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
583 | if (!err) | 586 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, |
584 | err = policy_vma(vma, new); | 587 | vma->anon_vma, vma->vm_file, pgoff, new_pol); |
588 | if (prev) { | ||
589 | vma = prev; | ||
590 | next = vma->vm_next; | ||
591 | continue; | ||
592 | } | ||
593 | if (vma->vm_start != vmstart) { | ||
594 | err = split_vma(vma->vm_mm, vma, vmstart, 1); | ||
595 | if (err) | ||
596 | goto out; | ||
597 | } | ||
598 | if (vma->vm_end != vmend) { | ||
599 | err = split_vma(vma->vm_mm, vma, vmend, 0); | ||
600 | if (err) | ||
601 | goto out; | ||
602 | } | ||
603 | err = policy_vma(vma, new_pol); | ||
585 | if (err) | 604 | if (err) |
586 | break; | 605 | goto out; |
587 | } | 606 | } |
607 | |||
608 | out: | ||
588 | return err; | 609 | return err; |
589 | } | 610 | } |
590 | 611 | ||
@@ -784,9 +805,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
784 | 805 | ||
785 | err = 0; | 806 | err = 0; |
786 | if (nmask) { | 807 | if (nmask) { |
787 | task_lock(current); | 808 | if (mpol_store_user_nodemask(pol)) { |
788 | get_policy_nodemask(pol, nmask); | 809 | *nmask = pol->w.user_nodemask; |
789 | task_unlock(current); | 810 | } else { |
811 | task_lock(current); | ||
812 | get_policy_nodemask(pol, nmask); | ||
813 | task_unlock(current); | ||
814 | } | ||
790 | } | 815 | } |
791 | 816 | ||
792 | out: | 817 | out: |
@@ -809,6 +834,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
809 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { | 834 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { |
810 | if (!isolate_lru_page(page)) { | 835 | if (!isolate_lru_page(page)) { |
811 | list_add_tail(&page->lru, pagelist); | 836 | list_add_tail(&page->lru, pagelist); |
837 | inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
838 | page_is_file_cache(page)); | ||
812 | } | 839 | } |
813 | } | 840 | } |
814 | } | 841 | } |
@@ -836,7 +863,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
836 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 863 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
837 | 864 | ||
838 | if (!list_empty(&pagelist)) | 865 | if (!list_empty(&pagelist)) |
839 | err = migrate_pages(&pagelist, new_node_page, dest); | 866 | err = migrate_pages(&pagelist, new_node_page, dest, 0); |
840 | 867 | ||
841 | return err; | 868 | return err; |
842 | } | 869 | } |
@@ -864,36 +891,36 @@ int do_migrate_pages(struct mm_struct *mm, | |||
864 | if (err) | 891 | if (err) |
865 | goto out; | 892 | goto out; |
866 | 893 | ||
867 | /* | 894 | /* |
868 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' | 895 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' |
869 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' | 896 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' |
870 | * bit in 'tmp', and return that <source, dest> pair for migration. | 897 | * bit in 'tmp', and return that <source, dest> pair for migration. |
871 | * The pair of nodemasks 'to' and 'from' define the map. | 898 | * The pair of nodemasks 'to' and 'from' define the map. |
872 | * | 899 | * |
873 | * If no pair of bits is found that way, fallback to picking some | 900 | * If no pair of bits is found that way, fallback to picking some |
874 | * pair of 'source' and 'dest' bits that are not the same. If the | 901 | * pair of 'source' and 'dest' bits that are not the same. If the |
875 | * 'source' and 'dest' bits are the same, this represents a node | 902 | * 'source' and 'dest' bits are the same, this represents a node |
876 | * that will be migrating to itself, so no pages need move. | 903 | * that will be migrating to itself, so no pages need move. |
877 | * | 904 | * |
878 | * If no bits are left in 'tmp', or if all remaining bits left | 905 | * If no bits are left in 'tmp', or if all remaining bits left |
879 | * in 'tmp' correspond to the same bit in 'to', return false | 906 | * in 'tmp' correspond to the same bit in 'to', return false |
880 | * (nothing left to migrate). | 907 | * (nothing left to migrate). |
881 | * | 908 | * |
882 | * This lets us pick a pair of nodes to migrate between, such that | 909 | * This lets us pick a pair of nodes to migrate between, such that |
883 | * if possible the dest node is not already occupied by some other | 910 | * if possible the dest node is not already occupied by some other |
884 | * source node, minimizing the risk of overloading the memory on a | 911 | * source node, minimizing the risk of overloading the memory on a |
885 | * node that would happen if we migrated incoming memory to a node | 912 | * node that would happen if we migrated incoming memory to a node |
886 | * before migrating outgoing memory source that same node. | 913 | * before migrating outgoing memory source that same node. |
887 | * | 914 | * |
888 | * A single scan of tmp is sufficient. As we go, we remember the | 915 | * A single scan of tmp is sufficient. As we go, we remember the |
889 | * most recent <s, d> pair that moved (s != d). If we find a pair | 916 | * most recent <s, d> pair that moved (s != d). If we find a pair |
890 | * that not only moved, but what's better, moved to an empty slot | 917 | * that not only moved, but what's better, moved to an empty slot |
891 | * (d is not set in tmp), then we break out then, with that pair. | 918 | * (d is not set in tmp), then we break out then, with that pair. |
892 | * Otherwise when we finish scannng from_tmp, we at least have the | 919 | * Otherwise when we finish scannng from_tmp, we at least have the |
893 | * most recent <s, d> pair that moved. If we get all the way through | 920 | * most recent <s, d> pair that moved. If we get all the way through |
894 | * the scan of tmp without finding any node that moved, much less | 921 | * the scan of tmp without finding any node that moved, much less |
895 | * moved to an empty node, then there is nothing left worth migrating. | 922 | * moved to an empty node, then there is nothing left worth migrating. |
896 | */ | 923 | */ |
897 | 924 | ||
898 | tmp = *from_nodes; | 925 | tmp = *from_nodes; |
899 | while (!nodes_empty(tmp)) { | 926 | while (!nodes_empty(tmp)) { |
@@ -1049,11 +1076,11 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1049 | if (!IS_ERR(vma)) { | 1076 | if (!IS_ERR(vma)) { |
1050 | int nr_failed = 0; | 1077 | int nr_failed = 0; |
1051 | 1078 | ||
1052 | err = mbind_range(vma, start, end, new); | 1079 | err = mbind_range(mm, start, end, new); |
1053 | 1080 | ||
1054 | if (!list_empty(&pagelist)) | 1081 | if (!list_empty(&pagelist)) |
1055 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1082 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1056 | (unsigned long)vma); | 1083 | (unsigned long)vma, 0); |
1057 | 1084 | ||
1058 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 1085 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) |
1059 | err = -EIO; | 1086 | err = -EIO; |
@@ -1565,6 +1592,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | |||
1565 | } | 1592 | } |
1566 | return zl; | 1593 | return zl; |
1567 | } | 1594 | } |
1595 | |||
1596 | /* | ||
1597 | * init_nodemask_of_mempolicy | ||
1598 | * | ||
1599 | * If the current task's mempolicy is "default" [NULL], return 'false' | ||
1600 | * to indicate default policy. Otherwise, extract the policy nodemask | ||
1601 | * for 'bind' or 'interleave' policy into the argument nodemask, or | ||
1602 | * initialize the argument nodemask to contain the single node for | ||
1603 | * 'preferred' or 'local' policy and return 'true' to indicate presence | ||
1604 | * of non-default mempolicy. | ||
1605 | * | ||
1606 | * We don't bother with reference counting the mempolicy [mpol_get/put] | ||
1607 | * because the current task is examining it's own mempolicy and a task's | ||
1608 | * mempolicy is only ever changed by the task itself. | ||
1609 | * | ||
1610 | * N.B., it is the caller's responsibility to free a returned nodemask. | ||
1611 | */ | ||
1612 | bool init_nodemask_of_mempolicy(nodemask_t *mask) | ||
1613 | { | ||
1614 | struct mempolicy *mempolicy; | ||
1615 | int nid; | ||
1616 | |||
1617 | if (!(mask && current->mempolicy)) | ||
1618 | return false; | ||
1619 | |||
1620 | mempolicy = current->mempolicy; | ||
1621 | switch (mempolicy->mode) { | ||
1622 | case MPOL_PREFERRED: | ||
1623 | if (mempolicy->flags & MPOL_F_LOCAL) | ||
1624 | nid = numa_node_id(); | ||
1625 | else | ||
1626 | nid = mempolicy->v.preferred_node; | ||
1627 | init_nodemask_of_node(mask, nid); | ||
1628 | break; | ||
1629 | |||
1630 | case MPOL_BIND: | ||
1631 | /* Fall through */ | ||
1632 | case MPOL_INTERLEAVE: | ||
1633 | *mask = mempolicy->v.nodes; | ||
1634 | break; | ||
1635 | |||
1636 | default: | ||
1637 | BUG(); | ||
1638 | } | ||
1639 | |||
1640 | return true; | ||
1641 | } | ||
1568 | #endif | 1642 | #endif |
1569 | 1643 | ||
1570 | /* Allocate a page in interleaved policy. | 1644 | /* Allocate a page in interleaved policy. |
@@ -1685,10 +1759,12 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) | |||
1685 | 1759 | ||
1686 | if (!new) | 1760 | if (!new) |
1687 | return ERR_PTR(-ENOMEM); | 1761 | return ERR_PTR(-ENOMEM); |
1762 | rcu_read_lock(); | ||
1688 | if (current_cpuset_is_being_rebound()) { | 1763 | if (current_cpuset_is_being_rebound()) { |
1689 | nodemask_t mems = cpuset_mems_allowed(current); | 1764 | nodemask_t mems = cpuset_mems_allowed(current); |
1690 | mpol_rebind_policy(old, &mems); | 1765 | mpol_rebind_policy(old, &mems); |
1691 | } | 1766 | } |
1767 | rcu_read_unlock(); | ||
1692 | *new = *old; | 1768 | *new = *old; |
1693 | atomic_set(&new->refcnt, 1); | 1769 | atomic_set(&new->refcnt, 1); |
1694 | return new; | 1770 | return new; |
@@ -2122,8 +2198,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2122 | char *rest = nodelist; | 2198 | char *rest = nodelist; |
2123 | while (isdigit(*rest)) | 2199 | while (isdigit(*rest)) |
2124 | rest++; | 2200 | rest++; |
2125 | if (!*rest) | 2201 | if (*rest) |
2126 | err = 0; | 2202 | goto out; |
2127 | } | 2203 | } |
2128 | break; | 2204 | break; |
2129 | case MPOL_INTERLEAVE: | 2205 | case MPOL_INTERLEAVE: |
@@ -2132,7 +2208,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2132 | */ | 2208 | */ |
2133 | if (!nodelist) | 2209 | if (!nodelist) |
2134 | nodes = node_states[N_HIGH_MEMORY]; | 2210 | nodes = node_states[N_HIGH_MEMORY]; |
2135 | err = 0; | ||
2136 | break; | 2211 | break; |
2137 | case MPOL_LOCAL: | 2212 | case MPOL_LOCAL: |
2138 | /* | 2213 | /* |
@@ -2142,11 +2217,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2142 | goto out; | 2217 | goto out; |
2143 | mode = MPOL_PREFERRED; | 2218 | mode = MPOL_PREFERRED; |
2144 | break; | 2219 | break; |
2145 | 2220 | case MPOL_DEFAULT: | |
2146 | /* | 2221 | /* |
2147 | * case MPOL_BIND: mpol_new() enforces non-empty nodemask. | 2222 | * Insist on a empty nodelist |
2148 | * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. | 2223 | */ |
2149 | */ | 2224 | if (!nodelist) |
2225 | err = 0; | ||
2226 | goto out; | ||
2227 | case MPOL_BIND: | ||
2228 | /* | ||
2229 | * Insist on a nodelist | ||
2230 | */ | ||
2231 | if (!nodelist) | ||
2232 | goto out; | ||
2150 | } | 2233 | } |
2151 | 2234 | ||
2152 | mode_flags = 0; | 2235 | mode_flags = 0; |
@@ -2160,13 +2243,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2160 | else if (!strcmp(flags, "relative")) | 2243 | else if (!strcmp(flags, "relative")) |
2161 | mode_flags |= MPOL_F_RELATIVE_NODES; | 2244 | mode_flags |= MPOL_F_RELATIVE_NODES; |
2162 | else | 2245 | else |
2163 | err = 1; | 2246 | goto out; |
2164 | } | 2247 | } |
2165 | 2248 | ||
2166 | new = mpol_new(mode, mode_flags, &nodes); | 2249 | new = mpol_new(mode, mode_flags, &nodes); |
2167 | if (IS_ERR(new)) | 2250 | if (IS_ERR(new)) |
2168 | err = 1; | 2251 | goto out; |
2169 | else { | 2252 | |
2253 | { | ||
2170 | int ret; | 2254 | int ret; |
2171 | NODEMASK_SCRATCH(scratch); | 2255 | NODEMASK_SCRATCH(scratch); |
2172 | if (scratch) { | 2256 | if (scratch) { |
@@ -2177,13 +2261,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2177 | ret = -ENOMEM; | 2261 | ret = -ENOMEM; |
2178 | NODEMASK_SCRATCH_FREE(scratch); | 2262 | NODEMASK_SCRATCH_FREE(scratch); |
2179 | if (ret) { | 2263 | if (ret) { |
2180 | err = 1; | ||
2181 | mpol_put(new); | 2264 | mpol_put(new); |
2182 | } else if (no_context) { | 2265 | goto out; |
2183 | /* save for contextualization */ | ||
2184 | new->w.user_nodemask = nodes; | ||
2185 | } | 2266 | } |
2186 | } | 2267 | } |
2268 | err = 0; | ||
2269 | if (no_context) { | ||
2270 | /* save for contextualization */ | ||
2271 | new->w.user_nodemask = nodes; | ||
2272 | } | ||
2187 | 2273 | ||
2188 | out: | 2274 | out: |
2189 | /* Restore string for error message */ | 2275 | /* Restore string for error message */ |
diff --git a/mm/migrate.c b/mm/migrate.c index 7dbcb22316d2..d3f3f7f81075 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/mm_inline.h> | 21 | #include <linux/mm_inline.h> |
22 | #include <linux/nsproxy.h> | 22 | #include <linux/nsproxy.h> |
23 | #include <linux/pagevec.h> | 23 | #include <linux/pagevec.h> |
24 | #include <linux/ksm.h> | ||
24 | #include <linux/rmap.h> | 25 | #include <linux/rmap.h> |
25 | #include <linux/topology.h> | 26 | #include <linux/topology.h> |
26 | #include <linux/cpu.h> | 27 | #include <linux/cpu.h> |
@@ -31,6 +32,7 @@ | |||
31 | #include <linux/security.h> | 32 | #include <linux/security.h> |
32 | #include <linux/memcontrol.h> | 33 | #include <linux/memcontrol.h> |
33 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
35 | #include <linux/gfp.h> | ||
34 | 36 | ||
35 | #include "internal.h" | 37 | #include "internal.h" |
36 | 38 | ||
@@ -78,8 +80,8 @@ int putback_lru_pages(struct list_head *l) | |||
78 | /* | 80 | /* |
79 | * Restore a potential migration pte to a working pte entry | 81 | * Restore a potential migration pte to a working pte entry |
80 | */ | 82 | */ |
81 | static void remove_migration_pte(struct vm_area_struct *vma, | 83 | static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, |
82 | struct page *old, struct page *new) | 84 | unsigned long addr, void *old) |
83 | { | 85 | { |
84 | struct mm_struct *mm = vma->vm_mm; | 86 | struct mm_struct *mm = vma->vm_mm; |
85 | swp_entry_t entry; | 87 | swp_entry_t entry; |
@@ -88,40 +90,37 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
88 | pmd_t *pmd; | 90 | pmd_t *pmd; |
89 | pte_t *ptep, pte; | 91 | pte_t *ptep, pte; |
90 | spinlock_t *ptl; | 92 | spinlock_t *ptl; |
91 | unsigned long addr = page_address_in_vma(new, vma); | ||
92 | |||
93 | if (addr == -EFAULT) | ||
94 | return; | ||
95 | 93 | ||
96 | pgd = pgd_offset(mm, addr); | 94 | pgd = pgd_offset(mm, addr); |
97 | if (!pgd_present(*pgd)) | 95 | if (!pgd_present(*pgd)) |
98 | return; | 96 | goto out; |
99 | 97 | ||
100 | pud = pud_offset(pgd, addr); | 98 | pud = pud_offset(pgd, addr); |
101 | if (!pud_present(*pud)) | 99 | if (!pud_present(*pud)) |
102 | return; | 100 | goto out; |
103 | 101 | ||
104 | pmd = pmd_offset(pud, addr); | 102 | pmd = pmd_offset(pud, addr); |
105 | if (!pmd_present(*pmd)) | 103 | if (!pmd_present(*pmd)) |
106 | return; | 104 | goto out; |
107 | 105 | ||
108 | ptep = pte_offset_map(pmd, addr); | 106 | ptep = pte_offset_map(pmd, addr); |
109 | 107 | ||
110 | if (!is_swap_pte(*ptep)) { | 108 | if (!is_swap_pte(*ptep)) { |
111 | pte_unmap(ptep); | 109 | pte_unmap(ptep); |
112 | return; | 110 | goto out; |
113 | } | 111 | } |
114 | 112 | ||
115 | ptl = pte_lockptr(mm, pmd); | 113 | ptl = pte_lockptr(mm, pmd); |
116 | spin_lock(ptl); | 114 | spin_lock(ptl); |
117 | pte = *ptep; | 115 | pte = *ptep; |
118 | if (!is_swap_pte(pte)) | 116 | if (!is_swap_pte(pte)) |
119 | goto out; | 117 | goto unlock; |
120 | 118 | ||
121 | entry = pte_to_swp_entry(pte); | 119 | entry = pte_to_swp_entry(pte); |
122 | 120 | ||
123 | if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) | 121 | if (!is_migration_entry(entry) || |
124 | goto out; | 122 | migration_entry_to_page(entry) != old) |
123 | goto unlock; | ||
125 | 124 | ||
126 | get_page(new); | 125 | get_page(new); |
127 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 126 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
@@ -136,59 +135,11 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
136 | page_add_file_rmap(new); | 135 | page_add_file_rmap(new); |
137 | 136 | ||
138 | /* No need to invalidate - it was non-present before */ | 137 | /* No need to invalidate - it was non-present before */ |
139 | update_mmu_cache(vma, addr, pte); | 138 | update_mmu_cache(vma, addr, ptep); |
140 | 139 | unlock: | |
141 | out: | ||
142 | pte_unmap_unlock(ptep, ptl); | 140 | pte_unmap_unlock(ptep, ptl); |
143 | } | 141 | out: |
144 | 142 | return SWAP_AGAIN; | |
145 | /* | ||
146 | * Note that remove_file_migration_ptes will only work on regular mappings, | ||
147 | * Nonlinear mappings do not use migration entries. | ||
148 | */ | ||
149 | static void remove_file_migration_ptes(struct page *old, struct page *new) | ||
150 | { | ||
151 | struct vm_area_struct *vma; | ||
152 | struct address_space *mapping = new->mapping; | ||
153 | struct prio_tree_iter iter; | ||
154 | pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
155 | |||
156 | if (!mapping) | ||
157 | return; | ||
158 | |||
159 | spin_lock(&mapping->i_mmap_lock); | ||
160 | |||
161 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) | ||
162 | remove_migration_pte(vma, old, new); | ||
163 | |||
164 | spin_unlock(&mapping->i_mmap_lock); | ||
165 | } | ||
166 | |||
167 | /* | ||
168 | * Must hold mmap_sem lock on at least one of the vmas containing | ||
169 | * the page so that the anon_vma cannot vanish. | ||
170 | */ | ||
171 | static void remove_anon_migration_ptes(struct page *old, struct page *new) | ||
172 | { | ||
173 | struct anon_vma *anon_vma; | ||
174 | struct vm_area_struct *vma; | ||
175 | unsigned long mapping; | ||
176 | |||
177 | mapping = (unsigned long)new->mapping; | ||
178 | |||
179 | if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) | ||
180 | return; | ||
181 | |||
182 | /* | ||
183 | * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. | ||
184 | */ | ||
185 | anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); | ||
186 | spin_lock(&anon_vma->lock); | ||
187 | |||
188 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) | ||
189 | remove_migration_pte(vma, old, new); | ||
190 | |||
191 | spin_unlock(&anon_vma->lock); | ||
192 | } | 143 | } |
193 | 144 | ||
194 | /* | 145 | /* |
@@ -197,10 +148,7 @@ static void remove_anon_migration_ptes(struct page *old, struct page *new) | |||
197 | */ | 148 | */ |
198 | static void remove_migration_ptes(struct page *old, struct page *new) | 149 | static void remove_migration_ptes(struct page *old, struct page *new) |
199 | { | 150 | { |
200 | if (PageAnon(new)) | 151 | rmap_walk(new, remove_migration_pte, old); |
201 | remove_anon_migration_ptes(old, new); | ||
202 | else | ||
203 | remove_file_migration_ptes(old, new); | ||
204 | } | 152 | } |
205 | 153 | ||
206 | /* | 154 | /* |
@@ -328,8 +276,6 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
328 | */ | 276 | */ |
329 | static void migrate_page_copy(struct page *newpage, struct page *page) | 277 | static void migrate_page_copy(struct page *newpage, struct page *page) |
330 | { | 278 | { |
331 | int anon; | ||
332 | |||
333 | copy_highpage(newpage, page); | 279 | copy_highpage(newpage, page); |
334 | 280 | ||
335 | if (PageError(page)) | 281 | if (PageError(page)) |
@@ -341,8 +287,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
341 | if (TestClearPageActive(page)) { | 287 | if (TestClearPageActive(page)) { |
342 | VM_BUG_ON(PageUnevictable(page)); | 288 | VM_BUG_ON(PageUnevictable(page)); |
343 | SetPageActive(newpage); | 289 | SetPageActive(newpage); |
344 | } else | 290 | } else if (TestClearPageUnevictable(page)) |
345 | unevictable_migrate_page(newpage, page); | 291 | SetPageUnevictable(newpage); |
346 | if (PageChecked(page)) | 292 | if (PageChecked(page)) |
347 | SetPageChecked(newpage); | 293 | SetPageChecked(newpage); |
348 | if (PageMappedToDisk(page)) | 294 | if (PageMappedToDisk(page)) |
@@ -361,12 +307,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
361 | } | 307 | } |
362 | 308 | ||
363 | mlock_migrate_page(newpage, page); | 309 | mlock_migrate_page(newpage, page); |
310 | ksm_migrate_page(newpage, page); | ||
364 | 311 | ||
365 | ClearPageSwapCache(page); | 312 | ClearPageSwapCache(page); |
366 | ClearPagePrivate(page); | 313 | ClearPagePrivate(page); |
367 | set_page_private(page, 0); | 314 | set_page_private(page, 0); |
368 | /* page->mapping contains a flag for PageAnon() */ | ||
369 | anon = PageAnon(page); | ||
370 | page->mapping = NULL; | 315 | page->mapping = NULL; |
371 | 316 | ||
372 | /* | 317 | /* |
@@ -580,9 +525,9 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
580 | else | 525 | else |
581 | rc = fallback_migrate_page(mapping, newpage, page); | 526 | rc = fallback_migrate_page(mapping, newpage, page); |
582 | 527 | ||
583 | if (!rc) { | 528 | if (!rc) |
584 | remove_migration_ptes(page, newpage); | 529 | remove_migration_ptes(page, newpage); |
585 | } else | 530 | else |
586 | newpage->mapping = NULL; | 531 | newpage->mapping = NULL; |
587 | 532 | ||
588 | unlock_page(newpage); | 533 | unlock_page(newpage); |
@@ -595,7 +540,7 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
595 | * to the newly allocated page in newpage. | 540 | * to the newly allocated page in newpage. |
596 | */ | 541 | */ |
597 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 542 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, |
598 | struct page *page, int force) | 543 | struct page *page, int force, int offlining) |
599 | { | 544 | { |
600 | int rc = 0; | 545 | int rc = 0; |
601 | int *result = NULL; | 546 | int *result = NULL; |
@@ -621,6 +566,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
621 | lock_page(page); | 566 | lock_page(page); |
622 | } | 567 | } |
623 | 568 | ||
569 | /* | ||
570 | * Only memory hotplug's offline_pages() caller has locked out KSM, | ||
571 | * and can safely migrate a KSM page. The other cases have skipped | ||
572 | * PageKsm along with PageReserved - but it is only now when we have | ||
573 | * the page lock that we can be certain it will not go KSM beneath us | ||
574 | * (KSM will not upgrade a page from PageAnon to PageKsm when it sees | ||
575 | * its pagecount raised, but only here do we take the page lock which | ||
576 | * serializes that). | ||
577 | */ | ||
578 | if (PageKsm(page) && !offlining) { | ||
579 | rc = -EBUSY; | ||
580 | goto unlock; | ||
581 | } | ||
582 | |||
624 | /* charge against new page */ | 583 | /* charge against new page */ |
625 | charge = mem_cgroup_prepare_migration(page, &mem); | 584 | charge = mem_cgroup_prepare_migration(page, &mem); |
626 | if (charge == -ENOMEM) { | 585 | if (charge == -ENOMEM) { |
@@ -737,7 +696,7 @@ move_newpage: | |||
737 | * Return: Number of pages not migrated or error code. | 696 | * Return: Number of pages not migrated or error code. |
738 | */ | 697 | */ |
739 | int migrate_pages(struct list_head *from, | 698 | int migrate_pages(struct list_head *from, |
740 | new_page_t get_new_page, unsigned long private) | 699 | new_page_t get_new_page, unsigned long private, int offlining) |
741 | { | 700 | { |
742 | int retry = 1; | 701 | int retry = 1; |
743 | int nr_failed = 0; | 702 | int nr_failed = 0; |
@@ -746,13 +705,6 @@ int migrate_pages(struct list_head *from, | |||
746 | struct page *page2; | 705 | struct page *page2; |
747 | int swapwrite = current->flags & PF_SWAPWRITE; | 706 | int swapwrite = current->flags & PF_SWAPWRITE; |
748 | int rc; | 707 | int rc; |
749 | unsigned long flags; | ||
750 | |||
751 | local_irq_save(flags); | ||
752 | list_for_each_entry(page, from, lru) | ||
753 | __inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
754 | page_is_file_cache(page)); | ||
755 | local_irq_restore(flags); | ||
756 | 708 | ||
757 | if (!swapwrite) | 709 | if (!swapwrite) |
758 | current->flags |= PF_SWAPWRITE; | 710 | current->flags |= PF_SWAPWRITE; |
@@ -764,7 +716,7 @@ int migrate_pages(struct list_head *from, | |||
764 | cond_resched(); | 716 | cond_resched(); |
765 | 717 | ||
766 | rc = unmap_and_move(get_new_page, private, | 718 | rc = unmap_and_move(get_new_page, private, |
767 | page, pass > 2); | 719 | page, pass > 2, offlining); |
768 | 720 | ||
769 | switch(rc) { | 721 | switch(rc) { |
770 | case -ENOMEM: | 722 | case -ENOMEM: |
@@ -860,7 +812,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
860 | if (!page) | 812 | if (!page) |
861 | goto set_status; | 813 | goto set_status; |
862 | 814 | ||
863 | if (PageReserved(page)) /* Check for zero page */ | 815 | /* Use PageReserved to check for zero page */ |
816 | if (PageReserved(page) || PageKsm(page)) | ||
864 | goto put_and_set; | 817 | goto put_and_set; |
865 | 818 | ||
866 | pp->page = page; | 819 | pp->page = page; |
@@ -878,8 +831,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
878 | goto put_and_set; | 831 | goto put_and_set; |
879 | 832 | ||
880 | err = isolate_lru_page(page); | 833 | err = isolate_lru_page(page); |
881 | if (!err) | 834 | if (!err) { |
882 | list_add_tail(&page->lru, &pagelist); | 835 | list_add_tail(&page->lru, &pagelist); |
836 | inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
837 | page_is_file_cache(page)); | ||
838 | } | ||
883 | put_and_set: | 839 | put_and_set: |
884 | /* | 840 | /* |
885 | * Either remove the duplicate refcount from | 841 | * Either remove the duplicate refcount from |
@@ -894,7 +850,7 @@ set_status: | |||
894 | err = 0; | 850 | err = 0; |
895 | if (!list_empty(&pagelist)) | 851 | if (!list_empty(&pagelist)) |
896 | err = migrate_pages(&pagelist, new_page_node, | 852 | err = migrate_pages(&pagelist, new_page_node, |
897 | (unsigned long)pm); | 853 | (unsigned long)pm, 0); |
898 | 854 | ||
899 | up_read(&mm->mmap_sem); | 855 | up_read(&mm->mmap_sem); |
900 | return err; | 856 | return err; |
@@ -953,6 +909,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | |||
953 | goto out_pm; | 909 | goto out_pm; |
954 | 910 | ||
955 | err = -ENODEV; | 911 | err = -ENODEV; |
912 | if (node < 0 || node >= MAX_NUMNODES) | ||
913 | goto out_pm; | ||
914 | |||
956 | if (!node_state(node, N_HIGH_MEMORY)) | 915 | if (!node_state(node, N_HIGH_MEMORY)) |
957 | goto out_pm; | 916 | goto out_pm; |
958 | 917 | ||
@@ -1015,7 +974,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, | |||
1015 | 974 | ||
1016 | err = -ENOENT; | 975 | err = -ENOENT; |
1017 | /* Use PageReserved to check for zero page */ | 976 | /* Use PageReserved to check for zero page */ |
1018 | if (!page || PageReserved(page)) | 977 | if (!page || PageReserved(page) || PageKsm(page)) |
1019 | goto set_status; | 978 | goto set_status; |
1020 | 979 | ||
1021 | err = page_to_nid(page); | 980 | err = page_to_nid(page); |
@@ -1040,33 +999,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, | |||
1040 | #define DO_PAGES_STAT_CHUNK_NR 16 | 999 | #define DO_PAGES_STAT_CHUNK_NR 16 |
1041 | const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; | 1000 | const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; |
1042 | int chunk_status[DO_PAGES_STAT_CHUNK_NR]; | 1001 | int chunk_status[DO_PAGES_STAT_CHUNK_NR]; |
1043 | unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR; | ||
1044 | int err; | ||
1045 | 1002 | ||
1046 | for (i = 0; i < nr_pages; i += chunk_nr) { | 1003 | while (nr_pages) { |
1047 | if (chunk_nr + i > nr_pages) | 1004 | unsigned long chunk_nr; |
1048 | chunk_nr = nr_pages - i; | ||
1049 | 1005 | ||
1050 | err = copy_from_user(chunk_pages, &pages[i], | 1006 | chunk_nr = nr_pages; |
1051 | chunk_nr * sizeof(*chunk_pages)); | 1007 | if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) |
1052 | if (err) { | 1008 | chunk_nr = DO_PAGES_STAT_CHUNK_NR; |
1053 | err = -EFAULT; | 1009 | |
1054 | goto out; | 1010 | if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) |
1055 | } | 1011 | break; |
1056 | 1012 | ||
1057 | do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); | 1013 | do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); |
1058 | 1014 | ||
1059 | err = copy_to_user(&status[i], chunk_status, | 1015 | if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) |
1060 | chunk_nr * sizeof(*chunk_status)); | 1016 | break; |
1061 | if (err) { | ||
1062 | err = -EFAULT; | ||
1063 | goto out; | ||
1064 | } | ||
1065 | } | ||
1066 | err = 0; | ||
1067 | 1017 | ||
1068 | out: | 1018 | pages += chunk_nr; |
1069 | return err; | 1019 | status += chunk_nr; |
1020 | nr_pages -= chunk_nr; | ||
1021 | } | ||
1022 | return nr_pages ? -EFAULT : 0; | ||
1070 | } | 1023 | } |
1071 | 1024 | ||
1072 | /* | 1025 | /* |
diff --git a/mm/mincore.c b/mm/mincore.c index 8cb508f84ea4..f77433c20279 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -7,13 +7,14 @@ | |||
7 | /* | 7 | /* |
8 | * The mincore() system call. | 8 | * The mincore() system call. |
9 | */ | 9 | */ |
10 | #include <linux/slab.h> | ||
11 | #include <linux/pagemap.h> | 10 | #include <linux/pagemap.h> |
11 | #include <linux/gfp.h> | ||
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/mman.h> | 13 | #include <linux/mman.h> |
14 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> |
15 | #include <linux/swap.h> | 15 | #include <linux/swap.h> |
16 | #include <linux/swapops.h> | 16 | #include <linux/swapops.h> |
17 | #include <linux/hugetlb.h> | ||
17 | 18 | ||
18 | #include <asm/uaccess.h> | 19 | #include <asm/uaccess.h> |
19 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
@@ -72,6 +73,42 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag | |||
72 | if (!vma || addr < vma->vm_start) | 73 | if (!vma || addr < vma->vm_start) |
73 | return -ENOMEM; | 74 | return -ENOMEM; |
74 | 75 | ||
76 | #ifdef CONFIG_HUGETLB_PAGE | ||
77 | if (is_vm_hugetlb_page(vma)) { | ||
78 | struct hstate *h; | ||
79 | unsigned long nr_huge; | ||
80 | unsigned char present; | ||
81 | |||
82 | i = 0; | ||
83 | nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); | ||
84 | h = hstate_vma(vma); | ||
85 | nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h)) | ||
86 | - (addr >> huge_page_shift(h)) + 1; | ||
87 | nr_huge = min(nr_huge, | ||
88 | (vma->vm_end - addr) >> huge_page_shift(h)); | ||
89 | while (1) { | ||
90 | /* hugepage always in RAM for now, | ||
91 | * but generally it needs to be check */ | ||
92 | ptep = huge_pte_offset(current->mm, | ||
93 | addr & huge_page_mask(h)); | ||
94 | present = !!(ptep && | ||
95 | !huge_pte_none(huge_ptep_get(ptep))); | ||
96 | while (1) { | ||
97 | vec[i++] = present; | ||
98 | addr += PAGE_SIZE; | ||
99 | /* reach buffer limit */ | ||
100 | if (i == nr) | ||
101 | return nr; | ||
102 | /* check hugepage border */ | ||
103 | if (!((addr & ~huge_page_mask(h)) | ||
104 | >> PAGE_SHIFT)) | ||
105 | break; | ||
106 | } | ||
107 | } | ||
108 | return nr; | ||
109 | } | ||
110 | #endif | ||
111 | |||
75 | /* | 112 | /* |
76 | * Calculate how many pages there are left in the last level of the | 113 | * Calculate how many pages there are left in the last level of the |
77 | * PTE array for our address. | 114 | * PTE array for our address. |
diff --git a/mm/mlock.c b/mm/mlock.c index bd6f0e466f6c..8f4e2dfceec1 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -25,7 +25,7 @@ int can_do_mlock(void) | |||
25 | { | 25 | { |
26 | if (capable(CAP_IPC_LOCK)) | 26 | if (capable(CAP_IPC_LOCK)) |
27 | return 1; | 27 | return 1; |
28 | if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0) | 28 | if (rlimit(RLIMIT_MEMLOCK) != 0) |
29 | return 1; | 29 | return 1; |
30 | return 0; | 30 | return 0; |
31 | } | 31 | } |
@@ -88,25 +88,22 @@ void mlock_vma_page(struct page *page) | |||
88 | } | 88 | } |
89 | } | 89 | } |
90 | 90 | ||
91 | /* | 91 | /** |
92 | * called from munlock()/munmap() path with page supposedly on the LRU. | 92 | * munlock_vma_page - munlock a vma page |
93 | * @page - page to be unlocked | ||
93 | * | 94 | * |
94 | * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked | 95 | * called from munlock()/munmap() path with page supposedly on the LRU. |
95 | * [in try_to_munlock()] and then attempt to isolate the page. We must | 96 | * When we munlock a page, because the vma where we found the page is being |
96 | * isolate the page to keep others from messing with its unevictable | 97 | * munlock()ed or munmap()ed, we want to check whether other vmas hold the |
97 | * and mlocked state while trying to munlock. However, we pre-clear the | 98 | * page locked so that we can leave it on the unevictable lru list and not |
98 | * mlocked state anyway as we might lose the isolation race and we might | 99 | * bother vmscan with it. However, to walk the page's rmap list in |
99 | * not get another chance to clear PageMlocked. If we successfully | 100 | * try_to_munlock() we must isolate the page from the LRU. If some other |
100 | * isolate the page and try_to_munlock() detects other VM_LOCKED vmas | 101 | * task has removed the page from the LRU, we won't be able to do that. |
101 | * mapping the page, it will restore the PageMlocked state, unless the page | 102 | * So we clear the PageMlocked as we might not get another chance. If we |
102 | * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), | 103 | * can't isolate the page, we leave it for putback_lru_page() and vmscan |
103 | * perhaps redundantly. | 104 | * [page_referenced()/try_to_unmap()] to deal with. |
104 | * If we lose the isolation race, and the page is mapped by other VM_LOCKED | ||
105 | * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() | ||
106 | * either of which will restore the PageMlocked state by calling | ||
107 | * mlock_vma_page() above, if it can grab the vma's mmap sem. | ||
108 | */ | 105 | */ |
109 | static void munlock_vma_page(struct page *page) | 106 | void munlock_vma_page(struct page *page) |
110 | { | 107 | { |
111 | BUG_ON(!PageLocked(page)); | 108 | BUG_ON(!PageLocked(page)); |
112 | 109 | ||
@@ -117,18 +114,18 @@ static void munlock_vma_page(struct page *page) | |||
117 | /* | 114 | /* |
118 | * did try_to_unlock() succeed or punt? | 115 | * did try_to_unlock() succeed or punt? |
119 | */ | 116 | */ |
120 | if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) | 117 | if (ret != SWAP_MLOCK) |
121 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | 118 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); |
122 | 119 | ||
123 | putback_lru_page(page); | 120 | putback_lru_page(page); |
124 | } else { | 121 | } else { |
125 | /* | 122 | /* |
126 | * We lost the race. let try_to_unmap() deal | 123 | * Some other task has removed the page from the LRU. |
127 | * with it. At least we get the page state and | 124 | * putback_lru_page() will take care of removing the |
128 | * mlock stats right. However, page is still on | 125 | * page from the unevictable list, if necessary. |
129 | * the noreclaim list. We'll fix that up when | 126 | * vmscan [page_referenced()] will move the page back |
130 | * the page is eventually freed or we scan the | 127 | * to the unevictable list if some other vma has it |
131 | * noreclaim list. | 128 | * mlocked. |
132 | */ | 129 | */ |
133 | if (PageUnevictable(page)) | 130 | if (PageUnevictable(page)) |
134 | count_vm_event(UNEVICTABLE_PGSTRANDED); | 131 | count_vm_event(UNEVICTABLE_PGSTRANDED); |
@@ -490,7 +487,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
490 | locked = len >> PAGE_SHIFT; | 487 | locked = len >> PAGE_SHIFT; |
491 | locked += current->mm->locked_vm; | 488 | locked += current->mm->locked_vm; |
492 | 489 | ||
493 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 490 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
494 | lock_limit >>= PAGE_SHIFT; | 491 | lock_limit >>= PAGE_SHIFT; |
495 | 492 | ||
496 | /* check against resource limits */ | 493 | /* check against resource limits */ |
@@ -553,7 +550,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
553 | 550 | ||
554 | down_write(¤t->mm->mmap_sem); | 551 | down_write(¤t->mm->mmap_sem); |
555 | 552 | ||
556 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 553 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
557 | lock_limit >>= PAGE_SHIFT; | 554 | lock_limit >>= PAGE_SHIFT; |
558 | 555 | ||
559 | ret = -ENOMEM; | 556 | ret = -ENOMEM; |
@@ -587,7 +584,7 @@ int user_shm_lock(size_t size, struct user_struct *user) | |||
587 | int allowed = 0; | 584 | int allowed = 0; |
588 | 585 | ||
589 | locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | 586 | locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
590 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 587 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
591 | if (lock_limit == RLIM_INFINITY) | 588 | if (lock_limit == RLIM_INFINITY) |
592 | allowed = 1; | 589 | allowed = 1; |
593 | lock_limit >>= PAGE_SHIFT; | 590 | lock_limit >>= PAGE_SHIFT; |
@@ -621,12 +618,12 @@ int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, | |||
621 | 618 | ||
622 | down_write(&mm->mmap_sem); | 619 | down_write(&mm->mmap_sem); |
623 | 620 | ||
624 | lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | 621 | lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT; |
625 | vm = mm->total_vm + pgsz; | 622 | vm = mm->total_vm + pgsz; |
626 | if (lim < vm) | 623 | if (lim < vm) |
627 | goto out; | 624 | goto out; |
628 | 625 | ||
629 | lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | 626 | lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT; |
630 | vm = mm->locked_vm + pgsz; | 627 | vm = mm->locked_vm + pgsz; |
631 | if (lim < vm) | 628 | if (lim < vm) |
632 | goto out; | 629 | goto out; |
@@ -20,7 +20,6 @@ | |||
20 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
21 | #include <linux/personality.h> | 21 | #include <linux/personality.h> |
22 | #include <linux/security.h> | 22 | #include <linux/security.h> |
23 | #include <linux/ima.h> | ||
24 | #include <linux/hugetlb.h> | 23 | #include <linux/hugetlb.h> |
25 | #include <linux/profile.h> | 24 | #include <linux/profile.h> |
26 | #include <linux/module.h> | 25 | #include <linux/module.h> |
@@ -266,7 +265,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
266 | * segment grow beyond its set limit the in case where the limit is | 265 | * segment grow beyond its set limit the in case where the limit is |
267 | * not page aligned -Ram Gupta | 266 | * not page aligned -Ram Gupta |
268 | */ | 267 | */ |
269 | rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; | 268 | rlim = rlimit(RLIMIT_DATA); |
270 | if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + | 269 | if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + |
271 | (mm->end_data - mm->start_data) > rlim) | 270 | (mm->end_data - mm->start_data) > rlim) |
272 | goto out; | 271 | goto out; |
@@ -438,7 +437,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
438 | { | 437 | { |
439 | __vma_link_list(mm, vma, prev, rb_parent); | 438 | __vma_link_list(mm, vma, prev, rb_parent); |
440 | __vma_link_rb(mm, vma, rb_link, rb_parent); | 439 | __vma_link_rb(mm, vma, rb_link, rb_parent); |
441 | __anon_vma_link(vma); | ||
442 | } | 440 | } |
443 | 441 | ||
444 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | 442 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, |
@@ -500,7 +498,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | |||
500 | * are necessary. The "insert" vma (if any) is to be inserted | 498 | * are necessary. The "insert" vma (if any) is to be inserted |
501 | * before we drop the necessary locks. | 499 | * before we drop the necessary locks. |
502 | */ | 500 | */ |
503 | void vma_adjust(struct vm_area_struct *vma, unsigned long start, | 501 | int vma_adjust(struct vm_area_struct *vma, unsigned long start, |
504 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) | 502 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) |
505 | { | 503 | { |
506 | struct mm_struct *mm = vma->vm_mm; | 504 | struct mm_struct *mm = vma->vm_mm; |
@@ -509,11 +507,12 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
509 | struct address_space *mapping = NULL; | 507 | struct address_space *mapping = NULL; |
510 | struct prio_tree_root *root = NULL; | 508 | struct prio_tree_root *root = NULL; |
511 | struct file *file = vma->vm_file; | 509 | struct file *file = vma->vm_file; |
512 | struct anon_vma *anon_vma = NULL; | ||
513 | long adjust_next = 0; | 510 | long adjust_next = 0; |
514 | int remove_next = 0; | 511 | int remove_next = 0; |
515 | 512 | ||
516 | if (next && !insert) { | 513 | if (next && !insert) { |
514 | struct vm_area_struct *exporter = NULL; | ||
515 | |||
517 | if (end >= next->vm_end) { | 516 | if (end >= next->vm_end) { |
518 | /* | 517 | /* |
519 | * vma expands, overlapping all the next, and | 518 | * vma expands, overlapping all the next, and |
@@ -521,7 +520,7 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
521 | */ | 520 | */ |
522 | again: remove_next = 1 + (end > next->vm_end); | 521 | again: remove_next = 1 + (end > next->vm_end); |
523 | end = next->vm_end; | 522 | end = next->vm_end; |
524 | anon_vma = next->anon_vma; | 523 | exporter = next; |
525 | importer = vma; | 524 | importer = vma; |
526 | } else if (end > next->vm_start) { | 525 | } else if (end > next->vm_start) { |
527 | /* | 526 | /* |
@@ -529,7 +528,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
529 | * mprotect case 5 shifting the boundary up. | 528 | * mprotect case 5 shifting the boundary up. |
530 | */ | 529 | */ |
531 | adjust_next = (end - next->vm_start) >> PAGE_SHIFT; | 530 | adjust_next = (end - next->vm_start) >> PAGE_SHIFT; |
532 | anon_vma = next->anon_vma; | 531 | exporter = next; |
533 | importer = vma; | 532 | importer = vma; |
534 | } else if (end < vma->vm_end) { | 533 | } else if (end < vma->vm_end) { |
535 | /* | 534 | /* |
@@ -538,9 +537,20 @@ again: remove_next = 1 + (end > next->vm_end); | |||
538 | * mprotect case 4 shifting the boundary down. | 537 | * mprotect case 4 shifting the boundary down. |
539 | */ | 538 | */ |
540 | adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); | 539 | adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); |
541 | anon_vma = next->anon_vma; | 540 | exporter = vma; |
542 | importer = next; | 541 | importer = next; |
543 | } | 542 | } |
543 | |||
544 | /* | ||
545 | * Easily overlooked: when mprotect shifts the boundary, | ||
546 | * make sure the expanding vma has anon_vma set if the | ||
547 | * shrinking vma had, to cover any anon pages imported. | ||
548 | */ | ||
549 | if (exporter && exporter->anon_vma && !importer->anon_vma) { | ||
550 | if (anon_vma_clone(importer, exporter)) | ||
551 | return -ENOMEM; | ||
552 | importer->anon_vma = exporter->anon_vma; | ||
553 | } | ||
544 | } | 554 | } |
545 | 555 | ||
546 | if (file) { | 556 | if (file) { |
@@ -568,25 +578,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
568 | } | 578 | } |
569 | } | 579 | } |
570 | 580 | ||
571 | /* | ||
572 | * When changing only vma->vm_end, we don't really need | ||
573 | * anon_vma lock. | ||
574 | */ | ||
575 | if (vma->anon_vma && (insert || importer || start != vma->vm_start)) | ||
576 | anon_vma = vma->anon_vma; | ||
577 | if (anon_vma) { | ||
578 | spin_lock(&anon_vma->lock); | ||
579 | /* | ||
580 | * Easily overlooked: when mprotect shifts the boundary, | ||
581 | * make sure the expanding vma has anon_vma set if the | ||
582 | * shrinking vma had, to cover any anon pages imported. | ||
583 | */ | ||
584 | if (importer && !importer->anon_vma) { | ||
585 | importer->anon_vma = anon_vma; | ||
586 | __anon_vma_link(importer); | ||
587 | } | ||
588 | } | ||
589 | |||
590 | if (root) { | 581 | if (root) { |
591 | flush_dcache_mmap_lock(mapping); | 582 | flush_dcache_mmap_lock(mapping); |
592 | vma_prio_tree_remove(vma, root); | 583 | vma_prio_tree_remove(vma, root); |
@@ -617,8 +608,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
617 | __vma_unlink(mm, next, vma); | 608 | __vma_unlink(mm, next, vma); |
618 | if (file) | 609 | if (file) |
619 | __remove_shared_vm_struct(next, file, mapping); | 610 | __remove_shared_vm_struct(next, file, mapping); |
620 | if (next->anon_vma) | ||
621 | __anon_vma_merge(vma, next); | ||
622 | } else if (insert) { | 611 | } else if (insert) { |
623 | /* | 612 | /* |
624 | * split_vma has split insert from vma, and needs | 613 | * split_vma has split insert from vma, and needs |
@@ -628,8 +617,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
628 | __insert_vm_struct(mm, insert); | 617 | __insert_vm_struct(mm, insert); |
629 | } | 618 | } |
630 | 619 | ||
631 | if (anon_vma) | ||
632 | spin_unlock(&anon_vma->lock); | ||
633 | if (mapping) | 620 | if (mapping) |
634 | spin_unlock(&mapping->i_mmap_lock); | 621 | spin_unlock(&mapping->i_mmap_lock); |
635 | 622 | ||
@@ -639,6 +626,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
639 | if (next->vm_flags & VM_EXECUTABLE) | 626 | if (next->vm_flags & VM_EXECUTABLE) |
640 | removed_exe_file_vma(mm); | 627 | removed_exe_file_vma(mm); |
641 | } | 628 | } |
629 | if (next->anon_vma) | ||
630 | anon_vma_merge(vma, next); | ||
642 | mm->map_count--; | 631 | mm->map_count--; |
643 | mpol_put(vma_policy(next)); | 632 | mpol_put(vma_policy(next)); |
644 | kmem_cache_free(vm_area_cachep, next); | 633 | kmem_cache_free(vm_area_cachep, next); |
@@ -654,6 +643,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
654 | } | 643 | } |
655 | 644 | ||
656 | validate_mm(mm); | 645 | validate_mm(mm); |
646 | |||
647 | return 0; | ||
657 | } | 648 | } |
658 | 649 | ||
659 | /* | 650 | /* |
@@ -760,6 +751,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
760 | { | 751 | { |
761 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; | 752 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; |
762 | struct vm_area_struct *area, *next; | 753 | struct vm_area_struct *area, *next; |
754 | int err; | ||
763 | 755 | ||
764 | /* | 756 | /* |
765 | * We later require that vma->vm_flags == vm_flags, | 757 | * We later require that vma->vm_flags == vm_flags, |
@@ -793,11 +785,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
793 | is_mergeable_anon_vma(prev->anon_vma, | 785 | is_mergeable_anon_vma(prev->anon_vma, |
794 | next->anon_vma)) { | 786 | next->anon_vma)) { |
795 | /* cases 1, 6 */ | 787 | /* cases 1, 6 */ |
796 | vma_adjust(prev, prev->vm_start, | 788 | err = vma_adjust(prev, prev->vm_start, |
797 | next->vm_end, prev->vm_pgoff, NULL); | 789 | next->vm_end, prev->vm_pgoff, NULL); |
798 | } else /* cases 2, 5, 7 */ | 790 | } else /* cases 2, 5, 7 */ |
799 | vma_adjust(prev, prev->vm_start, | 791 | err = vma_adjust(prev, prev->vm_start, |
800 | end, prev->vm_pgoff, NULL); | 792 | end, prev->vm_pgoff, NULL); |
793 | if (err) | ||
794 | return NULL; | ||
801 | return prev; | 795 | return prev; |
802 | } | 796 | } |
803 | 797 | ||
@@ -809,11 +803,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
809 | can_vma_merge_before(next, vm_flags, | 803 | can_vma_merge_before(next, vm_flags, |
810 | anon_vma, file, pgoff+pglen)) { | 804 | anon_vma, file, pgoff+pglen)) { |
811 | if (prev && addr < prev->vm_end) /* case 4 */ | 805 | if (prev && addr < prev->vm_end) /* case 4 */ |
812 | vma_adjust(prev, prev->vm_start, | 806 | err = vma_adjust(prev, prev->vm_start, |
813 | addr, prev->vm_pgoff, NULL); | 807 | addr, prev->vm_pgoff, NULL); |
814 | else /* cases 3, 8 */ | 808 | else /* cases 3, 8 */ |
815 | vma_adjust(area, addr, next->vm_end, | 809 | err = vma_adjust(area, addr, next->vm_end, |
816 | next->vm_pgoff - pglen, NULL); | 810 | next->vm_pgoff - pglen, NULL); |
811 | if (err) | ||
812 | return NULL; | ||
817 | return area; | 813 | return area; |
818 | } | 814 | } |
819 | 815 | ||
@@ -821,6 +817,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
821 | } | 817 | } |
822 | 818 | ||
823 | /* | 819 | /* |
820 | * Rough compatbility check to quickly see if it's even worth looking | ||
821 | * at sharing an anon_vma. | ||
822 | * | ||
823 | * They need to have the same vm_file, and the flags can only differ | ||
824 | * in things that mprotect may change. | ||
825 | * | ||
826 | * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that | ||
827 | * we can merge the two vma's. For example, we refuse to merge a vma if | ||
828 | * there is a vm_ops->close() function, because that indicates that the | ||
829 | * driver is doing some kind of reference counting. But that doesn't | ||
830 | * really matter for the anon_vma sharing case. | ||
831 | */ | ||
832 | static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) | ||
833 | { | ||
834 | return a->vm_end == b->vm_start && | ||
835 | mpol_equal(vma_policy(a), vma_policy(b)) && | ||
836 | a->vm_file == b->vm_file && | ||
837 | !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && | ||
838 | b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); | ||
839 | } | ||
840 | |||
841 | /* | ||
842 | * Do some basic sanity checking to see if we can re-use the anon_vma | ||
843 | * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be | ||
844 | * the same as 'old', the other will be the new one that is trying | ||
845 | * to share the anon_vma. | ||
846 | * | ||
847 | * NOTE! This runs with mm_sem held for reading, so it is possible that | ||
848 | * the anon_vma of 'old' is concurrently in the process of being set up | ||
849 | * by another page fault trying to merge _that_. But that's ok: if it | ||
850 | * is being set up, that automatically means that it will be a singleton | ||
851 | * acceptable for merging, so we can do all of this optimistically. But | ||
852 | * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. | ||
853 | * | ||
854 | * IOW: that the "list_is_singular()" test on the anon_vma_chain only | ||
855 | * matters for the 'stable anon_vma' case (ie the thing we want to avoid | ||
856 | * is to return an anon_vma that is "complex" due to having gone through | ||
857 | * a fork). | ||
858 | * | ||
859 | * We also make sure that the two vma's are compatible (adjacent, | ||
860 | * and with the same memory policies). That's all stable, even with just | ||
861 | * a read lock on the mm_sem. | ||
862 | */ | ||
863 | static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) | ||
864 | { | ||
865 | if (anon_vma_compatible(a, b)) { | ||
866 | struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); | ||
867 | |||
868 | if (anon_vma && list_is_singular(&old->anon_vma_chain)) | ||
869 | return anon_vma; | ||
870 | } | ||
871 | return NULL; | ||
872 | } | ||
873 | |||
874 | /* | ||
824 | * find_mergeable_anon_vma is used by anon_vma_prepare, to check | 875 | * find_mergeable_anon_vma is used by anon_vma_prepare, to check |
825 | * neighbouring vmas for a suitable anon_vma, before it goes off | 876 | * neighbouring vmas for a suitable anon_vma, before it goes off |
826 | * to allocate a new anon_vma. It checks because a repetitive | 877 | * to allocate a new anon_vma. It checks because a repetitive |
@@ -830,28 +881,16 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
830 | */ | 881 | */ |
831 | struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) | 882 | struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) |
832 | { | 883 | { |
884 | struct anon_vma *anon_vma; | ||
833 | struct vm_area_struct *near; | 885 | struct vm_area_struct *near; |
834 | unsigned long vm_flags; | ||
835 | 886 | ||
836 | near = vma->vm_next; | 887 | near = vma->vm_next; |
837 | if (!near) | 888 | if (!near) |
838 | goto try_prev; | 889 | goto try_prev; |
839 | 890 | ||
840 | /* | 891 | anon_vma = reusable_anon_vma(near, vma, near); |
841 | * Since only mprotect tries to remerge vmas, match flags | 892 | if (anon_vma) |
842 | * which might be mprotected into each other later on. | 893 | return anon_vma; |
843 | * Neither mlock nor madvise tries to remerge at present, | ||
844 | * so leave their flags as obstructing a merge. | ||
845 | */ | ||
846 | vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); | ||
847 | vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); | ||
848 | |||
849 | if (near->anon_vma && vma->vm_end == near->vm_start && | ||
850 | mpol_equal(vma_policy(vma), vma_policy(near)) && | ||
851 | can_vma_merge_before(near, vm_flags, | ||
852 | NULL, vma->vm_file, vma->vm_pgoff + | ||
853 | ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))) | ||
854 | return near->anon_vma; | ||
855 | try_prev: | 894 | try_prev: |
856 | /* | 895 | /* |
857 | * It is potentially slow to have to call find_vma_prev here. | 896 | * It is potentially slow to have to call find_vma_prev here. |
@@ -864,14 +903,9 @@ try_prev: | |||
864 | if (!near) | 903 | if (!near) |
865 | goto none; | 904 | goto none; |
866 | 905 | ||
867 | vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); | 906 | anon_vma = reusable_anon_vma(near, near, vma); |
868 | vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); | 907 | if (anon_vma) |
869 | 908 | return anon_vma; | |
870 | if (near->anon_vma && near->vm_end == vma->vm_start && | ||
871 | mpol_equal(vma_policy(near), vma_policy(vma)) && | ||
872 | can_vma_merge_after(near, vm_flags, | ||
873 | NULL, vma->vm_file, vma->vm_pgoff)) | ||
874 | return near->anon_vma; | ||
875 | none: | 909 | none: |
876 | /* | 910 | /* |
877 | * There's no absolute need to look only at touching neighbours: | 911 | * There's no absolute need to look only at touching neighbours: |
@@ -932,13 +966,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
932 | if (!(flags & MAP_FIXED)) | 966 | if (!(flags & MAP_FIXED)) |
933 | addr = round_hint_to_min(addr); | 967 | addr = round_hint_to_min(addr); |
934 | 968 | ||
935 | error = arch_mmap_check(addr, len, flags); | ||
936 | if (error) | ||
937 | return error; | ||
938 | |||
939 | /* Careful about overflows.. */ | 969 | /* Careful about overflows.. */ |
940 | len = PAGE_ALIGN(len); | 970 | len = PAGE_ALIGN(len); |
941 | if (!len || len > TASK_SIZE) | 971 | if (!len) |
942 | return -ENOMEM; | 972 | return -ENOMEM; |
943 | 973 | ||
944 | /* offset overflow? */ | 974 | /* offset overflow? */ |
@@ -949,24 +979,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
949 | if (mm->map_count > sysctl_max_map_count) | 979 | if (mm->map_count > sysctl_max_map_count) |
950 | return -ENOMEM; | 980 | return -ENOMEM; |
951 | 981 | ||
952 | if (flags & MAP_HUGETLB) { | ||
953 | struct user_struct *user = NULL; | ||
954 | if (file) | ||
955 | return -EINVAL; | ||
956 | |||
957 | /* | ||
958 | * VM_NORESERVE is used because the reservations will be | ||
959 | * taken when vm_ops->mmap() is called | ||
960 | * A dummy user value is used because we are not locking | ||
961 | * memory so no accounting is necessary | ||
962 | */ | ||
963 | len = ALIGN(len, huge_page_size(&default_hstate)); | ||
964 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, | ||
965 | &user, HUGETLB_ANONHUGE_INODE); | ||
966 | if (IS_ERR(file)) | ||
967 | return PTR_ERR(file); | ||
968 | } | ||
969 | |||
970 | /* Obtain the address to map to. we verify (or select) it and ensure | 982 | /* Obtain the address to map to. we verify (or select) it and ensure |
971 | * that it represents a valid section of the address space. | 983 | * that it represents a valid section of the address space. |
972 | */ | 984 | */ |
@@ -990,7 +1002,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
990 | unsigned long locked, lock_limit; | 1002 | unsigned long locked, lock_limit; |
991 | locked = len >> PAGE_SHIFT; | 1003 | locked = len >> PAGE_SHIFT; |
992 | locked += mm->locked_vm; | 1004 | locked += mm->locked_vm; |
993 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 1005 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
994 | lock_limit >>= PAGE_SHIFT; | 1006 | lock_limit >>= PAGE_SHIFT; |
995 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 1007 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
996 | return -EAGAIN; | 1008 | return -EAGAIN; |
@@ -1061,14 +1073,75 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1061 | error = security_file_mmap(file, reqprot, prot, flags, addr, 0); | 1073 | error = security_file_mmap(file, reqprot, prot, flags, addr, 0); |
1062 | if (error) | 1074 | if (error) |
1063 | return error; | 1075 | return error; |
1064 | error = ima_file_mmap(file, prot); | ||
1065 | if (error) | ||
1066 | return error; | ||
1067 | 1076 | ||
1068 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); | 1077 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); |
1069 | } | 1078 | } |
1070 | EXPORT_SYMBOL(do_mmap_pgoff); | 1079 | EXPORT_SYMBOL(do_mmap_pgoff); |
1071 | 1080 | ||
1081 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | ||
1082 | unsigned long, prot, unsigned long, flags, | ||
1083 | unsigned long, fd, unsigned long, pgoff) | ||
1084 | { | ||
1085 | struct file *file = NULL; | ||
1086 | unsigned long retval = -EBADF; | ||
1087 | |||
1088 | if (!(flags & MAP_ANONYMOUS)) { | ||
1089 | if (unlikely(flags & MAP_HUGETLB)) | ||
1090 | return -EINVAL; | ||
1091 | file = fget(fd); | ||
1092 | if (!file) | ||
1093 | goto out; | ||
1094 | } else if (flags & MAP_HUGETLB) { | ||
1095 | struct user_struct *user = NULL; | ||
1096 | /* | ||
1097 | * VM_NORESERVE is used because the reservations will be | ||
1098 | * taken when vm_ops->mmap() is called | ||
1099 | * A dummy user value is used because we are not locking | ||
1100 | * memory so no accounting is necessary | ||
1101 | */ | ||
1102 | len = ALIGN(len, huge_page_size(&default_hstate)); | ||
1103 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, | ||
1104 | &user, HUGETLB_ANONHUGE_INODE); | ||
1105 | if (IS_ERR(file)) | ||
1106 | return PTR_ERR(file); | ||
1107 | } | ||
1108 | |||
1109 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
1110 | |||
1111 | down_write(¤t->mm->mmap_sem); | ||
1112 | retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
1113 | up_write(¤t->mm->mmap_sem); | ||
1114 | |||
1115 | if (file) | ||
1116 | fput(file); | ||
1117 | out: | ||
1118 | return retval; | ||
1119 | } | ||
1120 | |||
1121 | #ifdef __ARCH_WANT_SYS_OLD_MMAP | ||
1122 | struct mmap_arg_struct { | ||
1123 | unsigned long addr; | ||
1124 | unsigned long len; | ||
1125 | unsigned long prot; | ||
1126 | unsigned long flags; | ||
1127 | unsigned long fd; | ||
1128 | unsigned long offset; | ||
1129 | }; | ||
1130 | |||
1131 | SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) | ||
1132 | { | ||
1133 | struct mmap_arg_struct a; | ||
1134 | |||
1135 | if (copy_from_user(&a, arg, sizeof(a))) | ||
1136 | return -EFAULT; | ||
1137 | if (a.offset & ~PAGE_MASK) | ||
1138 | return -EINVAL; | ||
1139 | |||
1140 | return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, | ||
1141 | a.offset >> PAGE_SHIFT); | ||
1142 | } | ||
1143 | #endif /* __ARCH_WANT_SYS_OLD_MMAP */ | ||
1144 | |||
1072 | /* | 1145 | /* |
1073 | * Some shared mappigns will want the pages marked read-only | 1146 | * Some shared mappigns will want the pages marked read-only |
1074 | * to track write events. If so, we'll downgrade vm_page_prot | 1147 | * to track write events. If so, we'll downgrade vm_page_prot |
@@ -1191,6 +1264,7 @@ munmap_back: | |||
1191 | vma->vm_flags = vm_flags; | 1264 | vma->vm_flags = vm_flags; |
1192 | vma->vm_page_prot = vm_get_page_prot(vm_flags); | 1265 | vma->vm_page_prot = vm_get_page_prot(vm_flags); |
1193 | vma->vm_pgoff = pgoff; | 1266 | vma->vm_pgoff = pgoff; |
1267 | INIT_LIST_HEAD(&vma->anon_vma_chain); | ||
1194 | 1268 | ||
1195 | if (file) { | 1269 | if (file) { |
1196 | error = -EINVAL; | 1270 | error = -EINVAL; |
@@ -1224,8 +1298,20 @@ munmap_back: | |||
1224 | goto free_vma; | 1298 | goto free_vma; |
1225 | } | 1299 | } |
1226 | 1300 | ||
1227 | if (vma_wants_writenotify(vma)) | 1301 | if (vma_wants_writenotify(vma)) { |
1302 | pgprot_t pprot = vma->vm_page_prot; | ||
1303 | |||
1304 | /* Can vma->vm_page_prot have changed?? | ||
1305 | * | ||
1306 | * Answer: Yes, drivers may have changed it in their | ||
1307 | * f_op->mmap method. | ||
1308 | * | ||
1309 | * Ensures that vmas marked as uncached stay that way. | ||
1310 | */ | ||
1228 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); | 1311 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); |
1312 | if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot))) | ||
1313 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | ||
1314 | } | ||
1229 | 1315 | ||
1230 | vma_link(mm, vma, prev, rb_link, rb_parent); | 1316 | vma_link(mm, vma, prev, rb_link, rb_parent); |
1231 | file = vma->vm_file; | 1317 | file = vma->vm_file; |
@@ -1239,13 +1325,8 @@ out: | |||
1239 | mm->total_vm += len >> PAGE_SHIFT; | 1325 | mm->total_vm += len >> PAGE_SHIFT; |
1240 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1326 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1241 | if (vm_flags & VM_LOCKED) { | 1327 | if (vm_flags & VM_LOCKED) { |
1242 | /* | 1328 | if (!mlock_vma_pages_range(vma, addr, addr + len)) |
1243 | * makes pages present; downgrades, drops, reacquires mmap_sem | 1329 | mm->locked_vm += (len >> PAGE_SHIFT); |
1244 | */ | ||
1245 | long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); | ||
1246 | if (nr_pages < 0) | ||
1247 | return nr_pages; /* vma gone! */ | ||
1248 | mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; | ||
1249 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | 1330 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) |
1250 | make_pages_present(addr, addr + len); | 1331 | make_pages_present(addr, addr + len); |
1251 | return addr; | 1332 | return addr; |
@@ -1459,6 +1540,14 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | |||
1459 | unsigned long (*get_area)(struct file *, unsigned long, | 1540 | unsigned long (*get_area)(struct file *, unsigned long, |
1460 | unsigned long, unsigned long, unsigned long); | 1541 | unsigned long, unsigned long, unsigned long); |
1461 | 1542 | ||
1543 | unsigned long error = arch_mmap_check(addr, len, flags); | ||
1544 | if (error) | ||
1545 | return error; | ||
1546 | |||
1547 | /* Careful about overflows.. */ | ||
1548 | if (len > TASK_SIZE) | ||
1549 | return -ENOMEM; | ||
1550 | |||
1462 | get_area = current->mm->get_unmapped_area; | 1551 | get_area = current->mm->get_unmapped_area; |
1463 | if (file && file->f_op && file->f_op->get_unmapped_area) | 1552 | if (file && file->f_op && file->f_op->get_unmapped_area) |
1464 | get_area = file->f_op->get_unmapped_area; | 1553 | get_area = file->f_op->get_unmapped_area; |
@@ -1565,7 +1654,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
1565 | return -ENOMEM; | 1654 | return -ENOMEM; |
1566 | 1655 | ||
1567 | /* Stack limit test */ | 1656 | /* Stack limit test */ |
1568 | if (size > rlim[RLIMIT_STACK].rlim_cur) | 1657 | if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) |
1569 | return -ENOMEM; | 1658 | return -ENOMEM; |
1570 | 1659 | ||
1571 | /* mlock limit tests */ | 1660 | /* mlock limit tests */ |
@@ -1573,7 +1662,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
1573 | unsigned long locked; | 1662 | unsigned long locked; |
1574 | unsigned long limit; | 1663 | unsigned long limit; |
1575 | locked = mm->locked_vm + grow; | 1664 | locked = mm->locked_vm + grow; |
1576 | limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | 1665 | limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); |
1666 | limit >>= PAGE_SHIFT; | ||
1577 | if (locked > limit && !capable(CAP_IPC_LOCK)) | 1667 | if (locked > limit && !capable(CAP_IPC_LOCK)) |
1578 | return -ENOMEM; | 1668 | return -ENOMEM; |
1579 | } | 1669 | } |
@@ -1720,8 +1810,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
1720 | if (!prev || expand_stack(prev, addr)) | 1810 | if (!prev || expand_stack(prev, addr)) |
1721 | return NULL; | 1811 | return NULL; |
1722 | if (prev->vm_flags & VM_LOCKED) { | 1812 | if (prev->vm_flags & VM_LOCKED) { |
1723 | if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) | 1813 | mlock_vma_pages_range(prev, addr, prev->vm_end); |
1724 | return NULL; /* vma gone! */ | ||
1725 | } | 1814 | } |
1726 | return prev; | 1815 | return prev; |
1727 | } | 1816 | } |
@@ -1749,8 +1838,7 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
1749 | if (expand_stack(vma, addr)) | 1838 | if (expand_stack(vma, addr)) |
1750 | return NULL; | 1839 | return NULL; |
1751 | if (vma->vm_flags & VM_LOCKED) { | 1840 | if (vma->vm_flags & VM_LOCKED) { |
1752 | if (mlock_vma_pages_range(vma, addr, start) < 0) | 1841 | mlock_vma_pages_range(vma, addr, start); |
1753 | return NULL; /* vma gone! */ | ||
1754 | } | 1842 | } |
1755 | return vma; | 1843 | return vma; |
1756 | } | 1844 | } |
@@ -1829,29 +1917,29 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1829 | } | 1917 | } |
1830 | 1918 | ||
1831 | /* | 1919 | /* |
1832 | * Split a vma into two pieces at address 'addr', a new vma is allocated | 1920 | * __split_vma() bypasses sysctl_max_map_count checking. We use this on the |
1833 | * either for the first part or the tail. | 1921 | * munmap path where it doesn't make sense to fail. |
1834 | */ | 1922 | */ |
1835 | int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | 1923 | static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, |
1836 | unsigned long addr, int new_below) | 1924 | unsigned long addr, int new_below) |
1837 | { | 1925 | { |
1838 | struct mempolicy *pol; | 1926 | struct mempolicy *pol; |
1839 | struct vm_area_struct *new; | 1927 | struct vm_area_struct *new; |
1928 | int err = -ENOMEM; | ||
1840 | 1929 | ||
1841 | if (is_vm_hugetlb_page(vma) && (addr & | 1930 | if (is_vm_hugetlb_page(vma) && (addr & |
1842 | ~(huge_page_mask(hstate_vma(vma))))) | 1931 | ~(huge_page_mask(hstate_vma(vma))))) |
1843 | return -EINVAL; | 1932 | return -EINVAL; |
1844 | 1933 | ||
1845 | if (mm->map_count >= sysctl_max_map_count) | ||
1846 | return -ENOMEM; | ||
1847 | |||
1848 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 1934 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
1849 | if (!new) | 1935 | if (!new) |
1850 | return -ENOMEM; | 1936 | goto out_err; |
1851 | 1937 | ||
1852 | /* most fields are the same, copy all, and then fixup */ | 1938 | /* most fields are the same, copy all, and then fixup */ |
1853 | *new = *vma; | 1939 | *new = *vma; |
1854 | 1940 | ||
1941 | INIT_LIST_HEAD(&new->anon_vma_chain); | ||
1942 | |||
1855 | if (new_below) | 1943 | if (new_below) |
1856 | new->vm_end = addr; | 1944 | new->vm_end = addr; |
1857 | else { | 1945 | else { |
@@ -1861,11 +1949,14 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1861 | 1949 | ||
1862 | pol = mpol_dup(vma_policy(vma)); | 1950 | pol = mpol_dup(vma_policy(vma)); |
1863 | if (IS_ERR(pol)) { | 1951 | if (IS_ERR(pol)) { |
1864 | kmem_cache_free(vm_area_cachep, new); | 1952 | err = PTR_ERR(pol); |
1865 | return PTR_ERR(pol); | 1953 | goto out_free_vma; |
1866 | } | 1954 | } |
1867 | vma_set_policy(new, pol); | 1955 | vma_set_policy(new, pol); |
1868 | 1956 | ||
1957 | if (anon_vma_clone(new, vma)) | ||
1958 | goto out_free_mpol; | ||
1959 | |||
1869 | if (new->vm_file) { | 1960 | if (new->vm_file) { |
1870 | get_file(new->vm_file); | 1961 | get_file(new->vm_file); |
1871 | if (vma->vm_flags & VM_EXECUTABLE) | 1962 | if (vma->vm_flags & VM_EXECUTABLE) |
@@ -1876,12 +1967,42 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1876 | new->vm_ops->open(new); | 1967 | new->vm_ops->open(new); |
1877 | 1968 | ||
1878 | if (new_below) | 1969 | if (new_below) |
1879 | vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + | 1970 | err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + |
1880 | ((addr - new->vm_start) >> PAGE_SHIFT), new); | 1971 | ((addr - new->vm_start) >> PAGE_SHIFT), new); |
1881 | else | 1972 | else |
1882 | vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); | 1973 | err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); |
1883 | 1974 | ||
1884 | return 0; | 1975 | /* Success. */ |
1976 | if (!err) | ||
1977 | return 0; | ||
1978 | |||
1979 | /* Clean everything up if vma_adjust failed. */ | ||
1980 | if (new->vm_ops && new->vm_ops->close) | ||
1981 | new->vm_ops->close(new); | ||
1982 | if (new->vm_file) { | ||
1983 | if (vma->vm_flags & VM_EXECUTABLE) | ||
1984 | removed_exe_file_vma(mm); | ||
1985 | fput(new->vm_file); | ||
1986 | } | ||
1987 | out_free_mpol: | ||
1988 | mpol_put(pol); | ||
1989 | out_free_vma: | ||
1990 | kmem_cache_free(vm_area_cachep, new); | ||
1991 | out_err: | ||
1992 | return err; | ||
1993 | } | ||
1994 | |||
1995 | /* | ||
1996 | * Split a vma into two pieces at address 'addr', a new vma is allocated | ||
1997 | * either for the first part or the tail. | ||
1998 | */ | ||
1999 | int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | ||
2000 | unsigned long addr, int new_below) | ||
2001 | { | ||
2002 | if (mm->map_count >= sysctl_max_map_count) | ||
2003 | return -ENOMEM; | ||
2004 | |||
2005 | return __split_vma(mm, vma, addr, new_below); | ||
1885 | } | 2006 | } |
1886 | 2007 | ||
1887 | /* Munmap is split into 2 main parts -- this part which finds | 2008 | /* Munmap is split into 2 main parts -- this part which finds |
@@ -1919,7 +2040,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1919 | * places tmp vma above, and higher split_vma places tmp vma below. | 2040 | * places tmp vma above, and higher split_vma places tmp vma below. |
1920 | */ | 2041 | */ |
1921 | if (start > vma->vm_start) { | 2042 | if (start > vma->vm_start) { |
1922 | int error = split_vma(mm, vma, start, 0); | 2043 | int error; |
2044 | |||
2045 | /* | ||
2046 | * Make sure that map_count on return from munmap() will | ||
2047 | * not exceed its limit; but let map_count go just above | ||
2048 | * its limit temporarily, to help free resources as expected. | ||
2049 | */ | ||
2050 | if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) | ||
2051 | return -ENOMEM; | ||
2052 | |||
2053 | error = __split_vma(mm, vma, start, 0); | ||
1923 | if (error) | 2054 | if (error) |
1924 | return error; | 2055 | return error; |
1925 | prev = vma; | 2056 | prev = vma; |
@@ -1928,7 +2059,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1928 | /* Does it split the last one? */ | 2059 | /* Does it split the last one? */ |
1929 | last = find_vma(mm, end); | 2060 | last = find_vma(mm, end); |
1930 | if (last && end > last->vm_start) { | 2061 | if (last && end > last->vm_start) { |
1931 | int error = split_vma(mm, last, end, 1); | 2062 | int error = __split_vma(mm, last, end, 1); |
1932 | if (error) | 2063 | if (error) |
1933 | return error; | 2064 | return error; |
1934 | } | 2065 | } |
@@ -2003,20 +2134,14 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2003 | if (!len) | 2134 | if (!len) |
2004 | return addr; | 2135 | return addr; |
2005 | 2136 | ||
2006 | if ((addr + len) > TASK_SIZE || (addr + len) < addr) | ||
2007 | return -EINVAL; | ||
2008 | |||
2009 | if (is_hugepage_only_range(mm, addr, len)) | ||
2010 | return -EINVAL; | ||
2011 | |||
2012 | error = security_file_mmap(NULL, 0, 0, 0, addr, 1); | 2137 | error = security_file_mmap(NULL, 0, 0, 0, addr, 1); |
2013 | if (error) | 2138 | if (error) |
2014 | return error; | 2139 | return error; |
2015 | 2140 | ||
2016 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | 2141 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; |
2017 | 2142 | ||
2018 | error = arch_mmap_check(addr, len, flags); | 2143 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); |
2019 | if (error) | 2144 | if (error & ~PAGE_MASK) |
2020 | return error; | 2145 | return error; |
2021 | 2146 | ||
2022 | /* | 2147 | /* |
@@ -2026,7 +2151,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2026 | unsigned long locked, lock_limit; | 2151 | unsigned long locked, lock_limit; |
2027 | locked = len >> PAGE_SHIFT; | 2152 | locked = len >> PAGE_SHIFT; |
2028 | locked += mm->locked_vm; | 2153 | locked += mm->locked_vm; |
2029 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 2154 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
2030 | lock_limit >>= PAGE_SHIFT; | 2155 | lock_limit >>= PAGE_SHIFT; |
2031 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 2156 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
2032 | return -EAGAIN; | 2157 | return -EAGAIN; |
@@ -2074,6 +2199,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2074 | return -ENOMEM; | 2199 | return -ENOMEM; |
2075 | } | 2200 | } |
2076 | 2201 | ||
2202 | INIT_LIST_HEAD(&vma->anon_vma_chain); | ||
2077 | vma->vm_mm = mm; | 2203 | vma->vm_mm = mm; |
2078 | vma->vm_start = addr; | 2204 | vma->vm_start = addr; |
2079 | vma->vm_end = addr + len; | 2205 | vma->vm_end = addr + len; |
@@ -2210,10 +2336,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2210 | if (new_vma) { | 2336 | if (new_vma) { |
2211 | *new_vma = *vma; | 2337 | *new_vma = *vma; |
2212 | pol = mpol_dup(vma_policy(vma)); | 2338 | pol = mpol_dup(vma_policy(vma)); |
2213 | if (IS_ERR(pol)) { | 2339 | if (IS_ERR(pol)) |
2214 | kmem_cache_free(vm_area_cachep, new_vma); | 2340 | goto out_free_vma; |
2215 | return NULL; | 2341 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); |
2216 | } | 2342 | if (anon_vma_clone(new_vma, vma)) |
2343 | goto out_free_mempol; | ||
2217 | vma_set_policy(new_vma, pol); | 2344 | vma_set_policy(new_vma, pol); |
2218 | new_vma->vm_start = addr; | 2345 | new_vma->vm_start = addr; |
2219 | new_vma->vm_end = addr + len; | 2346 | new_vma->vm_end = addr + len; |
@@ -2229,6 +2356,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2229 | } | 2356 | } |
2230 | } | 2357 | } |
2231 | return new_vma; | 2358 | return new_vma; |
2359 | |||
2360 | out_free_mempol: | ||
2361 | mpol_put(pol); | ||
2362 | out_free_vma: | ||
2363 | kmem_cache_free(vm_area_cachep, new_vma); | ||
2364 | return NULL; | ||
2232 | } | 2365 | } |
2233 | 2366 | ||
2234 | /* | 2367 | /* |
@@ -2240,7 +2373,7 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) | |||
2240 | unsigned long cur = mm->total_vm; /* pages */ | 2373 | unsigned long cur = mm->total_vm; /* pages */ |
2241 | unsigned long lim; | 2374 | unsigned long lim; |
2242 | 2375 | ||
2243 | lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | 2376 | lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; |
2244 | 2377 | ||
2245 | if (cur + npages > lim) | 2378 | if (cur + npages > lim) |
2246 | return 0; | 2379 | return 0; |
@@ -2306,6 +2439,7 @@ int install_special_mapping(struct mm_struct *mm, | |||
2306 | if (unlikely(vma == NULL)) | 2439 | if (unlikely(vma == NULL)) |
2307 | return -ENOMEM; | 2440 | return -ENOMEM; |
2308 | 2441 | ||
2442 | INIT_LIST_HEAD(&vma->anon_vma_chain); | ||
2309 | vma->vm_mm = mm; | 2443 | vma->vm_mm = mm; |
2310 | vma->vm_start = addr; | 2444 | vma->vm_start = addr; |
2311 | vma->vm_end = addr + len; | 2445 | vma->vm_end = addr + len; |
@@ -2406,6 +2540,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
2406 | int mm_take_all_locks(struct mm_struct *mm) | 2540 | int mm_take_all_locks(struct mm_struct *mm) |
2407 | { | 2541 | { |
2408 | struct vm_area_struct *vma; | 2542 | struct vm_area_struct *vma; |
2543 | struct anon_vma_chain *avc; | ||
2409 | int ret = -EINTR; | 2544 | int ret = -EINTR; |
2410 | 2545 | ||
2411 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 2546 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
@@ -2423,7 +2558,8 @@ int mm_take_all_locks(struct mm_struct *mm) | |||
2423 | if (signal_pending(current)) | 2558 | if (signal_pending(current)) |
2424 | goto out_unlock; | 2559 | goto out_unlock; |
2425 | if (vma->anon_vma) | 2560 | if (vma->anon_vma) |
2426 | vm_lock_anon_vma(mm, vma->anon_vma); | 2561 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
2562 | vm_lock_anon_vma(mm, avc->anon_vma); | ||
2427 | } | 2563 | } |
2428 | 2564 | ||
2429 | ret = 0; | 2565 | ret = 0; |
@@ -2478,13 +2614,15 @@ static void vm_unlock_mapping(struct address_space *mapping) | |||
2478 | void mm_drop_all_locks(struct mm_struct *mm) | 2614 | void mm_drop_all_locks(struct mm_struct *mm) |
2479 | { | 2615 | { |
2480 | struct vm_area_struct *vma; | 2616 | struct vm_area_struct *vma; |
2617 | struct anon_vma_chain *avc; | ||
2481 | 2618 | ||
2482 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 2619 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
2483 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); | 2620 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); |
2484 | 2621 | ||
2485 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 2622 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
2486 | if (vma->anon_vma) | 2623 | if (vma->anon_vma) |
2487 | vm_unlock_anon_vma(vma->anon_vma); | 2624 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
2625 | vm_unlock_anon_vma(avc->anon_vma); | ||
2488 | if (vma->vm_file && vma->vm_file->f_mapping) | 2626 | if (vma->vm_file && vma->vm_file->f_mapping) |
2489 | vm_unlock_mapping(vma->vm_file->f_mapping); | 2627 | vm_unlock_mapping(vma->vm_file->f_mapping); |
2490 | } | 2628 | } |
diff --git a/mm/mmu_context.c b/mm/mmu_context.c index ded9081f4021..9e82e937000e 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c | |||
@@ -5,6 +5,7 @@ | |||
5 | 5 | ||
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/mmu_context.h> | 7 | #include <linux/mmu_context.h> |
8 | #include <linux/module.h> | ||
8 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
9 | 10 | ||
10 | #include <asm/mmu_context.h> | 11 | #include <asm/mmu_context.h> |
@@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm) | |||
37 | if (active_mm != mm) | 38 | if (active_mm != mm) |
38 | mmdrop(active_mm); | 39 | mmdrop(active_mm); |
39 | } | 40 | } |
41 | EXPORT_SYMBOL_GPL(use_mm); | ||
40 | 42 | ||
41 | /* | 43 | /* |
42 | * unuse_mm | 44 | * unuse_mm |
@@ -51,8 +53,10 @@ void unuse_mm(struct mm_struct *mm) | |||
51 | struct task_struct *tsk = current; | 53 | struct task_struct *tsk = current; |
52 | 54 | ||
53 | task_lock(tsk); | 55 | task_lock(tsk); |
56 | sync_mm_rss(tsk, mm); | ||
54 | tsk->mm = NULL; | 57 | tsk->mm = NULL; |
55 | /* active_mm is still 'mm' */ | 58 | /* active_mm is still 'mm' */ |
56 | enter_lazy_tlb(mm, tsk); | 59 | enter_lazy_tlb(mm, tsk); |
57 | task_unlock(tsk); | 60 | task_unlock(tsk); |
58 | } | 61 | } |
62 | EXPORT_SYMBOL_GPL(unuse_mm); | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 7e33f2cb3c77..438951d366f2 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/err.h> | 16 | #include <linux/err.h> |
17 | #include <linux/rcupdate.h> | 17 | #include <linux/rcupdate.h> |
18 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
19 | #include <linux/slab.h> | ||
19 | 20 | ||
20 | /* | 21 | /* |
21 | * This function can't run concurrently against mmu_notifier_register | 22 | * This function can't run concurrently against mmu_notifier_register |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 8bc969d8112d..2d1bf7cf8851 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -10,7 +10,6 @@ | |||
10 | 10 | ||
11 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
12 | #include <linux/hugetlb.h> | 12 | #include <linux/hugetlb.h> |
13 | #include <linux/slab.h> | ||
14 | #include <linux/shm.h> | 13 | #include <linux/shm.h> |
15 | #include <linux/mman.h> | 14 | #include <linux/mman.h> |
16 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
diff --git a/mm/mremap.c b/mm/mremap.c index 97bff2547719..cde56ee51ef7 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -9,7 +9,6 @@ | |||
9 | 9 | ||
10 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
11 | #include <linux/hugetlb.h> | 11 | #include <linux/hugetlb.h> |
12 | #include <linux/slab.h> | ||
13 | #include <linux/shm.h> | 12 | #include <linux/shm.h> |
14 | #include <linux/ksm.h> | 13 | #include <linux/ksm.h> |
15 | #include <linux/mman.h> | 14 | #include <linux/mman.h> |
@@ -261,6 +260,137 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
261 | return new_addr; | 260 | return new_addr; |
262 | } | 261 | } |
263 | 262 | ||
263 | static struct vm_area_struct *vma_to_resize(unsigned long addr, | ||
264 | unsigned long old_len, unsigned long new_len, unsigned long *p) | ||
265 | { | ||
266 | struct mm_struct *mm = current->mm; | ||
267 | struct vm_area_struct *vma = find_vma(mm, addr); | ||
268 | |||
269 | if (!vma || vma->vm_start > addr) | ||
270 | goto Efault; | ||
271 | |||
272 | if (is_vm_hugetlb_page(vma)) | ||
273 | goto Einval; | ||
274 | |||
275 | /* We can't remap across vm area boundaries */ | ||
276 | if (old_len > vma->vm_end - addr) | ||
277 | goto Efault; | ||
278 | |||
279 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { | ||
280 | if (new_len > old_len) | ||
281 | goto Efault; | ||
282 | } | ||
283 | |||
284 | if (vma->vm_flags & VM_LOCKED) { | ||
285 | unsigned long locked, lock_limit; | ||
286 | locked = mm->locked_vm << PAGE_SHIFT; | ||
287 | lock_limit = rlimit(RLIMIT_MEMLOCK); | ||
288 | locked += new_len - old_len; | ||
289 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
290 | goto Eagain; | ||
291 | } | ||
292 | |||
293 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) | ||
294 | goto Enomem; | ||
295 | |||
296 | if (vma->vm_flags & VM_ACCOUNT) { | ||
297 | unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; | ||
298 | if (security_vm_enough_memory(charged)) | ||
299 | goto Efault; | ||
300 | *p = charged; | ||
301 | } | ||
302 | |||
303 | return vma; | ||
304 | |||
305 | Efault: /* very odd choice for most of the cases, but... */ | ||
306 | return ERR_PTR(-EFAULT); | ||
307 | Einval: | ||
308 | return ERR_PTR(-EINVAL); | ||
309 | Enomem: | ||
310 | return ERR_PTR(-ENOMEM); | ||
311 | Eagain: | ||
312 | return ERR_PTR(-EAGAIN); | ||
313 | } | ||
314 | |||
315 | static unsigned long mremap_to(unsigned long addr, | ||
316 | unsigned long old_len, unsigned long new_addr, | ||
317 | unsigned long new_len) | ||
318 | { | ||
319 | struct mm_struct *mm = current->mm; | ||
320 | struct vm_area_struct *vma; | ||
321 | unsigned long ret = -EINVAL; | ||
322 | unsigned long charged = 0; | ||
323 | unsigned long map_flags; | ||
324 | |||
325 | if (new_addr & ~PAGE_MASK) | ||
326 | goto out; | ||
327 | |||
328 | if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) | ||
329 | goto out; | ||
330 | |||
331 | /* Check if the location we're moving into overlaps the | ||
332 | * old location at all, and fail if it does. | ||
333 | */ | ||
334 | if ((new_addr <= addr) && (new_addr+new_len) > addr) | ||
335 | goto out; | ||
336 | |||
337 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | ||
338 | goto out; | ||
339 | |||
340 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
341 | if (ret) | ||
342 | goto out; | ||
343 | |||
344 | ret = do_munmap(mm, new_addr, new_len); | ||
345 | if (ret) | ||
346 | goto out; | ||
347 | |||
348 | if (old_len >= new_len) { | ||
349 | ret = do_munmap(mm, addr+new_len, old_len - new_len); | ||
350 | if (ret && old_len != new_len) | ||
351 | goto out; | ||
352 | old_len = new_len; | ||
353 | } | ||
354 | |||
355 | vma = vma_to_resize(addr, old_len, new_len, &charged); | ||
356 | if (IS_ERR(vma)) { | ||
357 | ret = PTR_ERR(vma); | ||
358 | goto out; | ||
359 | } | ||
360 | |||
361 | map_flags = MAP_FIXED; | ||
362 | if (vma->vm_flags & VM_MAYSHARE) | ||
363 | map_flags |= MAP_SHARED; | ||
364 | |||
365 | ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + | ||
366 | ((addr - vma->vm_start) >> PAGE_SHIFT), | ||
367 | map_flags); | ||
368 | if (ret & ~PAGE_MASK) | ||
369 | goto out1; | ||
370 | |||
371 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | ||
372 | if (!(ret & ~PAGE_MASK)) | ||
373 | goto out; | ||
374 | out1: | ||
375 | vm_unacct_memory(charged); | ||
376 | |||
377 | out: | ||
378 | return ret; | ||
379 | } | ||
380 | |||
381 | static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) | ||
382 | { | ||
383 | unsigned long end = vma->vm_end + delta; | ||
384 | if (end < vma->vm_end) /* overflow */ | ||
385 | return 0; | ||
386 | if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */ | ||
387 | return 0; | ||
388 | if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, | ||
389 | 0, MAP_FIXED) & ~PAGE_MASK) | ||
390 | return 0; | ||
391 | return 1; | ||
392 | } | ||
393 | |||
264 | /* | 394 | /* |
265 | * Expand (or shrink) an existing mapping, potentially moving it at the | 395 | * Expand (or shrink) an existing mapping, potentially moving it at the |
266 | * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) | 396 | * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) |
@@ -294,32 +424,10 @@ unsigned long do_mremap(unsigned long addr, | |||
294 | if (!new_len) | 424 | if (!new_len) |
295 | goto out; | 425 | goto out; |
296 | 426 | ||
297 | /* new_addr is only valid if MREMAP_FIXED is specified */ | ||
298 | if (flags & MREMAP_FIXED) { | 427 | if (flags & MREMAP_FIXED) { |
299 | if (new_addr & ~PAGE_MASK) | 428 | if (flags & MREMAP_MAYMOVE) |
300 | goto out; | 429 | ret = mremap_to(addr, old_len, new_addr, new_len); |
301 | if (!(flags & MREMAP_MAYMOVE)) | 430 | goto out; |
302 | goto out; | ||
303 | |||
304 | if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) | ||
305 | goto out; | ||
306 | |||
307 | /* Check if the location we're moving into overlaps the | ||
308 | * old location at all, and fail if it does. | ||
309 | */ | ||
310 | if ((new_addr <= addr) && (new_addr+new_len) > addr) | ||
311 | goto out; | ||
312 | |||
313 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | ||
314 | goto out; | ||
315 | |||
316 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
317 | if (ret) | ||
318 | goto out; | ||
319 | |||
320 | ret = do_munmap(mm, new_addr, new_len); | ||
321 | if (ret) | ||
322 | goto out; | ||
323 | } | 431 | } |
324 | 432 | ||
325 | /* | 433 | /* |
@@ -332,64 +440,30 @@ unsigned long do_mremap(unsigned long addr, | |||
332 | if (ret && old_len != new_len) | 440 | if (ret && old_len != new_len) |
333 | goto out; | 441 | goto out; |
334 | ret = addr; | 442 | ret = addr; |
335 | if (!(flags & MREMAP_FIXED) || (new_addr == addr)) | 443 | goto out; |
336 | goto out; | ||
337 | old_len = new_len; | ||
338 | } | 444 | } |
339 | 445 | ||
340 | /* | 446 | /* |
341 | * Ok, we need to grow.. or relocate. | 447 | * Ok, we need to grow.. |
342 | */ | 448 | */ |
343 | ret = -EFAULT; | 449 | vma = vma_to_resize(addr, old_len, new_len, &charged); |
344 | vma = find_vma(mm, addr); | 450 | if (IS_ERR(vma)) { |
345 | if (!vma || vma->vm_start > addr) | 451 | ret = PTR_ERR(vma); |
346 | goto out; | ||
347 | if (is_vm_hugetlb_page(vma)) { | ||
348 | ret = -EINVAL; | ||
349 | goto out; | ||
350 | } | ||
351 | /* We can't remap across vm area boundaries */ | ||
352 | if (old_len > vma->vm_end - addr) | ||
353 | goto out; | ||
354 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { | ||
355 | if (new_len > old_len) | ||
356 | goto out; | ||
357 | } | ||
358 | if (vma->vm_flags & VM_LOCKED) { | ||
359 | unsigned long locked, lock_limit; | ||
360 | locked = mm->locked_vm << PAGE_SHIFT; | ||
361 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | ||
362 | locked += new_len - old_len; | ||
363 | ret = -EAGAIN; | ||
364 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
365 | goto out; | ||
366 | } | ||
367 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) { | ||
368 | ret = -ENOMEM; | ||
369 | goto out; | 452 | goto out; |
370 | } | 453 | } |
371 | 454 | ||
372 | if (vma->vm_flags & VM_ACCOUNT) { | ||
373 | charged = (new_len - old_len) >> PAGE_SHIFT; | ||
374 | if (security_vm_enough_memory(charged)) | ||
375 | goto out_nc; | ||
376 | } | ||
377 | |||
378 | /* old_len exactly to the end of the area.. | 455 | /* old_len exactly to the end of the area.. |
379 | * And we're not relocating the area. | ||
380 | */ | 456 | */ |
381 | if (old_len == vma->vm_end - addr && | 457 | if (old_len == vma->vm_end - addr) { |
382 | !((flags & MREMAP_FIXED) && (addr != new_addr)) && | ||
383 | (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { | ||
384 | unsigned long max_addr = TASK_SIZE; | ||
385 | if (vma->vm_next) | ||
386 | max_addr = vma->vm_next->vm_start; | ||
387 | /* can we just expand the current mapping? */ | 458 | /* can we just expand the current mapping? */ |
388 | if (max_addr - addr >= new_len) { | 459 | if (vma_expandable(vma, new_len - old_len)) { |
389 | int pages = (new_len - old_len) >> PAGE_SHIFT; | 460 | int pages = (new_len - old_len) >> PAGE_SHIFT; |
390 | 461 | ||
391 | vma_adjust(vma, vma->vm_start, | 462 | if (vma_adjust(vma, vma->vm_start, addr + new_len, |
392 | addr + new_len, vma->vm_pgoff, NULL); | 463 | vma->vm_pgoff, NULL)) { |
464 | ret = -ENOMEM; | ||
465 | goto out; | ||
466 | } | ||
393 | 467 | ||
394 | mm->total_vm += pages; | 468 | mm->total_vm += pages; |
395 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); | 469 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
@@ -409,28 +483,27 @@ unsigned long do_mremap(unsigned long addr, | |||
409 | */ | 483 | */ |
410 | ret = -ENOMEM; | 484 | ret = -ENOMEM; |
411 | if (flags & MREMAP_MAYMOVE) { | 485 | if (flags & MREMAP_MAYMOVE) { |
412 | if (!(flags & MREMAP_FIXED)) { | 486 | unsigned long map_flags = 0; |
413 | unsigned long map_flags = 0; | 487 | if (vma->vm_flags & VM_MAYSHARE) |
414 | if (vma->vm_flags & VM_MAYSHARE) | 488 | map_flags |= MAP_SHARED; |
415 | map_flags |= MAP_SHARED; | 489 | |
416 | 490 | new_addr = get_unmapped_area(vma->vm_file, 0, new_len, | |
417 | new_addr = get_unmapped_area(vma->vm_file, 0, new_len, | 491 | vma->vm_pgoff + |
418 | vma->vm_pgoff, map_flags); | 492 | ((addr - vma->vm_start) >> PAGE_SHIFT), |
419 | if (new_addr & ~PAGE_MASK) { | 493 | map_flags); |
420 | ret = new_addr; | 494 | if (new_addr & ~PAGE_MASK) { |
421 | goto out; | 495 | ret = new_addr; |
422 | } | 496 | goto out; |
423 | |||
424 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
425 | if (ret) | ||
426 | goto out; | ||
427 | } | 497 | } |
498 | |||
499 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
500 | if (ret) | ||
501 | goto out; | ||
428 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | 502 | ret = move_vma(vma, addr, old_len, new_len, new_addr); |
429 | } | 503 | } |
430 | out: | 504 | out: |
431 | if (ret & ~PAGE_MASK) | 505 | if (ret & ~PAGE_MASK) |
432 | vm_unacct_memory(charged); | 506 | vm_unacct_memory(charged); |
433 | out_nc: | ||
434 | return ret; | 507 | return ret; |
435 | } | 508 | } |
436 | 509 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index 9876fa0c3ad3..63fa17d121f0 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -162,7 +162,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
162 | } | 162 | } |
163 | if (vmas) | 163 | if (vmas) |
164 | vmas[i] = vma; | 164 | vmas[i] = vma; |
165 | start += PAGE_SIZE; | 165 | start = (start + PAGE_SIZE) & PAGE_MASK; |
166 | } | 166 | } |
167 | 167 | ||
168 | return i; | 168 | return i; |
@@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
432 | /* | 432 | /* |
433 | * Ok, looks good - let it rip. | 433 | * Ok, looks good - let it rip. |
434 | */ | 434 | */ |
435 | flush_icache_range(mm->brk, brk); | ||
435 | return mm->brk = brk; | 436 | return mm->brk = brk; |
436 | } | 437 | } |
437 | 438 | ||
@@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to) | |||
551 | static void __put_nommu_region(struct vm_region *region) | 552 | static void __put_nommu_region(struct vm_region *region) |
552 | __releases(nommu_region_sem) | 553 | __releases(nommu_region_sem) |
553 | { | 554 | { |
554 | kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); | 555 | kenter("%p{%d}", region, region->vm_usage); |
555 | 556 | ||
556 | BUG_ON(!nommu_region_tree.rb_node); | 557 | BUG_ON(!nommu_region_tree.rb_node); |
557 | 558 | ||
558 | if (atomic_dec_and_test(®ion->vm_usage)) { | 559 | if (--region->vm_usage == 0) { |
559 | if (region->vm_top > region->vm_start) | 560 | if (region->vm_top > region->vm_start) |
560 | delete_nommu_region(region); | 561 | delete_nommu_region(region); |
561 | up_write(&nommu_region_sem); | 562 | up_write(&nommu_region_sem); |
@@ -1039,10 +1040,9 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) | |||
1039 | if (ret != -ENOSYS) | 1040 | if (ret != -ENOSYS) |
1040 | return ret; | 1041 | return ret; |
1041 | 1042 | ||
1042 | /* getting an ENOSYS error indicates that direct mmap isn't | 1043 | /* getting -ENOSYS indicates that direct mmap isn't possible (as |
1043 | * possible (as opposed to tried but failed) so we'll fall | 1044 | * opposed to tried but failed) so we can only give a suitable error as |
1044 | * through to making a private copy of the data and mapping | 1045 | * it's not possible to make a private copy if MAP_SHARED was given */ |
1045 | * that if we can */ | ||
1046 | return -ENODEV; | 1046 | return -ENODEV; |
1047 | } | 1047 | } |
1048 | 1048 | ||
@@ -1143,9 +1143,6 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1143 | if (ret < rlen) | 1143 | if (ret < rlen) |
1144 | memset(base + ret, 0, rlen - ret); | 1144 | memset(base + ret, 0, rlen - ret); |
1145 | 1145 | ||
1146 | } else { | ||
1147 | /* if it's an anonymous mapping, then just clear it */ | ||
1148 | memset(base, 0, rlen); | ||
1149 | } | 1146 | } |
1150 | 1147 | ||
1151 | return 0; | 1148 | return 0; |
@@ -1207,11 +1204,11 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1207 | if (!vma) | 1204 | if (!vma) |
1208 | goto error_getting_vma; | 1205 | goto error_getting_vma; |
1209 | 1206 | ||
1210 | atomic_set(®ion->vm_usage, 1); | 1207 | region->vm_usage = 1; |
1211 | region->vm_flags = vm_flags; | 1208 | region->vm_flags = vm_flags; |
1212 | region->vm_pgoff = pgoff; | 1209 | region->vm_pgoff = pgoff; |
1213 | 1210 | ||
1214 | INIT_LIST_HEAD(&vma->anon_vma_node); | 1211 | INIT_LIST_HEAD(&vma->anon_vma_chain); |
1215 | vma->vm_flags = vm_flags; | 1212 | vma->vm_flags = vm_flags; |
1216 | vma->vm_pgoff = pgoff; | 1213 | vma->vm_pgoff = pgoff; |
1217 | 1214 | ||
@@ -1274,7 +1271,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1274 | } | 1271 | } |
1275 | 1272 | ||
1276 | /* we've found a region we can share */ | 1273 | /* we've found a region we can share */ |
1277 | atomic_inc(&pregion->vm_usage); | 1274 | pregion->vm_usage++; |
1278 | vma->vm_region = pregion; | 1275 | vma->vm_region = pregion; |
1279 | start = pregion->vm_start; | 1276 | start = pregion->vm_start; |
1280 | start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; | 1277 | start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; |
@@ -1291,7 +1288,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1291 | vma->vm_region = NULL; | 1288 | vma->vm_region = NULL; |
1292 | vma->vm_start = 0; | 1289 | vma->vm_start = 0; |
1293 | vma->vm_end = 0; | 1290 | vma->vm_end = 0; |
1294 | atomic_dec(&pregion->vm_usage); | 1291 | pregion->vm_usage--; |
1295 | pregion = NULL; | 1292 | pregion = NULL; |
1296 | goto error_just_free; | 1293 | goto error_just_free; |
1297 | } | 1294 | } |
@@ -1343,6 +1340,11 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1343 | goto error_just_free; | 1340 | goto error_just_free; |
1344 | add_nommu_region(region); | 1341 | add_nommu_region(region); |
1345 | 1342 | ||
1343 | /* clear anonymous mappings that don't ask for uninitialized data */ | ||
1344 | if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) | ||
1345 | memset((void *)region->vm_start, 0, | ||
1346 | region->vm_end - region->vm_start); | ||
1347 | |||
1346 | /* okay... we have a mapping; now we have to register it */ | 1348 | /* okay... we have a mapping; now we have to register it */ |
1347 | result = vma->vm_start; | 1349 | result = vma->vm_start; |
1348 | 1350 | ||
@@ -1351,10 +1353,14 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1351 | share: | 1353 | share: |
1352 | add_vma_to_mm(current->mm, vma); | 1354 | add_vma_to_mm(current->mm, vma); |
1353 | 1355 | ||
1354 | up_write(&nommu_region_sem); | 1356 | /* we flush the region from the icache only when the first executable |
1357 | * mapping of it is made */ | ||
1358 | if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { | ||
1359 | flush_icache_range(region->vm_start, region->vm_end); | ||
1360 | region->vm_icache_flushed = true; | ||
1361 | } | ||
1355 | 1362 | ||
1356 | if (prot & PROT_EXEC) | 1363 | up_write(&nommu_region_sem); |
1357 | flush_icache_range(result, result + len); | ||
1358 | 1364 | ||
1359 | kleave(" = %lx", result); | 1365 | kleave(" = %lx", result); |
1360 | return result; | 1366 | return result; |
@@ -1396,6 +1402,55 @@ error_getting_region: | |||
1396 | } | 1402 | } |
1397 | EXPORT_SYMBOL(do_mmap_pgoff); | 1403 | EXPORT_SYMBOL(do_mmap_pgoff); |
1398 | 1404 | ||
1405 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | ||
1406 | unsigned long, prot, unsigned long, flags, | ||
1407 | unsigned long, fd, unsigned long, pgoff) | ||
1408 | { | ||
1409 | struct file *file = NULL; | ||
1410 | unsigned long retval = -EBADF; | ||
1411 | |||
1412 | if (!(flags & MAP_ANONYMOUS)) { | ||
1413 | file = fget(fd); | ||
1414 | if (!file) | ||
1415 | goto out; | ||
1416 | } | ||
1417 | |||
1418 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
1419 | |||
1420 | down_write(¤t->mm->mmap_sem); | ||
1421 | retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
1422 | up_write(¤t->mm->mmap_sem); | ||
1423 | |||
1424 | if (file) | ||
1425 | fput(file); | ||
1426 | out: | ||
1427 | return retval; | ||
1428 | } | ||
1429 | |||
1430 | #ifdef __ARCH_WANT_SYS_OLD_MMAP | ||
1431 | struct mmap_arg_struct { | ||
1432 | unsigned long addr; | ||
1433 | unsigned long len; | ||
1434 | unsigned long prot; | ||
1435 | unsigned long flags; | ||
1436 | unsigned long fd; | ||
1437 | unsigned long offset; | ||
1438 | }; | ||
1439 | |||
1440 | SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) | ||
1441 | { | ||
1442 | struct mmap_arg_struct a; | ||
1443 | |||
1444 | if (copy_from_user(&a, arg, sizeof(a))) | ||
1445 | return -EFAULT; | ||
1446 | if (a.offset & ~PAGE_MASK) | ||
1447 | return -EINVAL; | ||
1448 | |||
1449 | return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, | ||
1450 | a.offset >> PAGE_SHIFT); | ||
1451 | } | ||
1452 | #endif /* __ARCH_WANT_SYS_OLD_MMAP */ | ||
1453 | |||
1399 | /* | 1454 | /* |
1400 | * split a vma into two pieces at address 'addr', a new vma is allocated either | 1455 | * split a vma into two pieces at address 'addr', a new vma is allocated either |
1401 | * for the first part or the tail. | 1456 | * for the first part or the tail. |
@@ -1409,10 +1464,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1409 | 1464 | ||
1410 | kenter(""); | 1465 | kenter(""); |
1411 | 1466 | ||
1412 | /* we're only permitted to split anonymous regions that have a single | 1467 | /* we're only permitted to split anonymous regions (these should have |
1413 | * owner */ | 1468 | * only a single usage on the region) */ |
1414 | if (vma->vm_file || | 1469 | if (vma->vm_file) |
1415 | atomic_read(&vma->vm_region->vm_usage) != 1) | ||
1416 | return -ENOMEM; | 1470 | return -ENOMEM; |
1417 | 1471 | ||
1418 | if (mm->map_count >= sysctl_max_map_count) | 1472 | if (mm->map_count >= sysctl_max_map_count) |
@@ -1486,7 +1540,7 @@ static int shrink_vma(struct mm_struct *mm, | |||
1486 | 1540 | ||
1487 | /* cut the backing region down to size */ | 1541 | /* cut the backing region down to size */ |
1488 | region = vma->vm_region; | 1542 | region = vma->vm_region; |
1489 | BUG_ON(atomic_read(®ion->vm_usage) != 1); | 1543 | BUG_ON(region->vm_usage != 1); |
1490 | 1544 | ||
1491 | down_write(&nommu_region_sem); | 1545 | down_write(&nommu_region_sem); |
1492 | delete_nommu_region(region); | 1546 | delete_nommu_region(region); |
@@ -1730,27 +1784,6 @@ void unmap_mapping_range(struct address_space *mapping, | |||
1730 | EXPORT_SYMBOL(unmap_mapping_range); | 1784 | EXPORT_SYMBOL(unmap_mapping_range); |
1731 | 1785 | ||
1732 | /* | 1786 | /* |
1733 | * ask for an unmapped area at which to create a mapping on a file | ||
1734 | */ | ||
1735 | unsigned long get_unmapped_area(struct file *file, unsigned long addr, | ||
1736 | unsigned long len, unsigned long pgoff, | ||
1737 | unsigned long flags) | ||
1738 | { | ||
1739 | unsigned long (*get_area)(struct file *, unsigned long, unsigned long, | ||
1740 | unsigned long, unsigned long); | ||
1741 | |||
1742 | get_area = current->mm->get_unmapped_area; | ||
1743 | if (file && file->f_op && file->f_op->get_unmapped_area) | ||
1744 | get_area = file->f_op->get_unmapped_area; | ||
1745 | |||
1746 | if (!get_area) | ||
1747 | return -ENOSYS; | ||
1748 | |||
1749 | return get_area(file, addr, len, pgoff, flags); | ||
1750 | } | ||
1751 | EXPORT_SYMBOL(get_unmapped_area); | ||
1752 | |||
1753 | /* | ||
1754 | * Check that a process has enough memory to allocate a new virtual | 1787 | * Check that a process has enough memory to allocate a new virtual |
1755 | * mapping. 0 means there is enough memory for the allocation to | 1788 | * mapping. 0 means there is enough memory for the allocation to |
1756 | * succeed and -ENOMEM implies there is not. | 1789 | * succeed and -ENOMEM implies there is not. |
@@ -1889,9 +1922,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
1889 | 1922 | ||
1890 | /* only read or write mappings where it is permitted */ | 1923 | /* only read or write mappings where it is permitted */ |
1891 | if (write && vma->vm_flags & VM_MAYWRITE) | 1924 | if (write && vma->vm_flags & VM_MAYWRITE) |
1892 | len -= copy_to_user((void *) addr, buf, len); | 1925 | copy_to_user_page(vma, NULL, addr, |
1926 | (void *) addr, buf, len); | ||
1893 | else if (!write && vma->vm_flags & VM_MAYREAD) | 1927 | else if (!write && vma->vm_flags & VM_MAYREAD) |
1894 | len -= copy_from_user(buf, (void *) addr, len); | 1928 | copy_from_user_page(vma, NULL, addr, |
1929 | buf, (void *) addr, len); | ||
1895 | else | 1930 | else |
1896 | len = 0; | 1931 | len = 0; |
1897 | } else { | 1932 | } else { |
@@ -1902,3 +1937,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
1902 | mmput(mm); | 1937 | mmput(mm); |
1903 | return len; | 1938 | return len; |
1904 | } | 1939 | } |
1940 | |||
1941 | /** | ||
1942 | * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode | ||
1943 | * @inode: The inode to check | ||
1944 | * @size: The current filesize of the inode | ||
1945 | * @newsize: The proposed filesize of the inode | ||
1946 | * | ||
1947 | * Check the shared mappings on an inode on behalf of a shrinking truncate to | ||
1948 | * make sure that that any outstanding VMAs aren't broken and then shrink the | ||
1949 | * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't | ||
1950 | * automatically grant mappings that are too large. | ||
1951 | */ | ||
1952 | int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | ||
1953 | size_t newsize) | ||
1954 | { | ||
1955 | struct vm_area_struct *vma; | ||
1956 | struct prio_tree_iter iter; | ||
1957 | struct vm_region *region; | ||
1958 | pgoff_t low, high; | ||
1959 | size_t r_size, r_top; | ||
1960 | |||
1961 | low = newsize >> PAGE_SHIFT; | ||
1962 | high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
1963 | |||
1964 | down_write(&nommu_region_sem); | ||
1965 | |||
1966 | /* search for VMAs that fall within the dead zone */ | ||
1967 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | ||
1968 | low, high) { | ||
1969 | /* found one - only interested if it's shared out of the page | ||
1970 | * cache */ | ||
1971 | if (vma->vm_flags & VM_SHARED) { | ||
1972 | up_write(&nommu_region_sem); | ||
1973 | return -ETXTBSY; /* not quite true, but near enough */ | ||
1974 | } | ||
1975 | } | ||
1976 | |||
1977 | /* reduce any regions that overlap the dead zone - if in existence, | ||
1978 | * these will be pointed to by VMAs that don't overlap the dead zone | ||
1979 | * | ||
1980 | * we don't check for any regions that start beyond the EOF as there | ||
1981 | * shouldn't be any | ||
1982 | */ | ||
1983 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | ||
1984 | 0, ULONG_MAX) { | ||
1985 | if (!(vma->vm_flags & VM_SHARED)) | ||
1986 | continue; | ||
1987 | |||
1988 | region = vma->vm_region; | ||
1989 | r_size = region->vm_top - region->vm_start; | ||
1990 | r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size; | ||
1991 | |||
1992 | if (r_top > newsize) { | ||
1993 | region->vm_top -= r_top - newsize; | ||
1994 | if (region->vm_end > region->vm_top) | ||
1995 | region->vm_end = region->vm_top; | ||
1996 | } | ||
1997 | } | ||
1998 | |||
1999 | up_write(&nommu_region_sem); | ||
2000 | return 0; | ||
2001 | } | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ea2147dabba6..b68e802a7a7d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/oom.h> | 18 | #include <linux/oom.h> |
19 | #include <linux/mm.h> | 19 | #include <linux/mm.h> |
20 | #include <linux/err.h> | 20 | #include <linux/err.h> |
21 | #include <linux/gfp.h> | ||
21 | #include <linux/sched.h> | 22 | #include <linux/sched.h> |
22 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
23 | #include <linux/timex.h> | 24 | #include <linux/timex.h> |
@@ -196,27 +197,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
196 | /* | 197 | /* |
197 | * Determine the type of allocation constraint. | 198 | * Determine the type of allocation constraint. |
198 | */ | 199 | */ |
199 | static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
200 | gfp_t gfp_mask) | ||
201 | { | ||
202 | #ifdef CONFIG_NUMA | 200 | #ifdef CONFIG_NUMA |
201 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
202 | gfp_t gfp_mask, nodemask_t *nodemask) | ||
203 | { | ||
203 | struct zone *zone; | 204 | struct zone *zone; |
204 | struct zoneref *z; | 205 | struct zoneref *z; |
205 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 206 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
206 | nodemask_t nodes = node_states[N_HIGH_MEMORY]; | ||
207 | 207 | ||
208 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 208 | /* |
209 | if (cpuset_zone_allowed_softwall(zone, gfp_mask)) | 209 | * Reach here only when __GFP_NOFAIL is used. So, we should avoid |
210 | node_clear(zone_to_nid(zone), nodes); | 210 | * to kill current.We have to random task kill in this case. |
211 | else | 211 | * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. |
212 | return CONSTRAINT_CPUSET; | 212 | */ |
213 | if (gfp_mask & __GFP_THISNODE) | ||
214 | return CONSTRAINT_NONE; | ||
213 | 215 | ||
214 | if (!nodes_empty(nodes)) | 216 | /* |
217 | * The nodemask here is a nodemask passed to alloc_pages(). Now, | ||
218 | * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy | ||
219 | * feature. mempolicy is an only user of nodemask here. | ||
220 | * check mempolicy's nodemask contains all N_HIGH_MEMORY | ||
221 | */ | ||
222 | if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) | ||
215 | return CONSTRAINT_MEMORY_POLICY; | 223 | return CONSTRAINT_MEMORY_POLICY; |
216 | #endif | 224 | |
225 | /* Check this allocation failure is caused by cpuset's wall function */ | ||
226 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | ||
227 | high_zoneidx, nodemask) | ||
228 | if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) | ||
229 | return CONSTRAINT_CPUSET; | ||
217 | 230 | ||
218 | return CONSTRAINT_NONE; | 231 | return CONSTRAINT_NONE; |
219 | } | 232 | } |
233 | #else | ||
234 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
235 | gfp_t gfp_mask, nodemask_t *nodemask) | ||
236 | { | ||
237 | return CONSTRAINT_NONE; | ||
238 | } | ||
239 | #endif | ||
220 | 240 | ||
221 | /* | 241 | /* |
222 | * Simple selection loop. We chose the process with the highest | 242 | * Simple selection loop. We chose the process with the highest |
@@ -337,6 +357,24 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
337 | } while_each_thread(g, p); | 357 | } while_each_thread(g, p); |
338 | } | 358 | } |
339 | 359 | ||
360 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | ||
361 | struct mem_cgroup *mem) | ||
362 | { | ||
363 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | ||
364 | "oom_adj=%d\n", | ||
365 | current->comm, gfp_mask, order, current->signal->oom_adj); | ||
366 | task_lock(current); | ||
367 | cpuset_print_task_mems_allowed(current); | ||
368 | task_unlock(current); | ||
369 | dump_stack(); | ||
370 | mem_cgroup_print_oom_info(mem, p); | ||
371 | show_mem(); | ||
372 | if (sysctl_oom_dump_tasks) | ||
373 | dump_tasks(mem); | ||
374 | } | ||
375 | |||
376 | #define K(x) ((x) << (PAGE_SHIFT-10)) | ||
377 | |||
340 | /* | 378 | /* |
341 | * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO | 379 | * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO |
342 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO | 380 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO |
@@ -350,15 +388,23 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
350 | return; | 388 | return; |
351 | } | 389 | } |
352 | 390 | ||
391 | task_lock(p); | ||
353 | if (!p->mm) { | 392 | if (!p->mm) { |
354 | WARN_ON(1); | 393 | WARN_ON(1); |
355 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | 394 | printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n", |
395 | task_pid_nr(p), p->comm); | ||
396 | task_unlock(p); | ||
356 | return; | 397 | return; |
357 | } | 398 | } |
358 | 399 | ||
359 | if (verbose) | 400 | if (verbose) |
360 | printk(KERN_ERR "Killed process %d (%s)\n", | 401 | printk(KERN_ERR "Killed process %d (%s) " |
361 | task_pid_nr(p), p->comm); | 402 | "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", |
403 | task_pid_nr(p), p->comm, | ||
404 | K(p->mm->total_vm), | ||
405 | K(get_mm_counter(p->mm, MM_ANONPAGES)), | ||
406 | K(get_mm_counter(p->mm, MM_FILEPAGES))); | ||
407 | task_unlock(p); | ||
362 | 408 | ||
363 | /* | 409 | /* |
364 | * We give our sacrificial lamb high priority and access to | 410 | * We give our sacrificial lamb high priority and access to |
@@ -395,20 +441,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
395 | { | 441 | { |
396 | struct task_struct *c; | 442 | struct task_struct *c; |
397 | 443 | ||
398 | if (printk_ratelimit()) { | 444 | if (printk_ratelimit()) |
399 | printk(KERN_WARNING "%s invoked oom-killer: " | 445 | dump_header(p, gfp_mask, order, mem); |
400 | "gfp_mask=0x%x, order=%d, oom_adj=%d\n", | ||
401 | current->comm, gfp_mask, order, | ||
402 | current->signal->oom_adj); | ||
403 | task_lock(current); | ||
404 | cpuset_print_task_mems_allowed(current); | ||
405 | task_unlock(current); | ||
406 | dump_stack(); | ||
407 | mem_cgroup_print_oom_info(mem, current); | ||
408 | show_mem(); | ||
409 | if (sysctl_oom_dump_tasks) | ||
410 | dump_tasks(mem); | ||
411 | } | ||
412 | 446 | ||
413 | /* | 447 | /* |
414 | * If the task is already exiting, don't alarm the sysadmin or kill | 448 | * If the task is already exiting, don't alarm the sysadmin or kill |
@@ -426,6 +460,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
426 | list_for_each_entry(c, &p->children, sibling) { | 460 | list_for_each_entry(c, &p->children, sibling) { |
427 | if (c->mm == p->mm) | 461 | if (c->mm == p->mm) |
428 | continue; | 462 | continue; |
463 | if (mem && !task_in_mem_cgroup(c, mem)) | ||
464 | continue; | ||
429 | if (!oom_kill_task(c)) | 465 | if (!oom_kill_task(c)) |
430 | return 0; | 466 | return 0; |
431 | } | 467 | } |
@@ -438,6 +474,8 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) | |||
438 | unsigned long points = 0; | 474 | unsigned long points = 0; |
439 | struct task_struct *p; | 475 | struct task_struct *p; |
440 | 476 | ||
477 | if (sysctl_panic_on_oom == 2) | ||
478 | panic("out of memory(memcg). panic_on_oom is selected.\n"); | ||
441 | read_lock(&tasklist_lock); | 479 | read_lock(&tasklist_lock); |
442 | retry: | 480 | retry: |
443 | p = select_bad_process(&points, mem); | 481 | p = select_bad_process(&points, mem); |
@@ -544,6 +582,7 @@ retry: | |||
544 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 582 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
545 | if (!p) { | 583 | if (!p) { |
546 | read_unlock(&tasklist_lock); | 584 | read_unlock(&tasklist_lock); |
585 | dump_header(NULL, gfp_mask, order, NULL); | ||
547 | panic("Out of memory and no killable processes...\n"); | 586 | panic("Out of memory and no killable processes...\n"); |
548 | } | 587 | } |
549 | 588 | ||
@@ -565,13 +604,6 @@ void pagefault_out_of_memory(void) | |||
565 | /* Got some memory back in the last second. */ | 604 | /* Got some memory back in the last second. */ |
566 | return; | 605 | return; |
567 | 606 | ||
568 | /* | ||
569 | * If this is from memcg, oom-killer is already invoked. | ||
570 | * and not worth to go system-wide-oom. | ||
571 | */ | ||
572 | if (mem_cgroup_oom_called(current)) | ||
573 | goto rest_and_return; | ||
574 | |||
575 | if (sysctl_panic_on_oom) | 607 | if (sysctl_panic_on_oom) |
576 | panic("out of memory from page fault. panic_on_oom is selected.\n"); | 608 | panic("out of memory from page fault. panic_on_oom is selected.\n"); |
577 | 609 | ||
@@ -583,7 +615,6 @@ void pagefault_out_of_memory(void) | |||
583 | * Give "p" a good chance of killing itself before we | 615 | * Give "p" a good chance of killing itself before we |
584 | * retry to allocate memory. | 616 | * retry to allocate memory. |
585 | */ | 617 | */ |
586 | rest_and_return: | ||
587 | if (!test_thread_flag(TIF_MEMDIE)) | 618 | if (!test_thread_flag(TIF_MEMDIE)) |
588 | schedule_timeout_uninterruptible(1); | 619 | schedule_timeout_uninterruptible(1); |
589 | } | 620 | } |
@@ -599,7 +630,8 @@ rest_and_return: | |||
599 | * OR try to be smart about which process to kill. Note that we | 630 | * OR try to be smart about which process to kill. Note that we |
600 | * don't have to be perfect here, we just have to be good. | 631 | * don't have to be perfect here, we just have to be good. |
601 | */ | 632 | */ |
602 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | 633 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
634 | int order, nodemask_t *nodemask) | ||
603 | { | 635 | { |
604 | unsigned long freed = 0; | 636 | unsigned long freed = 0; |
605 | enum oom_constraint constraint; | 637 | enum oom_constraint constraint; |
@@ -609,14 +641,16 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
609 | /* Got some memory back in the last second. */ | 641 | /* Got some memory back in the last second. */ |
610 | return; | 642 | return; |
611 | 643 | ||
612 | if (sysctl_panic_on_oom == 2) | 644 | if (sysctl_panic_on_oom == 2) { |
645 | dump_header(NULL, gfp_mask, order, NULL); | ||
613 | panic("out of memory. Compulsory panic_on_oom is selected.\n"); | 646 | panic("out of memory. Compulsory panic_on_oom is selected.\n"); |
647 | } | ||
614 | 648 | ||
615 | /* | 649 | /* |
616 | * Check if there were limitations on the allocation (only relevant for | 650 | * Check if there were limitations on the allocation (only relevant for |
617 | * NUMA) that may require different handling. | 651 | * NUMA) that may require different handling. |
618 | */ | 652 | */ |
619 | constraint = constrained_alloc(zonelist, gfp_mask); | 653 | constraint = constrained_alloc(zonelist, gfp_mask, nodemask); |
620 | read_lock(&tasklist_lock); | 654 | read_lock(&tasklist_lock); |
621 | 655 | ||
622 | switch (constraint) { | 656 | switch (constraint) { |
@@ -626,8 +660,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
626 | break; | 660 | break; |
627 | 661 | ||
628 | case CONSTRAINT_NONE: | 662 | case CONSTRAINT_NONE: |
629 | if (sysctl_panic_on_oom) | 663 | if (sysctl_panic_on_oom) { |
664 | dump_header(NULL, gfp_mask, order, NULL); | ||
630 | panic("out of memory. panic_on_oom is selected\n"); | 665 | panic("out of memory. panic_on_oom is selected\n"); |
666 | } | ||
631 | /* Fall-through */ | 667 | /* Fall-through */ |
632 | case CONSTRAINT_CPUSET: | 668 | case CONSTRAINT_CPUSET: |
633 | __out_of_memory(gfp_mask, order); | 669 | __out_of_memory(gfp_mask, order); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2c5d79236ead..0b19943ecf8b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -821,7 +821,6 @@ int write_cache_pages(struct address_space *mapping, | |||
821 | struct writeback_control *wbc, writepage_t writepage, | 821 | struct writeback_control *wbc, writepage_t writepage, |
822 | void *data) | 822 | void *data) |
823 | { | 823 | { |
824 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
825 | int ret = 0; | 824 | int ret = 0; |
826 | int done = 0; | 825 | int done = 0; |
827 | struct pagevec pvec; | 826 | struct pagevec pvec; |
@@ -834,11 +833,6 @@ int write_cache_pages(struct address_space *mapping, | |||
834 | int range_whole = 0; | 833 | int range_whole = 0; |
835 | long nr_to_write = wbc->nr_to_write; | 834 | long nr_to_write = wbc->nr_to_write; |
836 | 835 | ||
837 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | ||
838 | wbc->encountered_congestion = 1; | ||
839 | return 0; | ||
840 | } | ||
841 | |||
842 | pagevec_init(&pvec, 0); | 836 | pagevec_init(&pvec, 0); |
843 | if (wbc->range_cyclic) { | 837 | if (wbc->range_cyclic) { |
844 | writeback_index = mapping->writeback_index; /* prev offset */ | 838 | writeback_index = mapping->writeback_index; /* prev offset */ |
@@ -957,12 +951,6 @@ continue_unlock: | |||
957 | break; | 951 | break; |
958 | } | 952 | } |
959 | } | 953 | } |
960 | |||
961 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | ||
962 | wbc->encountered_congestion = 1; | ||
963 | done = 1; | ||
964 | break; | ||
965 | } | ||
966 | } | 954 | } |
967 | pagevec_release(&pvec); | 955 | pagevec_release(&pvec); |
968 | cond_resched(); | 956 | cond_resched(); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bc2ac63f41e..d03c946d5566 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -48,7 +48,9 @@ | |||
48 | #include <linux/page_cgroup.h> | 48 | #include <linux/page_cgroup.h> |
49 | #include <linux/debugobjects.h> | 49 | #include <linux/debugobjects.h> |
50 | #include <linux/kmemleak.h> | 50 | #include <linux/kmemleak.h> |
51 | #include <linux/memory.h> | ||
51 | #include <trace/events/kmem.h> | 52 | #include <trace/events/kmem.h> |
53 | #include <linux/ftrace_event.h> | ||
52 | 54 | ||
53 | #include <asm/tlbflush.h> | 55 | #include <asm/tlbflush.h> |
54 | #include <asm/div64.h> | 56 | #include <asm/div64.h> |
@@ -75,6 +77,31 @@ unsigned long totalreserve_pages __read_mostly; | |||
75 | int percpu_pagelist_fraction; | 77 | int percpu_pagelist_fraction; |
76 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | 78 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
77 | 79 | ||
80 | #ifdef CONFIG_PM_SLEEP | ||
81 | /* | ||
82 | * The following functions are used by the suspend/hibernate code to temporarily | ||
83 | * change gfp_allowed_mask in order to avoid using I/O during memory allocations | ||
84 | * while devices are suspended. To avoid races with the suspend/hibernate code, | ||
85 | * they should always be called with pm_mutex held (gfp_allowed_mask also should | ||
86 | * only be modified with pm_mutex held, unless the suspend/hibernate code is | ||
87 | * guaranteed not to run in parallel with that modification). | ||
88 | */ | ||
89 | void set_gfp_allowed_mask(gfp_t mask) | ||
90 | { | ||
91 | WARN_ON(!mutex_is_locked(&pm_mutex)); | ||
92 | gfp_allowed_mask = mask; | ||
93 | } | ||
94 | |||
95 | gfp_t clear_gfp_allowed_mask(gfp_t mask) | ||
96 | { | ||
97 | gfp_t ret = gfp_allowed_mask; | ||
98 | |||
99 | WARN_ON(!mutex_is_locked(&pm_mutex)); | ||
100 | gfp_allowed_mask &= ~mask; | ||
101 | return ret; | ||
102 | } | ||
103 | #endif /* CONFIG_PM_SLEEP */ | ||
104 | |||
78 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 105 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
79 | int pageblock_order __read_mostly; | 106 | int pageblock_order __read_mostly; |
80 | #endif | 107 | #endif |
@@ -262,10 +289,7 @@ static void bad_page(struct page *page) | |||
262 | 289 | ||
263 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", | 290 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", |
264 | current->comm, page_to_pfn(page)); | 291 | current->comm, page_to_pfn(page)); |
265 | printk(KERN_ALERT | 292 | dump_page(page); |
266 | "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", | ||
267 | page, (void *)page->flags, page_count(page), | ||
268 | page_mapcount(page), page->mapping, page->index); | ||
269 | 293 | ||
270 | dump_stack(); | 294 | dump_stack(); |
271 | out: | 295 | out: |
@@ -486,7 +510,6 @@ static inline void __free_one_page(struct page *page, | |||
486 | zone->free_area[order].nr_free++; | 510 | zone->free_area[order].nr_free++; |
487 | } | 511 | } |
488 | 512 | ||
489 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
490 | /* | 513 | /* |
491 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | 514 | * free_page_mlock() -- clean up attempts to free and mlocked() page. |
492 | * Page should not be on lru, so no need to fix that up. | 515 | * Page should not be on lru, so no need to fix that up. |
@@ -497,9 +520,6 @@ static inline void free_page_mlock(struct page *page) | |||
497 | __dec_zone_page_state(page, NR_MLOCK); | 520 | __dec_zone_page_state(page, NR_MLOCK); |
498 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | 521 | __count_vm_event(UNEVICTABLE_MLOCKFREED); |
499 | } | 522 | } |
500 | #else | ||
501 | static void free_page_mlock(struct page *page) { } | ||
502 | #endif | ||
503 | 523 | ||
504 | static inline int free_pages_check(struct page *page) | 524 | static inline int free_pages_check(struct page *page) |
505 | { | 525 | { |
@@ -533,7 +553,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
533 | int batch_free = 0; | 553 | int batch_free = 0; |
534 | 554 | ||
535 | spin_lock(&zone->lock); | 555 | spin_lock(&zone->lock); |
536 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 556 | zone->all_unreclaimable = 0; |
537 | zone->pages_scanned = 0; | 557 | zone->pages_scanned = 0; |
538 | 558 | ||
539 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | 559 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); |
@@ -559,8 +579,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
559 | page = list_entry(list->prev, struct page, lru); | 579 | page = list_entry(list->prev, struct page, lru); |
560 | /* must delete as __free_one_page list manipulates */ | 580 | /* must delete as __free_one_page list manipulates */ |
561 | list_del(&page->lru); | 581 | list_del(&page->lru); |
562 | __free_one_page(page, zone, 0, migratetype); | 582 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
563 | trace_mm_page_pcpu_drain(page, 0, migratetype); | 583 | __free_one_page(page, zone, 0, page_private(page)); |
584 | trace_mm_page_pcpu_drain(page, 0, page_private(page)); | ||
564 | } while (--count && --batch_free && !list_empty(list)); | 585 | } while (--count && --batch_free && !list_empty(list)); |
565 | } | 586 | } |
566 | spin_unlock(&zone->lock); | 587 | spin_unlock(&zone->lock); |
@@ -570,7 +591,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order, | |||
570 | int migratetype) | 591 | int migratetype) |
571 | { | 592 | { |
572 | spin_lock(&zone->lock); | 593 | spin_lock(&zone->lock); |
573 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 594 | zone->all_unreclaimable = 0; |
574 | zone->pages_scanned = 0; | 595 | zone->pages_scanned = 0; |
575 | 596 | ||
576 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 597 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); |
@@ -585,6 +606,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
585 | int bad = 0; | 606 | int bad = 0; |
586 | int wasMlocked = __TestClearPageMlocked(page); | 607 | int wasMlocked = __TestClearPageMlocked(page); |
587 | 608 | ||
609 | trace_mm_page_free_direct(page, order); | ||
588 | kmemcheck_free_shadow(page, order); | 610 | kmemcheck_free_shadow(page, order); |
589 | 611 | ||
590 | for (i = 0 ; i < (1 << order) ; ++i) | 612 | for (i = 0 ; i < (1 << order) ; ++i) |
@@ -1011,10 +1033,10 @@ static void drain_pages(unsigned int cpu) | |||
1011 | struct per_cpu_pageset *pset; | 1033 | struct per_cpu_pageset *pset; |
1012 | struct per_cpu_pages *pcp; | 1034 | struct per_cpu_pages *pcp; |
1013 | 1035 | ||
1014 | pset = zone_pcp(zone, cpu); | 1036 | local_irq_save(flags); |
1037 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
1015 | 1038 | ||
1016 | pcp = &pset->pcp; | 1039 | pcp = &pset->pcp; |
1017 | local_irq_save(flags); | ||
1018 | free_pcppages_bulk(zone, pcp->count, pcp); | 1040 | free_pcppages_bulk(zone, pcp->count, pcp); |
1019 | pcp->count = 0; | 1041 | pcp->count = 0; |
1020 | local_irq_restore(flags); | 1042 | local_irq_restore(flags); |
@@ -1075,8 +1097,9 @@ void mark_free_pages(struct zone *zone) | |||
1075 | 1097 | ||
1076 | /* | 1098 | /* |
1077 | * Free a 0-order page | 1099 | * Free a 0-order page |
1100 | * cold == 1 ? free a cold page : free a hot page | ||
1078 | */ | 1101 | */ |
1079 | static void free_hot_cold_page(struct page *page, int cold) | 1102 | void free_hot_cold_page(struct page *page, int cold) |
1080 | { | 1103 | { |
1081 | struct zone *zone = page_zone(page); | 1104 | struct zone *zone = page_zone(page); |
1082 | struct per_cpu_pages *pcp; | 1105 | struct per_cpu_pages *pcp; |
@@ -1084,6 +1107,7 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1084 | int migratetype; | 1107 | int migratetype; |
1085 | int wasMlocked = __TestClearPageMlocked(page); | 1108 | int wasMlocked = __TestClearPageMlocked(page); |
1086 | 1109 | ||
1110 | trace_mm_page_free_direct(page, 0); | ||
1087 | kmemcheck_free_shadow(page, 0); | 1111 | kmemcheck_free_shadow(page, 0); |
1088 | 1112 | ||
1089 | if (PageAnon(page)) | 1113 | if (PageAnon(page)) |
@@ -1098,7 +1122,6 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1098 | arch_free_page(page, 0); | 1122 | arch_free_page(page, 0); |
1099 | kernel_map_pages(page, 1, 0); | 1123 | kernel_map_pages(page, 1, 0); |
1100 | 1124 | ||
1101 | pcp = &zone_pcp(zone, get_cpu())->pcp; | ||
1102 | migratetype = get_pageblock_migratetype(page); | 1125 | migratetype = get_pageblock_migratetype(page); |
1103 | set_page_private(page, migratetype); | 1126 | set_page_private(page, migratetype); |
1104 | local_irq_save(flags); | 1127 | local_irq_save(flags); |
@@ -1121,6 +1144,7 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1121 | migratetype = MIGRATE_MOVABLE; | 1144 | migratetype = MIGRATE_MOVABLE; |
1122 | } | 1145 | } |
1123 | 1146 | ||
1147 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | ||
1124 | if (cold) | 1148 | if (cold) |
1125 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | 1149 | list_add_tail(&page->lru, &pcp->lists[migratetype]); |
1126 | else | 1150 | else |
@@ -1133,15 +1157,8 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1133 | 1157 | ||
1134 | out: | 1158 | out: |
1135 | local_irq_restore(flags); | 1159 | local_irq_restore(flags); |
1136 | put_cpu(); | ||
1137 | } | 1160 | } |
1138 | 1161 | ||
1139 | void free_hot_page(struct page *page) | ||
1140 | { | ||
1141 | trace_mm_page_free_direct(page, 0); | ||
1142 | free_hot_cold_page(page, 0); | ||
1143 | } | ||
1144 | |||
1145 | /* | 1162 | /* |
1146 | * split_page takes a non-compound higher-order page, and splits it into | 1163 | * split_page takes a non-compound higher-order page, and splits it into |
1147 | * n (1<<order) sub-pages: page[0..n] | 1164 | * n (1<<order) sub-pages: page[0..n] |
@@ -1183,17 +1200,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
1183 | unsigned long flags; | 1200 | unsigned long flags; |
1184 | struct page *page; | 1201 | struct page *page; |
1185 | int cold = !!(gfp_flags & __GFP_COLD); | 1202 | int cold = !!(gfp_flags & __GFP_COLD); |
1186 | int cpu; | ||
1187 | 1203 | ||
1188 | again: | 1204 | again: |
1189 | cpu = get_cpu(); | ||
1190 | if (likely(order == 0)) { | 1205 | if (likely(order == 0)) { |
1191 | struct per_cpu_pages *pcp; | 1206 | struct per_cpu_pages *pcp; |
1192 | struct list_head *list; | 1207 | struct list_head *list; |
1193 | 1208 | ||
1194 | pcp = &zone_pcp(zone, cpu)->pcp; | ||
1195 | list = &pcp->lists[migratetype]; | ||
1196 | local_irq_save(flags); | 1209 | local_irq_save(flags); |
1210 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | ||
1211 | list = &pcp->lists[migratetype]; | ||
1197 | if (list_empty(list)) { | 1212 | if (list_empty(list)) { |
1198 | pcp->count += rmqueue_bulk(zone, 0, | 1213 | pcp->count += rmqueue_bulk(zone, 0, |
1199 | pcp->batch, list, | 1214 | pcp->batch, list, |
@@ -1225,16 +1240,15 @@ again: | |||
1225 | } | 1240 | } |
1226 | spin_lock_irqsave(&zone->lock, flags); | 1241 | spin_lock_irqsave(&zone->lock, flags); |
1227 | page = __rmqueue(zone, order, migratetype); | 1242 | page = __rmqueue(zone, order, migratetype); |
1228 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | ||
1229 | spin_unlock(&zone->lock); | 1243 | spin_unlock(&zone->lock); |
1230 | if (!page) | 1244 | if (!page) |
1231 | goto failed; | 1245 | goto failed; |
1246 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | ||
1232 | } | 1247 | } |
1233 | 1248 | ||
1234 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1249 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1235 | zone_statistics(preferred_zone, zone); | 1250 | zone_statistics(preferred_zone, zone); |
1236 | local_irq_restore(flags); | 1251 | local_irq_restore(flags); |
1237 | put_cpu(); | ||
1238 | 1252 | ||
1239 | VM_BUG_ON(bad_range(zone, page)); | 1253 | VM_BUG_ON(bad_range(zone, page)); |
1240 | if (prep_new_page(page, order, gfp_flags)) | 1254 | if (prep_new_page(page, order, gfp_flags)) |
@@ -1243,7 +1257,6 @@ again: | |||
1243 | 1257 | ||
1244 | failed: | 1258 | failed: |
1245 | local_irq_restore(flags); | 1259 | local_irq_restore(flags); |
1246 | put_cpu(); | ||
1247 | return NULL; | 1260 | return NULL; |
1248 | } | 1261 | } |
1249 | 1262 | ||
@@ -1658,12 +1671,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
1658 | if (page) | 1671 | if (page) |
1659 | goto out; | 1672 | goto out; |
1660 | 1673 | ||
1661 | /* The OOM killer will not help higher order allocs */ | 1674 | if (!(gfp_mask & __GFP_NOFAIL)) { |
1662 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) | 1675 | /* The OOM killer will not help higher order allocs */ |
1663 | goto out; | 1676 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
1664 | 1677 | goto out; | |
1678 | /* | ||
1679 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. | ||
1680 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. | ||
1681 | * The caller should handle page allocation failure by itself if | ||
1682 | * it specifies __GFP_THISNODE. | ||
1683 | * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. | ||
1684 | */ | ||
1685 | if (gfp_mask & __GFP_THISNODE) | ||
1686 | goto out; | ||
1687 | } | ||
1665 | /* Exhausted what can be done so it's blamo time */ | 1688 | /* Exhausted what can be done so it's blamo time */ |
1666 | out_of_memory(zonelist, gfp_mask, order); | 1689 | out_of_memory(zonelist, gfp_mask, order, nodemask); |
1667 | 1690 | ||
1668 | out: | 1691 | out: |
1669 | clear_zonelist_oom(zonelist, gfp_mask); | 1692 | clear_zonelist_oom(zonelist, gfp_mask); |
@@ -2005,9 +2028,8 @@ void __pagevec_free(struct pagevec *pvec) | |||
2005 | void __free_pages(struct page *page, unsigned int order) | 2028 | void __free_pages(struct page *page, unsigned int order) |
2006 | { | 2029 | { |
2007 | if (put_page_testzero(page)) { | 2030 | if (put_page_testzero(page)) { |
2008 | trace_mm_page_free_direct(page, order); | ||
2009 | if (order == 0) | 2031 | if (order == 0) |
2010 | free_hot_page(page); | 2032 | free_hot_cold_page(page, 0); |
2011 | else | 2033 | else |
2012 | __free_pages_ok(page, order); | 2034 | __free_pages_ok(page, order); |
2013 | } | 2035 | } |
@@ -2172,7 +2194,7 @@ void show_free_areas(void) | |||
2172 | for_each_online_cpu(cpu) { | 2194 | for_each_online_cpu(cpu) { |
2173 | struct per_cpu_pageset *pageset; | 2195 | struct per_cpu_pageset *pageset; |
2174 | 2196 | ||
2175 | pageset = zone_pcp(zone, cpu); | 2197 | pageset = per_cpu_ptr(zone->pageset, cpu); |
2176 | 2198 | ||
2177 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", | 2199 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", |
2178 | cpu, pageset->pcp.high, | 2200 | cpu, pageset->pcp.high, |
@@ -2263,7 +2285,7 @@ void show_free_areas(void) | |||
2263 | K(zone_page_state(zone, NR_BOUNCE)), | 2285 | K(zone_page_state(zone, NR_BOUNCE)), |
2264 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | 2286 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), |
2265 | zone->pages_scanned, | 2287 | zone->pages_scanned, |
2266 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") | 2288 | (zone->all_unreclaimable ? "yes" : "no") |
2267 | ); | 2289 | ); |
2268 | printk("lowmem_reserve[]:"); | 2290 | printk("lowmem_reserve[]:"); |
2269 | for (i = 0; i < MAX_NR_ZONES; i++) | 2291 | for (i = 0; i < MAX_NR_ZONES; i++) |
@@ -2395,13 +2417,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
2395 | { | 2417 | { |
2396 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; | 2418 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; |
2397 | int ret; | 2419 | int ret; |
2420 | static DEFINE_MUTEX(zl_order_mutex); | ||
2398 | 2421 | ||
2422 | mutex_lock(&zl_order_mutex); | ||
2399 | if (write) | 2423 | if (write) |
2400 | strncpy(saved_string, (char*)table->data, | 2424 | strcpy(saved_string, (char*)table->data); |
2401 | NUMA_ZONELIST_ORDER_LEN); | ||
2402 | ret = proc_dostring(table, write, buffer, length, ppos); | 2425 | ret = proc_dostring(table, write, buffer, length, ppos); |
2403 | if (ret) | 2426 | if (ret) |
2404 | return ret; | 2427 | goto out; |
2405 | if (write) { | 2428 | if (write) { |
2406 | int oldval = user_zonelist_order; | 2429 | int oldval = user_zonelist_order; |
2407 | if (__parse_numa_zonelist_order((char*)table->data)) { | 2430 | if (__parse_numa_zonelist_order((char*)table->data)) { |
@@ -2414,7 +2437,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
2414 | } else if (oldval != user_zonelist_order) | 2437 | } else if (oldval != user_zonelist_order) |
2415 | build_all_zonelists(); | 2438 | build_all_zonelists(); |
2416 | } | 2439 | } |
2417 | return 0; | 2440 | out: |
2441 | mutex_unlock(&zl_order_mutex); | ||
2442 | return ret; | ||
2418 | } | 2443 | } |
2419 | 2444 | ||
2420 | 2445 | ||
@@ -2734,10 +2759,29 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
2734 | 2759 | ||
2735 | #endif /* CONFIG_NUMA */ | 2760 | #endif /* CONFIG_NUMA */ |
2736 | 2761 | ||
2762 | /* | ||
2763 | * Boot pageset table. One per cpu which is going to be used for all | ||
2764 | * zones and all nodes. The parameters will be set in such a way | ||
2765 | * that an item put on a list will immediately be handed over to | ||
2766 | * the buddy list. This is safe since pageset manipulation is done | ||
2767 | * with interrupts disabled. | ||
2768 | * | ||
2769 | * The boot_pagesets must be kept even after bootup is complete for | ||
2770 | * unused processors and/or zones. They do play a role for bootstrapping | ||
2771 | * hotplugged processors. | ||
2772 | * | ||
2773 | * zoneinfo_show() and maybe other functions do | ||
2774 | * not check if the processor is online before following the pageset pointer. | ||
2775 | * Other parts of the kernel may not check if the zone is available. | ||
2776 | */ | ||
2777 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); | ||
2778 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); | ||
2779 | |||
2737 | /* return values int ....just for stop_machine() */ | 2780 | /* return values int ....just for stop_machine() */ |
2738 | static int __build_all_zonelists(void *dummy) | 2781 | static int __build_all_zonelists(void *dummy) |
2739 | { | 2782 | { |
2740 | int nid; | 2783 | int nid; |
2784 | int cpu; | ||
2741 | 2785 | ||
2742 | #ifdef CONFIG_NUMA | 2786 | #ifdef CONFIG_NUMA |
2743 | memset(node_load, 0, sizeof(node_load)); | 2787 | memset(node_load, 0, sizeof(node_load)); |
@@ -2748,6 +2792,23 @@ static int __build_all_zonelists(void *dummy) | |||
2748 | build_zonelists(pgdat); | 2792 | build_zonelists(pgdat); |
2749 | build_zonelist_cache(pgdat); | 2793 | build_zonelist_cache(pgdat); |
2750 | } | 2794 | } |
2795 | |||
2796 | /* | ||
2797 | * Initialize the boot_pagesets that are going to be used | ||
2798 | * for bootstrapping processors. The real pagesets for | ||
2799 | * each zone will be allocated later when the per cpu | ||
2800 | * allocator is available. | ||
2801 | * | ||
2802 | * boot_pagesets are used also for bootstrapping offline | ||
2803 | * cpus if the system is already booted because the pagesets | ||
2804 | * are needed to initialize allocators on a specific cpu too. | ||
2805 | * F.e. the percpu allocator needs the page allocator which | ||
2806 | * needs the percpu allocator in order to allocate its pagesets | ||
2807 | * (a chicken-egg dilemma). | ||
2808 | */ | ||
2809 | for_each_possible_cpu(cpu) | ||
2810 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); | ||
2811 | |||
2751 | return 0; | 2812 | return 0; |
2752 | } | 2813 | } |
2753 | 2814 | ||
@@ -3085,121 +3146,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
3085 | pcp->batch = PAGE_SHIFT * 8; | 3146 | pcp->batch = PAGE_SHIFT * 8; |
3086 | } | 3147 | } |
3087 | 3148 | ||
3088 | |||
3089 | #ifdef CONFIG_NUMA | ||
3090 | /* | ||
3091 | * Boot pageset table. One per cpu which is going to be used for all | ||
3092 | * zones and all nodes. The parameters will be set in such a way | ||
3093 | * that an item put on a list will immediately be handed over to | ||
3094 | * the buddy list. This is safe since pageset manipulation is done | ||
3095 | * with interrupts disabled. | ||
3096 | * | ||
3097 | * Some NUMA counter updates may also be caught by the boot pagesets. | ||
3098 | * | ||
3099 | * The boot_pagesets must be kept even after bootup is complete for | ||
3100 | * unused processors and/or zones. They do play a role for bootstrapping | ||
3101 | * hotplugged processors. | ||
3102 | * | ||
3103 | * zoneinfo_show() and maybe other functions do | ||
3104 | * not check if the processor is online before following the pageset pointer. | ||
3105 | * Other parts of the kernel may not check if the zone is available. | ||
3106 | */ | ||
3107 | static struct per_cpu_pageset boot_pageset[NR_CPUS]; | ||
3108 | |||
3109 | /* | 3149 | /* |
3110 | * Dynamically allocate memory for the | 3150 | * Allocate per cpu pagesets and initialize them. |
3111 | * per cpu pageset array in struct zone. | 3151 | * Before this call only boot pagesets were available. |
3152 | * Boot pagesets will no longer be used by this processorr | ||
3153 | * after setup_per_cpu_pageset(). | ||
3112 | */ | 3154 | */ |
3113 | static int __cpuinit process_zones(int cpu) | 3155 | void __init setup_per_cpu_pageset(void) |
3114 | { | 3156 | { |
3115 | struct zone *zone, *dzone; | 3157 | struct zone *zone; |
3116 | int node = cpu_to_node(cpu); | 3158 | int cpu; |
3117 | |||
3118 | node_set_state(node, N_CPU); /* this node has a cpu */ | ||
3119 | 3159 | ||
3120 | for_each_populated_zone(zone) { | 3160 | for_each_populated_zone(zone) { |
3121 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 3161 | zone->pageset = alloc_percpu(struct per_cpu_pageset); |
3122 | GFP_KERNEL, node); | ||
3123 | if (!zone_pcp(zone, cpu)) | ||
3124 | goto bad; | ||
3125 | 3162 | ||
3126 | setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); | 3163 | for_each_possible_cpu(cpu) { |
3164 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); | ||
3127 | 3165 | ||
3128 | if (percpu_pagelist_fraction) | 3166 | setup_pageset(pcp, zone_batchsize(zone)); |
3129 | setup_pagelist_highmark(zone_pcp(zone, cpu), | ||
3130 | (zone->present_pages / percpu_pagelist_fraction)); | ||
3131 | } | ||
3132 | 3167 | ||
3133 | return 0; | 3168 | if (percpu_pagelist_fraction) |
3134 | bad: | 3169 | setup_pagelist_highmark(pcp, |
3135 | for_each_zone(dzone) { | 3170 | (zone->present_pages / |
3136 | if (!populated_zone(dzone)) | 3171 | percpu_pagelist_fraction)); |
3137 | continue; | 3172 | } |
3138 | if (dzone == zone) | ||
3139 | break; | ||
3140 | kfree(zone_pcp(dzone, cpu)); | ||
3141 | zone_pcp(dzone, cpu) = &boot_pageset[cpu]; | ||
3142 | } | ||
3143 | return -ENOMEM; | ||
3144 | } | ||
3145 | |||
3146 | static inline void free_zone_pagesets(int cpu) | ||
3147 | { | ||
3148 | struct zone *zone; | ||
3149 | |||
3150 | for_each_zone(zone) { | ||
3151 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | ||
3152 | |||
3153 | /* Free per_cpu_pageset if it is slab allocated */ | ||
3154 | if (pset != &boot_pageset[cpu]) | ||
3155 | kfree(pset); | ||
3156 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; | ||
3157 | } | ||
3158 | } | ||
3159 | |||
3160 | static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, | ||
3161 | unsigned long action, | ||
3162 | void *hcpu) | ||
3163 | { | ||
3164 | int cpu = (long)hcpu; | ||
3165 | int ret = NOTIFY_OK; | ||
3166 | |||
3167 | switch (action) { | ||
3168 | case CPU_UP_PREPARE: | ||
3169 | case CPU_UP_PREPARE_FROZEN: | ||
3170 | if (process_zones(cpu)) | ||
3171 | ret = NOTIFY_BAD; | ||
3172 | break; | ||
3173 | case CPU_UP_CANCELED: | ||
3174 | case CPU_UP_CANCELED_FROZEN: | ||
3175 | case CPU_DEAD: | ||
3176 | case CPU_DEAD_FROZEN: | ||
3177 | free_zone_pagesets(cpu); | ||
3178 | break; | ||
3179 | default: | ||
3180 | break; | ||
3181 | } | 3173 | } |
3182 | return ret; | ||
3183 | } | 3174 | } |
3184 | 3175 | ||
3185 | static struct notifier_block __cpuinitdata pageset_notifier = | ||
3186 | { &pageset_cpuup_callback, NULL, 0 }; | ||
3187 | |||
3188 | void __init setup_per_cpu_pageset(void) | ||
3189 | { | ||
3190 | int err; | ||
3191 | |||
3192 | /* Initialize per_cpu_pageset for cpu 0. | ||
3193 | * A cpuup callback will do this for every cpu | ||
3194 | * as it comes online | ||
3195 | */ | ||
3196 | err = process_zones(smp_processor_id()); | ||
3197 | BUG_ON(err); | ||
3198 | register_cpu_notifier(&pageset_notifier); | ||
3199 | } | ||
3200 | |||
3201 | #endif | ||
3202 | |||
3203 | static noinline __init_refok | 3176 | static noinline __init_refok |
3204 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 3177 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
3205 | { | 3178 | { |
@@ -3249,11 +3222,11 @@ static int __zone_pcp_update(void *data) | |||
3249 | int cpu; | 3222 | int cpu; |
3250 | unsigned long batch = zone_batchsize(zone), flags; | 3223 | unsigned long batch = zone_batchsize(zone), flags; |
3251 | 3224 | ||
3252 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 3225 | for_each_possible_cpu(cpu) { |
3253 | struct per_cpu_pageset *pset; | 3226 | struct per_cpu_pageset *pset; |
3254 | struct per_cpu_pages *pcp; | 3227 | struct per_cpu_pages *pcp; |
3255 | 3228 | ||
3256 | pset = zone_pcp(zone, cpu); | 3229 | pset = per_cpu_ptr(zone->pageset, cpu); |
3257 | pcp = &pset->pcp; | 3230 | pcp = &pset->pcp; |
3258 | 3231 | ||
3259 | local_irq_save(flags); | 3232 | local_irq_save(flags); |
@@ -3271,21 +3244,17 @@ void zone_pcp_update(struct zone *zone) | |||
3271 | 3244 | ||
3272 | static __meminit void zone_pcp_init(struct zone *zone) | 3245 | static __meminit void zone_pcp_init(struct zone *zone) |
3273 | { | 3246 | { |
3274 | int cpu; | 3247 | /* |
3275 | unsigned long batch = zone_batchsize(zone); | 3248 | * per cpu subsystem is not up at this point. The following code |
3249 | * relies on the ability of the linker to provide the | ||
3250 | * offset of a (static) per cpu variable into the per cpu area. | ||
3251 | */ | ||
3252 | zone->pageset = &boot_pageset; | ||
3276 | 3253 | ||
3277 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
3278 | #ifdef CONFIG_NUMA | ||
3279 | /* Early boot. Slab allocator not functional yet */ | ||
3280 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; | ||
3281 | setup_pageset(&boot_pageset[cpu],0); | ||
3282 | #else | ||
3283 | setup_pageset(zone_pcp(zone,cpu), batch); | ||
3284 | #endif | ||
3285 | } | ||
3286 | if (zone->present_pages) | 3254 | if (zone->present_pages) |
3287 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | 3255 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", |
3288 | zone->name, zone->present_pages, batch); | 3256 | zone->name, zone->present_pages, |
3257 | zone_batchsize(zone)); | ||
3289 | } | 3258 | } |
3290 | 3259 | ||
3291 | __meminit int init_currently_empty_zone(struct zone *zone, | 3260 | __meminit int init_currently_empty_zone(struct zone *zone, |
@@ -3424,6 +3393,61 @@ void __init free_bootmem_with_active_regions(int nid, | |||
3424 | } | 3393 | } |
3425 | } | 3394 | } |
3426 | 3395 | ||
3396 | int __init add_from_early_node_map(struct range *range, int az, | ||
3397 | int nr_range, int nid) | ||
3398 | { | ||
3399 | int i; | ||
3400 | u64 start, end; | ||
3401 | |||
3402 | /* need to go over early_node_map to find out good range for node */ | ||
3403 | for_each_active_range_index_in_nid(i, nid) { | ||
3404 | start = early_node_map[i].start_pfn; | ||
3405 | end = early_node_map[i].end_pfn; | ||
3406 | nr_range = add_range(range, az, nr_range, start, end); | ||
3407 | } | ||
3408 | return nr_range; | ||
3409 | } | ||
3410 | |||
3411 | #ifdef CONFIG_NO_BOOTMEM | ||
3412 | void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | ||
3413 | u64 goal, u64 limit) | ||
3414 | { | ||
3415 | int i; | ||
3416 | void *ptr; | ||
3417 | |||
3418 | /* need to go over early_node_map to find out good range for node */ | ||
3419 | for_each_active_range_index_in_nid(i, nid) { | ||
3420 | u64 addr; | ||
3421 | u64 ei_start, ei_last; | ||
3422 | |||
3423 | ei_last = early_node_map[i].end_pfn; | ||
3424 | ei_last <<= PAGE_SHIFT; | ||
3425 | ei_start = early_node_map[i].start_pfn; | ||
3426 | ei_start <<= PAGE_SHIFT; | ||
3427 | addr = find_early_area(ei_start, ei_last, | ||
3428 | goal, limit, size, align); | ||
3429 | |||
3430 | if (addr == -1ULL) | ||
3431 | continue; | ||
3432 | |||
3433 | #if 0 | ||
3434 | printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n", | ||
3435 | nid, | ||
3436 | ei_start, ei_last, goal, limit, size, | ||
3437 | align, addr); | ||
3438 | #endif | ||
3439 | |||
3440 | ptr = phys_to_virt(addr); | ||
3441 | memset(ptr, 0, size); | ||
3442 | reserve_early_without_check(addr, addr + size, "BOOTMEM"); | ||
3443 | return ptr; | ||
3444 | } | ||
3445 | |||
3446 | return NULL; | ||
3447 | } | ||
3448 | #endif | ||
3449 | |||
3450 | |||
3427 | void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) | 3451 | void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) |
3428 | { | 3452 | { |
3429 | int i; | 3453 | int i; |
@@ -3573,7 +3597,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
3573 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | 3597 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, |
3574 | * then all holes in the requested range will be accounted for. | 3598 | * then all holes in the requested range will be accounted for. |
3575 | */ | 3599 | */ |
3576 | static unsigned long __meminit __absent_pages_in_range(int nid, | 3600 | unsigned long __meminit __absent_pages_in_range(int nid, |
3577 | unsigned long range_start_pfn, | 3601 | unsigned long range_start_pfn, |
3578 | unsigned long range_end_pfn) | 3602 | unsigned long range_end_pfn) |
3579 | { | 3603 | { |
@@ -3988,7 +4012,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, | |||
3988 | } | 4012 | } |
3989 | 4013 | ||
3990 | /* Merge backward if suitable */ | 4014 | /* Merge backward if suitable */ |
3991 | if (start_pfn < early_node_map[i].end_pfn && | 4015 | if (start_pfn < early_node_map[i].start_pfn && |
3992 | end_pfn >= early_node_map[i].start_pfn) { | 4016 | end_pfn >= early_node_map[i].start_pfn) { |
3993 | early_node_map[i].start_pfn = start_pfn; | 4017 | early_node_map[i].start_pfn = start_pfn; |
3994 | return; | 4018 | return; |
@@ -4102,7 +4126,7 @@ static int __init cmp_node_active_region(const void *a, const void *b) | |||
4102 | } | 4126 | } |
4103 | 4127 | ||
4104 | /* sort the node_map by start_pfn */ | 4128 | /* sort the node_map by start_pfn */ |
4105 | static void __init sort_node_map(void) | 4129 | void __init sort_node_map(void) |
4106 | { | 4130 | { |
4107 | sort(early_node_map, (size_t)nr_nodemap_entries, | 4131 | sort(early_node_map, (size_t)nr_nodemap_entries, |
4108 | sizeof(struct node_active_region), | 4132 | sizeof(struct node_active_region), |
@@ -4366,8 +4390,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4366 | for (i = 0; i < MAX_NR_ZONES; i++) { | 4390 | for (i = 0; i < MAX_NR_ZONES; i++) { |
4367 | if (i == ZONE_MOVABLE) | 4391 | if (i == ZONE_MOVABLE) |
4368 | continue; | 4392 | continue; |
4369 | printk(" %-8s %0#10lx -> %0#10lx\n", | 4393 | printk(" %-8s ", zone_names[i]); |
4370 | zone_names[i], | 4394 | if (arch_zone_lowest_possible_pfn[i] == |
4395 | arch_zone_highest_possible_pfn[i]) | ||
4396 | printk("empty\n"); | ||
4397 | else | ||
4398 | printk("%0#10lx -> %0#10lx\n", | ||
4371 | arch_zone_lowest_possible_pfn[i], | 4399 | arch_zone_lowest_possible_pfn[i], |
4372 | arch_zone_highest_possible_pfn[i]); | 4400 | arch_zone_highest_possible_pfn[i]); |
4373 | } | 4401 | } |
@@ -4456,7 +4484,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) | |||
4456 | } | 4484 | } |
4457 | 4485 | ||
4458 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4486 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
4459 | struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; | 4487 | struct pglist_data __refdata contig_page_data = { |
4488 | #ifndef CONFIG_NO_BOOTMEM | ||
4489 | .bdata = &bootmem_node_data[0] | ||
4490 | #endif | ||
4491 | }; | ||
4460 | EXPORT_SYMBOL(contig_page_data); | 4492 | EXPORT_SYMBOL(contig_page_data); |
4461 | #endif | 4493 | #endif |
4462 | 4494 | ||
@@ -4799,10 +4831,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
4799 | if (!write || (ret == -EINVAL)) | 4831 | if (!write || (ret == -EINVAL)) |
4800 | return ret; | 4832 | return ret; |
4801 | for_each_populated_zone(zone) { | 4833 | for_each_populated_zone(zone) { |
4802 | for_each_online_cpu(cpu) { | 4834 | for_each_possible_cpu(cpu) { |
4803 | unsigned long high; | 4835 | unsigned long high; |
4804 | high = zone->present_pages / percpu_pagelist_fraction; | 4836 | high = zone->present_pages / percpu_pagelist_fraction; |
4805 | setup_pagelist_highmark(zone_pcp(zone, cpu), high); | 4837 | setup_pagelist_highmark( |
4838 | per_cpu_ptr(zone->pageset, cpu), high); | ||
4806 | } | 4839 | } |
4807 | } | 4840 | } |
4808 | return 0; | 4841 | return 0; |
@@ -5002,23 +5035,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5002 | int set_migratetype_isolate(struct page *page) | 5035 | int set_migratetype_isolate(struct page *page) |
5003 | { | 5036 | { |
5004 | struct zone *zone; | 5037 | struct zone *zone; |
5005 | unsigned long flags; | 5038 | struct page *curr_page; |
5039 | unsigned long flags, pfn, iter; | ||
5040 | unsigned long immobile = 0; | ||
5041 | struct memory_isolate_notify arg; | ||
5042 | int notifier_ret; | ||
5006 | int ret = -EBUSY; | 5043 | int ret = -EBUSY; |
5007 | int zone_idx; | 5044 | int zone_idx; |
5008 | 5045 | ||
5009 | zone = page_zone(page); | 5046 | zone = page_zone(page); |
5010 | zone_idx = zone_idx(zone); | 5047 | zone_idx = zone_idx(zone); |
5048 | |||
5011 | spin_lock_irqsave(&zone->lock, flags); | 5049 | spin_lock_irqsave(&zone->lock, flags); |
5050 | if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE || | ||
5051 | zone_idx == ZONE_MOVABLE) { | ||
5052 | ret = 0; | ||
5053 | goto out; | ||
5054 | } | ||
5055 | |||
5056 | pfn = page_to_pfn(page); | ||
5057 | arg.start_pfn = pfn; | ||
5058 | arg.nr_pages = pageblock_nr_pages; | ||
5059 | arg.pages_found = 0; | ||
5060 | |||
5012 | /* | 5061 | /* |
5013 | * In future, more migrate types will be able to be isolation target. | 5062 | * It may be possible to isolate a pageblock even if the |
5063 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | ||
5064 | * notifier chain is used by balloon drivers to return the | ||
5065 | * number of pages in a range that are held by the balloon | ||
5066 | * driver to shrink memory. If all the pages are accounted for | ||
5067 | * by balloons, are free, or on the LRU, isolation can continue. | ||
5068 | * Later, for example, when memory hotplug notifier runs, these | ||
5069 | * pages reported as "can be isolated" should be isolated(freed) | ||
5070 | * by the balloon driver through the memory notifier chain. | ||
5014 | */ | 5071 | */ |
5015 | if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && | 5072 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); |
5016 | zone_idx != ZONE_MOVABLE) | 5073 | notifier_ret = notifier_to_errno(notifier_ret); |
5074 | if (notifier_ret || !arg.pages_found) | ||
5017 | goto out; | 5075 | goto out; |
5018 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | 5076 | |
5019 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | 5077 | for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { |
5020 | ret = 0; | 5078 | if (!pfn_valid_within(pfn)) |
5079 | continue; | ||
5080 | |||
5081 | curr_page = pfn_to_page(iter); | ||
5082 | if (!page_count(curr_page) || PageLRU(curr_page)) | ||
5083 | continue; | ||
5084 | |||
5085 | immobile++; | ||
5086 | } | ||
5087 | |||
5088 | if (arg.pages_found == immobile) | ||
5089 | ret = 0; | ||
5090 | |||
5021 | out: | 5091 | out: |
5092 | if (!ret) { | ||
5093 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
5094 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
5095 | } | ||
5096 | |||
5022 | spin_unlock_irqrestore(&zone->lock, flags); | 5097 | spin_unlock_irqrestore(&zone->lock, flags); |
5023 | if (!ret) | 5098 | if (!ret) |
5024 | drain_all_pages(); | 5099 | drain_all_pages(); |
@@ -5085,3 +5160,101 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
5085 | spin_unlock_irqrestore(&zone->lock, flags); | 5160 | spin_unlock_irqrestore(&zone->lock, flags); |
5086 | } | 5161 | } |
5087 | #endif | 5162 | #endif |
5163 | |||
5164 | #ifdef CONFIG_MEMORY_FAILURE | ||
5165 | bool is_free_buddy_page(struct page *page) | ||
5166 | { | ||
5167 | struct zone *zone = page_zone(page); | ||
5168 | unsigned long pfn = page_to_pfn(page); | ||
5169 | unsigned long flags; | ||
5170 | int order; | ||
5171 | |||
5172 | spin_lock_irqsave(&zone->lock, flags); | ||
5173 | for (order = 0; order < MAX_ORDER; order++) { | ||
5174 | struct page *page_head = page - (pfn & ((1 << order) - 1)); | ||
5175 | |||
5176 | if (PageBuddy(page_head) && page_order(page_head) >= order) | ||
5177 | break; | ||
5178 | } | ||
5179 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5180 | |||
5181 | return order < MAX_ORDER; | ||
5182 | } | ||
5183 | #endif | ||
5184 | |||
5185 | static struct trace_print_flags pageflag_names[] = { | ||
5186 | {1UL << PG_locked, "locked" }, | ||
5187 | {1UL << PG_error, "error" }, | ||
5188 | {1UL << PG_referenced, "referenced" }, | ||
5189 | {1UL << PG_uptodate, "uptodate" }, | ||
5190 | {1UL << PG_dirty, "dirty" }, | ||
5191 | {1UL << PG_lru, "lru" }, | ||
5192 | {1UL << PG_active, "active" }, | ||
5193 | {1UL << PG_slab, "slab" }, | ||
5194 | {1UL << PG_owner_priv_1, "owner_priv_1" }, | ||
5195 | {1UL << PG_arch_1, "arch_1" }, | ||
5196 | {1UL << PG_reserved, "reserved" }, | ||
5197 | {1UL << PG_private, "private" }, | ||
5198 | {1UL << PG_private_2, "private_2" }, | ||
5199 | {1UL << PG_writeback, "writeback" }, | ||
5200 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | ||
5201 | {1UL << PG_head, "head" }, | ||
5202 | {1UL << PG_tail, "tail" }, | ||
5203 | #else | ||
5204 | {1UL << PG_compound, "compound" }, | ||
5205 | #endif | ||
5206 | {1UL << PG_swapcache, "swapcache" }, | ||
5207 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | ||
5208 | {1UL << PG_reclaim, "reclaim" }, | ||
5209 | {1UL << PG_buddy, "buddy" }, | ||
5210 | {1UL << PG_swapbacked, "swapbacked" }, | ||
5211 | {1UL << PG_unevictable, "unevictable" }, | ||
5212 | #ifdef CONFIG_MMU | ||
5213 | {1UL << PG_mlocked, "mlocked" }, | ||
5214 | #endif | ||
5215 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED | ||
5216 | {1UL << PG_uncached, "uncached" }, | ||
5217 | #endif | ||
5218 | #ifdef CONFIG_MEMORY_FAILURE | ||
5219 | {1UL << PG_hwpoison, "hwpoison" }, | ||
5220 | #endif | ||
5221 | {-1UL, NULL }, | ||
5222 | }; | ||
5223 | |||
5224 | static void dump_page_flags(unsigned long flags) | ||
5225 | { | ||
5226 | const char *delim = ""; | ||
5227 | unsigned long mask; | ||
5228 | int i; | ||
5229 | |||
5230 | printk(KERN_ALERT "page flags: %#lx(", flags); | ||
5231 | |||
5232 | /* remove zone id */ | ||
5233 | flags &= (1UL << NR_PAGEFLAGS) - 1; | ||
5234 | |||
5235 | for (i = 0; pageflag_names[i].name && flags; i++) { | ||
5236 | |||
5237 | mask = pageflag_names[i].mask; | ||
5238 | if ((flags & mask) != mask) | ||
5239 | continue; | ||
5240 | |||
5241 | flags &= ~mask; | ||
5242 | printk("%s%s", delim, pageflag_names[i].name); | ||
5243 | delim = "|"; | ||
5244 | } | ||
5245 | |||
5246 | /* check for left over flags */ | ||
5247 | if (flags) | ||
5248 | printk("%s%#lx", delim, flags); | ||
5249 | |||
5250 | printk(")\n"); | ||
5251 | } | ||
5252 | |||
5253 | void dump_page(struct page *page) | ||
5254 | { | ||
5255 | printk(KERN_ALERT | ||
5256 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | ||
5257 | page, page_count(page), page_mapcount(page), | ||
5258 | page->mapping, page->index); | ||
5259 | dump_page_flags(page->flags); | ||
5260 | } | ||
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3d535d594826..6c0081441a32 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -284,6 +284,7 @@ static DEFINE_MUTEX(swap_cgroup_mutex); | |||
284 | struct swap_cgroup_ctrl { | 284 | struct swap_cgroup_ctrl { |
285 | struct page **map; | 285 | struct page **map; |
286 | unsigned long length; | 286 | unsigned long length; |
287 | spinlock_t lock; | ||
287 | }; | 288 | }; |
288 | 289 | ||
289 | struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; | 290 | struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; |
@@ -335,6 +336,43 @@ not_enough_page: | |||
335 | } | 336 | } |
336 | 337 | ||
337 | /** | 338 | /** |
339 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | ||
340 | * @end: swap entry to be cmpxchged | ||
341 | * @old: old id | ||
342 | * @new: new id | ||
343 | * | ||
344 | * Returns old id at success, 0 at failure. | ||
345 | * (There is no mem_cgroup useing 0 as its id) | ||
346 | */ | ||
347 | unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | ||
348 | unsigned short old, unsigned short new) | ||
349 | { | ||
350 | int type = swp_type(ent); | ||
351 | unsigned long offset = swp_offset(ent); | ||
352 | unsigned long idx = offset / SC_PER_PAGE; | ||
353 | unsigned long pos = offset & SC_POS_MASK; | ||
354 | struct swap_cgroup_ctrl *ctrl; | ||
355 | struct page *mappage; | ||
356 | struct swap_cgroup *sc; | ||
357 | unsigned long flags; | ||
358 | unsigned short retval; | ||
359 | |||
360 | ctrl = &swap_cgroup_ctrl[type]; | ||
361 | |||
362 | mappage = ctrl->map[idx]; | ||
363 | sc = page_address(mappage); | ||
364 | sc += pos; | ||
365 | spin_lock_irqsave(&ctrl->lock, flags); | ||
366 | retval = sc->id; | ||
367 | if (retval == old) | ||
368 | sc->id = new; | ||
369 | else | ||
370 | retval = 0; | ||
371 | spin_unlock_irqrestore(&ctrl->lock, flags); | ||
372 | return retval; | ||
373 | } | ||
374 | |||
375 | /** | ||
338 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | 376 | * swap_cgroup_record - record mem_cgroup for this swp_entry. |
339 | * @ent: swap entry to be recorded into | 377 | * @ent: swap entry to be recorded into |
340 | * @mem: mem_cgroup to be recorded | 378 | * @mem: mem_cgroup to be recorded |
@@ -352,14 +390,17 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | |||
352 | struct page *mappage; | 390 | struct page *mappage; |
353 | struct swap_cgroup *sc; | 391 | struct swap_cgroup *sc; |
354 | unsigned short old; | 392 | unsigned short old; |
393 | unsigned long flags; | ||
355 | 394 | ||
356 | ctrl = &swap_cgroup_ctrl[type]; | 395 | ctrl = &swap_cgroup_ctrl[type]; |
357 | 396 | ||
358 | mappage = ctrl->map[idx]; | 397 | mappage = ctrl->map[idx]; |
359 | sc = page_address(mappage); | 398 | sc = page_address(mappage); |
360 | sc += pos; | 399 | sc += pos; |
400 | spin_lock_irqsave(&ctrl->lock, flags); | ||
361 | old = sc->id; | 401 | old = sc->id; |
362 | sc->id = id; | 402 | sc->id = id; |
403 | spin_unlock_irqrestore(&ctrl->lock, flags); | ||
363 | 404 | ||
364 | return old; | 405 | return old; |
365 | } | 406 | } |
@@ -411,6 +452,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) | |||
411 | mutex_lock(&swap_cgroup_mutex); | 452 | mutex_lock(&swap_cgroup_mutex); |
412 | ctrl->length = length; | 453 | ctrl->length = length; |
413 | ctrl->map = array; | 454 | ctrl->map = array; |
455 | spin_lock_init(&ctrl->lock); | ||
414 | if (swap_cgroup_prepare(type)) { | 456 | if (swap_cgroup_prepare(type)) { |
415 | /* memory shortage */ | 457 | /* memory shortage */ |
416 | ctrl->map = NULL; | 458 | ctrl->map = NULL; |
diff --git a/mm/page_io.c b/mm/page_io.c index c6f3e5071de3..31a3b962230a 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -12,6 +12,7 @@ | |||
12 | 12 | ||
13 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
14 | #include <linux/kernel_stat.h> | 14 | #include <linux/kernel_stat.h> |
15 | #include <linux/gfp.h> | ||
15 | #include <linux/pagemap.h> | 16 | #include <linux/pagemap.h> |
16 | #include <linux/swap.h> | 17 | #include <linux/swap.h> |
17 | #include <linux/bio.h> | 18 | #include <linux/bio.h> |
@@ -19,20 +20,15 @@ | |||
19 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
20 | #include <asm/pgtable.h> | 21 | #include <asm/pgtable.h> |
21 | 22 | ||
22 | static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, | 23 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
23 | struct page *page, bio_end_io_t end_io) | 24 | struct page *page, bio_end_io_t end_io) |
24 | { | 25 | { |
25 | struct bio *bio; | 26 | struct bio *bio; |
26 | 27 | ||
27 | bio = bio_alloc(gfp_flags, 1); | 28 | bio = bio_alloc(gfp_flags, 1); |
28 | if (bio) { | 29 | if (bio) { |
29 | struct swap_info_struct *sis; | 30 | bio->bi_sector = map_swap_page(page, &bio->bi_bdev); |
30 | swp_entry_t entry = { .val = index, }; | 31 | bio->bi_sector <<= PAGE_SHIFT - 9; |
31 | |||
32 | sis = get_swap_info_struct(swp_type(entry)); | ||
33 | bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * | ||
34 | (PAGE_SIZE >> 9); | ||
35 | bio->bi_bdev = sis->bdev; | ||
36 | bio->bi_io_vec[0].bv_page = page; | 32 | bio->bi_io_vec[0].bv_page = page; |
37 | bio->bi_io_vec[0].bv_len = PAGE_SIZE; | 33 | bio->bi_io_vec[0].bv_len = PAGE_SIZE; |
38 | bio->bi_io_vec[0].bv_offset = 0; | 34 | bio->bi_io_vec[0].bv_offset = 0; |
@@ -102,8 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
102 | unlock_page(page); | 98 | unlock_page(page); |
103 | goto out; | 99 | goto out; |
104 | } | 100 | } |
105 | bio = get_swap_bio(GFP_NOIO, page_private(page), page, | 101 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); |
106 | end_swap_bio_write); | ||
107 | if (bio == NULL) { | 102 | if (bio == NULL) { |
108 | set_page_dirty(page); | 103 | set_page_dirty(page); |
109 | unlock_page(page); | 104 | unlock_page(page); |
@@ -127,8 +122,7 @@ int swap_readpage(struct page *page) | |||
127 | 122 | ||
128 | VM_BUG_ON(!PageLocked(page)); | 123 | VM_BUG_ON(!PageLocked(page)); |
129 | VM_BUG_ON(PageUptodate(page)); | 124 | VM_BUG_ON(PageUptodate(page)); |
130 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, | 125 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); |
131 | end_swap_bio_read); | ||
132 | if (bio == NULL) { | 126 | if (bio == NULL) { |
133 | unlock_page(page); | 127 | unlock_page(page); |
134 | ret = -ENOMEM; | 128 | ret = -ENOMEM; |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d5878bed7841..8b1a2ce21ee5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -1,6 +1,7 @@ | |||
1 | #include <linux/mm.h> | 1 | #include <linux/mm.h> |
2 | #include <linux/highmem.h> | 2 | #include <linux/highmem.h> |
3 | #include <linux/sched.h> | 3 | #include <linux/sched.h> |
4 | #include <linux/hugetlb.h> | ||
4 | 5 | ||
5 | static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | 6 | static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
6 | struct mm_walk *walk) | 7 | struct mm_walk *walk) |
@@ -79,6 +80,37 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, | |||
79 | return err; | 80 | return err; |
80 | } | 81 | } |
81 | 82 | ||
83 | #ifdef CONFIG_HUGETLB_PAGE | ||
84 | static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, | ||
85 | unsigned long end) | ||
86 | { | ||
87 | unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); | ||
88 | return boundary < end ? boundary : end; | ||
89 | } | ||
90 | |||
91 | static int walk_hugetlb_range(struct vm_area_struct *vma, | ||
92 | unsigned long addr, unsigned long end, | ||
93 | struct mm_walk *walk) | ||
94 | { | ||
95 | struct hstate *h = hstate_vma(vma); | ||
96 | unsigned long next; | ||
97 | unsigned long hmask = huge_page_mask(h); | ||
98 | pte_t *pte; | ||
99 | int err = 0; | ||
100 | |||
101 | do { | ||
102 | next = hugetlb_entry_end(h, addr, end); | ||
103 | pte = huge_pte_offset(walk->mm, addr & hmask); | ||
104 | if (pte && walk->hugetlb_entry) | ||
105 | err = walk->hugetlb_entry(pte, hmask, addr, next, walk); | ||
106 | if (err) | ||
107 | return err; | ||
108 | } while (addr = next, addr != end); | ||
109 | |||
110 | return 0; | ||
111 | } | ||
112 | #endif | ||
113 | |||
82 | /** | 114 | /** |
83 | * walk_page_range - walk a memory map's page tables with a callback | 115 | * walk_page_range - walk a memory map's page tables with a callback |
84 | * @mm: memory map to walk | 116 | * @mm: memory map to walk |
@@ -107,6 +139,7 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
107 | pgd_t *pgd; | 139 | pgd_t *pgd; |
108 | unsigned long next; | 140 | unsigned long next; |
109 | int err = 0; | 141 | int err = 0; |
142 | struct vm_area_struct *vma; | ||
110 | 143 | ||
111 | if (addr >= end) | 144 | if (addr >= end) |
112 | return err; | 145 | return err; |
@@ -117,11 +150,34 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
117 | pgd = pgd_offset(walk->mm, addr); | 150 | pgd = pgd_offset(walk->mm, addr); |
118 | do { | 151 | do { |
119 | next = pgd_addr_end(addr, end); | 152 | next = pgd_addr_end(addr, end); |
153 | |||
154 | /* | ||
155 | * handle hugetlb vma individually because pagetable walk for | ||
156 | * the hugetlb page is dependent on the architecture and | ||
157 | * we can't handled it in the same manner as non-huge pages. | ||
158 | */ | ||
159 | vma = find_vma(walk->mm, addr); | ||
160 | #ifdef CONFIG_HUGETLB_PAGE | ||
161 | if (vma && is_vm_hugetlb_page(vma)) { | ||
162 | if (vma->vm_end < next) | ||
163 | next = vma->vm_end; | ||
164 | /* | ||
165 | * Hugepage is very tightly coupled with vma, so | ||
166 | * walk through hugetlb entries within a given vma. | ||
167 | */ | ||
168 | err = walk_hugetlb_range(vma, addr, next, walk); | ||
169 | if (err) | ||
170 | break; | ||
171 | pgd = pgd_offset(walk->mm, next); | ||
172 | continue; | ||
173 | } | ||
174 | #endif | ||
120 | if (pgd_none_or_clear_bad(pgd)) { | 175 | if (pgd_none_or_clear_bad(pgd)) { |
121 | if (walk->pte_hole) | 176 | if (walk->pte_hole) |
122 | err = walk->pte_hole(addr, next, walk); | 177 | err = walk->pte_hole(addr, next, walk); |
123 | if (err) | 178 | if (err) |
124 | break; | 179 | break; |
180 | pgd++; | ||
125 | continue; | 181 | continue; |
126 | } | 182 | } |
127 | if (walk->pgd_entry) | 183 | if (walk->pgd_entry) |
@@ -131,7 +187,8 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
131 | err = walk_pud_range(pgd, addr, next, walk); | 187 | err = walk_pud_range(pgd, addr, next, walk); |
132 | if (err) | 188 | if (err) |
133 | break; | 189 | break; |
134 | } while (pgd++, addr = next, addr != end); | 190 | pgd++; |
191 | } while (addr = next, addr != end); | ||
135 | 192 | ||
136 | return err; | 193 | return err; |
137 | } | 194 | } |
diff --git a/mm/percpu.c b/mm/percpu.c index 5adfc268b408..6e09741ddc62 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -46,8 +46,6 @@ | |||
46 | * | 46 | * |
47 | * To use this allocator, arch code should do the followings. | 47 | * To use this allocator, arch code should do the followings. |
48 | * | 48 | * |
49 | * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA | ||
50 | * | ||
51 | * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate | 49 | * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate |
52 | * regular address to percpu pointer and back if they need to be | 50 | * regular address to percpu pointer and back if they need to be |
53 | * different from the default | 51 | * different from the default |
@@ -74,6 +72,7 @@ | |||
74 | #include <asm/cacheflush.h> | 72 | #include <asm/cacheflush.h> |
75 | #include <asm/sections.h> | 73 | #include <asm/sections.h> |
76 | #include <asm/tlbflush.h> | 74 | #include <asm/tlbflush.h> |
75 | #include <asm/io.h> | ||
77 | 76 | ||
78 | #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ | 77 | #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ |
79 | #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ | 78 | #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ |
@@ -81,13 +80,15 @@ | |||
81 | /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ | 80 | /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ |
82 | #ifndef __addr_to_pcpu_ptr | 81 | #ifndef __addr_to_pcpu_ptr |
83 | #define __addr_to_pcpu_ptr(addr) \ | 82 | #define __addr_to_pcpu_ptr(addr) \ |
84 | (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ | 83 | (void __percpu *)((unsigned long)(addr) - \ |
85 | + (unsigned long)__per_cpu_start) | 84 | (unsigned long)pcpu_base_addr + \ |
85 | (unsigned long)__per_cpu_start) | ||
86 | #endif | 86 | #endif |
87 | #ifndef __pcpu_ptr_to_addr | 87 | #ifndef __pcpu_ptr_to_addr |
88 | #define __pcpu_ptr_to_addr(ptr) \ | 88 | #define __pcpu_ptr_to_addr(ptr) \ |
89 | (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ | 89 | (void __force *)((unsigned long)(ptr) + \ |
90 | - (unsigned long)__per_cpu_start) | 90 | (unsigned long)pcpu_base_addr - \ |
91 | (unsigned long)__per_cpu_start) | ||
91 | #endif | 92 | #endif |
92 | 93 | ||
93 | struct pcpu_chunk { | 94 | struct pcpu_chunk { |
@@ -914,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) | |||
914 | int rs, re; | 915 | int rs, re; |
915 | 916 | ||
916 | /* quick path, check whether it's empty already */ | 917 | /* quick path, check whether it's empty already */ |
917 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { | 918 | rs = page_start; |
918 | if (rs == page_start && re == page_end) | 919 | pcpu_next_unpop(chunk, &rs, &re, page_end); |
919 | return; | 920 | if (rs == page_start && re == page_end) |
920 | break; | 921 | return; |
921 | } | ||
922 | 922 | ||
923 | /* immutable chunks can't be depopulated */ | 923 | /* immutable chunks can't be depopulated */ |
924 | WARN_ON(chunk->immutable); | 924 | WARN_ON(chunk->immutable); |
@@ -969,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) | |||
969 | int rs, re, rc; | 969 | int rs, re, rc; |
970 | 970 | ||
971 | /* quick path, check whether all pages are already there */ | 971 | /* quick path, check whether all pages are already there */ |
972 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { | 972 | rs = page_start; |
973 | if (rs == page_start && re == page_end) | 973 | pcpu_next_pop(chunk, &rs, &re, page_end); |
974 | goto clear; | 974 | if (rs == page_start && re == page_end) |
975 | break; | 975 | goto clear; |
976 | } | ||
977 | 976 | ||
978 | /* need to allocate and map pages, this chunk can't be immutable */ | 977 | /* need to allocate and map pages, this chunk can't be immutable */ |
979 | WARN_ON(chunk->immutable); | 978 | WARN_ON(chunk->immutable); |
@@ -1068,7 +1067,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) | |||
1068 | * RETURNS: | 1067 | * RETURNS: |
1069 | * Percpu pointer to the allocated area on success, NULL on failure. | 1068 | * Percpu pointer to the allocated area on success, NULL on failure. |
1070 | */ | 1069 | */ |
1071 | static void *pcpu_alloc(size_t size, size_t align, bool reserved) | 1070 | static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) |
1072 | { | 1071 | { |
1073 | static int warn_limit = 10; | 1072 | static int warn_limit = 10; |
1074 | struct pcpu_chunk *chunk; | 1073 | struct pcpu_chunk *chunk; |
@@ -1197,7 +1196,7 @@ fail_unlock_mutex: | |||
1197 | * RETURNS: | 1196 | * RETURNS: |
1198 | * Percpu pointer to the allocated area on success, NULL on failure. | 1197 | * Percpu pointer to the allocated area on success, NULL on failure. |
1199 | */ | 1198 | */ |
1200 | void *__alloc_percpu(size_t size, size_t align) | 1199 | void __percpu *__alloc_percpu(size_t size, size_t align) |
1201 | { | 1200 | { |
1202 | return pcpu_alloc(size, align, false); | 1201 | return pcpu_alloc(size, align, false); |
1203 | } | 1202 | } |
@@ -1218,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); | |||
1218 | * RETURNS: | 1217 | * RETURNS: |
1219 | * Percpu pointer to the allocated area on success, NULL on failure. | 1218 | * Percpu pointer to the allocated area on success, NULL on failure. |
1220 | */ | 1219 | */ |
1221 | void *__alloc_reserved_percpu(size_t size, size_t align) | 1220 | void __percpu *__alloc_reserved_percpu(size_t size, size_t align) |
1222 | { | 1221 | { |
1223 | return pcpu_alloc(size, align, true); | 1222 | return pcpu_alloc(size, align, true); |
1224 | } | 1223 | } |
@@ -1270,9 +1269,9 @@ static void pcpu_reclaim(struct work_struct *work) | |||
1270 | * CONTEXT: | 1269 | * CONTEXT: |
1271 | * Can be called from atomic context. | 1270 | * Can be called from atomic context. |
1272 | */ | 1271 | */ |
1273 | void free_percpu(void *ptr) | 1272 | void free_percpu(void __percpu *ptr) |
1274 | { | 1273 | { |
1275 | void *addr = __pcpu_ptr_to_addr(ptr); | 1274 | void *addr; |
1276 | struct pcpu_chunk *chunk; | 1275 | struct pcpu_chunk *chunk; |
1277 | unsigned long flags; | 1276 | unsigned long flags; |
1278 | int off; | 1277 | int off; |
@@ -1280,6 +1279,8 @@ void free_percpu(void *ptr) | |||
1280 | if (!ptr) | 1279 | if (!ptr) |
1281 | return; | 1280 | return; |
1282 | 1281 | ||
1282 | addr = __pcpu_ptr_to_addr(ptr); | ||
1283 | |||
1283 | spin_lock_irqsave(&pcpu_lock, flags); | 1284 | spin_lock_irqsave(&pcpu_lock, flags); |
1284 | 1285 | ||
1285 | chunk = pcpu_chunk_addr_search(addr); | 1286 | chunk = pcpu_chunk_addr_search(addr); |
@@ -1302,6 +1303,53 @@ void free_percpu(void *ptr) | |||
1302 | } | 1303 | } |
1303 | EXPORT_SYMBOL_GPL(free_percpu); | 1304 | EXPORT_SYMBOL_GPL(free_percpu); |
1304 | 1305 | ||
1306 | /** | ||
1307 | * is_kernel_percpu_address - test whether address is from static percpu area | ||
1308 | * @addr: address to test | ||
1309 | * | ||
1310 | * Test whether @addr belongs to in-kernel static percpu area. Module | ||
1311 | * static percpu areas are not considered. For those, use | ||
1312 | * is_module_percpu_address(). | ||
1313 | * | ||
1314 | * RETURNS: | ||
1315 | * %true if @addr is from in-kernel static percpu area, %false otherwise. | ||
1316 | */ | ||
1317 | bool is_kernel_percpu_address(unsigned long addr) | ||
1318 | { | ||
1319 | const size_t static_size = __per_cpu_end - __per_cpu_start; | ||
1320 | void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); | ||
1321 | unsigned int cpu; | ||
1322 | |||
1323 | for_each_possible_cpu(cpu) { | ||
1324 | void *start = per_cpu_ptr(base, cpu); | ||
1325 | |||
1326 | if ((void *)addr >= start && (void *)addr < start + static_size) | ||
1327 | return true; | ||
1328 | } | ||
1329 | return false; | ||
1330 | } | ||
1331 | |||
1332 | /** | ||
1333 | * per_cpu_ptr_to_phys - convert translated percpu address to physical address | ||
1334 | * @addr: the address to be converted to physical address | ||
1335 | * | ||
1336 | * Given @addr which is dereferenceable address obtained via one of | ||
1337 | * percpu access macros, this function translates it into its physical | ||
1338 | * address. The caller is responsible for ensuring @addr stays valid | ||
1339 | * until this function finishes. | ||
1340 | * | ||
1341 | * RETURNS: | ||
1342 | * The physical address for @addr. | ||
1343 | */ | ||
1344 | phys_addr_t per_cpu_ptr_to_phys(void *addr) | ||
1345 | { | ||
1346 | if ((unsigned long)addr < VMALLOC_START || | ||
1347 | (unsigned long)addr >= VMALLOC_END) | ||
1348 | return __pa(addr); | ||
1349 | else | ||
1350 | return page_to_phys(vmalloc_to_page(addr)); | ||
1351 | } | ||
1352 | |||
1305 | static inline size_t pcpu_calc_fc_sizes(size_t static_size, | 1353 | static inline size_t pcpu_calc_fc_sizes(size_t static_size, |
1306 | size_t reserved_size, | 1354 | size_t reserved_size, |
1307 | ssize_t *dyn_sizep) | 1355 | ssize_t *dyn_sizep) |
diff --git a/mm/percpu_up.c b/mm/percpu_up.c new file mode 100644 index 000000000000..c4351c7f57d2 --- /dev/null +++ b/mm/percpu_up.c | |||
@@ -0,0 +1,30 @@ | |||
1 | /* | ||
2 | * mm/percpu_up.c - dummy percpu memory allocator implementation for UP | ||
3 | */ | ||
4 | |||
5 | #include <linux/module.h> | ||
6 | #include <linux/percpu.h> | ||
7 | #include <linux/slab.h> | ||
8 | |||
9 | void __percpu *__alloc_percpu(size_t size, size_t align) | ||
10 | { | ||
11 | /* | ||
12 | * Can't easily make larger alignment work with kmalloc. WARN | ||
13 | * on it. Larger alignment should only be used for module | ||
14 | * percpu sections on SMP for which this path isn't used. | ||
15 | */ | ||
16 | WARN_ON_ONCE(align > SMP_CACHE_BYTES); | ||
17 | return kzalloc(size, GFP_KERNEL); | ||
18 | } | ||
19 | EXPORT_SYMBOL_GPL(__alloc_percpu); | ||
20 | |||
21 | void free_percpu(void __percpu *p) | ||
22 | { | ||
23 | kfree(p); | ||
24 | } | ||
25 | EXPORT_SYMBOL_GPL(free_percpu); | ||
26 | |||
27 | phys_addr_t per_cpu_ptr_to_phys(void *addr) | ||
28 | { | ||
29 | return __pa(addr); | ||
30 | } | ||
diff --git a/mm/quicklist.c b/mm/quicklist.c index 6633965bb27b..2876349339a7 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c | |||
@@ -14,6 +14,7 @@ | |||
14 | */ | 14 | */ |
15 | #include <linux/kernel.h> | 15 | #include <linux/kernel.h> |
16 | 16 | ||
17 | #include <linux/gfp.h> | ||
17 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
18 | #include <linux/mmzone.h> | 19 | #include <linux/mmzone.h> |
19 | #include <linux/module.h> | 20 | #include <linux/module.h> |
diff --git a/mm/readahead.c b/mm/readahead.c index aa1aa2345235..dfa9a1a03a11 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -9,6 +9,7 @@ | |||
9 | 9 | ||
10 | #include <linux/kernel.h> | 10 | #include <linux/kernel.h> |
11 | #include <linux/fs.h> | 11 | #include <linux/fs.h> |
12 | #include <linux/gfp.h> | ||
12 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
13 | #include <linux/module.h> | 14 | #include <linux/module.h> |
14 | #include <linux/blkdev.h> | 15 | #include <linux/blkdev.h> |
@@ -501,6 +502,12 @@ void page_cache_sync_readahead(struct address_space *mapping, | |||
501 | if (!ra->ra_pages) | 502 | if (!ra->ra_pages) |
502 | return; | 503 | return; |
503 | 504 | ||
505 | /* be dumb */ | ||
506 | if (filp && (filp->f_mode & FMODE_RANDOM)) { | ||
507 | force_page_cache_readahead(mapping, filp, offset, req_size); | ||
508 | return; | ||
509 | } | ||
510 | |||
504 | /* do read-ahead */ | 511 | /* do read-ahead */ |
505 | ondemand_readahead(mapping, ra, filp, false, offset, req_size); | 512 | ondemand_readahead(mapping, ra, filp, false, offset, req_size); |
506 | } | 513 | } |
@@ -547,5 +554,17 @@ page_cache_async_readahead(struct address_space *mapping, | |||
547 | 554 | ||
548 | /* do read-ahead */ | 555 | /* do read-ahead */ |
549 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); | 556 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); |
557 | |||
558 | #ifdef CONFIG_BLOCK | ||
559 | /* | ||
560 | * Normally the current page is !uptodate and lock_page() will be | ||
561 | * immediately called to implicitly unplug the device. However this | ||
562 | * is not always true for RAID conifgurations, where data arrives | ||
563 | * not strictly in their submission order. In this case we need to | ||
564 | * explicitly kick off the IO. | ||
565 | */ | ||
566 | if (PageUptodate(page)) | ||
567 | blk_run_backing_dev(mapping->backing_dev_info, NULL); | ||
568 | #endif | ||
550 | } | 569 | } |
551 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); | 570 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); |
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/swapops.h> | 49 | #include <linux/swapops.h> |
50 | #include <linux/slab.h> | 50 | #include <linux/slab.h> |
51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
52 | #include <linux/ksm.h> | ||
52 | #include <linux/rmap.h> | 53 | #include <linux/rmap.h> |
53 | #include <linux/rcupdate.h> | 54 | #include <linux/rcupdate.h> |
54 | #include <linux/module.h> | 55 | #include <linux/module.h> |
@@ -61,17 +62,28 @@ | |||
61 | #include "internal.h" | 62 | #include "internal.h" |
62 | 63 | ||
63 | static struct kmem_cache *anon_vma_cachep; | 64 | static struct kmem_cache *anon_vma_cachep; |
65 | static struct kmem_cache *anon_vma_chain_cachep; | ||
64 | 66 | ||
65 | static inline struct anon_vma *anon_vma_alloc(void) | 67 | static inline struct anon_vma *anon_vma_alloc(void) |
66 | { | 68 | { |
67 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | 69 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); |
68 | } | 70 | } |
69 | 71 | ||
70 | static inline void anon_vma_free(struct anon_vma *anon_vma) | 72 | void anon_vma_free(struct anon_vma *anon_vma) |
71 | { | 73 | { |
72 | kmem_cache_free(anon_vma_cachep, anon_vma); | 74 | kmem_cache_free(anon_vma_cachep, anon_vma); |
73 | } | 75 | } |
74 | 76 | ||
77 | static inline struct anon_vma_chain *anon_vma_chain_alloc(void) | ||
78 | { | ||
79 | return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); | ||
80 | } | ||
81 | |||
82 | void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | ||
83 | { | ||
84 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); | ||
85 | } | ||
86 | |||
75 | /** | 87 | /** |
76 | * anon_vma_prepare - attach an anon_vma to a memory region | 88 | * anon_vma_prepare - attach an anon_vma to a memory region |
77 | * @vma: the memory region in question | 89 | * @vma: the memory region in question |
@@ -102,87 +114,167 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
102 | int anon_vma_prepare(struct vm_area_struct *vma) | 114 | int anon_vma_prepare(struct vm_area_struct *vma) |
103 | { | 115 | { |
104 | struct anon_vma *anon_vma = vma->anon_vma; | 116 | struct anon_vma *anon_vma = vma->anon_vma; |
117 | struct anon_vma_chain *avc; | ||
105 | 118 | ||
106 | might_sleep(); | 119 | might_sleep(); |
107 | if (unlikely(!anon_vma)) { | 120 | if (unlikely(!anon_vma)) { |
108 | struct mm_struct *mm = vma->vm_mm; | 121 | struct mm_struct *mm = vma->vm_mm; |
109 | struct anon_vma *allocated; | 122 | struct anon_vma *allocated; |
110 | 123 | ||
124 | avc = anon_vma_chain_alloc(); | ||
125 | if (!avc) | ||
126 | goto out_enomem; | ||
127 | |||
111 | anon_vma = find_mergeable_anon_vma(vma); | 128 | anon_vma = find_mergeable_anon_vma(vma); |
112 | allocated = NULL; | 129 | allocated = NULL; |
113 | if (!anon_vma) { | 130 | if (!anon_vma) { |
114 | anon_vma = anon_vma_alloc(); | 131 | anon_vma = anon_vma_alloc(); |
115 | if (unlikely(!anon_vma)) | 132 | if (unlikely(!anon_vma)) |
116 | return -ENOMEM; | 133 | goto out_enomem_free_avc; |
117 | allocated = anon_vma; | 134 | allocated = anon_vma; |
118 | } | 135 | } |
119 | spin_lock(&anon_vma->lock); | ||
120 | 136 | ||
137 | spin_lock(&anon_vma->lock); | ||
121 | /* page_table_lock to protect against threads */ | 138 | /* page_table_lock to protect against threads */ |
122 | spin_lock(&mm->page_table_lock); | 139 | spin_lock(&mm->page_table_lock); |
123 | if (likely(!vma->anon_vma)) { | 140 | if (likely(!vma->anon_vma)) { |
124 | vma->anon_vma = anon_vma; | 141 | vma->anon_vma = anon_vma; |
125 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 142 | avc->anon_vma = anon_vma; |
143 | avc->vma = vma; | ||
144 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
145 | list_add(&avc->same_anon_vma, &anon_vma->head); | ||
126 | allocated = NULL; | 146 | allocated = NULL; |
147 | avc = NULL; | ||
127 | } | 148 | } |
128 | spin_unlock(&mm->page_table_lock); | 149 | spin_unlock(&mm->page_table_lock); |
129 | |||
130 | spin_unlock(&anon_vma->lock); | 150 | spin_unlock(&anon_vma->lock); |
151 | |||
131 | if (unlikely(allocated)) | 152 | if (unlikely(allocated)) |
132 | anon_vma_free(allocated); | 153 | anon_vma_free(allocated); |
154 | if (unlikely(avc)) | ||
155 | anon_vma_chain_free(avc); | ||
133 | } | 156 | } |
134 | return 0; | 157 | return 0; |
158 | |||
159 | out_enomem_free_avc: | ||
160 | anon_vma_chain_free(avc); | ||
161 | out_enomem: | ||
162 | return -ENOMEM; | ||
135 | } | 163 | } |
136 | 164 | ||
137 | void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) | 165 | static void anon_vma_chain_link(struct vm_area_struct *vma, |
166 | struct anon_vma_chain *avc, | ||
167 | struct anon_vma *anon_vma) | ||
138 | { | 168 | { |
139 | BUG_ON(vma->anon_vma != next->anon_vma); | 169 | avc->vma = vma; |
140 | list_del(&next->anon_vma_node); | 170 | avc->anon_vma = anon_vma; |
171 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
172 | |||
173 | spin_lock(&anon_vma->lock); | ||
174 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
175 | spin_unlock(&anon_vma->lock); | ||
141 | } | 176 | } |
142 | 177 | ||
143 | void __anon_vma_link(struct vm_area_struct *vma) | 178 | /* |
179 | * Attach the anon_vmas from src to dst. | ||
180 | * Returns 0 on success, -ENOMEM on failure. | ||
181 | */ | ||
182 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | ||
144 | { | 183 | { |
145 | struct anon_vma *anon_vma = vma->anon_vma; | 184 | struct anon_vma_chain *avc, *pavc; |
146 | 185 | ||
147 | if (anon_vma) | 186 | list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { |
148 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 187 | avc = anon_vma_chain_alloc(); |
188 | if (!avc) | ||
189 | goto enomem_failure; | ||
190 | anon_vma_chain_link(dst, avc, pavc->anon_vma); | ||
191 | } | ||
192 | return 0; | ||
193 | |||
194 | enomem_failure: | ||
195 | unlink_anon_vmas(dst); | ||
196 | return -ENOMEM; | ||
149 | } | 197 | } |
150 | 198 | ||
151 | void anon_vma_link(struct vm_area_struct *vma) | 199 | /* |
200 | * Attach vma to its own anon_vma, as well as to the anon_vmas that | ||
201 | * the corresponding VMA in the parent process is attached to. | ||
202 | * Returns 0 on success, non-zero on failure. | ||
203 | */ | ||
204 | int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | ||
152 | { | 205 | { |
153 | struct anon_vma *anon_vma = vma->anon_vma; | 206 | struct anon_vma_chain *avc; |
207 | struct anon_vma *anon_vma; | ||
154 | 208 | ||
155 | if (anon_vma) { | 209 | /* Don't bother if the parent process has no anon_vma here. */ |
156 | spin_lock(&anon_vma->lock); | 210 | if (!pvma->anon_vma) |
157 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 211 | return 0; |
158 | spin_unlock(&anon_vma->lock); | 212 | |
159 | } | 213 | /* |
214 | * First, attach the new VMA to the parent VMA's anon_vmas, | ||
215 | * so rmap can find non-COWed pages in child processes. | ||
216 | */ | ||
217 | if (anon_vma_clone(vma, pvma)) | ||
218 | return -ENOMEM; | ||
219 | |||
220 | /* Then add our own anon_vma. */ | ||
221 | anon_vma = anon_vma_alloc(); | ||
222 | if (!anon_vma) | ||
223 | goto out_error; | ||
224 | avc = anon_vma_chain_alloc(); | ||
225 | if (!avc) | ||
226 | goto out_error_free_anon_vma; | ||
227 | anon_vma_chain_link(vma, avc, anon_vma); | ||
228 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | ||
229 | vma->anon_vma = anon_vma; | ||
230 | |||
231 | return 0; | ||
232 | |||
233 | out_error_free_anon_vma: | ||
234 | anon_vma_free(anon_vma); | ||
235 | out_error: | ||
236 | unlink_anon_vmas(vma); | ||
237 | return -ENOMEM; | ||
160 | } | 238 | } |
161 | 239 | ||
162 | void anon_vma_unlink(struct vm_area_struct *vma) | 240 | static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) |
163 | { | 241 | { |
164 | struct anon_vma *anon_vma = vma->anon_vma; | 242 | struct anon_vma *anon_vma = anon_vma_chain->anon_vma; |
165 | int empty; | 243 | int empty; |
166 | 244 | ||
245 | /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ | ||
167 | if (!anon_vma) | 246 | if (!anon_vma) |
168 | return; | 247 | return; |
169 | 248 | ||
170 | spin_lock(&anon_vma->lock); | 249 | spin_lock(&anon_vma->lock); |
171 | list_del(&vma->anon_vma_node); | 250 | list_del(&anon_vma_chain->same_anon_vma); |
172 | 251 | ||
173 | /* We must garbage collect the anon_vma if it's empty */ | 252 | /* We must garbage collect the anon_vma if it's empty */ |
174 | empty = list_empty(&anon_vma->head); | 253 | empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); |
175 | spin_unlock(&anon_vma->lock); | 254 | spin_unlock(&anon_vma->lock); |
176 | 255 | ||
177 | if (empty) | 256 | if (empty) |
178 | anon_vma_free(anon_vma); | 257 | anon_vma_free(anon_vma); |
179 | } | 258 | } |
180 | 259 | ||
260 | void unlink_anon_vmas(struct vm_area_struct *vma) | ||
261 | { | ||
262 | struct anon_vma_chain *avc, *next; | ||
263 | |||
264 | /* Unlink each anon_vma chained to the VMA. */ | ||
265 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | ||
266 | anon_vma_unlink(avc); | ||
267 | list_del(&avc->same_vma); | ||
268 | anon_vma_chain_free(avc); | ||
269 | } | ||
270 | } | ||
271 | |||
181 | static void anon_vma_ctor(void *data) | 272 | static void anon_vma_ctor(void *data) |
182 | { | 273 | { |
183 | struct anon_vma *anon_vma = data; | 274 | struct anon_vma *anon_vma = data; |
184 | 275 | ||
185 | spin_lock_init(&anon_vma->lock); | 276 | spin_lock_init(&anon_vma->lock); |
277 | ksm_refcount_init(anon_vma); | ||
186 | INIT_LIST_HEAD(&anon_vma->head); | 278 | INIT_LIST_HEAD(&anon_vma->head); |
187 | } | 279 | } |
188 | 280 | ||
@@ -190,6 +282,7 @@ void __init anon_vma_init(void) | |||
190 | { | 282 | { |
191 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), | 283 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), |
192 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); | 284 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); |
285 | anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); | ||
193 | } | 286 | } |
194 | 287 | ||
195 | /* | 288 | /* |
@@ -202,8 +295,8 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
202 | unsigned long anon_mapping; | 295 | unsigned long anon_mapping; |
203 | 296 | ||
204 | rcu_read_lock(); | 297 | rcu_read_lock(); |
205 | anon_mapping = (unsigned long) page->mapping; | 298 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); |
206 | if (!(anon_mapping & PAGE_MAPPING_ANON)) | 299 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
207 | goto out; | 300 | goto out; |
208 | if (!page_mapped(page)) | 301 | if (!page_mapped(page)) |
209 | goto out; | 302 | goto out; |
@@ -243,15 +336,13 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
243 | 336 | ||
244 | /* | 337 | /* |
245 | * At what user virtual address is page expected in vma? | 338 | * At what user virtual address is page expected in vma? |
246 | * checking that the page matches the vma. | 339 | * Caller should check the page is actually part of the vma. |
247 | */ | 340 | */ |
248 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 341 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
249 | { | 342 | { |
250 | if (PageAnon(page)) { | 343 | if (PageAnon(page)) |
251 | if ((void *)vma->anon_vma != | 344 | ; |
252 | (void *)page->mapping - PAGE_MAPPING_ANON) | 345 | else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { |
253 | return -EFAULT; | ||
254 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { | ||
255 | if (!vma->vm_file || | 346 | if (!vma->vm_file || |
256 | vma->vm_file->f_mapping != page->mapping) | 347 | vma->vm_file->f_mapping != page->mapping) |
257 | return -EFAULT; | 348 | return -EFAULT; |
@@ -337,21 +428,15 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
337 | * Subfunctions of page_referenced: page_referenced_one called | 428 | * Subfunctions of page_referenced: page_referenced_one called |
338 | * repeatedly from either page_referenced_anon or page_referenced_file. | 429 | * repeatedly from either page_referenced_anon or page_referenced_file. |
339 | */ | 430 | */ |
340 | static int page_referenced_one(struct page *page, | 431 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, |
341 | struct vm_area_struct *vma, | 432 | unsigned long address, unsigned int *mapcount, |
342 | unsigned int *mapcount, | 433 | unsigned long *vm_flags) |
343 | unsigned long *vm_flags) | ||
344 | { | 434 | { |
345 | struct mm_struct *mm = vma->vm_mm; | 435 | struct mm_struct *mm = vma->vm_mm; |
346 | unsigned long address; | ||
347 | pte_t *pte; | 436 | pte_t *pte; |
348 | spinlock_t *ptl; | 437 | spinlock_t *ptl; |
349 | int referenced = 0; | 438 | int referenced = 0; |
350 | 439 | ||
351 | address = vma_address(page, vma); | ||
352 | if (address == -EFAULT) | ||
353 | goto out; | ||
354 | |||
355 | pte = page_check_address(page, mm, address, &ptl, 0); | 440 | pte = page_check_address(page, mm, address, &ptl, 0); |
356 | if (!pte) | 441 | if (!pte) |
357 | goto out; | 442 | goto out; |
@@ -388,9 +473,10 @@ static int page_referenced_one(struct page *page, | |||
388 | out_unmap: | 473 | out_unmap: |
389 | (*mapcount)--; | 474 | (*mapcount)--; |
390 | pte_unmap_unlock(pte, ptl); | 475 | pte_unmap_unlock(pte, ptl); |
391 | out: | 476 | |
392 | if (referenced) | 477 | if (referenced) |
393 | *vm_flags |= vma->vm_flags; | 478 | *vm_flags |= vma->vm_flags; |
479 | out: | ||
394 | return referenced; | 480 | return referenced; |
395 | } | 481 | } |
396 | 482 | ||
@@ -400,7 +486,7 @@ static int page_referenced_anon(struct page *page, | |||
400 | { | 486 | { |
401 | unsigned int mapcount; | 487 | unsigned int mapcount; |
402 | struct anon_vma *anon_vma; | 488 | struct anon_vma *anon_vma; |
403 | struct vm_area_struct *vma; | 489 | struct anon_vma_chain *avc; |
404 | int referenced = 0; | 490 | int referenced = 0; |
405 | 491 | ||
406 | anon_vma = page_lock_anon_vma(page); | 492 | anon_vma = page_lock_anon_vma(page); |
@@ -408,7 +494,11 @@ static int page_referenced_anon(struct page *page, | |||
408 | return referenced; | 494 | return referenced; |
409 | 495 | ||
410 | mapcount = page_mapcount(page); | 496 | mapcount = page_mapcount(page); |
411 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 497 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
498 | struct vm_area_struct *vma = avc->vma; | ||
499 | unsigned long address = vma_address(page, vma); | ||
500 | if (address == -EFAULT) | ||
501 | continue; | ||
412 | /* | 502 | /* |
413 | * If we are reclaiming on behalf of a cgroup, skip | 503 | * If we are reclaiming on behalf of a cgroup, skip |
414 | * counting on behalf of references from different | 504 | * counting on behalf of references from different |
@@ -416,7 +506,7 @@ static int page_referenced_anon(struct page *page, | |||
416 | */ | 506 | */ |
417 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 507 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
418 | continue; | 508 | continue; |
419 | referenced += page_referenced_one(page, vma, | 509 | referenced += page_referenced_one(page, vma, address, |
420 | &mapcount, vm_flags); | 510 | &mapcount, vm_flags); |
421 | if (!mapcount) | 511 | if (!mapcount) |
422 | break; | 512 | break; |
@@ -474,6 +564,9 @@ static int page_referenced_file(struct page *page, | |||
474 | mapcount = page_mapcount(page); | 564 | mapcount = page_mapcount(page); |
475 | 565 | ||
476 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 566 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
567 | unsigned long address = vma_address(page, vma); | ||
568 | if (address == -EFAULT) | ||
569 | continue; | ||
477 | /* | 570 | /* |
478 | * If we are reclaiming on behalf of a cgroup, skip | 571 | * If we are reclaiming on behalf of a cgroup, skip |
479 | * counting on behalf of references from different | 572 | * counting on behalf of references from different |
@@ -481,7 +574,7 @@ static int page_referenced_file(struct page *page, | |||
481 | */ | 574 | */ |
482 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 575 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
483 | continue; | 576 | continue; |
484 | referenced += page_referenced_one(page, vma, | 577 | referenced += page_referenced_one(page, vma, address, |
485 | &mapcount, vm_flags); | 578 | &mapcount, vm_flags); |
486 | if (!mapcount) | 579 | if (!mapcount) |
487 | break; | 580 | break; |
@@ -507,46 +600,44 @@ int page_referenced(struct page *page, | |||
507 | unsigned long *vm_flags) | 600 | unsigned long *vm_flags) |
508 | { | 601 | { |
509 | int referenced = 0; | 602 | int referenced = 0; |
510 | 603 | int we_locked = 0; | |
511 | if (TestClearPageReferenced(page)) | ||
512 | referenced++; | ||
513 | 604 | ||
514 | *vm_flags = 0; | 605 | *vm_flags = 0; |
515 | if (page_mapped(page) && page->mapping) { | 606 | if (page_mapped(page) && page_rmapping(page)) { |
516 | if (PageAnon(page)) | 607 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { |
608 | we_locked = trylock_page(page); | ||
609 | if (!we_locked) { | ||
610 | referenced++; | ||
611 | goto out; | ||
612 | } | ||
613 | } | ||
614 | if (unlikely(PageKsm(page))) | ||
615 | referenced += page_referenced_ksm(page, mem_cont, | ||
616 | vm_flags); | ||
617 | else if (PageAnon(page)) | ||
517 | referenced += page_referenced_anon(page, mem_cont, | 618 | referenced += page_referenced_anon(page, mem_cont, |
518 | vm_flags); | 619 | vm_flags); |
519 | else if (is_locked) | 620 | else if (page->mapping) |
520 | referenced += page_referenced_file(page, mem_cont, | 621 | referenced += page_referenced_file(page, mem_cont, |
521 | vm_flags); | 622 | vm_flags); |
522 | else if (!trylock_page(page)) | 623 | if (we_locked) |
523 | referenced++; | ||
524 | else { | ||
525 | if (page->mapping) | ||
526 | referenced += page_referenced_file(page, | ||
527 | mem_cont, vm_flags); | ||
528 | unlock_page(page); | 624 | unlock_page(page); |
529 | } | ||
530 | } | 625 | } |
531 | 626 | out: | |
532 | if (page_test_and_clear_young(page)) | 627 | if (page_test_and_clear_young(page)) |
533 | referenced++; | 628 | referenced++; |
534 | 629 | ||
535 | return referenced; | 630 | return referenced; |
536 | } | 631 | } |
537 | 632 | ||
538 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | 633 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, |
634 | unsigned long address) | ||
539 | { | 635 | { |
540 | struct mm_struct *mm = vma->vm_mm; | 636 | struct mm_struct *mm = vma->vm_mm; |
541 | unsigned long address; | ||
542 | pte_t *pte; | 637 | pte_t *pte; |
543 | spinlock_t *ptl; | 638 | spinlock_t *ptl; |
544 | int ret = 0; | 639 | int ret = 0; |
545 | 640 | ||
546 | address = vma_address(page, vma); | ||
547 | if (address == -EFAULT) | ||
548 | goto out; | ||
549 | |||
550 | pte = page_check_address(page, mm, address, &ptl, 1); | 641 | pte = page_check_address(page, mm, address, &ptl, 1); |
551 | if (!pte) | 642 | if (!pte) |
552 | goto out; | 643 | goto out; |
@@ -578,8 +669,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
578 | 669 | ||
579 | spin_lock(&mapping->i_mmap_lock); | 670 | spin_lock(&mapping->i_mmap_lock); |
580 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 671 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
581 | if (vma->vm_flags & VM_SHARED) | 672 | if (vma->vm_flags & VM_SHARED) { |
582 | ret += page_mkclean_one(page, vma); | 673 | unsigned long address = vma_address(page, vma); |
674 | if (address == -EFAULT) | ||
675 | continue; | ||
676 | ret += page_mkclean_one(page, vma, address); | ||
677 | } | ||
583 | } | 678 | } |
584 | spin_unlock(&mapping->i_mmap_lock); | 679 | spin_unlock(&mapping->i_mmap_lock); |
585 | return ret; | 680 | return ret; |
@@ -607,27 +702,60 @@ int page_mkclean(struct page *page) | |||
607 | EXPORT_SYMBOL_GPL(page_mkclean); | 702 | EXPORT_SYMBOL_GPL(page_mkclean); |
608 | 703 | ||
609 | /** | 704 | /** |
705 | * page_move_anon_rmap - move a page to our anon_vma | ||
706 | * @page: the page to move to our anon_vma | ||
707 | * @vma: the vma the page belongs to | ||
708 | * @address: the user virtual address mapped | ||
709 | * | ||
710 | * When a page belongs exclusively to one process after a COW event, | ||
711 | * that page can be moved into the anon_vma that belongs to just that | ||
712 | * process, so the rmap code will not search the parent or sibling | ||
713 | * processes. | ||
714 | */ | ||
715 | void page_move_anon_rmap(struct page *page, | ||
716 | struct vm_area_struct *vma, unsigned long address) | ||
717 | { | ||
718 | struct anon_vma *anon_vma = vma->anon_vma; | ||
719 | |||
720 | VM_BUG_ON(!PageLocked(page)); | ||
721 | VM_BUG_ON(!anon_vma); | ||
722 | VM_BUG_ON(page->index != linear_page_index(vma, address)); | ||
723 | |||
724 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
725 | page->mapping = (struct address_space *) anon_vma; | ||
726 | } | ||
727 | |||
728 | /** | ||
610 | * __page_set_anon_rmap - setup new anonymous rmap | 729 | * __page_set_anon_rmap - setup new anonymous rmap |
611 | * @page: the page to add the mapping to | 730 | * @page: the page to add the mapping to |
612 | * @vma: the vm area in which the mapping is added | 731 | * @vma: the vm area in which the mapping is added |
613 | * @address: the user virtual address mapped | 732 | * @address: the user virtual address mapped |
733 | * @exclusive: the page is exclusively owned by the current process | ||
614 | */ | 734 | */ |
615 | static void __page_set_anon_rmap(struct page *page, | 735 | static void __page_set_anon_rmap(struct page *page, |
616 | struct vm_area_struct *vma, unsigned long address) | 736 | struct vm_area_struct *vma, unsigned long address, int exclusive) |
617 | { | 737 | { |
618 | struct anon_vma *anon_vma = vma->anon_vma; | 738 | struct anon_vma *anon_vma = vma->anon_vma; |
619 | 739 | ||
620 | BUG_ON(!anon_vma); | 740 | BUG_ON(!anon_vma); |
621 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
622 | page->mapping = (struct address_space *) anon_vma; | ||
623 | |||
624 | page->index = linear_page_index(vma, address); | ||
625 | 741 | ||
626 | /* | 742 | /* |
627 | * nr_mapped state can be updated without turning off | 743 | * If the page isn't exclusively mapped into this vma, |
628 | * interrupts because it is not modified via interrupt. | 744 | * we must use the _oldest_ possible anon_vma for the |
745 | * page mapping! | ||
746 | * | ||
747 | * So take the last AVC chain entry in the vma, which is | ||
748 | * the deepest ancestor, and use the anon_vma from that. | ||
629 | */ | 749 | */ |
630 | __inc_zone_page_state(page, NR_ANON_PAGES); | 750 | if (!exclusive) { |
751 | struct anon_vma_chain *avc; | ||
752 | avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma); | ||
753 | anon_vma = avc->anon_vma; | ||
754 | } | ||
755 | |||
756 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
757 | page->mapping = (struct address_space *) anon_vma; | ||
758 | page->index = linear_page_index(vma, address); | ||
631 | } | 759 | } |
632 | 760 | ||
633 | /** | 761 | /** |
@@ -652,9 +780,6 @@ static void __page_check_anon_rmap(struct page *page, | |||
652 | * are initially only visible via the pagetables, and the pte is locked | 780 | * are initially only visible via the pagetables, and the pte is locked |
653 | * over the call to page_add_new_anon_rmap. | 781 | * over the call to page_add_new_anon_rmap. |
654 | */ | 782 | */ |
655 | struct anon_vma *anon_vma = vma->anon_vma; | ||
656 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
657 | BUG_ON(page->mapping != (struct address_space *)anon_vma); | ||
658 | BUG_ON(page->index != linear_page_index(vma, address)); | 783 | BUG_ON(page->index != linear_page_index(vma, address)); |
659 | #endif | 784 | #endif |
660 | } | 785 | } |
@@ -665,15 +790,24 @@ static void __page_check_anon_rmap(struct page *page, | |||
665 | * @vma: the vm area in which the mapping is added | 790 | * @vma: the vm area in which the mapping is added |
666 | * @address: the user virtual address mapped | 791 | * @address: the user virtual address mapped |
667 | * | 792 | * |
668 | * The caller needs to hold the pte lock and the page must be locked. | 793 | * The caller needs to hold the pte lock, and the page must be locked in |
794 | * the anon_vma case: to serialize mapping,index checking after setting, | ||
795 | * and to ensure that PageAnon is not being upgraded racily to PageKsm | ||
796 | * (but PageKsm is never downgraded to PageAnon). | ||
669 | */ | 797 | */ |
670 | void page_add_anon_rmap(struct page *page, | 798 | void page_add_anon_rmap(struct page *page, |
671 | struct vm_area_struct *vma, unsigned long address) | 799 | struct vm_area_struct *vma, unsigned long address) |
672 | { | 800 | { |
801 | int first = atomic_inc_and_test(&page->_mapcount); | ||
802 | if (first) | ||
803 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
804 | if (unlikely(PageKsm(page))) | ||
805 | return; | ||
806 | |||
673 | VM_BUG_ON(!PageLocked(page)); | 807 | VM_BUG_ON(!PageLocked(page)); |
674 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 808 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
675 | if (atomic_inc_and_test(&page->_mapcount)) | 809 | if (first) |
676 | __page_set_anon_rmap(page, vma, address); | 810 | __page_set_anon_rmap(page, vma, address, 0); |
677 | else | 811 | else |
678 | __page_check_anon_rmap(page, vma, address); | 812 | __page_check_anon_rmap(page, vma, address); |
679 | } | 813 | } |
@@ -694,7 +828,8 @@ void page_add_new_anon_rmap(struct page *page, | |||
694 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 828 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
695 | SetPageSwapBacked(page); | 829 | SetPageSwapBacked(page); |
696 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 830 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
697 | __page_set_anon_rmap(page, vma, address); | 831 | __inc_zone_page_state(page, NR_ANON_PAGES); |
832 | __page_set_anon_rmap(page, vma, address, 1); | ||
698 | if (page_evictable(page, vma)) | 833 | if (page_evictable(page, vma)) |
699 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 834 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
700 | else | 835 | else |
@@ -711,7 +846,7 @@ void page_add_file_rmap(struct page *page) | |||
711 | { | 846 | { |
712 | if (atomic_inc_and_test(&page->_mapcount)) { | 847 | if (atomic_inc_and_test(&page->_mapcount)) { |
713 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 848 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
714 | mem_cgroup_update_mapped_file_stat(page, 1); | 849 | mem_cgroup_update_file_mapped(page, 1); |
715 | } | 850 | } |
716 | } | 851 | } |
717 | 852 | ||
@@ -743,8 +878,8 @@ void page_remove_rmap(struct page *page) | |||
743 | __dec_zone_page_state(page, NR_ANON_PAGES); | 878 | __dec_zone_page_state(page, NR_ANON_PAGES); |
744 | } else { | 879 | } else { |
745 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 880 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
881 | mem_cgroup_update_file_mapped(page, -1); | ||
746 | } | 882 | } |
747 | mem_cgroup_update_mapped_file_stat(page, -1); | ||
748 | /* | 883 | /* |
749 | * It would be tidy to reset the PageAnon mapping here, | 884 | * It would be tidy to reset the PageAnon mapping here, |
750 | * but that might overwrite a racing page_add_anon_rmap | 885 | * but that might overwrite a racing page_add_anon_rmap |
@@ -760,20 +895,15 @@ void page_remove_rmap(struct page *page) | |||
760 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 895 | * Subfunctions of try_to_unmap: try_to_unmap_one called |
761 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 896 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. |
762 | */ | 897 | */ |
763 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 898 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
764 | enum ttu_flags flags) | 899 | unsigned long address, enum ttu_flags flags) |
765 | { | 900 | { |
766 | struct mm_struct *mm = vma->vm_mm; | 901 | struct mm_struct *mm = vma->vm_mm; |
767 | unsigned long address; | ||
768 | pte_t *pte; | 902 | pte_t *pte; |
769 | pte_t pteval; | 903 | pte_t pteval; |
770 | spinlock_t *ptl; | 904 | spinlock_t *ptl; |
771 | int ret = SWAP_AGAIN; | 905 | int ret = SWAP_AGAIN; |
772 | 906 | ||
773 | address = vma_address(page, vma); | ||
774 | if (address == -EFAULT) | ||
775 | goto out; | ||
776 | |||
777 | pte = page_check_address(page, mm, address, &ptl, 0); | 907 | pte = page_check_address(page, mm, address, &ptl, 0); |
778 | if (!pte) | 908 | if (!pte) |
779 | goto out; | 909 | goto out; |
@@ -784,10 +914,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
784 | * skipped over this mm) then we should reactivate it. | 914 | * skipped over this mm) then we should reactivate it. |
785 | */ | 915 | */ |
786 | if (!(flags & TTU_IGNORE_MLOCK)) { | 916 | if (!(flags & TTU_IGNORE_MLOCK)) { |
787 | if (vma->vm_flags & VM_LOCKED) { | 917 | if (vma->vm_flags & VM_LOCKED) |
788 | ret = SWAP_MLOCK; | 918 | goto out_mlock; |
919 | |||
920 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
789 | goto out_unmap; | 921 | goto out_unmap; |
790 | } | ||
791 | } | 922 | } |
792 | if (!(flags & TTU_IGNORE_ACCESS)) { | 923 | if (!(flags & TTU_IGNORE_ACCESS)) { |
793 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 924 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
@@ -809,9 +940,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
809 | 940 | ||
810 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { | 941 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
811 | if (PageAnon(page)) | 942 | if (PageAnon(page)) |
812 | dec_mm_counter(mm, anon_rss); | 943 | dec_mm_counter(mm, MM_ANONPAGES); |
813 | else | 944 | else |
814 | dec_mm_counter(mm, file_rss); | 945 | dec_mm_counter(mm, MM_FILEPAGES); |
815 | set_pte_at(mm, address, pte, | 946 | set_pte_at(mm, address, pte, |
816 | swp_entry_to_pte(make_hwpoison_entry(page))); | 947 | swp_entry_to_pte(make_hwpoison_entry(page))); |
817 | } else if (PageAnon(page)) { | 948 | } else if (PageAnon(page)) { |
@@ -822,14 +953,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
822 | * Store the swap location in the pte. | 953 | * Store the swap location in the pte. |
823 | * See handle_pte_fault() ... | 954 | * See handle_pte_fault() ... |
824 | */ | 955 | */ |
825 | swap_duplicate(entry); | 956 | if (swap_duplicate(entry) < 0) { |
957 | set_pte_at(mm, address, pte, pteval); | ||
958 | ret = SWAP_FAIL; | ||
959 | goto out_unmap; | ||
960 | } | ||
826 | if (list_empty(&mm->mmlist)) { | 961 | if (list_empty(&mm->mmlist)) { |
827 | spin_lock(&mmlist_lock); | 962 | spin_lock(&mmlist_lock); |
828 | if (list_empty(&mm->mmlist)) | 963 | if (list_empty(&mm->mmlist)) |
829 | list_add(&mm->mmlist, &init_mm.mmlist); | 964 | list_add(&mm->mmlist, &init_mm.mmlist); |
830 | spin_unlock(&mmlist_lock); | 965 | spin_unlock(&mmlist_lock); |
831 | } | 966 | } |
832 | dec_mm_counter(mm, anon_rss); | 967 | dec_mm_counter(mm, MM_ANONPAGES); |
968 | inc_mm_counter(mm, MM_SWAPENTS); | ||
833 | } else if (PAGE_MIGRATION) { | 969 | } else if (PAGE_MIGRATION) { |
834 | /* | 970 | /* |
835 | * Store the pfn of the page in a special migration | 971 | * Store the pfn of the page in a special migration |
@@ -847,8 +983,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
847 | entry = make_migration_entry(page, pte_write(pteval)); | 983 | entry = make_migration_entry(page, pte_write(pteval)); |
848 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 984 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
849 | } else | 985 | } else |
850 | dec_mm_counter(mm, file_rss); | 986 | dec_mm_counter(mm, MM_FILEPAGES); |
851 | |||
852 | 987 | ||
853 | page_remove_rmap(page); | 988 | page_remove_rmap(page); |
854 | page_cache_release(page); | 989 | page_cache_release(page); |
@@ -857,6 +992,27 @@ out_unmap: | |||
857 | pte_unmap_unlock(pte, ptl); | 992 | pte_unmap_unlock(pte, ptl); |
858 | out: | 993 | out: |
859 | return ret; | 994 | return ret; |
995 | |||
996 | out_mlock: | ||
997 | pte_unmap_unlock(pte, ptl); | ||
998 | |||
999 | |||
1000 | /* | ||
1001 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | ||
1002 | * unstable result and race. Plus, We can't wait here because | ||
1003 | * we now hold anon_vma->lock or mapping->i_mmap_lock. | ||
1004 | * if trylock failed, the page remain in evictable lru and later | ||
1005 | * vmscan could retry to move the page to unevictable lru if the | ||
1006 | * page is actually mlocked. | ||
1007 | */ | ||
1008 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
1009 | if (vma->vm_flags & VM_LOCKED) { | ||
1010 | mlock_vma_page(page); | ||
1011 | ret = SWAP_MLOCK; | ||
1012 | } | ||
1013 | up_read(&vma->vm_mm->mmap_sem); | ||
1014 | } | ||
1015 | return ret; | ||
860 | } | 1016 | } |
861 | 1017 | ||
862 | /* | 1018 | /* |
@@ -922,11 +1078,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
922 | return ret; | 1078 | return ret; |
923 | 1079 | ||
924 | /* | 1080 | /* |
925 | * MLOCK_PAGES => feature is configured. | 1081 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, |
926 | * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, | ||
927 | * keep the sem while scanning the cluster for mlocking pages. | 1082 | * keep the sem while scanning the cluster for mlocking pages. |
928 | */ | 1083 | */ |
929 | if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { | 1084 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { |
930 | locked_vma = (vma->vm_flags & VM_LOCKED); | 1085 | locked_vma = (vma->vm_flags & VM_LOCKED); |
931 | if (!locked_vma) | 1086 | if (!locked_vma) |
932 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ | 1087 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ |
@@ -967,7 +1122,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
967 | 1122 | ||
968 | page_remove_rmap(page); | 1123 | page_remove_rmap(page); |
969 | page_cache_release(page); | 1124 | page_cache_release(page); |
970 | dec_mm_counter(mm, file_rss); | 1125 | dec_mm_counter(mm, MM_FILEPAGES); |
971 | (*mapcount)--; | 1126 | (*mapcount)--; |
972 | } | 1127 | } |
973 | pte_unmap_unlock(pte - 1, ptl); | 1128 | pte_unmap_unlock(pte - 1, ptl); |
@@ -976,29 +1131,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
976 | return ret; | 1131 | return ret; |
977 | } | 1132 | } |
978 | 1133 | ||
979 | /* | ||
980 | * common handling for pages mapped in VM_LOCKED vmas | ||
981 | */ | ||
982 | static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) | ||
983 | { | ||
984 | int mlocked = 0; | ||
985 | |||
986 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
987 | if (vma->vm_flags & VM_LOCKED) { | ||
988 | mlock_vma_page(page); | ||
989 | mlocked++; /* really mlocked the page */ | ||
990 | } | ||
991 | up_read(&vma->vm_mm->mmap_sem); | ||
992 | } | ||
993 | return mlocked; | ||
994 | } | ||
995 | |||
996 | /** | 1134 | /** |
997 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based | 1135 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based |
998 | * rmap method | 1136 | * rmap method |
999 | * @page: the page to unmap/unlock | 1137 | * @page: the page to unmap/unlock |
1000 | * @unlock: request for unlock rather than unmap [unlikely] | 1138 | * @flags: action and flags |
1001 | * @migration: unmapping for migration - ignored if @unlock | ||
1002 | * | 1139 | * |
1003 | * Find all the mappings of a page using the mapping pointer and the vma chains | 1140 | * Find all the mappings of a page using the mapping pointer and the vma chains |
1004 | * contained in the anon_vma struct it points to. | 1141 | * contained in the anon_vma struct it points to. |
@@ -1013,43 +1150,24 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) | |||
1013 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | 1150 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) |
1014 | { | 1151 | { |
1015 | struct anon_vma *anon_vma; | 1152 | struct anon_vma *anon_vma; |
1016 | struct vm_area_struct *vma; | 1153 | struct anon_vma_chain *avc; |
1017 | unsigned int mlocked = 0; | ||
1018 | int ret = SWAP_AGAIN; | 1154 | int ret = SWAP_AGAIN; |
1019 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
1020 | |||
1021 | if (MLOCK_PAGES && unlikely(unlock)) | ||
1022 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
1023 | 1155 | ||
1024 | anon_vma = page_lock_anon_vma(page); | 1156 | anon_vma = page_lock_anon_vma(page); |
1025 | if (!anon_vma) | 1157 | if (!anon_vma) |
1026 | return ret; | 1158 | return ret; |
1027 | 1159 | ||
1028 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1160 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
1029 | if (MLOCK_PAGES && unlikely(unlock)) { | 1161 | struct vm_area_struct *vma = avc->vma; |
1030 | if (!((vma->vm_flags & VM_LOCKED) && | 1162 | unsigned long address = vma_address(page, vma); |
1031 | page_mapped_in_vma(page, vma))) | 1163 | if (address == -EFAULT) |
1032 | continue; /* must visit all unlocked vmas */ | 1164 | continue; |
1033 | ret = SWAP_MLOCK; /* saw at least one mlocked vma */ | 1165 | ret = try_to_unmap_one(page, vma, address, flags); |
1034 | } else { | 1166 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1035 | ret = try_to_unmap_one(page, vma, flags); | 1167 | break; |
1036 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
1037 | break; | ||
1038 | } | ||
1039 | if (ret == SWAP_MLOCK) { | ||
1040 | mlocked = try_to_mlock_page(page, vma); | ||
1041 | if (mlocked) | ||
1042 | break; /* stop if actually mlocked page */ | ||
1043 | } | ||
1044 | } | 1168 | } |
1045 | 1169 | ||
1046 | page_unlock_anon_vma(anon_vma); | 1170 | page_unlock_anon_vma(anon_vma); |
1047 | |||
1048 | if (mlocked) | ||
1049 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
1050 | else if (ret == SWAP_MLOCK) | ||
1051 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
1052 | |||
1053 | return ret; | 1171 | return ret; |
1054 | } | 1172 | } |
1055 | 1173 | ||
@@ -1079,48 +1197,30 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1079 | unsigned long max_nl_cursor = 0; | 1197 | unsigned long max_nl_cursor = 0; |
1080 | unsigned long max_nl_size = 0; | 1198 | unsigned long max_nl_size = 0; |
1081 | unsigned int mapcount; | 1199 | unsigned int mapcount; |
1082 | unsigned int mlocked = 0; | ||
1083 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
1084 | |||
1085 | if (MLOCK_PAGES && unlikely(unlock)) | ||
1086 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
1087 | 1200 | ||
1088 | spin_lock(&mapping->i_mmap_lock); | 1201 | spin_lock(&mapping->i_mmap_lock); |
1089 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1202 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1090 | if (MLOCK_PAGES && unlikely(unlock)) { | 1203 | unsigned long address = vma_address(page, vma); |
1091 | if (!((vma->vm_flags & VM_LOCKED) && | 1204 | if (address == -EFAULT) |
1092 | page_mapped_in_vma(page, vma))) | 1205 | continue; |
1093 | continue; /* must visit all vmas */ | 1206 | ret = try_to_unmap_one(page, vma, address, flags); |
1094 | ret = SWAP_MLOCK; | 1207 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1095 | } else { | 1208 | goto out; |
1096 | ret = try_to_unmap_one(page, vma, flags); | ||
1097 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
1098 | goto out; | ||
1099 | } | ||
1100 | if (ret == SWAP_MLOCK) { | ||
1101 | mlocked = try_to_mlock_page(page, vma); | ||
1102 | if (mlocked) | ||
1103 | break; /* stop if actually mlocked page */ | ||
1104 | } | ||
1105 | } | 1209 | } |
1106 | 1210 | ||
1107 | if (mlocked) | 1211 | if (list_empty(&mapping->i_mmap_nonlinear)) |
1108 | goto out; | 1212 | goto out; |
1109 | 1213 | ||
1110 | if (list_empty(&mapping->i_mmap_nonlinear)) | 1214 | /* |
1215 | * We don't bother to try to find the munlocked page in nonlinears. | ||
1216 | * It's costly. Instead, later, page reclaim logic may call | ||
1217 | * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. | ||
1218 | */ | ||
1219 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
1111 | goto out; | 1220 | goto out; |
1112 | 1221 | ||
1113 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1222 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1114 | shared.vm_set.list) { | 1223 | shared.vm_set.list) { |
1115 | if (MLOCK_PAGES && unlikely(unlock)) { | ||
1116 | if (!(vma->vm_flags & VM_LOCKED)) | ||
1117 | continue; /* must visit all vmas */ | ||
1118 | ret = SWAP_MLOCK; /* leave mlocked == 0 */ | ||
1119 | goto out; /* no need to look further */ | ||
1120 | } | ||
1121 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && | ||
1122 | (vma->vm_flags & VM_LOCKED)) | ||
1123 | continue; | ||
1124 | cursor = (unsigned long) vma->vm_private_data; | 1224 | cursor = (unsigned long) vma->vm_private_data; |
1125 | if (cursor > max_nl_cursor) | 1225 | if (cursor > max_nl_cursor) |
1126 | max_nl_cursor = cursor; | 1226 | max_nl_cursor = cursor; |
@@ -1153,16 +1253,12 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1153 | do { | 1253 | do { |
1154 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1254 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1155 | shared.vm_set.list) { | 1255 | shared.vm_set.list) { |
1156 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && | ||
1157 | (vma->vm_flags & VM_LOCKED)) | ||
1158 | continue; | ||
1159 | cursor = (unsigned long) vma->vm_private_data; | 1256 | cursor = (unsigned long) vma->vm_private_data; |
1160 | while ( cursor < max_nl_cursor && | 1257 | while ( cursor < max_nl_cursor && |
1161 | cursor < vma->vm_end - vma->vm_start) { | 1258 | cursor < vma->vm_end - vma->vm_start) { |
1162 | ret = try_to_unmap_cluster(cursor, &mapcount, | 1259 | if (try_to_unmap_cluster(cursor, &mapcount, |
1163 | vma, page); | 1260 | vma, page) == SWAP_MLOCK) |
1164 | if (ret == SWAP_MLOCK) | 1261 | ret = SWAP_MLOCK; |
1165 | mlocked = 2; /* to return below */ | ||
1166 | cursor += CLUSTER_SIZE; | 1262 | cursor += CLUSTER_SIZE; |
1167 | vma->vm_private_data = (void *) cursor; | 1263 | vma->vm_private_data = (void *) cursor; |
1168 | if ((int)mapcount <= 0) | 1264 | if ((int)mapcount <= 0) |
@@ -1183,10 +1279,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1183 | vma->vm_private_data = NULL; | 1279 | vma->vm_private_data = NULL; |
1184 | out: | 1280 | out: |
1185 | spin_unlock(&mapping->i_mmap_lock); | 1281 | spin_unlock(&mapping->i_mmap_lock); |
1186 | if (mlocked) | ||
1187 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
1188 | else if (ret == SWAP_MLOCK) | ||
1189 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
1190 | return ret; | 1282 | return ret; |
1191 | } | 1283 | } |
1192 | 1284 | ||
@@ -1210,7 +1302,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1210 | 1302 | ||
1211 | BUG_ON(!PageLocked(page)); | 1303 | BUG_ON(!PageLocked(page)); |
1212 | 1304 | ||
1213 | if (PageAnon(page)) | 1305 | if (unlikely(PageKsm(page))) |
1306 | ret = try_to_unmap_ksm(page, flags); | ||
1307 | else if (PageAnon(page)) | ||
1214 | ret = try_to_unmap_anon(page, flags); | 1308 | ret = try_to_unmap_anon(page, flags); |
1215 | else | 1309 | else |
1216 | ret = try_to_unmap_file(page, flags); | 1310 | ret = try_to_unmap_file(page, flags); |
@@ -1229,17 +1323,99 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1229 | * | 1323 | * |
1230 | * Return values are: | 1324 | * Return values are: |
1231 | * | 1325 | * |
1232 | * SWAP_SUCCESS - no vma's holding page mlocked. | 1326 | * SWAP_AGAIN - no vma is holding page mlocked, or, |
1233 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem | 1327 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem |
1328 | * SWAP_FAIL - page cannot be located at present | ||
1234 | * SWAP_MLOCK - page is now mlocked. | 1329 | * SWAP_MLOCK - page is now mlocked. |
1235 | */ | 1330 | */ |
1236 | int try_to_munlock(struct page *page) | 1331 | int try_to_munlock(struct page *page) |
1237 | { | 1332 | { |
1238 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | 1333 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); |
1239 | 1334 | ||
1240 | if (PageAnon(page)) | 1335 | if (unlikely(PageKsm(page))) |
1336 | return try_to_unmap_ksm(page, TTU_MUNLOCK); | ||
1337 | else if (PageAnon(page)) | ||
1241 | return try_to_unmap_anon(page, TTU_MUNLOCK); | 1338 | return try_to_unmap_anon(page, TTU_MUNLOCK); |
1242 | else | 1339 | else |
1243 | return try_to_unmap_file(page, TTU_MUNLOCK); | 1340 | return try_to_unmap_file(page, TTU_MUNLOCK); |
1244 | } | 1341 | } |
1245 | 1342 | ||
1343 | #ifdef CONFIG_MIGRATION | ||
1344 | /* | ||
1345 | * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): | ||
1346 | * Called by migrate.c to remove migration ptes, but might be used more later. | ||
1347 | */ | ||
1348 | static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | ||
1349 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1350 | { | ||
1351 | struct anon_vma *anon_vma; | ||
1352 | struct anon_vma_chain *avc; | ||
1353 | int ret = SWAP_AGAIN; | ||
1354 | |||
1355 | /* | ||
1356 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() | ||
1357 | * because that depends on page_mapped(); but not all its usages | ||
1358 | * are holding mmap_sem, which also gave the necessary guarantee | ||
1359 | * (that this anon_vma's slab has not already been destroyed). | ||
1360 | * This needs to be reviewed later: avoiding page_lock_anon_vma() | ||
1361 | * is risky, and currently limits the usefulness of rmap_walk(). | ||
1362 | */ | ||
1363 | anon_vma = page_anon_vma(page); | ||
1364 | if (!anon_vma) | ||
1365 | return ret; | ||
1366 | spin_lock(&anon_vma->lock); | ||
1367 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | ||
1368 | struct vm_area_struct *vma = avc->vma; | ||
1369 | unsigned long address = vma_address(page, vma); | ||
1370 | if (address == -EFAULT) | ||
1371 | continue; | ||
1372 | ret = rmap_one(page, vma, address, arg); | ||
1373 | if (ret != SWAP_AGAIN) | ||
1374 | break; | ||
1375 | } | ||
1376 | spin_unlock(&anon_vma->lock); | ||
1377 | return ret; | ||
1378 | } | ||
1379 | |||
1380 | static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | ||
1381 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1382 | { | ||
1383 | struct address_space *mapping = page->mapping; | ||
1384 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1385 | struct vm_area_struct *vma; | ||
1386 | struct prio_tree_iter iter; | ||
1387 | int ret = SWAP_AGAIN; | ||
1388 | |||
1389 | if (!mapping) | ||
1390 | return ret; | ||
1391 | spin_lock(&mapping->i_mmap_lock); | ||
1392 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
1393 | unsigned long address = vma_address(page, vma); | ||
1394 | if (address == -EFAULT) | ||
1395 | continue; | ||
1396 | ret = rmap_one(page, vma, address, arg); | ||
1397 | if (ret != SWAP_AGAIN) | ||
1398 | break; | ||
1399 | } | ||
1400 | /* | ||
1401 | * No nonlinear handling: being always shared, nonlinear vmas | ||
1402 | * never contain migration ptes. Decide what to do about this | ||
1403 | * limitation to linear when we need rmap_walk() on nonlinear. | ||
1404 | */ | ||
1405 | spin_unlock(&mapping->i_mmap_lock); | ||
1406 | return ret; | ||
1407 | } | ||
1408 | |||
1409 | int rmap_walk(struct page *page, int (*rmap_one)(struct page *, | ||
1410 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1411 | { | ||
1412 | VM_BUG_ON(!PageLocked(page)); | ||
1413 | |||
1414 | if (unlikely(PageKsm(page))) | ||
1415 | return rmap_walk_ksm(page, rmap_one, arg); | ||
1416 | else if (PageAnon(page)) | ||
1417 | return rmap_walk_anon(page, rmap_one, arg); | ||
1418 | else | ||
1419 | return rmap_walk_file(page, rmap_one, arg); | ||
1420 | } | ||
1421 | #endif /* CONFIG_MIGRATION */ | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 356dd99566ec..eef4ebea5158 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -29,7 +29,6 @@ | |||
29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
31 | #include <linux/swap.h> | 31 | #include <linux/swap.h> |
32 | #include <linux/ima.h> | ||
33 | 32 | ||
34 | static struct vfsmount *shm_mnt; | 33 | static struct vfsmount *shm_mnt; |
35 | 34 | ||
@@ -42,6 +41,7 @@ static struct vfsmount *shm_mnt; | |||
42 | 41 | ||
43 | #include <linux/xattr.h> | 42 | #include <linux/xattr.h> |
44 | #include <linux/exportfs.h> | 43 | #include <linux/exportfs.h> |
44 | #include <linux/posix_acl.h> | ||
45 | #include <linux/generic_acl.h> | 45 | #include <linux/generic_acl.h> |
46 | #include <linux/mman.h> | 46 | #include <linux/mman.h> |
47 | #include <linux/string.h> | 47 | #include <linux/string.h> |
@@ -810,7 +810,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
810 | error = inode_setattr(inode, attr); | 810 | error = inode_setattr(inode, attr); |
811 | #ifdef CONFIG_TMPFS_POSIX_ACL | 811 | #ifdef CONFIG_TMPFS_POSIX_ACL |
812 | if (!error && (attr->ia_valid & ATTR_MODE)) | 812 | if (!error && (attr->ia_valid & ATTR_MODE)) |
813 | error = generic_acl_chmod(inode, &shmem_acl_ops); | 813 | error = generic_acl_chmod(inode); |
814 | #endif | 814 | #endif |
815 | if (page) | 815 | if (page) |
816 | page_cache_release(page); | 816 | page_cache_release(page); |
@@ -1017,7 +1017,14 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
1017 | goto out; | 1017 | goto out; |
1018 | } | 1018 | } |
1019 | mutex_unlock(&shmem_swaplist_mutex); | 1019 | mutex_unlock(&shmem_swaplist_mutex); |
1020 | out: return found; /* 0 or 1 or -ENOMEM */ | 1020 | /* |
1021 | * Can some race bring us here? We've been holding page lock, | ||
1022 | * so I think not; but would rather try again later than BUG() | ||
1023 | */ | ||
1024 | unlock_page(page); | ||
1025 | page_cache_release(page); | ||
1026 | out: | ||
1027 | return (found < 0) ? found : 0; | ||
1021 | } | 1028 | } |
1022 | 1029 | ||
1023 | /* | 1030 | /* |
@@ -1080,7 +1087,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1080 | else | 1087 | else |
1081 | inode = NULL; | 1088 | inode = NULL; |
1082 | spin_unlock(&info->lock); | 1089 | spin_unlock(&info->lock); |
1083 | swap_duplicate(swap); | 1090 | swap_shmem_alloc(swap); |
1084 | BUG_ON(page_mapped(page)); | 1091 | BUG_ON(page_mapped(page)); |
1085 | page_cache_release(page); /* pagecache ref */ | 1092 | page_cache_release(page); /* pagecache ref */ |
1086 | swap_writepage(page, wbc); | 1093 | swap_writepage(page, wbc); |
@@ -1817,11 +1824,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) | |||
1817 | return error; | 1824 | return error; |
1818 | } | 1825 | } |
1819 | } | 1826 | } |
1820 | error = shmem_acl_init(inode, dir); | 1827 | #ifdef CONFIG_TMPFS_POSIX_ACL |
1828 | error = generic_acl_init(inode, dir); | ||
1821 | if (error) { | 1829 | if (error) { |
1822 | iput(inode); | 1830 | iput(inode); |
1823 | return error; | 1831 | return error; |
1824 | } | 1832 | } |
1833 | #else | ||
1834 | error = 0; | ||
1835 | #endif | ||
1825 | if (dir->i_mode & S_ISGID) { | 1836 | if (dir->i_mode & S_ISGID) { |
1826 | inode->i_gid = dir->i_gid; | 1837 | inode->i_gid = dir->i_gid; |
1827 | if (S_ISDIR(mode)) | 1838 | if (S_ISDIR(mode)) |
@@ -2036,27 +2047,28 @@ static const struct inode_operations shmem_symlink_inode_operations = { | |||
2036 | * filesystem level, though. | 2047 | * filesystem level, though. |
2037 | */ | 2048 | */ |
2038 | 2049 | ||
2039 | static size_t shmem_xattr_security_list(struct inode *inode, char *list, | 2050 | static size_t shmem_xattr_security_list(struct dentry *dentry, char *list, |
2040 | size_t list_len, const char *name, | 2051 | size_t list_len, const char *name, |
2041 | size_t name_len) | 2052 | size_t name_len, int handler_flags) |
2042 | { | 2053 | { |
2043 | return security_inode_listsecurity(inode, list, list_len); | 2054 | return security_inode_listsecurity(dentry->d_inode, list, list_len); |
2044 | } | 2055 | } |
2045 | 2056 | ||
2046 | static int shmem_xattr_security_get(struct inode *inode, const char *name, | 2057 | static int shmem_xattr_security_get(struct dentry *dentry, const char *name, |
2047 | void *buffer, size_t size) | 2058 | void *buffer, size_t size, int handler_flags) |
2048 | { | 2059 | { |
2049 | if (strcmp(name, "") == 0) | 2060 | if (strcmp(name, "") == 0) |
2050 | return -EINVAL; | 2061 | return -EINVAL; |
2051 | return xattr_getsecurity(inode, name, buffer, size); | 2062 | return xattr_getsecurity(dentry->d_inode, name, buffer, size); |
2052 | } | 2063 | } |
2053 | 2064 | ||
2054 | static int shmem_xattr_security_set(struct inode *inode, const char *name, | 2065 | static int shmem_xattr_security_set(struct dentry *dentry, const char *name, |
2055 | const void *value, size_t size, int flags) | 2066 | const void *value, size_t size, int flags, int handler_flags) |
2056 | { | 2067 | { |
2057 | if (strcmp(name, "") == 0) | 2068 | if (strcmp(name, "") == 0) |
2058 | return -EINVAL; | 2069 | return -EINVAL; |
2059 | return security_inode_setsecurity(inode, name, value, size, flags); | 2070 | return security_inode_setsecurity(dentry->d_inode, name, value, |
2071 | size, flags); | ||
2060 | } | 2072 | } |
2061 | 2073 | ||
2062 | static struct xattr_handler shmem_xattr_security_handler = { | 2074 | static struct xattr_handler shmem_xattr_security_handler = { |
@@ -2067,8 +2079,8 @@ static struct xattr_handler shmem_xattr_security_handler = { | |||
2067 | }; | 2079 | }; |
2068 | 2080 | ||
2069 | static struct xattr_handler *shmem_xattr_handlers[] = { | 2081 | static struct xattr_handler *shmem_xattr_handlers[] = { |
2070 | &shmem_xattr_acl_access_handler, | 2082 | &generic_acl_access_handler, |
2071 | &shmem_xattr_acl_default_handler, | 2083 | &generic_acl_default_handler, |
2072 | &shmem_xattr_security_handler, | 2084 | &shmem_xattr_security_handler, |
2073 | NULL | 2085 | NULL |
2074 | }; | 2086 | }; |
@@ -2447,7 +2459,7 @@ static const struct inode_operations shmem_inode_operations = { | |||
2447 | .getxattr = generic_getxattr, | 2459 | .getxattr = generic_getxattr, |
2448 | .listxattr = generic_listxattr, | 2460 | .listxattr = generic_listxattr, |
2449 | .removexattr = generic_removexattr, | 2461 | .removexattr = generic_removexattr, |
2450 | .check_acl = shmem_check_acl, | 2462 | .check_acl = generic_check_acl, |
2451 | #endif | 2463 | #endif |
2452 | 2464 | ||
2453 | }; | 2465 | }; |
@@ -2470,7 +2482,7 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
2470 | .getxattr = generic_getxattr, | 2482 | .getxattr = generic_getxattr, |
2471 | .listxattr = generic_listxattr, | 2483 | .listxattr = generic_listxattr, |
2472 | .removexattr = generic_removexattr, | 2484 | .removexattr = generic_removexattr, |
2473 | .check_acl = shmem_check_acl, | 2485 | .check_acl = generic_check_acl, |
2474 | #endif | 2486 | #endif |
2475 | }; | 2487 | }; |
2476 | 2488 | ||
@@ -2481,7 +2493,7 @@ static const struct inode_operations shmem_special_inode_operations = { | |||
2481 | .getxattr = generic_getxattr, | 2493 | .getxattr = generic_getxattr, |
2482 | .listxattr = generic_listxattr, | 2494 | .listxattr = generic_listxattr, |
2483 | .removexattr = generic_removexattr, | 2495 | .removexattr = generic_removexattr, |
2484 | .check_acl = shmem_check_acl, | 2496 | .check_acl = generic_check_acl, |
2485 | #endif | 2497 | #endif |
2486 | }; | 2498 | }; |
2487 | 2499 | ||
@@ -2619,7 +2631,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2619 | int error; | 2631 | int error; |
2620 | struct file *file; | 2632 | struct file *file; |
2621 | struct inode *inode; | 2633 | struct inode *inode; |
2622 | struct dentry *dentry, *root; | 2634 | struct path path; |
2635 | struct dentry *root; | ||
2623 | struct qstr this; | 2636 | struct qstr this; |
2624 | 2637 | ||
2625 | if (IS_ERR(shm_mnt)) | 2638 | if (IS_ERR(shm_mnt)) |
@@ -2636,38 +2649,35 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2636 | this.len = strlen(name); | 2649 | this.len = strlen(name); |
2637 | this.hash = 0; /* will go */ | 2650 | this.hash = 0; /* will go */ |
2638 | root = shm_mnt->mnt_root; | 2651 | root = shm_mnt->mnt_root; |
2639 | dentry = d_alloc(root, &this); | 2652 | path.dentry = d_alloc(root, &this); |
2640 | if (!dentry) | 2653 | if (!path.dentry) |
2641 | goto put_memory; | 2654 | goto put_memory; |
2642 | 2655 | path.mnt = mntget(shm_mnt); | |
2643 | error = -ENFILE; | ||
2644 | file = get_empty_filp(); | ||
2645 | if (!file) | ||
2646 | goto put_dentry; | ||
2647 | 2656 | ||
2648 | error = -ENOSPC; | 2657 | error = -ENOSPC; |
2649 | inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); | 2658 | inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); |
2650 | if (!inode) | 2659 | if (!inode) |
2651 | goto close_file; | 2660 | goto put_dentry; |
2652 | 2661 | ||
2653 | d_instantiate(dentry, inode); | 2662 | d_instantiate(path.dentry, inode); |
2654 | inode->i_size = size; | 2663 | inode->i_size = size; |
2655 | inode->i_nlink = 0; /* It is unlinked */ | 2664 | inode->i_nlink = 0; /* It is unlinked */ |
2656 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
2657 | &shmem_file_operations); | ||
2658 | |||
2659 | #ifndef CONFIG_MMU | 2665 | #ifndef CONFIG_MMU |
2660 | error = ramfs_nommu_expand_for_mapping(inode, size); | 2666 | error = ramfs_nommu_expand_for_mapping(inode, size); |
2661 | if (error) | 2667 | if (error) |
2662 | goto close_file; | 2668 | goto put_dentry; |
2663 | #endif | 2669 | #endif |
2664 | ima_counts_get(file); | 2670 | |
2671 | error = -ENFILE; | ||
2672 | file = alloc_file(&path, FMODE_WRITE | FMODE_READ, | ||
2673 | &shmem_file_operations); | ||
2674 | if (!file) | ||
2675 | goto put_dentry; | ||
2676 | |||
2665 | return file; | 2677 | return file; |
2666 | 2678 | ||
2667 | close_file: | ||
2668 | put_filp(file); | ||
2669 | put_dentry: | 2679 | put_dentry: |
2670 | dput(dentry); | 2680 | path_put(&path); |
2671 | put_memory: | 2681 | put_memory: |
2672 | shmem_unacct_size(flags, size); | 2682 | shmem_unacct_size(flags, size); |
2673 | return ERR_PTR(error); | 2683 | return ERR_PTR(error); |
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c deleted file mode 100644 index df2c87fdae50..000000000000 --- a/mm/shmem_acl.c +++ /dev/null | |||
@@ -1,171 +0,0 @@ | |||
1 | /* | ||
2 | * mm/shmem_acl.c | ||
3 | * | ||
4 | * (C) 2005 Andreas Gruenbacher <agruen@suse.de> | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | */ | ||
8 | |||
9 | #include <linux/fs.h> | ||
10 | #include <linux/shmem_fs.h> | ||
11 | #include <linux/xattr.h> | ||
12 | #include <linux/generic_acl.h> | ||
13 | |||
14 | /** | ||
15 | * shmem_get_acl - generic_acl_operations->getacl() operation | ||
16 | */ | ||
17 | static struct posix_acl * | ||
18 | shmem_get_acl(struct inode *inode, int type) | ||
19 | { | ||
20 | struct posix_acl *acl = NULL; | ||
21 | |||
22 | spin_lock(&inode->i_lock); | ||
23 | switch(type) { | ||
24 | case ACL_TYPE_ACCESS: | ||
25 | acl = posix_acl_dup(inode->i_acl); | ||
26 | break; | ||
27 | |||
28 | case ACL_TYPE_DEFAULT: | ||
29 | acl = posix_acl_dup(inode->i_default_acl); | ||
30 | break; | ||
31 | } | ||
32 | spin_unlock(&inode->i_lock); | ||
33 | |||
34 | return acl; | ||
35 | } | ||
36 | |||
37 | /** | ||
38 | * shmem_set_acl - generic_acl_operations->setacl() operation | ||
39 | */ | ||
40 | static void | ||
41 | shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) | ||
42 | { | ||
43 | struct posix_acl *free = NULL; | ||
44 | |||
45 | spin_lock(&inode->i_lock); | ||
46 | switch(type) { | ||
47 | case ACL_TYPE_ACCESS: | ||
48 | free = inode->i_acl; | ||
49 | inode->i_acl = posix_acl_dup(acl); | ||
50 | break; | ||
51 | |||
52 | case ACL_TYPE_DEFAULT: | ||
53 | free = inode->i_default_acl; | ||
54 | inode->i_default_acl = posix_acl_dup(acl); | ||
55 | break; | ||
56 | } | ||
57 | spin_unlock(&inode->i_lock); | ||
58 | posix_acl_release(free); | ||
59 | } | ||
60 | |||
61 | struct generic_acl_operations shmem_acl_ops = { | ||
62 | .getacl = shmem_get_acl, | ||
63 | .setacl = shmem_set_acl, | ||
64 | }; | ||
65 | |||
66 | /** | ||
67 | * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access, | ||
68 | * shmem_xattr_acl_access_handler - plumbing code to implement the | ||
69 | * system.posix_acl_access xattr using the generic acl functions. | ||
70 | */ | ||
71 | |||
72 | static size_t | ||
73 | shmem_list_acl_access(struct inode *inode, char *list, size_t list_size, | ||
74 | const char *name, size_t name_len) | ||
75 | { | ||
76 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, | ||
77 | list, list_size); | ||
78 | } | ||
79 | |||
80 | static int | ||
81 | shmem_get_acl_access(struct inode *inode, const char *name, void *buffer, | ||
82 | size_t size) | ||
83 | { | ||
84 | if (strcmp(name, "") != 0) | ||
85 | return -EINVAL; | ||
86 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer, | ||
87 | size); | ||
88 | } | ||
89 | |||
90 | static int | ||
91 | shmem_set_acl_access(struct inode *inode, const char *name, const void *value, | ||
92 | size_t size, int flags) | ||
93 | { | ||
94 | if (strcmp(name, "") != 0) | ||
95 | return -EINVAL; | ||
96 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value, | ||
97 | size); | ||
98 | } | ||
99 | |||
100 | struct xattr_handler shmem_xattr_acl_access_handler = { | ||
101 | .prefix = POSIX_ACL_XATTR_ACCESS, | ||
102 | .list = shmem_list_acl_access, | ||
103 | .get = shmem_get_acl_access, | ||
104 | .set = shmem_set_acl_access, | ||
105 | }; | ||
106 | |||
107 | /** | ||
108 | * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default, | ||
109 | * shmem_xattr_acl_default_handler - plumbing code to implement the | ||
110 | * system.posix_acl_default xattr using the generic acl functions. | ||
111 | */ | ||
112 | |||
113 | static size_t | ||
114 | shmem_list_acl_default(struct inode *inode, char *list, size_t list_size, | ||
115 | const char *name, size_t name_len) | ||
116 | { | ||
117 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, | ||
118 | list, list_size); | ||
119 | } | ||
120 | |||
121 | static int | ||
122 | shmem_get_acl_default(struct inode *inode, const char *name, void *buffer, | ||
123 | size_t size) | ||
124 | { | ||
125 | if (strcmp(name, "") != 0) | ||
126 | return -EINVAL; | ||
127 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer, | ||
128 | size); | ||
129 | } | ||
130 | |||
131 | static int | ||
132 | shmem_set_acl_default(struct inode *inode, const char *name, const void *value, | ||
133 | size_t size, int flags) | ||
134 | { | ||
135 | if (strcmp(name, "") != 0) | ||
136 | return -EINVAL; | ||
137 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value, | ||
138 | size); | ||
139 | } | ||
140 | |||
141 | struct xattr_handler shmem_xattr_acl_default_handler = { | ||
142 | .prefix = POSIX_ACL_XATTR_DEFAULT, | ||
143 | .list = shmem_list_acl_default, | ||
144 | .get = shmem_get_acl_default, | ||
145 | .set = shmem_set_acl_default, | ||
146 | }; | ||
147 | |||
148 | /** | ||
149 | * shmem_acl_init - Inizialize the acl(s) of a new inode | ||
150 | */ | ||
151 | int | ||
152 | shmem_acl_init(struct inode *inode, struct inode *dir) | ||
153 | { | ||
154 | return generic_acl_init(inode, dir, &shmem_acl_ops); | ||
155 | } | ||
156 | |||
157 | /** | ||
158 | * shmem_check_acl - check_acl() callback for generic_permission() | ||
159 | */ | ||
160 | int | ||
161 | shmem_check_acl(struct inode *inode, int mask) | ||
162 | { | ||
163 | struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); | ||
164 | |||
165 | if (acl) { | ||
166 | int error = posix_acl_permission(inode, acl, mask); | ||
167 | posix_acl_release(acl); | ||
168 | return error; | ||
169 | } | ||
170 | return -EAGAIN; | ||
171 | } | ||
@@ -490,7 +490,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
490 | 490 | ||
491 | #endif | 491 | #endif |
492 | 492 | ||
493 | #ifdef CONFIG_KMEMTRACE | 493 | #ifdef CONFIG_TRACING |
494 | size_t slab_buffer_size(struct kmem_cache *cachep) | 494 | size_t slab_buffer_size(struct kmem_cache *cachep) |
495 | { | 495 | { |
496 | return cachep->buffer_size; | 496 | return cachep->buffer_size; |
@@ -604,6 +604,26 @@ static struct kmem_cache cache_cache = { | |||
604 | 604 | ||
605 | #define BAD_ALIEN_MAGIC 0x01020304ul | 605 | #define BAD_ALIEN_MAGIC 0x01020304ul |
606 | 606 | ||
607 | /* | ||
608 | * chicken and egg problem: delay the per-cpu array allocation | ||
609 | * until the general caches are up. | ||
610 | */ | ||
611 | static enum { | ||
612 | NONE, | ||
613 | PARTIAL_AC, | ||
614 | PARTIAL_L3, | ||
615 | EARLY, | ||
616 | FULL | ||
617 | } g_cpucache_up; | ||
618 | |||
619 | /* | ||
620 | * used by boot code to determine if it can use slab based allocator | ||
621 | */ | ||
622 | int slab_is_available(void) | ||
623 | { | ||
624 | return g_cpucache_up >= EARLY; | ||
625 | } | ||
626 | |||
607 | #ifdef CONFIG_LOCKDEP | 627 | #ifdef CONFIG_LOCKDEP |
608 | 628 | ||
609 | /* | 629 | /* |
@@ -620,40 +640,52 @@ static struct kmem_cache cache_cache = { | |||
620 | static struct lock_class_key on_slab_l3_key; | 640 | static struct lock_class_key on_slab_l3_key; |
621 | static struct lock_class_key on_slab_alc_key; | 641 | static struct lock_class_key on_slab_alc_key; |
622 | 642 | ||
623 | static inline void init_lock_keys(void) | 643 | static void init_node_lock_keys(int q) |
624 | |||
625 | { | 644 | { |
626 | int q; | ||
627 | struct cache_sizes *s = malloc_sizes; | 645 | struct cache_sizes *s = malloc_sizes; |
628 | 646 | ||
629 | while (s->cs_size != ULONG_MAX) { | 647 | if (g_cpucache_up != FULL) |
630 | for_each_node(q) { | 648 | return; |
631 | struct array_cache **alc; | 649 | |
632 | int r; | 650 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { |
633 | struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; | 651 | struct array_cache **alc; |
634 | if (!l3 || OFF_SLAB(s->cs_cachep)) | 652 | struct kmem_list3 *l3; |
635 | continue; | 653 | int r; |
636 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); | 654 | |
637 | alc = l3->alien; | 655 | l3 = s->cs_cachep->nodelists[q]; |
638 | /* | 656 | if (!l3 || OFF_SLAB(s->cs_cachep)) |
639 | * FIXME: This check for BAD_ALIEN_MAGIC | 657 | continue; |
640 | * should go away when common slab code is taught to | 658 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); |
641 | * work even without alien caches. | 659 | alc = l3->alien; |
642 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | 660 | /* |
643 | * for alloc_alien_cache, | 661 | * FIXME: This check for BAD_ALIEN_MAGIC |
644 | */ | 662 | * should go away when common slab code is taught to |
645 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | 663 | * work even without alien caches. |
646 | continue; | 664 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC |
647 | for_each_node(r) { | 665 | * for alloc_alien_cache, |
648 | if (alc[r]) | 666 | */ |
649 | lockdep_set_class(&alc[r]->lock, | 667 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) |
650 | &on_slab_alc_key); | 668 | continue; |
651 | } | 669 | for_each_node(r) { |
670 | if (alc[r]) | ||
671 | lockdep_set_class(&alc[r]->lock, | ||
672 | &on_slab_alc_key); | ||
652 | } | 673 | } |
653 | s++; | ||
654 | } | 674 | } |
655 | } | 675 | } |
676 | |||
677 | static inline void init_lock_keys(void) | ||
678 | { | ||
679 | int node; | ||
680 | |||
681 | for_each_node(node) | ||
682 | init_node_lock_keys(node); | ||
683 | } | ||
656 | #else | 684 | #else |
685 | static void init_node_lock_keys(int q) | ||
686 | { | ||
687 | } | ||
688 | |||
657 | static inline void init_lock_keys(void) | 689 | static inline void init_lock_keys(void) |
658 | { | 690 | { |
659 | } | 691 | } |
@@ -665,27 +697,7 @@ static inline void init_lock_keys(void) | |||
665 | static DEFINE_MUTEX(cache_chain_mutex); | 697 | static DEFINE_MUTEX(cache_chain_mutex); |
666 | static struct list_head cache_chain; | 698 | static struct list_head cache_chain; |
667 | 699 | ||
668 | /* | 700 | static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); |
669 | * chicken and egg problem: delay the per-cpu array allocation | ||
670 | * until the general caches are up. | ||
671 | */ | ||
672 | static enum { | ||
673 | NONE, | ||
674 | PARTIAL_AC, | ||
675 | PARTIAL_L3, | ||
676 | EARLY, | ||
677 | FULL | ||
678 | } g_cpucache_up; | ||
679 | |||
680 | /* | ||
681 | * used by boot code to determine if it can use slab based allocator | ||
682 | */ | ||
683 | int slab_is_available(void) | ||
684 | { | ||
685 | return g_cpucache_up >= EARLY; | ||
686 | } | ||
687 | |||
688 | static DEFINE_PER_CPU(struct delayed_work, reap_work); | ||
689 | 701 | ||
690 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 702 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) |
691 | { | 703 | { |
@@ -826,7 +838,7 @@ __setup("noaliencache", noaliencache_setup); | |||
826 | * objects freed on different nodes from which they were allocated) and the | 838 | * objects freed on different nodes from which they were allocated) and the |
827 | * flushing of remote pcps by calling drain_node_pages. | 839 | * flushing of remote pcps by calling drain_node_pages. |
828 | */ | 840 | */ |
829 | static DEFINE_PER_CPU(unsigned long, reap_node); | 841 | static DEFINE_PER_CPU(unsigned long, slab_reap_node); |
830 | 842 | ||
831 | static void init_reap_node(int cpu) | 843 | static void init_reap_node(int cpu) |
832 | { | 844 | { |
@@ -836,17 +848,17 @@ static void init_reap_node(int cpu) | |||
836 | if (node == MAX_NUMNODES) | 848 | if (node == MAX_NUMNODES) |
837 | node = first_node(node_online_map); | 849 | node = first_node(node_online_map); |
838 | 850 | ||
839 | per_cpu(reap_node, cpu) = node; | 851 | per_cpu(slab_reap_node, cpu) = node; |
840 | } | 852 | } |
841 | 853 | ||
842 | static void next_reap_node(void) | 854 | static void next_reap_node(void) |
843 | { | 855 | { |
844 | int node = __get_cpu_var(reap_node); | 856 | int node = __get_cpu_var(slab_reap_node); |
845 | 857 | ||
846 | node = next_node(node, node_online_map); | 858 | node = next_node(node, node_online_map); |
847 | if (unlikely(node >= MAX_NUMNODES)) | 859 | if (unlikely(node >= MAX_NUMNODES)) |
848 | node = first_node(node_online_map); | 860 | node = first_node(node_online_map); |
849 | __get_cpu_var(reap_node) = node; | 861 | __get_cpu_var(slab_reap_node) = node; |
850 | } | 862 | } |
851 | 863 | ||
852 | #else | 864 | #else |
@@ -863,7 +875,7 @@ static void next_reap_node(void) | |||
863 | */ | 875 | */ |
864 | static void __cpuinit start_cpu_timer(int cpu) | 876 | static void __cpuinit start_cpu_timer(int cpu) |
865 | { | 877 | { |
866 | struct delayed_work *reap_work = &per_cpu(reap_work, cpu); | 878 | struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); |
867 | 879 | ||
868 | /* | 880 | /* |
869 | * When this gets called from do_initcalls via cpucache_init(), | 881 | * When this gets called from do_initcalls via cpucache_init(), |
@@ -923,7 +935,6 @@ static int transfer_objects(struct array_cache *to, | |||
923 | 935 | ||
924 | from->avail -= nr; | 936 | from->avail -= nr; |
925 | to->avail += nr; | 937 | to->avail += nr; |
926 | to->touched = 1; | ||
927 | return nr; | 938 | return nr; |
928 | } | 939 | } |
929 | 940 | ||
@@ -971,13 +982,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) | |||
971 | 982 | ||
972 | if (limit > 1) | 983 | if (limit > 1) |
973 | limit = 12; | 984 | limit = 12; |
974 | ac_ptr = kmalloc_node(memsize, gfp, node); | 985 | ac_ptr = kzalloc_node(memsize, gfp, node); |
975 | if (ac_ptr) { | 986 | if (ac_ptr) { |
976 | for_each_node(i) { | 987 | for_each_node(i) { |
977 | if (i == node || !node_online(i)) { | 988 | if (i == node || !node_online(i)) |
978 | ac_ptr[i] = NULL; | ||
979 | continue; | 989 | continue; |
980 | } | ||
981 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); | 990 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); |
982 | if (!ac_ptr[i]) { | 991 | if (!ac_ptr[i]) { |
983 | for (i--; i >= 0; i--) | 992 | for (i--; i >= 0; i--) |
@@ -1027,7 +1036,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, | |||
1027 | */ | 1036 | */ |
1028 | static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) | 1037 | static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) |
1029 | { | 1038 | { |
1030 | int node = __get_cpu_var(reap_node); | 1039 | int node = __get_cpu_var(slab_reap_node); |
1031 | 1040 | ||
1032 | if (l3->alien) { | 1041 | if (l3->alien) { |
1033 | struct array_cache *ac = l3->alien[node]; | 1042 | struct array_cache *ac = l3->alien[node]; |
@@ -1120,7 +1129,7 @@ static void __cpuinit cpuup_canceled(long cpu) | |||
1120 | if (nc) | 1129 | if (nc) |
1121 | free_block(cachep, nc->entry, nc->avail, node); | 1130 | free_block(cachep, nc->entry, nc->avail, node); |
1122 | 1131 | ||
1123 | if (!cpus_empty(*mask)) { | 1132 | if (!cpumask_empty(mask)) { |
1124 | spin_unlock_irq(&l3->list_lock); | 1133 | spin_unlock_irq(&l3->list_lock); |
1125 | goto free_array_cache; | 1134 | goto free_array_cache; |
1126 | } | 1135 | } |
@@ -1254,6 +1263,8 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
1254 | kfree(shared); | 1263 | kfree(shared); |
1255 | free_alien_cache(alien); | 1264 | free_alien_cache(alien); |
1256 | } | 1265 | } |
1266 | init_node_lock_keys(node); | ||
1267 | |||
1257 | return 0; | 1268 | return 0; |
1258 | bad: | 1269 | bad: |
1259 | cpuup_canceled(cpu); | 1270 | cpuup_canceled(cpu); |
@@ -1286,9 +1297,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1286 | * anything expensive but will only modify reap_work | 1297 | * anything expensive but will only modify reap_work |
1287 | * and reschedule the timer. | 1298 | * and reschedule the timer. |
1288 | */ | 1299 | */ |
1289 | cancel_rearming_delayed_work(&per_cpu(reap_work, cpu)); | 1300 | cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); |
1290 | /* Now the cache_reaper is guaranteed to be not running. */ | 1301 | /* Now the cache_reaper is guaranteed to be not running. */ |
1291 | per_cpu(reap_work, cpu).work.func = NULL; | 1302 | per_cpu(slab_reap_work, cpu).work.func = NULL; |
1292 | break; | 1303 | break; |
1293 | case CPU_DOWN_FAILED: | 1304 | case CPU_DOWN_FAILED: |
1294 | case CPU_DOWN_FAILED_FROZEN: | 1305 | case CPU_DOWN_FAILED_FROZEN: |
@@ -2261,9 +2272,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2261 | /* | 2272 | /* |
2262 | * Determine if the slab management is 'on' or 'off' slab. | 2273 | * Determine if the slab management is 'on' or 'off' slab. |
2263 | * (bootstrapping cannot cope with offslab caches so don't do | 2274 | * (bootstrapping cannot cope with offslab caches so don't do |
2264 | * it too early on.) | 2275 | * it too early on. Always use on-slab management when |
2276 | * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) | ||
2265 | */ | 2277 | */ |
2266 | if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) | 2278 | if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && |
2279 | !(flags & SLAB_NOLEAKTRACE)) | ||
2267 | /* | 2280 | /* |
2268 | * Size is large, assume best to place the slab management obj | 2281 | * Size is large, assume best to place the slab management obj |
2269 | * off-slab (should allow better packing of objs). | 2282 | * off-slab (should allow better packing of objs). |
@@ -2582,8 +2595,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | |||
2582 | * kmemleak does not treat the ->s_mem pointer as a reference | 2595 | * kmemleak does not treat the ->s_mem pointer as a reference |
2583 | * to the object. Otherwise we will not report the leak. | 2596 | * to the object. Otherwise we will not report the leak. |
2584 | */ | 2597 | */ |
2585 | kmemleak_scan_area(slabp, offsetof(struct slab, list), | 2598 | kmemleak_scan_area(&slabp->list, sizeof(struct list_head), |
2586 | sizeof(struct list_head), local_flags); | 2599 | local_flags); |
2587 | if (!slabp) | 2600 | if (!slabp) |
2588 | return NULL; | 2601 | return NULL; |
2589 | } else { | 2602 | } else { |
@@ -2947,8 +2960,10 @@ retry: | |||
2947 | spin_lock(&l3->list_lock); | 2960 | spin_lock(&l3->list_lock); |
2948 | 2961 | ||
2949 | /* See if we can refill from the shared array */ | 2962 | /* See if we can refill from the shared array */ |
2950 | if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) | 2963 | if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { |
2964 | l3->shared->touched = 1; | ||
2951 | goto alloc_done; | 2965 | goto alloc_done; |
2966 | } | ||
2952 | 2967 | ||
2953 | while (batchcount > 0) { | 2968 | while (batchcount > 0) { |
2954 | struct list_head *entry; | 2969 | struct list_head *entry; |
@@ -3085,7 +3100,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) | |||
3085 | if (cachep == &cache_cache) | 3100 | if (cachep == &cache_cache) |
3086 | return false; | 3101 | return false; |
3087 | 3102 | ||
3088 | return should_failslab(obj_size(cachep), flags); | 3103 | return should_failslab(obj_size(cachep), flags, cachep->flags); |
3089 | } | 3104 | } |
3090 | 3105 | ||
3091 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 3106 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
@@ -3103,13 +3118,19 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3103 | } else { | 3118 | } else { |
3104 | STATS_INC_ALLOCMISS(cachep); | 3119 | STATS_INC_ALLOCMISS(cachep); |
3105 | objp = cache_alloc_refill(cachep, flags); | 3120 | objp = cache_alloc_refill(cachep, flags); |
3121 | /* | ||
3122 | * the 'ac' may be updated by cache_alloc_refill(), | ||
3123 | * and kmemleak_erase() requires its correct value. | ||
3124 | */ | ||
3125 | ac = cpu_cache_get(cachep); | ||
3106 | } | 3126 | } |
3107 | /* | 3127 | /* |
3108 | * To avoid a false negative, if an object that is in one of the | 3128 | * To avoid a false negative, if an object that is in one of the |
3109 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't | 3129 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't |
3110 | * treat the array pointers as a reference to the object. | 3130 | * treat the array pointers as a reference to the object. |
3111 | */ | 3131 | */ |
3112 | kmemleak_erase(&ac->entry[ac->avail]); | 3132 | if (objp) |
3133 | kmemleak_erase(&ac->entry[ac->avail]); | ||
3113 | return objp; | 3134 | return objp; |
3114 | } | 3135 | } |
3115 | 3136 | ||
@@ -3306,7 +3327,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3306 | cache_alloc_debugcheck_before(cachep, flags); | 3327 | cache_alloc_debugcheck_before(cachep, flags); |
3307 | local_irq_save(save_flags); | 3328 | local_irq_save(save_flags); |
3308 | 3329 | ||
3309 | if (unlikely(nodeid == -1)) | 3330 | if (nodeid == -1) |
3310 | nodeid = numa_node_id(); | 3331 | nodeid = numa_node_id(); |
3311 | 3332 | ||
3312 | if (unlikely(!cachep->nodelists[nodeid])) { | 3333 | if (unlikely(!cachep->nodelists[nodeid])) { |
@@ -3558,7 +3579,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3558 | } | 3579 | } |
3559 | EXPORT_SYMBOL(kmem_cache_alloc); | 3580 | EXPORT_SYMBOL(kmem_cache_alloc); |
3560 | 3581 | ||
3561 | #ifdef CONFIG_KMEMTRACE | 3582 | #ifdef CONFIG_TRACING |
3562 | void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) | 3583 | void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) |
3563 | { | 3584 | { |
3564 | return __cache_alloc(cachep, flags, __builtin_return_address(0)); | 3585 | return __cache_alloc(cachep, flags, __builtin_return_address(0)); |
@@ -3581,21 +3602,10 @@ EXPORT_SYMBOL(kmem_cache_alloc_notrace); | |||
3581 | */ | 3602 | */ |
3582 | int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) | 3603 | int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) |
3583 | { | 3604 | { |
3584 | unsigned long addr = (unsigned long)ptr; | ||
3585 | unsigned long min_addr = PAGE_OFFSET; | ||
3586 | unsigned long align_mask = BYTES_PER_WORD - 1; | ||
3587 | unsigned long size = cachep->buffer_size; | 3605 | unsigned long size = cachep->buffer_size; |
3588 | struct page *page; | 3606 | struct page *page; |
3589 | 3607 | ||
3590 | if (unlikely(addr < min_addr)) | 3608 | if (unlikely(!kern_ptr_validate(ptr, size))) |
3591 | goto out; | ||
3592 | if (unlikely(addr > (unsigned long)high_memory - size)) | ||
3593 | goto out; | ||
3594 | if (unlikely(addr & align_mask)) | ||
3595 | goto out; | ||
3596 | if (unlikely(!kern_addr_valid(addr))) | ||
3597 | goto out; | ||
3598 | if (unlikely(!kern_addr_valid(addr + size - 1))) | ||
3599 | goto out; | 3609 | goto out; |
3600 | page = virt_to_page(ptr); | 3610 | page = virt_to_page(ptr); |
3601 | if (unlikely(!PageSlab(page))) | 3611 | if (unlikely(!PageSlab(page))) |
@@ -3621,7 +3631,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
3621 | } | 3631 | } |
3622 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3632 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
3623 | 3633 | ||
3624 | #ifdef CONFIG_KMEMTRACE | 3634 | #ifdef CONFIG_TRACING |
3625 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, | 3635 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, |
3626 | gfp_t flags, | 3636 | gfp_t flags, |
3627 | int nodeid) | 3637 | int nodeid) |
@@ -3649,7 +3659,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) | |||
3649 | return ret; | 3659 | return ret; |
3650 | } | 3660 | } |
3651 | 3661 | ||
3652 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) | 3662 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
3653 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3663 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
3654 | { | 3664 | { |
3655 | return __do_kmalloc_node(size, flags, node, | 3665 | return __do_kmalloc_node(size, flags, node, |
@@ -3669,7 +3679,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) | |||
3669 | return __do_kmalloc_node(size, flags, node, NULL); | 3679 | return __do_kmalloc_node(size, flags, node, NULL); |
3670 | } | 3680 | } |
3671 | EXPORT_SYMBOL(__kmalloc_node); | 3681 | EXPORT_SYMBOL(__kmalloc_node); |
3672 | #endif /* CONFIG_DEBUG_SLAB */ | 3682 | #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ |
3673 | #endif /* CONFIG_NUMA */ | 3683 | #endif /* CONFIG_NUMA */ |
3674 | 3684 | ||
3675 | /** | 3685 | /** |
@@ -3701,7 +3711,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
3701 | } | 3711 | } |
3702 | 3712 | ||
3703 | 3713 | ||
3704 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) | 3714 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
3705 | void *__kmalloc(size_t size, gfp_t flags) | 3715 | void *__kmalloc(size_t size, gfp_t flags) |
3706 | { | 3716 | { |
3707 | return __do_kmalloc(size, flags, __builtin_return_address(0)); | 3717 | return __do_kmalloc(size, flags, __builtin_return_address(0)); |
@@ -151,7 +151,8 @@ | |||
151 | * Set of flags that will prevent slab merging | 151 | * Set of flags that will prevent slab merging |
152 | */ | 152 | */ |
153 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | 153 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ |
154 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) | 154 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ |
155 | SLAB_FAILSLAB) | ||
155 | 156 | ||
156 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ | 157 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ |
157 | SLAB_CACHE_DMA | SLAB_NOTRACK) | 158 | SLAB_CACHE_DMA | SLAB_NOTRACK) |
@@ -217,10 +218,10 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) | |||
217 | 218 | ||
218 | #endif | 219 | #endif |
219 | 220 | ||
220 | static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) | 221 | static inline void stat(struct kmem_cache *s, enum stat_item si) |
221 | { | 222 | { |
222 | #ifdef CONFIG_SLUB_STATS | 223 | #ifdef CONFIG_SLUB_STATS |
223 | c->stat[si]++; | 224 | __this_cpu_inc(s->cpu_slab->stat[si]); |
224 | #endif | 225 | #endif |
225 | } | 226 | } |
226 | 227 | ||
@@ -242,15 +243,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | |||
242 | #endif | 243 | #endif |
243 | } | 244 | } |
244 | 245 | ||
245 | static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) | ||
246 | { | ||
247 | #ifdef CONFIG_SMP | ||
248 | return s->cpu_slab[cpu]; | ||
249 | #else | ||
250 | return &s->cpu_slab; | ||
251 | #endif | ||
252 | } | ||
253 | |||
254 | /* Verify that a pointer has an address that is valid within a slab page */ | 246 | /* Verify that a pointer has an address that is valid within a slab page */ |
255 | static inline int check_valid_pointer(struct kmem_cache *s, | 247 | static inline int check_valid_pointer(struct kmem_cache *s, |
256 | struct page *page, const void *object) | 248 | struct page *page, const void *object) |
@@ -269,13 +261,6 @@ static inline int check_valid_pointer(struct kmem_cache *s, | |||
269 | return 1; | 261 | return 1; |
270 | } | 262 | } |
271 | 263 | ||
272 | /* | ||
273 | * Slow version of get and set free pointer. | ||
274 | * | ||
275 | * This version requires touching the cache lines of kmem_cache which | ||
276 | * we avoid to do in the fast alloc free paths. There we obtain the offset | ||
277 | * from the page struct. | ||
278 | */ | ||
279 | static inline void *get_freepointer(struct kmem_cache *s, void *object) | 264 | static inline void *get_freepointer(struct kmem_cache *s, void *object) |
280 | { | 265 | { |
281 | return *(void **)(object + s->offset); | 266 | return *(void **)(object + s->offset); |
@@ -1020,6 +1005,9 @@ static int __init setup_slub_debug(char *str) | |||
1020 | case 't': | 1005 | case 't': |
1021 | slub_debug |= SLAB_TRACE; | 1006 | slub_debug |= SLAB_TRACE; |
1022 | break; | 1007 | break; |
1008 | case 'a': | ||
1009 | slub_debug |= SLAB_FAILSLAB; | ||
1010 | break; | ||
1023 | default: | 1011 | default: |
1024 | printk(KERN_ERR "slub_debug option '%c' " | 1012 | printk(KERN_ERR "slub_debug option '%c' " |
1025 | "unknown. skipped\n", *str); | 1013 | "unknown. skipped\n", *str); |
@@ -1124,7 +1112,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1124 | if (!page) | 1112 | if (!page) |
1125 | return NULL; | 1113 | return NULL; |
1126 | 1114 | ||
1127 | stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); | 1115 | stat(s, ORDER_FALLBACK); |
1128 | } | 1116 | } |
1129 | 1117 | ||
1130 | if (kmemcheck_enabled | 1118 | if (kmemcheck_enabled |
@@ -1422,23 +1410,22 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) | |||
1422 | static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | 1410 | static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) |
1423 | { | 1411 | { |
1424 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | 1412 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
1425 | struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); | ||
1426 | 1413 | ||
1427 | __ClearPageSlubFrozen(page); | 1414 | __ClearPageSlubFrozen(page); |
1428 | if (page->inuse) { | 1415 | if (page->inuse) { |
1429 | 1416 | ||
1430 | if (page->freelist) { | 1417 | if (page->freelist) { |
1431 | add_partial(n, page, tail); | 1418 | add_partial(n, page, tail); |
1432 | stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); | 1419 | stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); |
1433 | } else { | 1420 | } else { |
1434 | stat(c, DEACTIVATE_FULL); | 1421 | stat(s, DEACTIVATE_FULL); |
1435 | if (SLABDEBUG && PageSlubDebug(page) && | 1422 | if (SLABDEBUG && PageSlubDebug(page) && |
1436 | (s->flags & SLAB_STORE_USER)) | 1423 | (s->flags & SLAB_STORE_USER)) |
1437 | add_full(n, page); | 1424 | add_full(n, page); |
1438 | } | 1425 | } |
1439 | slab_unlock(page); | 1426 | slab_unlock(page); |
1440 | } else { | 1427 | } else { |
1441 | stat(c, DEACTIVATE_EMPTY); | 1428 | stat(s, DEACTIVATE_EMPTY); |
1442 | if (n->nr_partial < s->min_partial) { | 1429 | if (n->nr_partial < s->min_partial) { |
1443 | /* | 1430 | /* |
1444 | * Adding an empty slab to the partial slabs in order | 1431 | * Adding an empty slab to the partial slabs in order |
@@ -1454,7 +1441,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | |||
1454 | slab_unlock(page); | 1441 | slab_unlock(page); |
1455 | } else { | 1442 | } else { |
1456 | slab_unlock(page); | 1443 | slab_unlock(page); |
1457 | stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); | 1444 | stat(s, FREE_SLAB); |
1458 | discard_slab(s, page); | 1445 | discard_slab(s, page); |
1459 | } | 1446 | } |
1460 | } | 1447 | } |
@@ -1469,7 +1456,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
1469 | int tail = 1; | 1456 | int tail = 1; |
1470 | 1457 | ||
1471 | if (page->freelist) | 1458 | if (page->freelist) |
1472 | stat(c, DEACTIVATE_REMOTE_FREES); | 1459 | stat(s, DEACTIVATE_REMOTE_FREES); |
1473 | /* | 1460 | /* |
1474 | * Merge cpu freelist into slab freelist. Typically we get here | 1461 | * Merge cpu freelist into slab freelist. Typically we get here |
1475 | * because both freelists are empty. So this is unlikely | 1462 | * because both freelists are empty. So this is unlikely |
@@ -1482,10 +1469,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
1482 | 1469 | ||
1483 | /* Retrieve object from cpu_freelist */ | 1470 | /* Retrieve object from cpu_freelist */ |
1484 | object = c->freelist; | 1471 | object = c->freelist; |
1485 | c->freelist = c->freelist[c->offset]; | 1472 | c->freelist = get_freepointer(s, c->freelist); |
1486 | 1473 | ||
1487 | /* And put onto the regular freelist */ | 1474 | /* And put onto the regular freelist */ |
1488 | object[c->offset] = page->freelist; | 1475 | set_freepointer(s, object, page->freelist); |
1489 | page->freelist = object; | 1476 | page->freelist = object; |
1490 | page->inuse--; | 1477 | page->inuse--; |
1491 | } | 1478 | } |
@@ -1495,7 +1482,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
1495 | 1482 | ||
1496 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1483 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1497 | { | 1484 | { |
1498 | stat(c, CPUSLAB_FLUSH); | 1485 | stat(s, CPUSLAB_FLUSH); |
1499 | slab_lock(c->page); | 1486 | slab_lock(c->page); |
1500 | deactivate_slab(s, c); | 1487 | deactivate_slab(s, c); |
1501 | } | 1488 | } |
@@ -1507,7 +1494,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
1507 | */ | 1494 | */ |
1508 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) | 1495 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) |
1509 | { | 1496 | { |
1510 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | 1497 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
1511 | 1498 | ||
1512 | if (likely(c && c->page)) | 1499 | if (likely(c && c->page)) |
1513 | flush_slab(s, c); | 1500 | flush_slab(s, c); |
@@ -1635,7 +1622,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1635 | if (unlikely(!node_match(c, node))) | 1622 | if (unlikely(!node_match(c, node))) |
1636 | goto another_slab; | 1623 | goto another_slab; |
1637 | 1624 | ||
1638 | stat(c, ALLOC_REFILL); | 1625 | stat(s, ALLOC_REFILL); |
1639 | 1626 | ||
1640 | load_freelist: | 1627 | load_freelist: |
1641 | object = c->page->freelist; | 1628 | object = c->page->freelist; |
@@ -1644,13 +1631,13 @@ load_freelist: | |||
1644 | if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) | 1631 | if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) |
1645 | goto debug; | 1632 | goto debug; |
1646 | 1633 | ||
1647 | c->freelist = object[c->offset]; | 1634 | c->freelist = get_freepointer(s, object); |
1648 | c->page->inuse = c->page->objects; | 1635 | c->page->inuse = c->page->objects; |
1649 | c->page->freelist = NULL; | 1636 | c->page->freelist = NULL; |
1650 | c->node = page_to_nid(c->page); | 1637 | c->node = page_to_nid(c->page); |
1651 | unlock_out: | 1638 | unlock_out: |
1652 | slab_unlock(c->page); | 1639 | slab_unlock(c->page); |
1653 | stat(c, ALLOC_SLOWPATH); | 1640 | stat(s, ALLOC_SLOWPATH); |
1654 | return object; | 1641 | return object; |
1655 | 1642 | ||
1656 | another_slab: | 1643 | another_slab: |
@@ -1660,7 +1647,7 @@ new_slab: | |||
1660 | new = get_partial(s, gfpflags, node); | 1647 | new = get_partial(s, gfpflags, node); |
1661 | if (new) { | 1648 | if (new) { |
1662 | c->page = new; | 1649 | c->page = new; |
1663 | stat(c, ALLOC_FROM_PARTIAL); | 1650 | stat(s, ALLOC_FROM_PARTIAL); |
1664 | goto load_freelist; | 1651 | goto load_freelist; |
1665 | } | 1652 | } |
1666 | 1653 | ||
@@ -1673,8 +1660,8 @@ new_slab: | |||
1673 | local_irq_disable(); | 1660 | local_irq_disable(); |
1674 | 1661 | ||
1675 | if (new) { | 1662 | if (new) { |
1676 | c = get_cpu_slab(s, smp_processor_id()); | 1663 | c = __this_cpu_ptr(s->cpu_slab); |
1677 | stat(c, ALLOC_SLAB); | 1664 | stat(s, ALLOC_SLAB); |
1678 | if (c->page) | 1665 | if (c->page) |
1679 | flush_slab(s, c); | 1666 | flush_slab(s, c); |
1680 | slab_lock(new); | 1667 | slab_lock(new); |
@@ -1690,7 +1677,7 @@ debug: | |||
1690 | goto another_slab; | 1677 | goto another_slab; |
1691 | 1678 | ||
1692 | c->page->inuse++; | 1679 | c->page->inuse++; |
1693 | c->page->freelist = object[c->offset]; | 1680 | c->page->freelist = get_freepointer(s, object); |
1694 | c->node = -1; | 1681 | c->node = -1; |
1695 | goto unlock_out; | 1682 | goto unlock_out; |
1696 | } | 1683 | } |
@@ -1711,35 +1698,33 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
1711 | void **object; | 1698 | void **object; |
1712 | struct kmem_cache_cpu *c; | 1699 | struct kmem_cache_cpu *c; |
1713 | unsigned long flags; | 1700 | unsigned long flags; |
1714 | unsigned int objsize; | ||
1715 | 1701 | ||
1716 | gfpflags &= gfp_allowed_mask; | 1702 | gfpflags &= gfp_allowed_mask; |
1717 | 1703 | ||
1718 | lockdep_trace_alloc(gfpflags); | 1704 | lockdep_trace_alloc(gfpflags); |
1719 | might_sleep_if(gfpflags & __GFP_WAIT); | 1705 | might_sleep_if(gfpflags & __GFP_WAIT); |
1720 | 1706 | ||
1721 | if (should_failslab(s->objsize, gfpflags)) | 1707 | if (should_failslab(s->objsize, gfpflags, s->flags)) |
1722 | return NULL; | 1708 | return NULL; |
1723 | 1709 | ||
1724 | local_irq_save(flags); | 1710 | local_irq_save(flags); |
1725 | c = get_cpu_slab(s, smp_processor_id()); | 1711 | c = __this_cpu_ptr(s->cpu_slab); |
1726 | objsize = c->objsize; | 1712 | object = c->freelist; |
1727 | if (unlikely(!c->freelist || !node_match(c, node))) | 1713 | if (unlikely(!object || !node_match(c, node))) |
1728 | 1714 | ||
1729 | object = __slab_alloc(s, gfpflags, node, addr, c); | 1715 | object = __slab_alloc(s, gfpflags, node, addr, c); |
1730 | 1716 | ||
1731 | else { | 1717 | else { |
1732 | object = c->freelist; | 1718 | c->freelist = get_freepointer(s, object); |
1733 | c->freelist = object[c->offset]; | 1719 | stat(s, ALLOC_FASTPATH); |
1734 | stat(c, ALLOC_FASTPATH); | ||
1735 | } | 1720 | } |
1736 | local_irq_restore(flags); | 1721 | local_irq_restore(flags); |
1737 | 1722 | ||
1738 | if (unlikely((gfpflags & __GFP_ZERO) && object)) | 1723 | if (unlikely(gfpflags & __GFP_ZERO) && object) |
1739 | memset(object, 0, objsize); | 1724 | memset(object, 0, s->objsize); |
1740 | 1725 | ||
1741 | kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); | 1726 | kmemcheck_slab_alloc(s, gfpflags, object, s->objsize); |
1742 | kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); | 1727 | kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags); |
1743 | 1728 | ||
1744 | return object; | 1729 | return object; |
1745 | } | 1730 | } |
@@ -1754,7 +1739,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) | |||
1754 | } | 1739 | } |
1755 | EXPORT_SYMBOL(kmem_cache_alloc); | 1740 | EXPORT_SYMBOL(kmem_cache_alloc); |
1756 | 1741 | ||
1757 | #ifdef CONFIG_KMEMTRACE | 1742 | #ifdef CONFIG_TRACING |
1758 | void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) | 1743 | void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) |
1759 | { | 1744 | { |
1760 | return slab_alloc(s, gfpflags, -1, _RET_IP_); | 1745 | return slab_alloc(s, gfpflags, -1, _RET_IP_); |
@@ -1775,7 +1760,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) | |||
1775 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 1760 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
1776 | #endif | 1761 | #endif |
1777 | 1762 | ||
1778 | #ifdef CONFIG_KMEMTRACE | 1763 | #ifdef CONFIG_TRACING |
1779 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, | 1764 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, |
1780 | gfp_t gfpflags, | 1765 | gfp_t gfpflags, |
1781 | int node) | 1766 | int node) |
@@ -1794,26 +1779,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); | |||
1794 | * handling required then we can return immediately. | 1779 | * handling required then we can return immediately. |
1795 | */ | 1780 | */ |
1796 | static void __slab_free(struct kmem_cache *s, struct page *page, | 1781 | static void __slab_free(struct kmem_cache *s, struct page *page, |
1797 | void *x, unsigned long addr, unsigned int offset) | 1782 | void *x, unsigned long addr) |
1798 | { | 1783 | { |
1799 | void *prior; | 1784 | void *prior; |
1800 | void **object = (void *)x; | 1785 | void **object = (void *)x; |
1801 | struct kmem_cache_cpu *c; | ||
1802 | 1786 | ||
1803 | c = get_cpu_slab(s, raw_smp_processor_id()); | 1787 | stat(s, FREE_SLOWPATH); |
1804 | stat(c, FREE_SLOWPATH); | ||
1805 | slab_lock(page); | 1788 | slab_lock(page); |
1806 | 1789 | ||
1807 | if (unlikely(SLABDEBUG && PageSlubDebug(page))) | 1790 | if (unlikely(SLABDEBUG && PageSlubDebug(page))) |
1808 | goto debug; | 1791 | goto debug; |
1809 | 1792 | ||
1810 | checks_ok: | 1793 | checks_ok: |
1811 | prior = object[offset] = page->freelist; | 1794 | prior = page->freelist; |
1795 | set_freepointer(s, object, prior); | ||
1812 | page->freelist = object; | 1796 | page->freelist = object; |
1813 | page->inuse--; | 1797 | page->inuse--; |
1814 | 1798 | ||
1815 | if (unlikely(PageSlubFrozen(page))) { | 1799 | if (unlikely(PageSlubFrozen(page))) { |
1816 | stat(c, FREE_FROZEN); | 1800 | stat(s, FREE_FROZEN); |
1817 | goto out_unlock; | 1801 | goto out_unlock; |
1818 | } | 1802 | } |
1819 | 1803 | ||
@@ -1826,7 +1810,7 @@ checks_ok: | |||
1826 | */ | 1810 | */ |
1827 | if (unlikely(!prior)) { | 1811 | if (unlikely(!prior)) { |
1828 | add_partial(get_node(s, page_to_nid(page)), page, 1); | 1812 | add_partial(get_node(s, page_to_nid(page)), page, 1); |
1829 | stat(c, FREE_ADD_PARTIAL); | 1813 | stat(s, FREE_ADD_PARTIAL); |
1830 | } | 1814 | } |
1831 | 1815 | ||
1832 | out_unlock: | 1816 | out_unlock: |
@@ -1839,10 +1823,10 @@ slab_empty: | |||
1839 | * Slab still on the partial list. | 1823 | * Slab still on the partial list. |
1840 | */ | 1824 | */ |
1841 | remove_partial(s, page); | 1825 | remove_partial(s, page); |
1842 | stat(c, FREE_REMOVE_PARTIAL); | 1826 | stat(s, FREE_REMOVE_PARTIAL); |
1843 | } | 1827 | } |
1844 | slab_unlock(page); | 1828 | slab_unlock(page); |
1845 | stat(c, FREE_SLAB); | 1829 | stat(s, FREE_SLAB); |
1846 | discard_slab(s, page); | 1830 | discard_slab(s, page); |
1847 | return; | 1831 | return; |
1848 | 1832 | ||
@@ -1872,17 +1856,17 @@ static __always_inline void slab_free(struct kmem_cache *s, | |||
1872 | 1856 | ||
1873 | kmemleak_free_recursive(x, s->flags); | 1857 | kmemleak_free_recursive(x, s->flags); |
1874 | local_irq_save(flags); | 1858 | local_irq_save(flags); |
1875 | c = get_cpu_slab(s, smp_processor_id()); | 1859 | c = __this_cpu_ptr(s->cpu_slab); |
1876 | kmemcheck_slab_free(s, object, c->objsize); | 1860 | kmemcheck_slab_free(s, object, s->objsize); |
1877 | debug_check_no_locks_freed(object, c->objsize); | 1861 | debug_check_no_locks_freed(object, s->objsize); |
1878 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 1862 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
1879 | debug_check_no_obj_freed(object, c->objsize); | 1863 | debug_check_no_obj_freed(object, s->objsize); |
1880 | if (likely(page == c->page && c->node >= 0)) { | 1864 | if (likely(page == c->page && c->node >= 0)) { |
1881 | object[c->offset] = c->freelist; | 1865 | set_freepointer(s, object, c->freelist); |
1882 | c->freelist = object; | 1866 | c->freelist = object; |
1883 | stat(c, FREE_FASTPATH); | 1867 | stat(s, FREE_FASTPATH); |
1884 | } else | 1868 | } else |
1885 | __slab_free(s, page, x, addr, c->offset); | 1869 | __slab_free(s, page, x, addr); |
1886 | 1870 | ||
1887 | local_irq_restore(flags); | 1871 | local_irq_restore(flags); |
1888 | } | 1872 | } |
@@ -2069,19 +2053,6 @@ static unsigned long calculate_alignment(unsigned long flags, | |||
2069 | return ALIGN(align, sizeof(void *)); | 2053 | return ALIGN(align, sizeof(void *)); |
2070 | } | 2054 | } |
2071 | 2055 | ||
2072 | static void init_kmem_cache_cpu(struct kmem_cache *s, | ||
2073 | struct kmem_cache_cpu *c) | ||
2074 | { | ||
2075 | c->page = NULL; | ||
2076 | c->freelist = NULL; | ||
2077 | c->node = 0; | ||
2078 | c->offset = s->offset / sizeof(void *); | ||
2079 | c->objsize = s->objsize; | ||
2080 | #ifdef CONFIG_SLUB_STATS | ||
2081 | memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned)); | ||
2082 | #endif | ||
2083 | } | ||
2084 | |||
2085 | static void | 2056 | static void |
2086 | init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) | 2057 | init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) |
2087 | { | 2058 | { |
@@ -2095,130 +2066,24 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) | |||
2095 | #endif | 2066 | #endif |
2096 | } | 2067 | } |
2097 | 2068 | ||
2098 | #ifdef CONFIG_SMP | 2069 | static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]); |
2099 | /* | ||
2100 | * Per cpu array for per cpu structures. | ||
2101 | * | ||
2102 | * The per cpu array places all kmem_cache_cpu structures from one processor | ||
2103 | * close together meaning that it becomes possible that multiple per cpu | ||
2104 | * structures are contained in one cacheline. This may be particularly | ||
2105 | * beneficial for the kmalloc caches. | ||
2106 | * | ||
2107 | * A desktop system typically has around 60-80 slabs. With 100 here we are | ||
2108 | * likely able to get per cpu structures for all caches from the array defined | ||
2109 | * here. We must be able to cover all kmalloc caches during bootstrap. | ||
2110 | * | ||
2111 | * If the per cpu array is exhausted then fall back to kmalloc | ||
2112 | * of individual cachelines. No sharing is possible then. | ||
2113 | */ | ||
2114 | #define NR_KMEM_CACHE_CPU 100 | ||
2115 | |||
2116 | static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], | ||
2117 | kmem_cache_cpu); | ||
2118 | |||
2119 | static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); | ||
2120 | static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); | ||
2121 | |||
2122 | static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, | ||
2123 | int cpu, gfp_t flags) | ||
2124 | { | ||
2125 | struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); | ||
2126 | |||
2127 | if (c) | ||
2128 | per_cpu(kmem_cache_cpu_free, cpu) = | ||
2129 | (void *)c->freelist; | ||
2130 | else { | ||
2131 | /* Table overflow: So allocate ourselves */ | ||
2132 | c = kmalloc_node( | ||
2133 | ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), | ||
2134 | flags, cpu_to_node(cpu)); | ||
2135 | if (!c) | ||
2136 | return NULL; | ||
2137 | } | ||
2138 | |||
2139 | init_kmem_cache_cpu(s, c); | ||
2140 | return c; | ||
2141 | } | ||
2142 | |||
2143 | static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) | ||
2144 | { | ||
2145 | if (c < per_cpu(kmem_cache_cpu, cpu) || | ||
2146 | c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { | ||
2147 | kfree(c); | ||
2148 | return; | ||
2149 | } | ||
2150 | c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); | ||
2151 | per_cpu(kmem_cache_cpu_free, cpu) = c; | ||
2152 | } | ||
2153 | |||
2154 | static void free_kmem_cache_cpus(struct kmem_cache *s) | ||
2155 | { | ||
2156 | int cpu; | ||
2157 | |||
2158 | for_each_online_cpu(cpu) { | ||
2159 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
2160 | |||
2161 | if (c) { | ||
2162 | s->cpu_slab[cpu] = NULL; | ||
2163 | free_kmem_cache_cpu(c, cpu); | ||
2164 | } | ||
2165 | } | ||
2166 | } | ||
2167 | |||
2168 | static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
2169 | { | ||
2170 | int cpu; | ||
2171 | |||
2172 | for_each_online_cpu(cpu) { | ||
2173 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
2174 | |||
2175 | if (c) | ||
2176 | continue; | ||
2177 | |||
2178 | c = alloc_kmem_cache_cpu(s, cpu, flags); | ||
2179 | if (!c) { | ||
2180 | free_kmem_cache_cpus(s); | ||
2181 | return 0; | ||
2182 | } | ||
2183 | s->cpu_slab[cpu] = c; | ||
2184 | } | ||
2185 | return 1; | ||
2186 | } | ||
2187 | |||
2188 | /* | ||
2189 | * Initialize the per cpu array. | ||
2190 | */ | ||
2191 | static void init_alloc_cpu_cpu(int cpu) | ||
2192 | { | ||
2193 | int i; | ||
2194 | |||
2195 | if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) | ||
2196 | return; | ||
2197 | |||
2198 | for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) | ||
2199 | free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); | ||
2200 | |||
2201 | cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)); | ||
2202 | } | ||
2203 | 2070 | ||
2204 | static void __init init_alloc_cpu(void) | 2071 | static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) |
2205 | { | 2072 | { |
2206 | int cpu; | 2073 | if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) |
2207 | 2074 | /* | |
2208 | for_each_online_cpu(cpu) | 2075 | * Boot time creation of the kmalloc array. Use static per cpu data |
2209 | init_alloc_cpu_cpu(cpu); | 2076 | * since the per cpu allocator is not available yet. |
2210 | } | 2077 | */ |
2078 | s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches); | ||
2079 | else | ||
2080 | s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); | ||
2211 | 2081 | ||
2212 | #else | 2082 | if (!s->cpu_slab) |
2213 | static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} | 2083 | return 0; |
2214 | static inline void init_alloc_cpu(void) {} | ||
2215 | 2084 | ||
2216 | static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
2217 | { | ||
2218 | init_kmem_cache_cpu(s, &s->cpu_slab); | ||
2219 | return 1; | 2085 | return 1; |
2220 | } | 2086 | } |
2221 | #endif | ||
2222 | 2087 | ||
2223 | #ifdef CONFIG_NUMA | 2088 | #ifdef CONFIG_NUMA |
2224 | /* | 2089 | /* |
@@ -2287,7 +2152,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) | |||
2287 | int node; | 2152 | int node; |
2288 | int local_node; | 2153 | int local_node; |
2289 | 2154 | ||
2290 | if (slab_state >= UP) | 2155 | if (slab_state >= UP && (s < kmalloc_caches || |
2156 | s >= kmalloc_caches + KMALLOC_CACHES)) | ||
2291 | local_node = page_to_nid(virt_to_page(s)); | 2157 | local_node = page_to_nid(virt_to_page(s)); |
2292 | else | 2158 | else |
2293 | local_node = 0; | 2159 | local_node = 0; |
@@ -2502,6 +2368,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | |||
2502 | 2368 | ||
2503 | if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) | 2369 | if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) |
2504 | return 1; | 2370 | return 1; |
2371 | |||
2505 | free_kmem_cache_nodes(s); | 2372 | free_kmem_cache_nodes(s); |
2506 | error: | 2373 | error: |
2507 | if (flags & SLAB_PANIC) | 2374 | if (flags & SLAB_PANIC) |
@@ -2519,6 +2386,9 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object) | |||
2519 | { | 2386 | { |
2520 | struct page *page; | 2387 | struct page *page; |
2521 | 2388 | ||
2389 | if (!kern_ptr_validate(object, s->size)) | ||
2390 | return 0; | ||
2391 | |||
2522 | page = get_object_page(object); | 2392 | page = get_object_page(object); |
2523 | 2393 | ||
2524 | if (!page || s != page->slab) | 2394 | if (!page || s != page->slab) |
@@ -2609,9 +2479,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) | |||
2609 | int node; | 2479 | int node; |
2610 | 2480 | ||
2611 | flush_all(s); | 2481 | flush_all(s); |
2612 | 2482 | free_percpu(s->cpu_slab); | |
2613 | /* Attempt to free all objects */ | 2483 | /* Attempt to free all objects */ |
2614 | free_kmem_cache_cpus(s); | ||
2615 | for_each_node_state(node, N_NORMAL_MEMORY) { | 2484 | for_each_node_state(node, N_NORMAL_MEMORY) { |
2616 | struct kmem_cache_node *n = get_node(s, node); | 2485 | struct kmem_cache_node *n = get_node(s, node); |
2617 | 2486 | ||
@@ -2651,7 +2520,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); | |||
2651 | * Kmalloc subsystem | 2520 | * Kmalloc subsystem |
2652 | *******************************************************************/ | 2521 | *******************************************************************/ |
2653 | 2522 | ||
2654 | struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; | 2523 | struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned; |
2655 | EXPORT_SYMBOL(kmalloc_caches); | 2524 | EXPORT_SYMBOL(kmalloc_caches); |
2656 | 2525 | ||
2657 | static int __init setup_slub_min_order(char *str) | 2526 | static int __init setup_slub_min_order(char *str) |
@@ -2741,6 +2610,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
2741 | char *text; | 2610 | char *text; |
2742 | size_t realsize; | 2611 | size_t realsize; |
2743 | unsigned long slabflags; | 2612 | unsigned long slabflags; |
2613 | int i; | ||
2744 | 2614 | ||
2745 | s = kmalloc_caches_dma[index]; | 2615 | s = kmalloc_caches_dma[index]; |
2746 | if (s) | 2616 | if (s) |
@@ -2760,7 +2630,14 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
2760 | realsize = kmalloc_caches[index].objsize; | 2630 | realsize = kmalloc_caches[index].objsize; |
2761 | text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", | 2631 | text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", |
2762 | (unsigned int)realsize); | 2632 | (unsigned int)realsize); |
2763 | s = kmalloc(kmem_size, flags & ~SLUB_DMA); | 2633 | |
2634 | s = NULL; | ||
2635 | for (i = 0; i < KMALLOC_CACHES; i++) | ||
2636 | if (!kmalloc_caches[i].size) | ||
2637 | break; | ||
2638 | |||
2639 | BUG_ON(i >= KMALLOC_CACHES); | ||
2640 | s = kmalloc_caches + i; | ||
2764 | 2641 | ||
2765 | /* | 2642 | /* |
2766 | * Must defer sysfs creation to a workqueue because we don't know | 2643 | * Must defer sysfs creation to a workqueue because we don't know |
@@ -2772,9 +2649,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
2772 | if (slab_state >= SYSFS) | 2649 | if (slab_state >= SYSFS) |
2773 | slabflags |= __SYSFS_ADD_DEFERRED; | 2650 | slabflags |= __SYSFS_ADD_DEFERRED; |
2774 | 2651 | ||
2775 | if (!s || !text || !kmem_cache_open(s, flags, text, | 2652 | if (!text || !kmem_cache_open(s, flags, text, |
2776 | realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { | 2653 | realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { |
2777 | kfree(s); | 2654 | s->size = 0; |
2778 | kfree(text); | 2655 | kfree(text); |
2779 | goto unlock_out; | 2656 | goto unlock_out; |
2780 | } | 2657 | } |
@@ -3086,7 +2963,7 @@ static void slab_mem_offline_callback(void *arg) | |||
3086 | /* | 2963 | /* |
3087 | * if n->nr_slabs > 0, slabs still exist on the node | 2964 | * if n->nr_slabs > 0, slabs still exist on the node |
3088 | * that is going down. We were unable to free them, | 2965 | * that is going down. We were unable to free them, |
3089 | * and offline_pages() function shoudn't call this | 2966 | * and offline_pages() function shouldn't call this |
3090 | * callback. So, we must fail. | 2967 | * callback. So, we must fail. |
3091 | */ | 2968 | */ |
3092 | BUG_ON(slabs_node(s, offline_node)); | 2969 | BUG_ON(slabs_node(s, offline_node)); |
@@ -3176,8 +3053,6 @@ void __init kmem_cache_init(void) | |||
3176 | int i; | 3053 | int i; |
3177 | int caches = 0; | 3054 | int caches = 0; |
3178 | 3055 | ||
3179 | init_alloc_cpu(); | ||
3180 | |||
3181 | #ifdef CONFIG_NUMA | 3056 | #ifdef CONFIG_NUMA |
3182 | /* | 3057 | /* |
3183 | * Must first have the slab cache available for the allocations of the | 3058 | * Must first have the slab cache available for the allocations of the |
@@ -3261,8 +3136,10 @@ void __init kmem_cache_init(void) | |||
3261 | 3136 | ||
3262 | #ifdef CONFIG_SMP | 3137 | #ifdef CONFIG_SMP |
3263 | register_cpu_notifier(&slab_notifier); | 3138 | register_cpu_notifier(&slab_notifier); |
3264 | kmem_size = offsetof(struct kmem_cache, cpu_slab) + | 3139 | #endif |
3265 | nr_cpu_ids * sizeof(struct kmem_cache_cpu *); | 3140 | #ifdef CONFIG_NUMA |
3141 | kmem_size = offsetof(struct kmem_cache, node) + | ||
3142 | nr_node_ids * sizeof(struct kmem_cache_node *); | ||
3266 | #else | 3143 | #else |
3267 | kmem_size = sizeof(struct kmem_cache); | 3144 | kmem_size = sizeof(struct kmem_cache); |
3268 | #endif | 3145 | #endif |
@@ -3351,22 +3228,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
3351 | down_write(&slub_lock); | 3228 | down_write(&slub_lock); |
3352 | s = find_mergeable(size, align, flags, name, ctor); | 3229 | s = find_mergeable(size, align, flags, name, ctor); |
3353 | if (s) { | 3230 | if (s) { |
3354 | int cpu; | ||
3355 | |||
3356 | s->refcount++; | 3231 | s->refcount++; |
3357 | /* | 3232 | /* |
3358 | * Adjust the object sizes so that we clear | 3233 | * Adjust the object sizes so that we clear |
3359 | * the complete object on kzalloc. | 3234 | * the complete object on kzalloc. |
3360 | */ | 3235 | */ |
3361 | s->objsize = max(s->objsize, (int)size); | 3236 | s->objsize = max(s->objsize, (int)size); |
3362 | |||
3363 | /* | ||
3364 | * And then we need to update the object size in the | ||
3365 | * per cpu structures | ||
3366 | */ | ||
3367 | for_each_online_cpu(cpu) | ||
3368 | get_cpu_slab(s, cpu)->objsize = s->objsize; | ||
3369 | |||
3370 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); | 3237 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); |
3371 | up_write(&slub_lock); | 3238 | up_write(&slub_lock); |
3372 | 3239 | ||
@@ -3420,29 +3287,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | |||
3420 | unsigned long flags; | 3287 | unsigned long flags; |
3421 | 3288 | ||
3422 | switch (action) { | 3289 | switch (action) { |
3423 | case CPU_UP_PREPARE: | ||
3424 | case CPU_UP_PREPARE_FROZEN: | ||
3425 | init_alloc_cpu_cpu(cpu); | ||
3426 | down_read(&slub_lock); | ||
3427 | list_for_each_entry(s, &slab_caches, list) | ||
3428 | s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, | ||
3429 | GFP_KERNEL); | ||
3430 | up_read(&slub_lock); | ||
3431 | break; | ||
3432 | |||
3433 | case CPU_UP_CANCELED: | 3290 | case CPU_UP_CANCELED: |
3434 | case CPU_UP_CANCELED_FROZEN: | 3291 | case CPU_UP_CANCELED_FROZEN: |
3435 | case CPU_DEAD: | 3292 | case CPU_DEAD: |
3436 | case CPU_DEAD_FROZEN: | 3293 | case CPU_DEAD_FROZEN: |
3437 | down_read(&slub_lock); | 3294 | down_read(&slub_lock); |
3438 | list_for_each_entry(s, &slab_caches, list) { | 3295 | list_for_each_entry(s, &slab_caches, list) { |
3439 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
3440 | |||
3441 | local_irq_save(flags); | 3296 | local_irq_save(flags); |
3442 | __flush_cpu_slab(s, cpu); | 3297 | __flush_cpu_slab(s, cpu); |
3443 | local_irq_restore(flags); | 3298 | local_irq_restore(flags); |
3444 | free_kmem_cache_cpu(c, cpu); | ||
3445 | s->cpu_slab[cpu] = NULL; | ||
3446 | } | 3299 | } |
3447 | up_read(&slub_lock); | 3300 | up_read(&slub_lock); |
3448 | break; | 3301 | break; |
@@ -3928,7 +3781,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
3928 | int cpu; | 3781 | int cpu; |
3929 | 3782 | ||
3930 | for_each_possible_cpu(cpu) { | 3783 | for_each_possible_cpu(cpu) { |
3931 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | 3784 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
3932 | 3785 | ||
3933 | if (!c || c->node < 0) | 3786 | if (!c || c->node < 0) |
3934 | continue; | 3787 | continue; |
@@ -4171,6 +4024,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, | |||
4171 | } | 4024 | } |
4172 | SLAB_ATTR(trace); | 4025 | SLAB_ATTR(trace); |
4173 | 4026 | ||
4027 | #ifdef CONFIG_FAILSLAB | ||
4028 | static ssize_t failslab_show(struct kmem_cache *s, char *buf) | ||
4029 | { | ||
4030 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); | ||
4031 | } | ||
4032 | |||
4033 | static ssize_t failslab_store(struct kmem_cache *s, const char *buf, | ||
4034 | size_t length) | ||
4035 | { | ||
4036 | s->flags &= ~SLAB_FAILSLAB; | ||
4037 | if (buf[0] == '1') | ||
4038 | s->flags |= SLAB_FAILSLAB; | ||
4039 | return length; | ||
4040 | } | ||
4041 | SLAB_ATTR(failslab); | ||
4042 | #endif | ||
4043 | |||
4174 | static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) | 4044 | static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) |
4175 | { | 4045 | { |
4176 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); | 4046 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); |
@@ -4353,7 +4223,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) | |||
4353 | return -ENOMEM; | 4223 | return -ENOMEM; |
4354 | 4224 | ||
4355 | for_each_online_cpu(cpu) { | 4225 | for_each_online_cpu(cpu) { |
4356 | unsigned x = get_cpu_slab(s, cpu)->stat[si]; | 4226 | unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; |
4357 | 4227 | ||
4358 | data[cpu] = x; | 4228 | data[cpu] = x; |
4359 | sum += x; | 4229 | sum += x; |
@@ -4371,12 +4241,28 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) | |||
4371 | return len + sprintf(buf + len, "\n"); | 4241 | return len + sprintf(buf + len, "\n"); |
4372 | } | 4242 | } |
4373 | 4243 | ||
4244 | static void clear_stat(struct kmem_cache *s, enum stat_item si) | ||
4245 | { | ||
4246 | int cpu; | ||
4247 | |||
4248 | for_each_online_cpu(cpu) | ||
4249 | per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; | ||
4250 | } | ||
4251 | |||
4374 | #define STAT_ATTR(si, text) \ | 4252 | #define STAT_ATTR(si, text) \ |
4375 | static ssize_t text##_show(struct kmem_cache *s, char *buf) \ | 4253 | static ssize_t text##_show(struct kmem_cache *s, char *buf) \ |
4376 | { \ | 4254 | { \ |
4377 | return show_stat(s, buf, si); \ | 4255 | return show_stat(s, buf, si); \ |
4378 | } \ | 4256 | } \ |
4379 | SLAB_ATTR_RO(text); \ | 4257 | static ssize_t text##_store(struct kmem_cache *s, \ |
4258 | const char *buf, size_t length) \ | ||
4259 | { \ | ||
4260 | if (buf[0] != '0') \ | ||
4261 | return -EINVAL; \ | ||
4262 | clear_stat(s, si); \ | ||
4263 | return length; \ | ||
4264 | } \ | ||
4265 | SLAB_ATTR(text); \ | ||
4380 | 4266 | ||
4381 | STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); | 4267 | STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); |
4382 | STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); | 4268 | STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); |
@@ -4451,6 +4337,10 @@ static struct attribute *slab_attrs[] = { | |||
4451 | &deactivate_remote_frees_attr.attr, | 4337 | &deactivate_remote_frees_attr.attr, |
4452 | &order_fallback_attr.attr, | 4338 | &order_fallback_attr.attr, |
4453 | #endif | 4339 | #endif |
4340 | #ifdef CONFIG_FAILSLAB | ||
4341 | &failslab_attr.attr, | ||
4342 | #endif | ||
4343 | |||
4454 | NULL | 4344 | NULL |
4455 | }; | 4345 | }; |
4456 | 4346 | ||
@@ -4503,7 +4393,7 @@ static void kmem_cache_release(struct kobject *kobj) | |||
4503 | kfree(s); | 4393 | kfree(s); |
4504 | } | 4394 | } |
4505 | 4395 | ||
4506 | static struct sysfs_ops slab_sysfs_ops = { | 4396 | static const struct sysfs_ops slab_sysfs_ops = { |
4507 | .show = slab_attr_show, | 4397 | .show = slab_attr_show, |
4508 | .store = slab_attr_store, | 4398 | .store = slab_attr_store, |
4509 | }; | 4399 | }; |
@@ -4522,7 +4412,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj) | |||
4522 | return 0; | 4412 | return 0; |
4523 | } | 4413 | } |
4524 | 4414 | ||
4525 | static struct kset_uevent_ops slab_uevent_ops = { | 4415 | static const struct kset_uevent_ops slab_uevent_ops = { |
4526 | .filter = uevent_filter, | 4416 | .filter = uevent_filter, |
4527 | }; | 4417 | }; |
4528 | 4418 | ||
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index d9714bdcb4a3..aa33fd67fa41 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/bootmem.h> | 22 | #include <linux/bootmem.h> |
23 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
25 | #include <linux/slab.h> | ||
25 | #include <linux/spinlock.h> | 26 | #include <linux/spinlock.h> |
26 | #include <linux/vmalloc.h> | 27 | #include <linux/vmalloc.h> |
27 | #include <linux/sched.h> | 28 | #include <linux/sched.h> |
@@ -40,9 +41,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, | |||
40 | unsigned long align, | 41 | unsigned long align, |
41 | unsigned long goal) | 42 | unsigned long goal) |
42 | { | 43 | { |
43 | return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); | 44 | return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); |
44 | } | 45 | } |
45 | 46 | ||
47 | static void *vmemmap_buf; | ||
48 | static void *vmemmap_buf_end; | ||
46 | 49 | ||
47 | void * __meminit vmemmap_alloc_block(unsigned long size, int node) | 50 | void * __meminit vmemmap_alloc_block(unsigned long size, int node) |
48 | { | 51 | { |
@@ -64,6 +67,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) | |||
64 | __pa(MAX_DMA_ADDRESS)); | 67 | __pa(MAX_DMA_ADDRESS)); |
65 | } | 68 | } |
66 | 69 | ||
70 | /* need to make sure size is all the same during early stage */ | ||
71 | void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) | ||
72 | { | ||
73 | void *ptr; | ||
74 | |||
75 | if (!vmemmap_buf) | ||
76 | return vmemmap_alloc_block(size, node); | ||
77 | |||
78 | /* take the from buf */ | ||
79 | ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size); | ||
80 | if (ptr + size > vmemmap_buf_end) | ||
81 | return vmemmap_alloc_block(size, node); | ||
82 | |||
83 | vmemmap_buf = ptr + size; | ||
84 | |||
85 | return ptr; | ||
86 | } | ||
87 | |||
67 | void __meminit vmemmap_verify(pte_t *pte, int node, | 88 | void __meminit vmemmap_verify(pte_t *pte, int node, |
68 | unsigned long start, unsigned long end) | 89 | unsigned long start, unsigned long end) |
69 | { | 90 | { |
@@ -80,7 +101,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) | |||
80 | pte_t *pte = pte_offset_kernel(pmd, addr); | 101 | pte_t *pte = pte_offset_kernel(pmd, addr); |
81 | if (pte_none(*pte)) { | 102 | if (pte_none(*pte)) { |
82 | pte_t entry; | 103 | pte_t entry; |
83 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | 104 | void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); |
84 | if (!p) | 105 | if (!p) |
85 | return NULL; | 106 | return NULL; |
86 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); | 107 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); |
@@ -163,3 +184,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) | |||
163 | 184 | ||
164 | return map; | 185 | return map; |
165 | } | 186 | } |
187 | |||
188 | void __init sparse_mem_maps_populate_node(struct page **map_map, | ||
189 | unsigned long pnum_begin, | ||
190 | unsigned long pnum_end, | ||
191 | unsigned long map_count, int nodeid) | ||
192 | { | ||
193 | unsigned long pnum; | ||
194 | unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; | ||
195 | void *vmemmap_buf_start; | ||
196 | |||
197 | size = ALIGN(size, PMD_SIZE); | ||
198 | vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, | ||
199 | PMD_SIZE, __pa(MAX_DMA_ADDRESS)); | ||
200 | |||
201 | if (vmemmap_buf_start) { | ||
202 | vmemmap_buf = vmemmap_buf_start; | ||
203 | vmemmap_buf_end = vmemmap_buf_start + size * map_count; | ||
204 | } | ||
205 | |||
206 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
207 | struct mem_section *ms; | ||
208 | |||
209 | if (!present_section_nr(pnum)) | ||
210 | continue; | ||
211 | |||
212 | map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); | ||
213 | if (map_map[pnum]) | ||
214 | continue; | ||
215 | ms = __nr_to_section(pnum); | ||
216 | printk(KERN_ERR "%s: sparsemem memory map backing failed " | ||
217 | "some memory will not be available.\n", __func__); | ||
218 | ms->section_mem_map = 0; | ||
219 | } | ||
220 | |||
221 | if (vmemmap_buf_start) { | ||
222 | /* need to free left buf */ | ||
223 | #ifdef CONFIG_NO_BOOTMEM | ||
224 | free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end)); | ||
225 | if (vmemmap_buf_start < vmemmap_buf) { | ||
226 | char name[15]; | ||
227 | |||
228 | snprintf(name, sizeof(name), "MEMMAP %d", nodeid); | ||
229 | reserve_early_without_check(__pa(vmemmap_buf_start), | ||
230 | __pa(vmemmap_buf), name); | ||
231 | } | ||
232 | #else | ||
233 | free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); | ||
234 | #endif | ||
235 | vmemmap_buf = NULL; | ||
236 | vmemmap_buf_end = NULL; | ||
237 | } | ||
238 | } | ||
diff --git a/mm/sparse.c b/mm/sparse.c index 6ce4aab69e99..dc0cc4d43ff3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -2,6 +2,7 @@ | |||
2 | * sparse memory mappings. | 2 | * sparse memory mappings. |
3 | */ | 3 | */ |
4 | #include <linux/mm.h> | 4 | #include <linux/mm.h> |
5 | #include <linux/slab.h> | ||
5 | #include <linux/mmzone.h> | 6 | #include <linux/mmzone.h> |
6 | #include <linux/bootmem.h> | 7 | #include <linux/bootmem.h> |
7 | #include <linux/highmem.h> | 8 | #include <linux/highmem.h> |
@@ -271,7 +272,8 @@ static unsigned long *__kmalloc_section_usemap(void) | |||
271 | 272 | ||
272 | #ifdef CONFIG_MEMORY_HOTREMOVE | 273 | #ifdef CONFIG_MEMORY_HOTREMOVE |
273 | static unsigned long * __init | 274 | static unsigned long * __init |
274 | sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | 275 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
276 | unsigned long count) | ||
275 | { | 277 | { |
276 | unsigned long section_nr; | 278 | unsigned long section_nr; |
277 | 279 | ||
@@ -286,7 +288,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | |||
286 | * this problem. | 288 | * this problem. |
287 | */ | 289 | */ |
288 | section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); | 290 | section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); |
289 | return alloc_bootmem_section(usemap_size(), section_nr); | 291 | return alloc_bootmem_section(usemap_size() * count, section_nr); |
290 | } | 292 | } |
291 | 293 | ||
292 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 294 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
@@ -329,7 +331,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | |||
329 | } | 331 | } |
330 | #else | 332 | #else |
331 | static unsigned long * __init | 333 | static unsigned long * __init |
332 | sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | 334 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
335 | unsigned long count) | ||
333 | { | 336 | { |
334 | return NULL; | 337 | return NULL; |
335 | } | 338 | } |
@@ -339,27 +342,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | |||
339 | } | 342 | } |
340 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 343 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
341 | 344 | ||
342 | static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) | 345 | static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, |
346 | unsigned long pnum_begin, | ||
347 | unsigned long pnum_end, | ||
348 | unsigned long usemap_count, int nodeid) | ||
343 | { | 349 | { |
344 | unsigned long *usemap; | 350 | void *usemap; |
345 | struct mem_section *ms = __nr_to_section(pnum); | 351 | unsigned long pnum; |
346 | int nid = sparse_early_nid(ms); | 352 | int size = usemap_size(); |
347 | |||
348 | usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid)); | ||
349 | if (usemap) | ||
350 | return usemap; | ||
351 | 353 | ||
352 | usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); | 354 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), |
355 | usemap_count); | ||
353 | if (usemap) { | 356 | if (usemap) { |
354 | check_usemap_section_nr(nid, usemap); | 357 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
355 | return usemap; | 358 | if (!present_section_nr(pnum)) |
359 | continue; | ||
360 | usemap_map[pnum] = usemap; | ||
361 | usemap += size; | ||
362 | } | ||
363 | return; | ||
356 | } | 364 | } |
357 | 365 | ||
358 | /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ | 366 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); |
359 | nid = 0; | 367 | if (usemap) { |
368 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
369 | if (!present_section_nr(pnum)) | ||
370 | continue; | ||
371 | usemap_map[pnum] = usemap; | ||
372 | usemap += size; | ||
373 | check_usemap_section_nr(nodeid, usemap_map[pnum]); | ||
374 | } | ||
375 | return; | ||
376 | } | ||
360 | 377 | ||
361 | printk(KERN_WARNING "%s: allocation failed\n", __func__); | 378 | printk(KERN_WARNING "%s: allocation failed\n", __func__); |
362 | return NULL; | ||
363 | } | 379 | } |
364 | 380 | ||
365 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 381 | #ifndef CONFIG_SPARSEMEM_VMEMMAP |
@@ -375,8 +391,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | |||
375 | PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); | 391 | PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); |
376 | return map; | 392 | return map; |
377 | } | 393 | } |
394 | void __init sparse_mem_maps_populate_node(struct page **map_map, | ||
395 | unsigned long pnum_begin, | ||
396 | unsigned long pnum_end, | ||
397 | unsigned long map_count, int nodeid) | ||
398 | { | ||
399 | void *map; | ||
400 | unsigned long pnum; | ||
401 | unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; | ||
402 | |||
403 | map = alloc_remap(nodeid, size * map_count); | ||
404 | if (map) { | ||
405 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
406 | if (!present_section_nr(pnum)) | ||
407 | continue; | ||
408 | map_map[pnum] = map; | ||
409 | map += size; | ||
410 | } | ||
411 | return; | ||
412 | } | ||
413 | |||
414 | size = PAGE_ALIGN(size); | ||
415 | map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count); | ||
416 | if (map) { | ||
417 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
418 | if (!present_section_nr(pnum)) | ||
419 | continue; | ||
420 | map_map[pnum] = map; | ||
421 | map += size; | ||
422 | } | ||
423 | return; | ||
424 | } | ||
425 | |||
426 | /* fallback */ | ||
427 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
428 | struct mem_section *ms; | ||
429 | |||
430 | if (!present_section_nr(pnum)) | ||
431 | continue; | ||
432 | map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); | ||
433 | if (map_map[pnum]) | ||
434 | continue; | ||
435 | ms = __nr_to_section(pnum); | ||
436 | printk(KERN_ERR "%s: sparsemem memory map backing failed " | ||
437 | "some memory will not be available.\n", __func__); | ||
438 | ms->section_mem_map = 0; | ||
439 | } | ||
440 | } | ||
378 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | 441 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ |
379 | 442 | ||
443 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
444 | static void __init sparse_early_mem_maps_alloc_node(struct page **map_map, | ||
445 | unsigned long pnum_begin, | ||
446 | unsigned long pnum_end, | ||
447 | unsigned long map_count, int nodeid) | ||
448 | { | ||
449 | sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, | ||
450 | map_count, nodeid); | ||
451 | } | ||
452 | #else | ||
380 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | 453 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) |
381 | { | 454 | { |
382 | struct page *map; | 455 | struct page *map; |
@@ -392,10 +465,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | |||
392 | ms->section_mem_map = 0; | 465 | ms->section_mem_map = 0; |
393 | return NULL; | 466 | return NULL; |
394 | } | 467 | } |
468 | #endif | ||
395 | 469 | ||
396 | void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) | 470 | void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) |
397 | { | 471 | { |
398 | } | 472 | } |
473 | |||
399 | /* | 474 | /* |
400 | * Allocate the accumulated non-linear sections, allocate a mem_map | 475 | * Allocate the accumulated non-linear sections, allocate a mem_map |
401 | * for each and record the physical to section mapping. | 476 | * for each and record the physical to section mapping. |
@@ -407,6 +482,14 @@ void __init sparse_init(void) | |||
407 | unsigned long *usemap; | 482 | unsigned long *usemap; |
408 | unsigned long **usemap_map; | 483 | unsigned long **usemap_map; |
409 | int size; | 484 | int size; |
485 | int nodeid_begin = 0; | ||
486 | unsigned long pnum_begin = 0; | ||
487 | unsigned long usemap_count; | ||
488 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
489 | unsigned long map_count; | ||
490 | int size2; | ||
491 | struct page **map_map; | ||
492 | #endif | ||
410 | 493 | ||
411 | /* | 494 | /* |
412 | * map is using big page (aka 2M in x86 64 bit) | 495 | * map is using big page (aka 2M in x86 64 bit) |
@@ -425,10 +508,81 @@ void __init sparse_init(void) | |||
425 | panic("can not allocate usemap_map\n"); | 508 | panic("can not allocate usemap_map\n"); |
426 | 509 | ||
427 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 510 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { |
511 | struct mem_section *ms; | ||
512 | |||
428 | if (!present_section_nr(pnum)) | 513 | if (!present_section_nr(pnum)) |
429 | continue; | 514 | continue; |
430 | usemap_map[pnum] = sparse_early_usemap_alloc(pnum); | 515 | ms = __nr_to_section(pnum); |
516 | nodeid_begin = sparse_early_nid(ms); | ||
517 | pnum_begin = pnum; | ||
518 | break; | ||
431 | } | 519 | } |
520 | usemap_count = 1; | ||
521 | for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { | ||
522 | struct mem_section *ms; | ||
523 | int nodeid; | ||
524 | |||
525 | if (!present_section_nr(pnum)) | ||
526 | continue; | ||
527 | ms = __nr_to_section(pnum); | ||
528 | nodeid = sparse_early_nid(ms); | ||
529 | if (nodeid == nodeid_begin) { | ||
530 | usemap_count++; | ||
531 | continue; | ||
532 | } | ||
533 | /* ok, we need to take cake of from pnum_begin to pnum - 1*/ | ||
534 | sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum, | ||
535 | usemap_count, nodeid_begin); | ||
536 | /* new start, update count etc*/ | ||
537 | nodeid_begin = nodeid; | ||
538 | pnum_begin = pnum; | ||
539 | usemap_count = 1; | ||
540 | } | ||
541 | /* ok, last chunk */ | ||
542 | sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS, | ||
543 | usemap_count, nodeid_begin); | ||
544 | |||
545 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
546 | size2 = sizeof(struct page *) * NR_MEM_SECTIONS; | ||
547 | map_map = alloc_bootmem(size2); | ||
548 | if (!map_map) | ||
549 | panic("can not allocate map_map\n"); | ||
550 | |||
551 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | ||
552 | struct mem_section *ms; | ||
553 | |||
554 | if (!present_section_nr(pnum)) | ||
555 | continue; | ||
556 | ms = __nr_to_section(pnum); | ||
557 | nodeid_begin = sparse_early_nid(ms); | ||
558 | pnum_begin = pnum; | ||
559 | break; | ||
560 | } | ||
561 | map_count = 1; | ||
562 | for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { | ||
563 | struct mem_section *ms; | ||
564 | int nodeid; | ||
565 | |||
566 | if (!present_section_nr(pnum)) | ||
567 | continue; | ||
568 | ms = __nr_to_section(pnum); | ||
569 | nodeid = sparse_early_nid(ms); | ||
570 | if (nodeid == nodeid_begin) { | ||
571 | map_count++; | ||
572 | continue; | ||
573 | } | ||
574 | /* ok, we need to take cake of from pnum_begin to pnum - 1*/ | ||
575 | sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum, | ||
576 | map_count, nodeid_begin); | ||
577 | /* new start, update count etc*/ | ||
578 | nodeid_begin = nodeid; | ||
579 | pnum_begin = pnum; | ||
580 | map_count = 1; | ||
581 | } | ||
582 | /* ok, last chunk */ | ||
583 | sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS, | ||
584 | map_count, nodeid_begin); | ||
585 | #endif | ||
432 | 586 | ||
433 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 587 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { |
434 | if (!present_section_nr(pnum)) | 588 | if (!present_section_nr(pnum)) |
@@ -438,7 +592,11 @@ void __init sparse_init(void) | |||
438 | if (!usemap) | 592 | if (!usemap) |
439 | continue; | 593 | continue; |
440 | 594 | ||
595 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
596 | map = map_map[pnum]; | ||
597 | #else | ||
441 | map = sparse_early_mem_map_alloc(pnum); | 598 | map = sparse_early_mem_map_alloc(pnum); |
599 | #endif | ||
442 | if (!map) | 600 | if (!map) |
443 | continue; | 601 | continue; |
444 | 602 | ||
@@ -448,6 +606,9 @@ void __init sparse_init(void) | |||
448 | 606 | ||
449 | vmemmap_populate_print_last(); | 607 | vmemmap_populate_print_last(); |
450 | 608 | ||
609 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
610 | free_bootmem(__pa(map_map), size2); | ||
611 | #endif | ||
451 | free_bootmem(__pa(usemap_map), size); | 612 | free_bootmem(__pa(usemap_map), size); |
452 | } | 613 | } |
453 | 614 | ||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
31 | #include <linux/backing-dev.h> | 31 | #include <linux/backing-dev.h> |
32 | #include <linux/memcontrol.h> | 32 | #include <linux/memcontrol.h> |
33 | #include <linux/gfp.h> | ||
33 | 34 | ||
34 | #include "internal.h" | 35 | #include "internal.h" |
35 | 36 | ||
@@ -55,7 +56,7 @@ static void __page_cache_release(struct page *page) | |||
55 | del_page_from_lru(zone, page); | 56 | del_page_from_lru(zone, page); |
56 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 57 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
57 | } | 58 | } |
58 | free_hot_page(page); | 59 | free_hot_cold_page(page, 0); |
59 | } | 60 | } |
60 | 61 | ||
61 | static void put_compound_page(struct page *page) | 62 | static void put_compound_page(struct page *page) |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 6d1daeb1cb4a..e10f5833167f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -8,6 +8,7 @@ | |||
8 | */ | 8 | */ |
9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
10 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
11 | #include <linux/gfp.h> | ||
11 | #include <linux/kernel_stat.h> | 12 | #include <linux/kernel_stat.h> |
12 | #include <linux/swap.h> | 13 | #include <linux/swap.h> |
13 | #include <linux/swapops.h> | 14 | #include <linux/swapops.h> |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 9c590eef7912..6cd0a8f90dc7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
25 | #include <linux/ksm.h> | ||
25 | #include <linux/rmap.h> | 26 | #include <linux/rmap.h> |
26 | #include <linux/security.h> | 27 | #include <linux/security.h> |
27 | #include <linux/backing-dev.h> | 28 | #include <linux/backing-dev.h> |
@@ -35,11 +36,15 @@ | |||
35 | #include <linux/swapops.h> | 36 | #include <linux/swapops.h> |
36 | #include <linux/page_cgroup.h> | 37 | #include <linux/page_cgroup.h> |
37 | 38 | ||
39 | static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | ||
40 | unsigned char); | ||
41 | static void free_swap_count_continuations(struct swap_info_struct *); | ||
42 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); | ||
43 | |||
38 | static DEFINE_SPINLOCK(swap_lock); | 44 | static DEFINE_SPINLOCK(swap_lock); |
39 | static unsigned int nr_swapfiles; | 45 | static unsigned int nr_swapfiles; |
40 | long nr_swap_pages; | 46 | long nr_swap_pages; |
41 | long total_swap_pages; | 47 | long total_swap_pages; |
42 | static int swap_overflow; | ||
43 | static int least_priority; | 48 | static int least_priority; |
44 | 49 | ||
45 | static const char Bad_file[] = "Bad swap file entry "; | 50 | static const char Bad_file[] = "Bad swap file entry "; |
@@ -49,42 +54,20 @@ static const char Unused_offset[] = "Unused swap offset entry "; | |||
49 | 54 | ||
50 | static struct swap_list_t swap_list = {-1, -1}; | 55 | static struct swap_list_t swap_list = {-1, -1}; |
51 | 56 | ||
52 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; | 57 | static struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
53 | 58 | ||
54 | static DEFINE_MUTEX(swapon_mutex); | 59 | static DEFINE_MUTEX(swapon_mutex); |
55 | 60 | ||
56 | /* For reference count accounting in swap_map */ | 61 | static inline unsigned char swap_count(unsigned char ent) |
57 | /* enum for swap_map[] handling. internal use only */ | ||
58 | enum { | ||
59 | SWAP_MAP = 0, /* ops for reference from swap users */ | ||
60 | SWAP_CACHE, /* ops for reference from swap cache */ | ||
61 | }; | ||
62 | |||
63 | static inline int swap_count(unsigned short ent) | ||
64 | { | ||
65 | return ent & SWAP_COUNT_MASK; | ||
66 | } | ||
67 | |||
68 | static inline bool swap_has_cache(unsigned short ent) | ||
69 | { | ||
70 | return !!(ent & SWAP_HAS_CACHE); | ||
71 | } | ||
72 | |||
73 | static inline unsigned short encode_swapmap(int count, bool has_cache) | ||
74 | { | 62 | { |
75 | unsigned short ret = count; | 63 | return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ |
76 | |||
77 | if (has_cache) | ||
78 | return SWAP_HAS_CACHE | ret; | ||
79 | return ret; | ||
80 | } | 64 | } |
81 | 65 | ||
82 | /* returnes 1 if swap entry is freed */ | 66 | /* returns 1 if swap entry is freed */ |
83 | static int | 67 | static int |
84 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | 68 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) |
85 | { | 69 | { |
86 | int type = si - swap_info; | 70 | swp_entry_t entry = swp_entry(si->type, offset); |
87 | swp_entry_t entry = swp_entry(type, offset); | ||
88 | struct page *page; | 71 | struct page *page; |
89 | int ret = 0; | 72 | int ret = 0; |
90 | 73 | ||
@@ -120,7 +103,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
120 | down_read(&swap_unplug_sem); | 103 | down_read(&swap_unplug_sem); |
121 | entry.val = page_private(page); | 104 | entry.val = page_private(page); |
122 | if (PageSwapCache(page)) { | 105 | if (PageSwapCache(page)) { |
123 | struct block_device *bdev = swap_info[swp_type(entry)].bdev; | 106 | struct block_device *bdev = swap_info[swp_type(entry)]->bdev; |
124 | struct backing_dev_info *bdi; | 107 | struct backing_dev_info *bdi; |
125 | 108 | ||
126 | /* | 109 | /* |
@@ -146,23 +129,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
146 | static int discard_swap(struct swap_info_struct *si) | 129 | static int discard_swap(struct swap_info_struct *si) |
147 | { | 130 | { |
148 | struct swap_extent *se; | 131 | struct swap_extent *se; |
132 | sector_t start_block; | ||
133 | sector_t nr_blocks; | ||
149 | int err = 0; | 134 | int err = 0; |
150 | 135 | ||
151 | list_for_each_entry(se, &si->extent_list, list) { | 136 | /* Do not discard the swap header page! */ |
152 | sector_t start_block = se->start_block << (PAGE_SHIFT - 9); | 137 | se = &si->first_swap_extent; |
153 | sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); | 138 | start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); |
139 | nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); | ||
140 | if (nr_blocks) { | ||
141 | err = blkdev_issue_discard(si->bdev, start_block, | ||
142 | nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); | ||
143 | if (err) | ||
144 | return err; | ||
145 | cond_resched(); | ||
146 | } | ||
154 | 147 | ||
155 | if (se->start_page == 0) { | 148 | list_for_each_entry(se, &si->first_swap_extent.list, list) { |
156 | /* Do not discard the swap header page! */ | 149 | start_block = se->start_block << (PAGE_SHIFT - 9); |
157 | start_block += 1 << (PAGE_SHIFT - 9); | 150 | nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); |
158 | nr_blocks -= 1 << (PAGE_SHIFT - 9); | ||
159 | if (!nr_blocks) | ||
160 | continue; | ||
161 | } | ||
162 | 151 | ||
163 | err = blkdev_issue_discard(si->bdev, start_block, | 152 | err = blkdev_issue_discard(si->bdev, start_block, |
164 | nr_blocks, GFP_KERNEL, | 153 | nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); |
165 | DISCARD_FL_BARRIER); | ||
166 | if (err) | 154 | if (err) |
167 | break; | 155 | break; |
168 | 156 | ||
@@ -201,14 +189,11 @@ static void discard_swap_cluster(struct swap_info_struct *si, | |||
201 | start_block <<= PAGE_SHIFT - 9; | 189 | start_block <<= PAGE_SHIFT - 9; |
202 | nr_blocks <<= PAGE_SHIFT - 9; | 190 | nr_blocks <<= PAGE_SHIFT - 9; |
203 | if (blkdev_issue_discard(si->bdev, start_block, | 191 | if (blkdev_issue_discard(si->bdev, start_block, |
204 | nr_blocks, GFP_NOIO, | 192 | nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) |
205 | DISCARD_FL_BARRIER)) | ||
206 | break; | 193 | break; |
207 | } | 194 | } |
208 | 195 | ||
209 | lh = se->list.next; | 196 | lh = se->list.next; |
210 | if (lh == &si->extent_list) | ||
211 | lh = lh->next; | ||
212 | se = list_entry(lh, struct swap_extent, list); | 197 | se = list_entry(lh, struct swap_extent, list); |
213 | } | 198 | } |
214 | } | 199 | } |
@@ -223,7 +208,7 @@ static int wait_for_discard(void *word) | |||
223 | #define LATENCY_LIMIT 256 | 208 | #define LATENCY_LIMIT 256 |
224 | 209 | ||
225 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, | 210 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, |
226 | int cache) | 211 | unsigned char usage) |
227 | { | 212 | { |
228 | unsigned long offset; | 213 | unsigned long offset; |
229 | unsigned long scan_base; | 214 | unsigned long scan_base; |
@@ -354,10 +339,7 @@ checks: | |||
354 | si->lowest_bit = si->max; | 339 | si->lowest_bit = si->max; |
355 | si->highest_bit = 0; | 340 | si->highest_bit = 0; |
356 | } | 341 | } |
357 | if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ | 342 | si->swap_map[offset] = usage; |
358 | si->swap_map[offset] = encode_swapmap(0, true); | ||
359 | else /* at suspend */ | ||
360 | si->swap_map[offset] = encode_swapmap(1, false); | ||
361 | si->cluster_next = offset + 1; | 343 | si->cluster_next = offset + 1; |
362 | si->flags -= SWP_SCANNING; | 344 | si->flags -= SWP_SCANNING; |
363 | 345 | ||
@@ -467,10 +449,10 @@ swp_entry_t get_swap_page(void) | |||
467 | nr_swap_pages--; | 449 | nr_swap_pages--; |
468 | 450 | ||
469 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | 451 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { |
470 | si = swap_info + type; | 452 | si = swap_info[type]; |
471 | next = si->next; | 453 | next = si->next; |
472 | if (next < 0 || | 454 | if (next < 0 || |
473 | (!wrapped && si->prio != swap_info[next].prio)) { | 455 | (!wrapped && si->prio != swap_info[next]->prio)) { |
474 | next = swap_list.head; | 456 | next = swap_list.head; |
475 | wrapped++; | 457 | wrapped++; |
476 | } | 458 | } |
@@ -482,7 +464,7 @@ swp_entry_t get_swap_page(void) | |||
482 | 464 | ||
483 | swap_list.next = next; | 465 | swap_list.next = next; |
484 | /* This is called for allocating swap entry for cache */ | 466 | /* This is called for allocating swap entry for cache */ |
485 | offset = scan_swap_map(si, SWAP_CACHE); | 467 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
486 | if (offset) { | 468 | if (offset) { |
487 | spin_unlock(&swap_lock); | 469 | spin_unlock(&swap_lock); |
488 | return swp_entry(type, offset); | 470 | return swp_entry(type, offset); |
@@ -503,11 +485,11 @@ swp_entry_t get_swap_page_of_type(int type) | |||
503 | pgoff_t offset; | 485 | pgoff_t offset; |
504 | 486 | ||
505 | spin_lock(&swap_lock); | 487 | spin_lock(&swap_lock); |
506 | si = swap_info + type; | 488 | si = swap_info[type]; |
507 | if (si->flags & SWP_WRITEOK) { | 489 | if (si && (si->flags & SWP_WRITEOK)) { |
508 | nr_swap_pages--; | 490 | nr_swap_pages--; |
509 | /* This is called for allocating swap entry, not cache */ | 491 | /* This is called for allocating swap entry, not cache */ |
510 | offset = scan_swap_map(si, SWAP_MAP); | 492 | offset = scan_swap_map(si, 1); |
511 | if (offset) { | 493 | if (offset) { |
512 | spin_unlock(&swap_lock); | 494 | spin_unlock(&swap_lock); |
513 | return swp_entry(type, offset); | 495 | return swp_entry(type, offset); |
@@ -518,9 +500,9 @@ swp_entry_t get_swap_page_of_type(int type) | |||
518 | return (swp_entry_t) {0}; | 500 | return (swp_entry_t) {0}; |
519 | } | 501 | } |
520 | 502 | ||
521 | static struct swap_info_struct * swap_info_get(swp_entry_t entry) | 503 | static struct swap_info_struct *swap_info_get(swp_entry_t entry) |
522 | { | 504 | { |
523 | struct swap_info_struct * p; | 505 | struct swap_info_struct *p; |
524 | unsigned long offset, type; | 506 | unsigned long offset, type; |
525 | 507 | ||
526 | if (!entry.val) | 508 | if (!entry.val) |
@@ -528,7 +510,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry) | |||
528 | type = swp_type(entry); | 510 | type = swp_type(entry); |
529 | if (type >= nr_swapfiles) | 511 | if (type >= nr_swapfiles) |
530 | goto bad_nofile; | 512 | goto bad_nofile; |
531 | p = & swap_info[type]; | 513 | p = swap_info[type]; |
532 | if (!(p->flags & SWP_USED)) | 514 | if (!(p->flags & SWP_USED)) |
533 | goto bad_device; | 515 | goto bad_device; |
534 | offset = swp_offset(entry); | 516 | offset = swp_offset(entry); |
@@ -554,41 +536,56 @@ out: | |||
554 | return NULL; | 536 | return NULL; |
555 | } | 537 | } |
556 | 538 | ||
557 | static int swap_entry_free(struct swap_info_struct *p, | 539 | static unsigned char swap_entry_free(struct swap_info_struct *p, |
558 | swp_entry_t ent, int cache) | 540 | swp_entry_t entry, unsigned char usage) |
559 | { | 541 | { |
560 | unsigned long offset = swp_offset(ent); | 542 | unsigned long offset = swp_offset(entry); |
561 | int count = swap_count(p->swap_map[offset]); | 543 | unsigned char count; |
562 | bool has_cache; | 544 | unsigned char has_cache; |
563 | 545 | ||
564 | has_cache = swap_has_cache(p->swap_map[offset]); | 546 | count = p->swap_map[offset]; |
547 | has_cache = count & SWAP_HAS_CACHE; | ||
548 | count &= ~SWAP_HAS_CACHE; | ||
565 | 549 | ||
566 | if (cache == SWAP_MAP) { /* dropping usage count of swap */ | 550 | if (usage == SWAP_HAS_CACHE) { |
567 | if (count < SWAP_MAP_MAX) { | ||
568 | count--; | ||
569 | p->swap_map[offset] = encode_swapmap(count, has_cache); | ||
570 | } | ||
571 | } else { /* dropping swap cache flag */ | ||
572 | VM_BUG_ON(!has_cache); | 551 | VM_BUG_ON(!has_cache); |
573 | p->swap_map[offset] = encode_swapmap(count, false); | 552 | has_cache = 0; |
574 | 553 | } else if (count == SWAP_MAP_SHMEM) { | |
554 | /* | ||
555 | * Or we could insist on shmem.c using a special | ||
556 | * swap_shmem_free() and free_shmem_swap_and_cache()... | ||
557 | */ | ||
558 | count = 0; | ||
559 | } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { | ||
560 | if (count == COUNT_CONTINUED) { | ||
561 | if (swap_count_continued(p, offset, count)) | ||
562 | count = SWAP_MAP_MAX | COUNT_CONTINUED; | ||
563 | else | ||
564 | count = SWAP_MAP_MAX; | ||
565 | } else | ||
566 | count--; | ||
575 | } | 567 | } |
576 | /* return code. */ | 568 | |
577 | count = p->swap_map[offset]; | 569 | if (!count) |
570 | mem_cgroup_uncharge_swap(entry); | ||
571 | |||
572 | usage = count | has_cache; | ||
573 | p->swap_map[offset] = usage; | ||
574 | |||
578 | /* free if no reference */ | 575 | /* free if no reference */ |
579 | if (!count) { | 576 | if (!usage) { |
580 | if (offset < p->lowest_bit) | 577 | if (offset < p->lowest_bit) |
581 | p->lowest_bit = offset; | 578 | p->lowest_bit = offset; |
582 | if (offset > p->highest_bit) | 579 | if (offset > p->highest_bit) |
583 | p->highest_bit = offset; | 580 | p->highest_bit = offset; |
584 | if (p->prio > swap_info[swap_list.next].prio) | 581 | if (swap_list.next >= 0 && |
585 | swap_list.next = p - swap_info; | 582 | p->prio > swap_info[swap_list.next]->prio) |
583 | swap_list.next = p->type; | ||
586 | nr_swap_pages++; | 584 | nr_swap_pages++; |
587 | p->inuse_pages--; | 585 | p->inuse_pages--; |
588 | } | 586 | } |
589 | if (!swap_count(count)) | 587 | |
590 | mem_cgroup_uncharge_swap(ent); | 588 | return usage; |
591 | return count; | ||
592 | } | 589 | } |
593 | 590 | ||
594 | /* | 591 | /* |
@@ -597,11 +594,11 @@ static int swap_entry_free(struct swap_info_struct *p, | |||
597 | */ | 594 | */ |
598 | void swap_free(swp_entry_t entry) | 595 | void swap_free(swp_entry_t entry) |
599 | { | 596 | { |
600 | struct swap_info_struct * p; | 597 | struct swap_info_struct *p; |
601 | 598 | ||
602 | p = swap_info_get(entry); | 599 | p = swap_info_get(entry); |
603 | if (p) { | 600 | if (p) { |
604 | swap_entry_free(p, entry, SWAP_MAP); | 601 | swap_entry_free(p, entry, 1); |
605 | spin_unlock(&swap_lock); | 602 | spin_unlock(&swap_lock); |
606 | } | 603 | } |
607 | } | 604 | } |
@@ -612,26 +609,21 @@ void swap_free(swp_entry_t entry) | |||
612 | void swapcache_free(swp_entry_t entry, struct page *page) | 609 | void swapcache_free(swp_entry_t entry, struct page *page) |
613 | { | 610 | { |
614 | struct swap_info_struct *p; | 611 | struct swap_info_struct *p; |
615 | int ret; | 612 | unsigned char count; |
616 | 613 | ||
617 | p = swap_info_get(entry); | 614 | p = swap_info_get(entry); |
618 | if (p) { | 615 | if (p) { |
619 | ret = swap_entry_free(p, entry, SWAP_CACHE); | 616 | count = swap_entry_free(p, entry, SWAP_HAS_CACHE); |
620 | if (page) { | 617 | if (page) |
621 | bool swapout; | 618 | mem_cgroup_uncharge_swapcache(page, entry, count != 0); |
622 | if (ret) | ||
623 | swapout = true; /* the end of swap out */ | ||
624 | else | ||
625 | swapout = false; /* no more swap users! */ | ||
626 | mem_cgroup_uncharge_swapcache(page, entry, swapout); | ||
627 | } | ||
628 | spin_unlock(&swap_lock); | 619 | spin_unlock(&swap_lock); |
629 | } | 620 | } |
630 | return; | ||
631 | } | 621 | } |
632 | 622 | ||
633 | /* | 623 | /* |
634 | * How many references to page are currently swapped out? | 624 | * How many references to page are currently swapped out? |
625 | * This does not give an exact answer when swap count is continued, | ||
626 | * but does include the high COUNT_CONTINUED flag to allow for that. | ||
635 | */ | 627 | */ |
636 | static inline int page_swapcount(struct page *page) | 628 | static inline int page_swapcount(struct page *page) |
637 | { | 629 | { |
@@ -659,6 +651,8 @@ int reuse_swap_page(struct page *page) | |||
659 | int count; | 651 | int count; |
660 | 652 | ||
661 | VM_BUG_ON(!PageLocked(page)); | 653 | VM_BUG_ON(!PageLocked(page)); |
654 | if (unlikely(PageKsm(page))) | ||
655 | return 0; | ||
662 | count = page_mapcount(page); | 656 | count = page_mapcount(page); |
663 | if (count <= 1 && PageSwapCache(page)) { | 657 | if (count <= 1 && PageSwapCache(page)) { |
664 | count += page_swapcount(page); | 658 | count += page_swapcount(page); |
@@ -667,7 +661,7 @@ int reuse_swap_page(struct page *page) | |||
667 | SetPageDirty(page); | 661 | SetPageDirty(page); |
668 | } | 662 | } |
669 | } | 663 | } |
670 | return count == 1; | 664 | return count <= 1; |
671 | } | 665 | } |
672 | 666 | ||
673 | /* | 667 | /* |
@@ -704,7 +698,7 @@ int free_swap_and_cache(swp_entry_t entry) | |||
704 | 698 | ||
705 | p = swap_info_get(entry); | 699 | p = swap_info_get(entry); |
706 | if (p) { | 700 | if (p) { |
707 | if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { | 701 | if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { |
708 | page = find_get_page(&swapper_space, entry.val); | 702 | page = find_get_page(&swapper_space, entry.val); |
709 | if (page && !trylock_page(page)) { | 703 | if (page && !trylock_page(page)) { |
710 | page_cache_release(page); | 704 | page_cache_release(page); |
@@ -729,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry) | |||
729 | return p != NULL; | 723 | return p != NULL; |
730 | } | 724 | } |
731 | 725 | ||
726 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
727 | /** | ||
728 | * mem_cgroup_count_swap_user - count the user of a swap entry | ||
729 | * @ent: the swap entry to be checked | ||
730 | * @pagep: the pointer for the swap cache page of the entry to be stored | ||
731 | * | ||
732 | * Returns the number of the user of the swap entry. The number is valid only | ||
733 | * for swaps of anonymous pages. | ||
734 | * If the entry is found on swap cache, the page is stored to pagep with | ||
735 | * refcount of it being incremented. | ||
736 | */ | ||
737 | int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) | ||
738 | { | ||
739 | struct page *page; | ||
740 | struct swap_info_struct *p; | ||
741 | int count = 0; | ||
742 | |||
743 | page = find_get_page(&swapper_space, ent.val); | ||
744 | if (page) | ||
745 | count += page_mapcount(page); | ||
746 | p = swap_info_get(ent); | ||
747 | if (p) { | ||
748 | count += swap_count(p->swap_map[swp_offset(ent)]); | ||
749 | spin_unlock(&swap_lock); | ||
750 | } | ||
751 | |||
752 | *pagep = page; | ||
753 | return count; | ||
754 | } | ||
755 | #endif | ||
756 | |||
732 | #ifdef CONFIG_HIBERNATION | 757 | #ifdef CONFIG_HIBERNATION |
733 | /* | 758 | /* |
734 | * Find the swap type that corresponds to given device (if any). | 759 | * Find the swap type that corresponds to given device (if any). |
@@ -741,14 +766,14 @@ int free_swap_and_cache(swp_entry_t entry) | |||
741 | int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | 766 | int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) |
742 | { | 767 | { |
743 | struct block_device *bdev = NULL; | 768 | struct block_device *bdev = NULL; |
744 | int i; | 769 | int type; |
745 | 770 | ||
746 | if (device) | 771 | if (device) |
747 | bdev = bdget(device); | 772 | bdev = bdget(device); |
748 | 773 | ||
749 | spin_lock(&swap_lock); | 774 | spin_lock(&swap_lock); |
750 | for (i = 0; i < nr_swapfiles; i++) { | 775 | for (type = 0; type < nr_swapfiles; type++) { |
751 | struct swap_info_struct *sis = swap_info + i; | 776 | struct swap_info_struct *sis = swap_info[type]; |
752 | 777 | ||
753 | if (!(sis->flags & SWP_WRITEOK)) | 778 | if (!(sis->flags & SWP_WRITEOK)) |
754 | continue; | 779 | continue; |
@@ -758,20 +783,18 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
758 | *bdev_p = bdgrab(sis->bdev); | 783 | *bdev_p = bdgrab(sis->bdev); |
759 | 784 | ||
760 | spin_unlock(&swap_lock); | 785 | spin_unlock(&swap_lock); |
761 | return i; | 786 | return type; |
762 | } | 787 | } |
763 | if (bdev == sis->bdev) { | 788 | if (bdev == sis->bdev) { |
764 | struct swap_extent *se; | 789 | struct swap_extent *se = &sis->first_swap_extent; |
765 | 790 | ||
766 | se = list_entry(sis->extent_list.next, | ||
767 | struct swap_extent, list); | ||
768 | if (se->start_block == offset) { | 791 | if (se->start_block == offset) { |
769 | if (bdev_p) | 792 | if (bdev_p) |
770 | *bdev_p = bdgrab(sis->bdev); | 793 | *bdev_p = bdgrab(sis->bdev); |
771 | 794 | ||
772 | spin_unlock(&swap_lock); | 795 | spin_unlock(&swap_lock); |
773 | bdput(bdev); | 796 | bdput(bdev); |
774 | return i; | 797 | return type; |
775 | } | 798 | } |
776 | } | 799 | } |
777 | } | 800 | } |
@@ -783,6 +806,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
783 | } | 806 | } |
784 | 807 | ||
785 | /* | 808 | /* |
809 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev | ||
810 | * corresponding to given index in swap_info (swap type). | ||
811 | */ | ||
812 | sector_t swapdev_block(int type, pgoff_t offset) | ||
813 | { | ||
814 | struct block_device *bdev; | ||
815 | |||
816 | if ((unsigned int)type >= nr_swapfiles) | ||
817 | return 0; | ||
818 | if (!(swap_info[type]->flags & SWP_WRITEOK)) | ||
819 | return 0; | ||
820 | return map_swap_entry(swp_entry(type, offset), &bdev); | ||
821 | } | ||
822 | |||
823 | /* | ||
786 | * Return either the total number of swap pages of given type, or the number | 824 | * Return either the total number of swap pages of given type, or the number |
787 | * of free pages of that type (depending on @free) | 825 | * of free pages of that type (depending on @free) |
788 | * | 826 | * |
@@ -792,18 +830,20 @@ unsigned int count_swap_pages(int type, int free) | |||
792 | { | 830 | { |
793 | unsigned int n = 0; | 831 | unsigned int n = 0; |
794 | 832 | ||
795 | if (type < nr_swapfiles) { | 833 | spin_lock(&swap_lock); |
796 | spin_lock(&swap_lock); | 834 | if ((unsigned int)type < nr_swapfiles) { |
797 | if (swap_info[type].flags & SWP_WRITEOK) { | 835 | struct swap_info_struct *sis = swap_info[type]; |
798 | n = swap_info[type].pages; | 836 | |
837 | if (sis->flags & SWP_WRITEOK) { | ||
838 | n = sis->pages; | ||
799 | if (free) | 839 | if (free) |
800 | n -= swap_info[type].inuse_pages; | 840 | n -= sis->inuse_pages; |
801 | } | 841 | } |
802 | spin_unlock(&swap_lock); | ||
803 | } | 842 | } |
843 | spin_unlock(&swap_lock); | ||
804 | return n; | 844 | return n; |
805 | } | 845 | } |
806 | #endif | 846 | #endif /* CONFIG_HIBERNATION */ |
807 | 847 | ||
808 | /* | 848 | /* |
809 | * No need to decide whether this PTE shares the swap entry with others, | 849 | * No need to decide whether this PTE shares the swap entry with others, |
@@ -831,7 +871,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
831 | goto out; | 871 | goto out; |
832 | } | 872 | } |
833 | 873 | ||
834 | inc_mm_counter(vma->vm_mm, anon_rss); | 874 | dec_mm_counter(vma->vm_mm, MM_SWAPENTS); |
875 | inc_mm_counter(vma->vm_mm, MM_ANONPAGES); | ||
835 | get_page(page); | 876 | get_page(page); |
836 | set_pte_at(vma->vm_mm, addr, pte, | 877 | set_pte_at(vma->vm_mm, addr, pte, |
837 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 878 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
@@ -932,7 +973,7 @@ static int unuse_vma(struct vm_area_struct *vma, | |||
932 | unsigned long addr, end, next; | 973 | unsigned long addr, end, next; |
933 | int ret; | 974 | int ret; |
934 | 975 | ||
935 | if (page->mapping) { | 976 | if (page_anon_vma(page)) { |
936 | addr = page_address_in_vma(page, vma); | 977 | addr = page_address_in_vma(page, vma); |
937 | if (addr == -EFAULT) | 978 | if (addr == -EFAULT) |
938 | return 0; | 979 | return 0; |
@@ -988,7 +1029,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
988 | { | 1029 | { |
989 | unsigned int max = si->max; | 1030 | unsigned int max = si->max; |
990 | unsigned int i = prev; | 1031 | unsigned int i = prev; |
991 | int count; | 1032 | unsigned char count; |
992 | 1033 | ||
993 | /* | 1034 | /* |
994 | * No need for swap_lock here: we're just looking | 1035 | * No need for swap_lock here: we're just looking |
@@ -1024,16 +1065,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1024 | */ | 1065 | */ |
1025 | static int try_to_unuse(unsigned int type) | 1066 | static int try_to_unuse(unsigned int type) |
1026 | { | 1067 | { |
1027 | struct swap_info_struct * si = &swap_info[type]; | 1068 | struct swap_info_struct *si = swap_info[type]; |
1028 | struct mm_struct *start_mm; | 1069 | struct mm_struct *start_mm; |
1029 | unsigned short *swap_map; | 1070 | unsigned char *swap_map; |
1030 | unsigned short swcount; | 1071 | unsigned char swcount; |
1031 | struct page *page; | 1072 | struct page *page; |
1032 | swp_entry_t entry; | 1073 | swp_entry_t entry; |
1033 | unsigned int i = 0; | 1074 | unsigned int i = 0; |
1034 | int retval = 0; | 1075 | int retval = 0; |
1035 | int reset_overflow = 0; | ||
1036 | int shmem; | ||
1037 | 1076 | ||
1038 | /* | 1077 | /* |
1039 | * When searching mms for an entry, a good strategy is to | 1078 | * When searching mms for an entry, a good strategy is to |
@@ -1047,8 +1086,7 @@ static int try_to_unuse(unsigned int type) | |||
1047 | * together, child after parent. If we race with dup_mmap(), we | 1086 | * together, child after parent. If we race with dup_mmap(), we |
1048 | * prefer to resolve parent before child, lest we miss entries | 1087 | * prefer to resolve parent before child, lest we miss entries |
1049 | * duplicated after we scanned child: using last mm would invert | 1088 | * duplicated after we scanned child: using last mm would invert |
1050 | * that. Though it's only a serious concern when an overflowed | 1089 | * that. |
1051 | * swap count is reset from SWAP_MAP_MAX, preventing a rescan. | ||
1052 | */ | 1090 | */ |
1053 | start_mm = &init_mm; | 1091 | start_mm = &init_mm; |
1054 | atomic_inc(&init_mm.mm_users); | 1092 | atomic_inc(&init_mm.mm_users); |
@@ -1110,17 +1148,18 @@ static int try_to_unuse(unsigned int type) | |||
1110 | 1148 | ||
1111 | /* | 1149 | /* |
1112 | * Remove all references to entry. | 1150 | * Remove all references to entry. |
1113 | * Whenever we reach init_mm, there's no address space | ||
1114 | * to search, but use it as a reminder to search shmem. | ||
1115 | */ | 1151 | */ |
1116 | shmem = 0; | ||
1117 | swcount = *swap_map; | 1152 | swcount = *swap_map; |
1118 | if (swap_count(swcount)) { | 1153 | if (swap_count(swcount) == SWAP_MAP_SHMEM) { |
1119 | if (start_mm == &init_mm) | 1154 | retval = shmem_unuse(entry, page); |
1120 | shmem = shmem_unuse(entry, page); | 1155 | /* page has already been unlocked and released */ |
1121 | else | 1156 | if (retval < 0) |
1122 | retval = unuse_mm(start_mm, entry, page); | 1157 | break; |
1158 | continue; | ||
1123 | } | 1159 | } |
1160 | if (swap_count(swcount) && start_mm != &init_mm) | ||
1161 | retval = unuse_mm(start_mm, entry, page); | ||
1162 | |||
1124 | if (swap_count(*swap_map)) { | 1163 | if (swap_count(*swap_map)) { |
1125 | int set_start_mm = (*swap_map >= swcount); | 1164 | int set_start_mm = (*swap_map >= swcount); |
1126 | struct list_head *p = &start_mm->mmlist; | 1165 | struct list_head *p = &start_mm->mmlist; |
@@ -1131,7 +1170,7 @@ static int try_to_unuse(unsigned int type) | |||
1131 | atomic_inc(&new_start_mm->mm_users); | 1170 | atomic_inc(&new_start_mm->mm_users); |
1132 | atomic_inc(&prev_mm->mm_users); | 1171 | atomic_inc(&prev_mm->mm_users); |
1133 | spin_lock(&mmlist_lock); | 1172 | spin_lock(&mmlist_lock); |
1134 | while (swap_count(*swap_map) && !retval && !shmem && | 1173 | while (swap_count(*swap_map) && !retval && |
1135 | (p = p->next) != &start_mm->mmlist) { | 1174 | (p = p->next) != &start_mm->mmlist) { |
1136 | mm = list_entry(p, struct mm_struct, mmlist); | 1175 | mm = list_entry(p, struct mm_struct, mmlist); |
1137 | if (!atomic_inc_not_zero(&mm->mm_users)) | 1176 | if (!atomic_inc_not_zero(&mm->mm_users)) |
@@ -1145,10 +1184,9 @@ static int try_to_unuse(unsigned int type) | |||
1145 | swcount = *swap_map; | 1184 | swcount = *swap_map; |
1146 | if (!swap_count(swcount)) /* any usage ? */ | 1185 | if (!swap_count(swcount)) /* any usage ? */ |
1147 | ; | 1186 | ; |
1148 | else if (mm == &init_mm) { | 1187 | else if (mm == &init_mm) |
1149 | set_start_mm = 1; | 1188 | set_start_mm = 1; |
1150 | shmem = shmem_unuse(entry, page); | 1189 | else |
1151 | } else | ||
1152 | retval = unuse_mm(mm, entry, page); | 1190 | retval = unuse_mm(mm, entry, page); |
1153 | 1191 | ||
1154 | if (set_start_mm && *swap_map < swcount) { | 1192 | if (set_start_mm && *swap_map < swcount) { |
@@ -1164,13 +1202,6 @@ static int try_to_unuse(unsigned int type) | |||
1164 | mmput(start_mm); | 1202 | mmput(start_mm); |
1165 | start_mm = new_start_mm; | 1203 | start_mm = new_start_mm; |
1166 | } | 1204 | } |
1167 | if (shmem) { | ||
1168 | /* page has already been unlocked and released */ | ||
1169 | if (shmem > 0) | ||
1170 | continue; | ||
1171 | retval = shmem; | ||
1172 | break; | ||
1173 | } | ||
1174 | if (retval) { | 1205 | if (retval) { |
1175 | unlock_page(page); | 1206 | unlock_page(page); |
1176 | page_cache_release(page); | 1207 | page_cache_release(page); |
@@ -1178,30 +1209,6 @@ static int try_to_unuse(unsigned int type) | |||
1178 | } | 1209 | } |
1179 | 1210 | ||
1180 | /* | 1211 | /* |
1181 | * How could swap count reach 0x7ffe ? | ||
1182 | * There's no way to repeat a swap page within an mm | ||
1183 | * (except in shmem, where it's the shared object which takes | ||
1184 | * the reference count)? | ||
1185 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned | ||
1186 | * short is too small....) | ||
1187 | * If that's wrong, then we should worry more about | ||
1188 | * exit_mmap() and do_munmap() cases described above: | ||
1189 | * we might be resetting SWAP_MAP_MAX too early here. | ||
1190 | * We know "Undead"s can happen, they're okay, so don't | ||
1191 | * report them; but do report if we reset SWAP_MAP_MAX. | ||
1192 | */ | ||
1193 | /* We might release the lock_page() in unuse_mm(). */ | ||
1194 | if (!PageSwapCache(page) || page_private(page) != entry.val) | ||
1195 | goto retry; | ||
1196 | |||
1197 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { | ||
1198 | spin_lock(&swap_lock); | ||
1199 | *swap_map = encode_swapmap(0, true); | ||
1200 | spin_unlock(&swap_lock); | ||
1201 | reset_overflow = 1; | ||
1202 | } | ||
1203 | |||
1204 | /* | ||
1205 | * If a reference remains (rare), we would like to leave | 1212 | * If a reference remains (rare), we would like to leave |
1206 | * the page in the swap cache; but try_to_unmap could | 1213 | * the page in the swap cache; but try_to_unmap could |
1207 | * then re-duplicate the entry once we drop page lock, | 1214 | * then re-duplicate the entry once we drop page lock, |
@@ -1213,6 +1220,12 @@ static int try_to_unuse(unsigned int type) | |||
1213 | * read from disk into another page. Splitting into two | 1220 | * read from disk into another page. Splitting into two |
1214 | * pages would be incorrect if swap supported "shared | 1221 | * pages would be incorrect if swap supported "shared |
1215 | * private" pages, but they are handled by tmpfs files. | 1222 | * private" pages, but they are handled by tmpfs files. |
1223 | * | ||
1224 | * Given how unuse_vma() targets one particular offset | ||
1225 | * in an anon_vma, once the anon_vma has been determined, | ||
1226 | * this splitting happens to be just what is needed to | ||
1227 | * handle where KSM pages have been swapped out: re-reading | ||
1228 | * is unnecessarily slow, but we can fix that later on. | ||
1216 | */ | 1229 | */ |
1217 | if (swap_count(*swap_map) && | 1230 | if (swap_count(*swap_map) && |
1218 | PageDirty(page) && PageSwapCache(page)) { | 1231 | PageDirty(page) && PageSwapCache(page)) { |
@@ -1242,7 +1255,6 @@ static int try_to_unuse(unsigned int type) | |||
1242 | * mark page dirty so shrink_page_list will preserve it. | 1255 | * mark page dirty so shrink_page_list will preserve it. |
1243 | */ | 1256 | */ |
1244 | SetPageDirty(page); | 1257 | SetPageDirty(page); |
1245 | retry: | ||
1246 | unlock_page(page); | 1258 | unlock_page(page); |
1247 | page_cache_release(page); | 1259 | page_cache_release(page); |
1248 | 1260 | ||
@@ -1254,10 +1266,6 @@ retry: | |||
1254 | } | 1266 | } |
1255 | 1267 | ||
1256 | mmput(start_mm); | 1268 | mmput(start_mm); |
1257 | if (reset_overflow) { | ||
1258 | printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); | ||
1259 | swap_overflow = 0; | ||
1260 | } | ||
1261 | return retval; | 1269 | return retval; |
1262 | } | 1270 | } |
1263 | 1271 | ||
@@ -1270,10 +1278,10 @@ retry: | |||
1270 | static void drain_mmlist(void) | 1278 | static void drain_mmlist(void) |
1271 | { | 1279 | { |
1272 | struct list_head *p, *next; | 1280 | struct list_head *p, *next; |
1273 | unsigned int i; | 1281 | unsigned int type; |
1274 | 1282 | ||
1275 | for (i = 0; i < nr_swapfiles; i++) | 1283 | for (type = 0; type < nr_swapfiles; type++) |
1276 | if (swap_info[i].inuse_pages) | 1284 | if (swap_info[type]->inuse_pages) |
1277 | return; | 1285 | return; |
1278 | spin_lock(&mmlist_lock); | 1286 | spin_lock(&mmlist_lock); |
1279 | list_for_each_safe(p, next, &init_mm.mmlist) | 1287 | list_for_each_safe(p, next, &init_mm.mmlist) |
@@ -1283,12 +1291,23 @@ static void drain_mmlist(void) | |||
1283 | 1291 | ||
1284 | /* | 1292 | /* |
1285 | * Use this swapdev's extent info to locate the (PAGE_SIZE) block which | 1293 | * Use this swapdev's extent info to locate the (PAGE_SIZE) block which |
1286 | * corresponds to page offset `offset'. | 1294 | * corresponds to page offset for the specified swap entry. |
1295 | * Note that the type of this function is sector_t, but it returns page offset | ||
1296 | * into the bdev, not sector offset. | ||
1287 | */ | 1297 | */ |
1288 | sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | 1298 | static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) |
1289 | { | 1299 | { |
1290 | struct swap_extent *se = sis->curr_swap_extent; | 1300 | struct swap_info_struct *sis; |
1291 | struct swap_extent *start_se = se; | 1301 | struct swap_extent *start_se; |
1302 | struct swap_extent *se; | ||
1303 | pgoff_t offset; | ||
1304 | |||
1305 | sis = swap_info[swp_type(entry)]; | ||
1306 | *bdev = sis->bdev; | ||
1307 | |||
1308 | offset = swp_offset(entry); | ||
1309 | start_se = sis->curr_swap_extent; | ||
1310 | se = start_se; | ||
1292 | 1311 | ||
1293 | for ( ; ; ) { | 1312 | for ( ; ; ) { |
1294 | struct list_head *lh; | 1313 | struct list_head *lh; |
@@ -1298,40 +1317,31 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | |||
1298 | return se->start_block + (offset - se->start_page); | 1317 | return se->start_block + (offset - se->start_page); |
1299 | } | 1318 | } |
1300 | lh = se->list.next; | 1319 | lh = se->list.next; |
1301 | if (lh == &sis->extent_list) | ||
1302 | lh = lh->next; | ||
1303 | se = list_entry(lh, struct swap_extent, list); | 1320 | se = list_entry(lh, struct swap_extent, list); |
1304 | sis->curr_swap_extent = se; | 1321 | sis->curr_swap_extent = se; |
1305 | BUG_ON(se == start_se); /* It *must* be present */ | 1322 | BUG_ON(se == start_se); /* It *must* be present */ |
1306 | } | 1323 | } |
1307 | } | 1324 | } |
1308 | 1325 | ||
1309 | #ifdef CONFIG_HIBERNATION | ||
1310 | /* | 1326 | /* |
1311 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev | 1327 | * Returns the page offset into bdev for the specified page's swap entry. |
1312 | * corresponding to given index in swap_info (swap type). | ||
1313 | */ | 1328 | */ |
1314 | sector_t swapdev_block(int swap_type, pgoff_t offset) | 1329 | sector_t map_swap_page(struct page *page, struct block_device **bdev) |
1315 | { | 1330 | { |
1316 | struct swap_info_struct *sis; | 1331 | swp_entry_t entry; |
1317 | 1332 | entry.val = page_private(page); | |
1318 | if (swap_type >= nr_swapfiles) | 1333 | return map_swap_entry(entry, bdev); |
1319 | return 0; | ||
1320 | |||
1321 | sis = swap_info + swap_type; | ||
1322 | return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; | ||
1323 | } | 1334 | } |
1324 | #endif /* CONFIG_HIBERNATION */ | ||
1325 | 1335 | ||
1326 | /* | 1336 | /* |
1327 | * Free all of a swapdev's extent information | 1337 | * Free all of a swapdev's extent information |
1328 | */ | 1338 | */ |
1329 | static void destroy_swap_extents(struct swap_info_struct *sis) | 1339 | static void destroy_swap_extents(struct swap_info_struct *sis) |
1330 | { | 1340 | { |
1331 | while (!list_empty(&sis->extent_list)) { | 1341 | while (!list_empty(&sis->first_swap_extent.list)) { |
1332 | struct swap_extent *se; | 1342 | struct swap_extent *se; |
1333 | 1343 | ||
1334 | se = list_entry(sis->extent_list.next, | 1344 | se = list_entry(sis->first_swap_extent.list.next, |
1335 | struct swap_extent, list); | 1345 | struct swap_extent, list); |
1336 | list_del(&se->list); | 1346 | list_del(&se->list); |
1337 | kfree(se); | 1347 | kfree(se); |
@@ -1352,8 +1362,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
1352 | struct swap_extent *new_se; | 1362 | struct swap_extent *new_se; |
1353 | struct list_head *lh; | 1363 | struct list_head *lh; |
1354 | 1364 | ||
1355 | lh = sis->extent_list.prev; /* The highest page extent */ | 1365 | if (start_page == 0) { |
1356 | if (lh != &sis->extent_list) { | 1366 | se = &sis->first_swap_extent; |
1367 | sis->curr_swap_extent = se; | ||
1368 | se->start_page = 0; | ||
1369 | se->nr_pages = nr_pages; | ||
1370 | se->start_block = start_block; | ||
1371 | return 1; | ||
1372 | } else { | ||
1373 | lh = sis->first_swap_extent.list.prev; /* Highest extent */ | ||
1357 | se = list_entry(lh, struct swap_extent, list); | 1374 | se = list_entry(lh, struct swap_extent, list); |
1358 | BUG_ON(se->start_page + se->nr_pages != start_page); | 1375 | BUG_ON(se->start_page + se->nr_pages != start_page); |
1359 | if (se->start_block + se->nr_pages == start_block) { | 1376 | if (se->start_block + se->nr_pages == start_block) { |
@@ -1373,7 +1390,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
1373 | new_se->nr_pages = nr_pages; | 1390 | new_se->nr_pages = nr_pages; |
1374 | new_se->start_block = start_block; | 1391 | new_se->start_block = start_block; |
1375 | 1392 | ||
1376 | list_add_tail(&new_se->list, &sis->extent_list); | 1393 | list_add_tail(&new_se->list, &sis->first_swap_extent.list); |
1377 | return 1; | 1394 | return 1; |
1378 | } | 1395 | } |
1379 | 1396 | ||
@@ -1425,7 +1442,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | |||
1425 | if (S_ISBLK(inode->i_mode)) { | 1442 | if (S_ISBLK(inode->i_mode)) { |
1426 | ret = add_swap_extent(sis, 0, sis->max, 0); | 1443 | ret = add_swap_extent(sis, 0, sis->max, 0); |
1427 | *span = sis->pages; | 1444 | *span = sis->pages; |
1428 | goto done; | 1445 | goto out; |
1429 | } | 1446 | } |
1430 | 1447 | ||
1431 | blkbits = inode->i_blkbits; | 1448 | blkbits = inode->i_blkbits; |
@@ -1496,25 +1513,22 @@ reprobe: | |||
1496 | sis->max = page_no; | 1513 | sis->max = page_no; |
1497 | sis->pages = page_no - 1; | 1514 | sis->pages = page_no - 1; |
1498 | sis->highest_bit = page_no - 1; | 1515 | sis->highest_bit = page_no - 1; |
1499 | done: | 1516 | out: |
1500 | sis->curr_swap_extent = list_entry(sis->extent_list.prev, | 1517 | return ret; |
1501 | struct swap_extent, list); | ||
1502 | goto out; | ||
1503 | bad_bmap: | 1518 | bad_bmap: |
1504 | printk(KERN_ERR "swapon: swapfile has holes\n"); | 1519 | printk(KERN_ERR "swapon: swapfile has holes\n"); |
1505 | ret = -EINVAL; | 1520 | ret = -EINVAL; |
1506 | out: | 1521 | goto out; |
1507 | return ret; | ||
1508 | } | 1522 | } |
1509 | 1523 | ||
1510 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | 1524 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
1511 | { | 1525 | { |
1512 | struct swap_info_struct * p = NULL; | 1526 | struct swap_info_struct *p = NULL; |
1513 | unsigned short *swap_map; | 1527 | unsigned char *swap_map; |
1514 | struct file *swap_file, *victim; | 1528 | struct file *swap_file, *victim; |
1515 | struct address_space *mapping; | 1529 | struct address_space *mapping; |
1516 | struct inode *inode; | 1530 | struct inode *inode; |
1517 | char * pathname; | 1531 | char *pathname; |
1518 | int i, type, prev; | 1532 | int i, type, prev; |
1519 | int err; | 1533 | int err; |
1520 | 1534 | ||
@@ -1535,8 +1549,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1535 | mapping = victim->f_mapping; | 1549 | mapping = victim->f_mapping; |
1536 | prev = -1; | 1550 | prev = -1; |
1537 | spin_lock(&swap_lock); | 1551 | spin_lock(&swap_lock); |
1538 | for (type = swap_list.head; type >= 0; type = swap_info[type].next) { | 1552 | for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { |
1539 | p = swap_info + type; | 1553 | p = swap_info[type]; |
1540 | if (p->flags & SWP_WRITEOK) { | 1554 | if (p->flags & SWP_WRITEOK) { |
1541 | if (p->swap_file->f_mapping == mapping) | 1555 | if (p->swap_file->f_mapping == mapping) |
1542 | break; | 1556 | break; |
@@ -1555,18 +1569,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1555 | spin_unlock(&swap_lock); | 1569 | spin_unlock(&swap_lock); |
1556 | goto out_dput; | 1570 | goto out_dput; |
1557 | } | 1571 | } |
1558 | if (prev < 0) { | 1572 | if (prev < 0) |
1559 | swap_list.head = p->next; | 1573 | swap_list.head = p->next; |
1560 | } else { | 1574 | else |
1561 | swap_info[prev].next = p->next; | 1575 | swap_info[prev]->next = p->next; |
1562 | } | ||
1563 | if (type == swap_list.next) { | 1576 | if (type == swap_list.next) { |
1564 | /* just pick something that's safe... */ | 1577 | /* just pick something that's safe... */ |
1565 | swap_list.next = swap_list.head; | 1578 | swap_list.next = swap_list.head; |
1566 | } | 1579 | } |
1567 | if (p->prio < 0) { | 1580 | if (p->prio < 0) { |
1568 | for (i = p->next; i >= 0; i = swap_info[i].next) | 1581 | for (i = p->next; i >= 0; i = swap_info[i]->next) |
1569 | swap_info[i].prio = p->prio--; | 1582 | swap_info[i]->prio = p->prio--; |
1570 | least_priority++; | 1583 | least_priority++; |
1571 | } | 1584 | } |
1572 | nr_swap_pages -= p->pages; | 1585 | nr_swap_pages -= p->pages; |
@@ -1584,16 +1597,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1584 | if (p->prio < 0) | 1597 | if (p->prio < 0) |
1585 | p->prio = --least_priority; | 1598 | p->prio = --least_priority; |
1586 | prev = -1; | 1599 | prev = -1; |
1587 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | 1600 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { |
1588 | if (p->prio >= swap_info[i].prio) | 1601 | if (p->prio >= swap_info[i]->prio) |
1589 | break; | 1602 | break; |
1590 | prev = i; | 1603 | prev = i; |
1591 | } | 1604 | } |
1592 | p->next = i; | 1605 | p->next = i; |
1593 | if (prev < 0) | 1606 | if (prev < 0) |
1594 | swap_list.head = swap_list.next = p - swap_info; | 1607 | swap_list.head = swap_list.next = type; |
1595 | else | 1608 | else |
1596 | swap_info[prev].next = p - swap_info; | 1609 | swap_info[prev]->next = type; |
1597 | nr_swap_pages += p->pages; | 1610 | nr_swap_pages += p->pages; |
1598 | total_swap_pages += p->pages; | 1611 | total_swap_pages += p->pages; |
1599 | p->flags |= SWP_WRITEOK; | 1612 | p->flags |= SWP_WRITEOK; |
@@ -1606,6 +1619,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1606 | up_write(&swap_unplug_sem); | 1619 | up_write(&swap_unplug_sem); |
1607 | 1620 | ||
1608 | destroy_swap_extents(p); | 1621 | destroy_swap_extents(p); |
1622 | if (p->flags & SWP_CONTINUED) | ||
1623 | free_swap_count_continuations(p); | ||
1624 | |||
1609 | mutex_lock(&swapon_mutex); | 1625 | mutex_lock(&swapon_mutex); |
1610 | spin_lock(&swap_lock); | 1626 | spin_lock(&swap_lock); |
1611 | drain_mmlist(); | 1627 | drain_mmlist(); |
@@ -1653,8 +1669,8 @@ out: | |||
1653 | /* iterator */ | 1669 | /* iterator */ |
1654 | static void *swap_start(struct seq_file *swap, loff_t *pos) | 1670 | static void *swap_start(struct seq_file *swap, loff_t *pos) |
1655 | { | 1671 | { |
1656 | struct swap_info_struct *ptr = swap_info; | 1672 | struct swap_info_struct *si; |
1657 | int i; | 1673 | int type; |
1658 | loff_t l = *pos; | 1674 | loff_t l = *pos; |
1659 | 1675 | ||
1660 | mutex_lock(&swapon_mutex); | 1676 | mutex_lock(&swapon_mutex); |
@@ -1662,11 +1678,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1662 | if (!l) | 1678 | if (!l) |
1663 | return SEQ_START_TOKEN; | 1679 | return SEQ_START_TOKEN; |
1664 | 1680 | ||
1665 | for (i = 0; i < nr_swapfiles; i++, ptr++) { | 1681 | for (type = 0; type < nr_swapfiles; type++) { |
1666 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1682 | smp_rmb(); /* read nr_swapfiles before swap_info[type] */ |
1683 | si = swap_info[type]; | ||
1684 | if (!(si->flags & SWP_USED) || !si->swap_map) | ||
1667 | continue; | 1685 | continue; |
1668 | if (!--l) | 1686 | if (!--l) |
1669 | return ptr; | 1687 | return si; |
1670 | } | 1688 | } |
1671 | 1689 | ||
1672 | return NULL; | 1690 | return NULL; |
@@ -1674,21 +1692,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1674 | 1692 | ||
1675 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) | 1693 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) |
1676 | { | 1694 | { |
1677 | struct swap_info_struct *ptr; | 1695 | struct swap_info_struct *si = v; |
1678 | struct swap_info_struct *endptr = swap_info + nr_swapfiles; | 1696 | int type; |
1679 | 1697 | ||
1680 | if (v == SEQ_START_TOKEN) | 1698 | if (v == SEQ_START_TOKEN) |
1681 | ptr = swap_info; | 1699 | type = 0; |
1682 | else { | 1700 | else |
1683 | ptr = v; | 1701 | type = si->type + 1; |
1684 | ptr++; | ||
1685 | } | ||
1686 | 1702 | ||
1687 | for (; ptr < endptr; ptr++) { | 1703 | for (; type < nr_swapfiles; type++) { |
1688 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1704 | smp_rmb(); /* read nr_swapfiles before swap_info[type] */ |
1705 | si = swap_info[type]; | ||
1706 | if (!(si->flags & SWP_USED) || !si->swap_map) | ||
1689 | continue; | 1707 | continue; |
1690 | ++*pos; | 1708 | ++*pos; |
1691 | return ptr; | 1709 | return si; |
1692 | } | 1710 | } |
1693 | 1711 | ||
1694 | return NULL; | 1712 | return NULL; |
@@ -1701,24 +1719,24 @@ static void swap_stop(struct seq_file *swap, void *v) | |||
1701 | 1719 | ||
1702 | static int swap_show(struct seq_file *swap, void *v) | 1720 | static int swap_show(struct seq_file *swap, void *v) |
1703 | { | 1721 | { |
1704 | struct swap_info_struct *ptr = v; | 1722 | struct swap_info_struct *si = v; |
1705 | struct file *file; | 1723 | struct file *file; |
1706 | int len; | 1724 | int len; |
1707 | 1725 | ||
1708 | if (ptr == SEQ_START_TOKEN) { | 1726 | if (si == SEQ_START_TOKEN) { |
1709 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); | 1727 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); |
1710 | return 0; | 1728 | return 0; |
1711 | } | 1729 | } |
1712 | 1730 | ||
1713 | file = ptr->swap_file; | 1731 | file = si->swap_file; |
1714 | len = seq_path(swap, &file->f_path, " \t\n\\"); | 1732 | len = seq_path(swap, &file->f_path, " \t\n\\"); |
1715 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", | 1733 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
1716 | len < 40 ? 40 - len : 1, " ", | 1734 | len < 40 ? 40 - len : 1, " ", |
1717 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? | 1735 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? |
1718 | "partition" : "file\t", | 1736 | "partition" : "file\t", |
1719 | ptr->pages << (PAGE_SHIFT - 10), | 1737 | si->pages << (PAGE_SHIFT - 10), |
1720 | ptr->inuse_pages << (PAGE_SHIFT - 10), | 1738 | si->inuse_pages << (PAGE_SHIFT - 10), |
1721 | ptr->prio); | 1739 | si->prio); |
1722 | return 0; | 1740 | return 0; |
1723 | } | 1741 | } |
1724 | 1742 | ||
@@ -1765,7 +1783,7 @@ late_initcall(max_swapfiles_check); | |||
1765 | */ | 1783 | */ |
1766 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | 1784 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) |
1767 | { | 1785 | { |
1768 | struct swap_info_struct * p; | 1786 | struct swap_info_struct *p; |
1769 | char *name = NULL; | 1787 | char *name = NULL; |
1770 | struct block_device *bdev = NULL; | 1788 | struct block_device *bdev = NULL; |
1771 | struct file *swap_file = NULL; | 1789 | struct file *swap_file = NULL; |
@@ -1773,36 +1791,58 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1773 | unsigned int type; | 1791 | unsigned int type; |
1774 | int i, prev; | 1792 | int i, prev; |
1775 | int error; | 1793 | int error; |
1776 | union swap_header *swap_header = NULL; | 1794 | union swap_header *swap_header; |
1777 | unsigned int nr_good_pages = 0; | 1795 | unsigned int nr_good_pages; |
1778 | int nr_extents = 0; | 1796 | int nr_extents = 0; |
1779 | sector_t span; | 1797 | sector_t span; |
1780 | unsigned long maxpages = 1; | 1798 | unsigned long maxpages; |
1781 | unsigned long swapfilepages; | 1799 | unsigned long swapfilepages; |
1782 | unsigned short *swap_map = NULL; | 1800 | unsigned char *swap_map = NULL; |
1783 | struct page *page = NULL; | 1801 | struct page *page = NULL; |
1784 | struct inode *inode = NULL; | 1802 | struct inode *inode = NULL; |
1785 | int did_down = 0; | 1803 | int did_down = 0; |
1786 | 1804 | ||
1787 | if (!capable(CAP_SYS_ADMIN)) | 1805 | if (!capable(CAP_SYS_ADMIN)) |
1788 | return -EPERM; | 1806 | return -EPERM; |
1807 | |||
1808 | p = kzalloc(sizeof(*p), GFP_KERNEL); | ||
1809 | if (!p) | ||
1810 | return -ENOMEM; | ||
1811 | |||
1789 | spin_lock(&swap_lock); | 1812 | spin_lock(&swap_lock); |
1790 | p = swap_info; | 1813 | for (type = 0; type < nr_swapfiles; type++) { |
1791 | for (type = 0 ; type < nr_swapfiles ; type++,p++) | 1814 | if (!(swap_info[type]->flags & SWP_USED)) |
1792 | if (!(p->flags & SWP_USED)) | ||
1793 | break; | 1815 | break; |
1816 | } | ||
1794 | error = -EPERM; | 1817 | error = -EPERM; |
1795 | if (type >= MAX_SWAPFILES) { | 1818 | if (type >= MAX_SWAPFILES) { |
1796 | spin_unlock(&swap_lock); | 1819 | spin_unlock(&swap_lock); |
1820 | kfree(p); | ||
1797 | goto out; | 1821 | goto out; |
1798 | } | 1822 | } |
1799 | if (type >= nr_swapfiles) | 1823 | if (type >= nr_swapfiles) { |
1800 | nr_swapfiles = type+1; | 1824 | p->type = type; |
1801 | memset(p, 0, sizeof(*p)); | 1825 | swap_info[type] = p; |
1802 | INIT_LIST_HEAD(&p->extent_list); | 1826 | /* |
1827 | * Write swap_info[type] before nr_swapfiles, in case a | ||
1828 | * racing procfs swap_start() or swap_next() is reading them. | ||
1829 | * (We never shrink nr_swapfiles, we never free this entry.) | ||
1830 | */ | ||
1831 | smp_wmb(); | ||
1832 | nr_swapfiles++; | ||
1833 | } else { | ||
1834 | kfree(p); | ||
1835 | p = swap_info[type]; | ||
1836 | /* | ||
1837 | * Do not memset this entry: a racing procfs swap_next() | ||
1838 | * would be relying on p->type to remain valid. | ||
1839 | */ | ||
1840 | } | ||
1841 | INIT_LIST_HEAD(&p->first_swap_extent.list); | ||
1803 | p->flags = SWP_USED; | 1842 | p->flags = SWP_USED; |
1804 | p->next = -1; | 1843 | p->next = -1; |
1805 | spin_unlock(&swap_lock); | 1844 | spin_unlock(&swap_lock); |
1845 | |||
1806 | name = getname(specialfile); | 1846 | name = getname(specialfile); |
1807 | error = PTR_ERR(name); | 1847 | error = PTR_ERR(name); |
1808 | if (IS_ERR(name)) { | 1848 | if (IS_ERR(name)) { |
@@ -1822,7 +1862,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1822 | 1862 | ||
1823 | error = -EBUSY; | 1863 | error = -EBUSY; |
1824 | for (i = 0; i < nr_swapfiles; i++) { | 1864 | for (i = 0; i < nr_swapfiles; i++) { |
1825 | struct swap_info_struct *q = &swap_info[i]; | 1865 | struct swap_info_struct *q = swap_info[i]; |
1826 | 1866 | ||
1827 | if (i == type || !q->swap_file) | 1867 | if (i == type || !q->swap_file) |
1828 | continue; | 1868 | continue; |
@@ -1897,6 +1937,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1897 | 1937 | ||
1898 | p->lowest_bit = 1; | 1938 | p->lowest_bit = 1; |
1899 | p->cluster_next = 1; | 1939 | p->cluster_next = 1; |
1940 | p->cluster_nr = 0; | ||
1900 | 1941 | ||
1901 | /* | 1942 | /* |
1902 | * Find out how many pages are allowed for a single swap | 1943 | * Find out how many pages are allowed for a single swap |
@@ -1913,9 +1954,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1913 | * swap pte. | 1954 | * swap pte. |
1914 | */ | 1955 | */ |
1915 | maxpages = swp_offset(pte_to_swp_entry( | 1956 | maxpages = swp_offset(pte_to_swp_entry( |
1916 | swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; | 1957 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; |
1917 | if (maxpages > swap_header->info.last_page) | 1958 | if (maxpages > swap_header->info.last_page) { |
1918 | maxpages = swap_header->info.last_page; | 1959 | maxpages = swap_header->info.last_page + 1; |
1960 | /* p->max is an unsigned int: don't overflow it */ | ||
1961 | if ((unsigned int)maxpages == 0) | ||
1962 | maxpages = UINT_MAX; | ||
1963 | } | ||
1919 | p->highest_bit = maxpages - 1; | 1964 | p->highest_bit = maxpages - 1; |
1920 | 1965 | ||
1921 | error = -EINVAL; | 1966 | error = -EINVAL; |
@@ -1932,30 +1977,31 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1932 | goto bad_swap; | 1977 | goto bad_swap; |
1933 | 1978 | ||
1934 | /* OK, set up the swap map and apply the bad block list */ | 1979 | /* OK, set up the swap map and apply the bad block list */ |
1935 | swap_map = vmalloc(maxpages * sizeof(short)); | 1980 | swap_map = vmalloc(maxpages); |
1936 | if (!swap_map) { | 1981 | if (!swap_map) { |
1937 | error = -ENOMEM; | 1982 | error = -ENOMEM; |
1938 | goto bad_swap; | 1983 | goto bad_swap; |
1939 | } | 1984 | } |
1940 | 1985 | ||
1941 | memset(swap_map, 0, maxpages * sizeof(short)); | 1986 | memset(swap_map, 0, maxpages); |
1987 | nr_good_pages = maxpages - 1; /* omit header page */ | ||
1988 | |||
1942 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 1989 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
1943 | int page_nr = swap_header->info.badpages[i]; | 1990 | unsigned int page_nr = swap_header->info.badpages[i]; |
1944 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { | 1991 | if (page_nr == 0 || page_nr > swap_header->info.last_page) { |
1945 | error = -EINVAL; | 1992 | error = -EINVAL; |
1946 | goto bad_swap; | 1993 | goto bad_swap; |
1947 | } | 1994 | } |
1948 | swap_map[page_nr] = SWAP_MAP_BAD; | 1995 | if (page_nr < maxpages) { |
1996 | swap_map[page_nr] = SWAP_MAP_BAD; | ||
1997 | nr_good_pages--; | ||
1998 | } | ||
1949 | } | 1999 | } |
1950 | 2000 | ||
1951 | error = swap_cgroup_swapon(type, maxpages); | 2001 | error = swap_cgroup_swapon(type, maxpages); |
1952 | if (error) | 2002 | if (error) |
1953 | goto bad_swap; | 2003 | goto bad_swap; |
1954 | 2004 | ||
1955 | nr_good_pages = swap_header->info.last_page - | ||
1956 | swap_header->info.nr_badpages - | ||
1957 | 1 /* header page */; | ||
1958 | |||
1959 | if (nr_good_pages) { | 2005 | if (nr_good_pages) { |
1960 | swap_map[0] = SWAP_MAP_BAD; | 2006 | swap_map[0] = SWAP_MAP_BAD; |
1961 | p->max = maxpages; | 2007 | p->max = maxpages; |
@@ -2003,18 +2049,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2003 | 2049 | ||
2004 | /* insert swap space into swap_list: */ | 2050 | /* insert swap space into swap_list: */ |
2005 | prev = -1; | 2051 | prev = -1; |
2006 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | 2052 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { |
2007 | if (p->prio >= swap_info[i].prio) { | 2053 | if (p->prio >= swap_info[i]->prio) |
2008 | break; | 2054 | break; |
2009 | } | ||
2010 | prev = i; | 2055 | prev = i; |
2011 | } | 2056 | } |
2012 | p->next = i; | 2057 | p->next = i; |
2013 | if (prev < 0) { | 2058 | if (prev < 0) |
2014 | swap_list.head = swap_list.next = p - swap_info; | 2059 | swap_list.head = swap_list.next = type; |
2015 | } else { | 2060 | else |
2016 | swap_info[prev].next = p - swap_info; | 2061 | swap_info[prev]->next = type; |
2017 | } | ||
2018 | spin_unlock(&swap_lock); | 2062 | spin_unlock(&swap_lock); |
2019 | mutex_unlock(&swapon_mutex); | 2063 | mutex_unlock(&swapon_mutex); |
2020 | error = 0; | 2064 | error = 0; |
@@ -2051,15 +2095,15 @@ out: | |||
2051 | 2095 | ||
2052 | void si_swapinfo(struct sysinfo *val) | 2096 | void si_swapinfo(struct sysinfo *val) |
2053 | { | 2097 | { |
2054 | unsigned int i; | 2098 | unsigned int type; |
2055 | unsigned long nr_to_be_unused = 0; | 2099 | unsigned long nr_to_be_unused = 0; |
2056 | 2100 | ||
2057 | spin_lock(&swap_lock); | 2101 | spin_lock(&swap_lock); |
2058 | for (i = 0; i < nr_swapfiles; i++) { | 2102 | for (type = 0; type < nr_swapfiles; type++) { |
2059 | if (!(swap_info[i].flags & SWP_USED) || | 2103 | struct swap_info_struct *si = swap_info[type]; |
2060 | (swap_info[i].flags & SWP_WRITEOK)) | 2104 | |
2061 | continue; | 2105 | if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) |
2062 | nr_to_be_unused += swap_info[i].inuse_pages; | 2106 | nr_to_be_unused += si->inuse_pages; |
2063 | } | 2107 | } |
2064 | val->freeswap = nr_swap_pages + nr_to_be_unused; | 2108 | val->freeswap = nr_swap_pages + nr_to_be_unused; |
2065 | val->totalswap = total_swap_pages + nr_to_be_unused; | 2109 | val->totalswap = total_swap_pages + nr_to_be_unused; |
@@ -2069,101 +2113,111 @@ void si_swapinfo(struct sysinfo *val) | |||
2069 | /* | 2113 | /* |
2070 | * Verify that a swap entry is valid and increment its swap map count. | 2114 | * Verify that a swap entry is valid and increment its swap map count. |
2071 | * | 2115 | * |
2072 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | ||
2073 | * "permanent", but will be reclaimed by the next swapoff. | ||
2074 | * Returns error code in following case. | 2116 | * Returns error code in following case. |
2075 | * - success -> 0 | 2117 | * - success -> 0 |
2076 | * - swp_entry is invalid -> EINVAL | 2118 | * - swp_entry is invalid -> EINVAL |
2077 | * - swp_entry is migration entry -> EINVAL | 2119 | * - swp_entry is migration entry -> EINVAL |
2078 | * - swap-cache reference is requested but there is already one. -> EEXIST | 2120 | * - swap-cache reference is requested but there is already one. -> EEXIST |
2079 | * - swap-cache reference is requested but the entry is not used. -> ENOENT | 2121 | * - swap-cache reference is requested but the entry is not used. -> ENOENT |
2122 | * - swap-mapped reference requested but needs continued swap count. -> ENOMEM | ||
2080 | */ | 2123 | */ |
2081 | static int __swap_duplicate(swp_entry_t entry, bool cache) | 2124 | static int __swap_duplicate(swp_entry_t entry, unsigned char usage) |
2082 | { | 2125 | { |
2083 | struct swap_info_struct * p; | 2126 | struct swap_info_struct *p; |
2084 | unsigned long offset, type; | 2127 | unsigned long offset, type; |
2085 | int result = -EINVAL; | 2128 | unsigned char count; |
2086 | int count; | 2129 | unsigned char has_cache; |
2087 | bool has_cache; | 2130 | int err = -EINVAL; |
2088 | 2131 | ||
2089 | if (non_swap_entry(entry)) | 2132 | if (non_swap_entry(entry)) |
2090 | return -EINVAL; | 2133 | goto out; |
2091 | 2134 | ||
2092 | type = swp_type(entry); | 2135 | type = swp_type(entry); |
2093 | if (type >= nr_swapfiles) | 2136 | if (type >= nr_swapfiles) |
2094 | goto bad_file; | 2137 | goto bad_file; |
2095 | p = type + swap_info; | 2138 | p = swap_info[type]; |
2096 | offset = swp_offset(entry); | 2139 | offset = swp_offset(entry); |
2097 | 2140 | ||
2098 | spin_lock(&swap_lock); | 2141 | spin_lock(&swap_lock); |
2099 | |||
2100 | if (unlikely(offset >= p->max)) | 2142 | if (unlikely(offset >= p->max)) |
2101 | goto unlock_out; | 2143 | goto unlock_out; |
2102 | 2144 | ||
2103 | count = swap_count(p->swap_map[offset]); | 2145 | count = p->swap_map[offset]; |
2104 | has_cache = swap_has_cache(p->swap_map[offset]); | 2146 | has_cache = count & SWAP_HAS_CACHE; |
2147 | count &= ~SWAP_HAS_CACHE; | ||
2148 | err = 0; | ||
2105 | 2149 | ||
2106 | if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ | 2150 | if (usage == SWAP_HAS_CACHE) { |
2107 | 2151 | ||
2108 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ | 2152 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ |
2109 | if (!has_cache && count) { | 2153 | if (!has_cache && count) |
2110 | p->swap_map[offset] = encode_swapmap(count, true); | 2154 | has_cache = SWAP_HAS_CACHE; |
2111 | result = 0; | 2155 | else if (has_cache) /* someone else added cache */ |
2112 | } else if (has_cache) /* someone added cache */ | 2156 | err = -EEXIST; |
2113 | result = -EEXIST; | 2157 | else /* no users remaining */ |
2114 | else if (!count) /* no users */ | 2158 | err = -ENOENT; |
2115 | result = -ENOENT; | ||
2116 | 2159 | ||
2117 | } else if (count || has_cache) { | 2160 | } else if (count || has_cache) { |
2118 | if (count < SWAP_MAP_MAX - 1) { | 2161 | |
2119 | p->swap_map[offset] = encode_swapmap(count + 1, | 2162 | if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) |
2120 | has_cache); | 2163 | count += usage; |
2121 | result = 0; | 2164 | else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) |
2122 | } else if (count <= SWAP_MAP_MAX) { | 2165 | err = -EINVAL; |
2123 | if (swap_overflow++ < 5) | 2166 | else if (swap_count_continued(p, offset, count)) |
2124 | printk(KERN_WARNING | 2167 | count = COUNT_CONTINUED; |
2125 | "swap_dup: swap entry overflow\n"); | 2168 | else |
2126 | p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, | 2169 | err = -ENOMEM; |
2127 | has_cache); | ||
2128 | result = 0; | ||
2129 | } | ||
2130 | } else | 2170 | } else |
2131 | result = -ENOENT; /* unused swap entry */ | 2171 | err = -ENOENT; /* unused swap entry */ |
2172 | |||
2173 | p->swap_map[offset] = count | has_cache; | ||
2174 | |||
2132 | unlock_out: | 2175 | unlock_out: |
2133 | spin_unlock(&swap_lock); | 2176 | spin_unlock(&swap_lock); |
2134 | out: | 2177 | out: |
2135 | return result; | 2178 | return err; |
2136 | 2179 | ||
2137 | bad_file: | 2180 | bad_file: |
2138 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | 2181 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); |
2139 | goto out; | 2182 | goto out; |
2140 | } | 2183 | } |
2184 | |||
2185 | /* | ||
2186 | * Help swapoff by noting that swap entry belongs to shmem/tmpfs | ||
2187 | * (in which case its reference count is never incremented). | ||
2188 | */ | ||
2189 | void swap_shmem_alloc(swp_entry_t entry) | ||
2190 | { | ||
2191 | __swap_duplicate(entry, SWAP_MAP_SHMEM); | ||
2192 | } | ||
2193 | |||
2141 | /* | 2194 | /* |
2142 | * increase reference count of swap entry by 1. | 2195 | * Increase reference count of swap entry by 1. |
2196 | * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required | ||
2197 | * but could not be atomically allocated. Returns 0, just as if it succeeded, | ||
2198 | * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which | ||
2199 | * might occur if a page table entry has got corrupted. | ||
2143 | */ | 2200 | */ |
2144 | void swap_duplicate(swp_entry_t entry) | 2201 | int swap_duplicate(swp_entry_t entry) |
2145 | { | 2202 | { |
2146 | __swap_duplicate(entry, SWAP_MAP); | 2203 | int err = 0; |
2204 | |||
2205 | while (!err && __swap_duplicate(entry, 1) == -ENOMEM) | ||
2206 | err = add_swap_count_continuation(entry, GFP_ATOMIC); | ||
2207 | return err; | ||
2147 | } | 2208 | } |
2148 | 2209 | ||
2149 | /* | 2210 | /* |
2150 | * @entry: swap entry for which we allocate swap cache. | 2211 | * @entry: swap entry for which we allocate swap cache. |
2151 | * | 2212 | * |
2152 | * Called when allocating swap cache for exising swap entry, | 2213 | * Called when allocating swap cache for existing swap entry, |
2153 | * This can return error codes. Returns 0 at success. | 2214 | * This can return error codes. Returns 0 at success. |
2154 | * -EBUSY means there is a swap cache. | 2215 | * -EBUSY means there is a swap cache. |
2155 | * Note: return code is different from swap_duplicate(). | 2216 | * Note: return code is different from swap_duplicate(). |
2156 | */ | 2217 | */ |
2157 | int swapcache_prepare(swp_entry_t entry) | 2218 | int swapcache_prepare(swp_entry_t entry) |
2158 | { | 2219 | { |
2159 | return __swap_duplicate(entry, SWAP_CACHE); | 2220 | return __swap_duplicate(entry, SWAP_HAS_CACHE); |
2160 | } | ||
2161 | |||
2162 | |||
2163 | struct swap_info_struct * | ||
2164 | get_swap_info_struct(unsigned type) | ||
2165 | { | ||
2166 | return &swap_info[type]; | ||
2167 | } | 2221 | } |
2168 | 2222 | ||
2169 | /* | 2223 | /* |
@@ -2181,7 +2235,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2181 | if (!our_page_cluster) /* no readahead */ | 2235 | if (!our_page_cluster) /* no readahead */ |
2182 | return 0; | 2236 | return 0; |
2183 | 2237 | ||
2184 | si = &swap_info[swp_type(entry)]; | 2238 | si = swap_info[swp_type(entry)]; |
2185 | target = swp_offset(entry); | 2239 | target = swp_offset(entry); |
2186 | base = (target >> our_page_cluster) << our_page_cluster; | 2240 | base = (target >> our_page_cluster) << our_page_cluster; |
2187 | end = base + (1 << our_page_cluster); | 2241 | end = base + (1 << our_page_cluster); |
@@ -2217,3 +2271,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2217 | *offset = ++toff; | 2271 | *offset = ++toff; |
2218 | return nr_pages? ++nr_pages: 0; | 2272 | return nr_pages? ++nr_pages: 0; |
2219 | } | 2273 | } |
2274 | |||
2275 | /* | ||
2276 | * add_swap_count_continuation - called when a swap count is duplicated | ||
2277 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's | ||
2278 | * page of the original vmalloc'ed swap_map, to hold the continuation count | ||
2279 | * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called | ||
2280 | * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. | ||
2281 | * | ||
2282 | * These continuation pages are seldom referenced: the common paths all work | ||
2283 | * on the original swap_map, only referring to a continuation page when the | ||
2284 | * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. | ||
2285 | * | ||
2286 | * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding | ||
2287 | * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) | ||
2288 | * can be called after dropping locks. | ||
2289 | */ | ||
2290 | int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | ||
2291 | { | ||
2292 | struct swap_info_struct *si; | ||
2293 | struct page *head; | ||
2294 | struct page *page; | ||
2295 | struct page *list_page; | ||
2296 | pgoff_t offset; | ||
2297 | unsigned char count; | ||
2298 | |||
2299 | /* | ||
2300 | * When debugging, it's easier to use __GFP_ZERO here; but it's better | ||
2301 | * for latency not to zero a page while GFP_ATOMIC and holding locks. | ||
2302 | */ | ||
2303 | page = alloc_page(gfp_mask | __GFP_HIGHMEM); | ||
2304 | |||
2305 | si = swap_info_get(entry); | ||
2306 | if (!si) { | ||
2307 | /* | ||
2308 | * An acceptable race has occurred since the failing | ||
2309 | * __swap_duplicate(): the swap entry has been freed, | ||
2310 | * perhaps even the whole swap_map cleared for swapoff. | ||
2311 | */ | ||
2312 | goto outer; | ||
2313 | } | ||
2314 | |||
2315 | offset = swp_offset(entry); | ||
2316 | count = si->swap_map[offset] & ~SWAP_HAS_CACHE; | ||
2317 | |||
2318 | if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { | ||
2319 | /* | ||
2320 | * The higher the swap count, the more likely it is that tasks | ||
2321 | * will race to add swap count continuation: we need to avoid | ||
2322 | * over-provisioning. | ||
2323 | */ | ||
2324 | goto out; | ||
2325 | } | ||
2326 | |||
2327 | if (!page) { | ||
2328 | spin_unlock(&swap_lock); | ||
2329 | return -ENOMEM; | ||
2330 | } | ||
2331 | |||
2332 | /* | ||
2333 | * We are fortunate that although vmalloc_to_page uses pte_offset_map, | ||
2334 | * no architecture is using highmem pages for kernel pagetables: so it | ||
2335 | * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. | ||
2336 | */ | ||
2337 | head = vmalloc_to_page(si->swap_map + offset); | ||
2338 | offset &= ~PAGE_MASK; | ||
2339 | |||
2340 | /* | ||
2341 | * Page allocation does not initialize the page's lru field, | ||
2342 | * but it does always reset its private field. | ||
2343 | */ | ||
2344 | if (!page_private(head)) { | ||
2345 | BUG_ON(count & COUNT_CONTINUED); | ||
2346 | INIT_LIST_HEAD(&head->lru); | ||
2347 | set_page_private(head, SWP_CONTINUED); | ||
2348 | si->flags |= SWP_CONTINUED; | ||
2349 | } | ||
2350 | |||
2351 | list_for_each_entry(list_page, &head->lru, lru) { | ||
2352 | unsigned char *map; | ||
2353 | |||
2354 | /* | ||
2355 | * If the previous map said no continuation, but we've found | ||
2356 | * a continuation page, free our allocation and use this one. | ||
2357 | */ | ||
2358 | if (!(count & COUNT_CONTINUED)) | ||
2359 | goto out; | ||
2360 | |||
2361 | map = kmap_atomic(list_page, KM_USER0) + offset; | ||
2362 | count = *map; | ||
2363 | kunmap_atomic(map, KM_USER0); | ||
2364 | |||
2365 | /* | ||
2366 | * If this continuation count now has some space in it, | ||
2367 | * free our allocation and use this one. | ||
2368 | */ | ||
2369 | if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) | ||
2370 | goto out; | ||
2371 | } | ||
2372 | |||
2373 | list_add_tail(&page->lru, &head->lru); | ||
2374 | page = NULL; /* now it's attached, don't free it */ | ||
2375 | out: | ||
2376 | spin_unlock(&swap_lock); | ||
2377 | outer: | ||
2378 | if (page) | ||
2379 | __free_page(page); | ||
2380 | return 0; | ||
2381 | } | ||
2382 | |||
2383 | /* | ||
2384 | * swap_count_continued - when the original swap_map count is incremented | ||
2385 | * from SWAP_MAP_MAX, check if there is already a continuation page to carry | ||
2386 | * into, carry if so, or else fail until a new continuation page is allocated; | ||
2387 | * when the original swap_map count is decremented from 0 with continuation, | ||
2388 | * borrow from the continuation and report whether it still holds more. | ||
2389 | * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. | ||
2390 | */ | ||
2391 | static bool swap_count_continued(struct swap_info_struct *si, | ||
2392 | pgoff_t offset, unsigned char count) | ||
2393 | { | ||
2394 | struct page *head; | ||
2395 | struct page *page; | ||
2396 | unsigned char *map; | ||
2397 | |||
2398 | head = vmalloc_to_page(si->swap_map + offset); | ||
2399 | if (page_private(head) != SWP_CONTINUED) { | ||
2400 | BUG_ON(count & COUNT_CONTINUED); | ||
2401 | return false; /* need to add count continuation */ | ||
2402 | } | ||
2403 | |||
2404 | offset &= ~PAGE_MASK; | ||
2405 | page = list_entry(head->lru.next, struct page, lru); | ||
2406 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2407 | |||
2408 | if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ | ||
2409 | goto init_map; /* jump over SWAP_CONT_MAX checks */ | ||
2410 | |||
2411 | if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ | ||
2412 | /* | ||
2413 | * Think of how you add 1 to 999 | ||
2414 | */ | ||
2415 | while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { | ||
2416 | kunmap_atomic(map, KM_USER0); | ||
2417 | page = list_entry(page->lru.next, struct page, lru); | ||
2418 | BUG_ON(page == head); | ||
2419 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2420 | } | ||
2421 | if (*map == SWAP_CONT_MAX) { | ||
2422 | kunmap_atomic(map, KM_USER0); | ||
2423 | page = list_entry(page->lru.next, struct page, lru); | ||
2424 | if (page == head) | ||
2425 | return false; /* add count continuation */ | ||
2426 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2427 | init_map: *map = 0; /* we didn't zero the page */ | ||
2428 | } | ||
2429 | *map += 1; | ||
2430 | kunmap_atomic(map, KM_USER0); | ||
2431 | page = list_entry(page->lru.prev, struct page, lru); | ||
2432 | while (page != head) { | ||
2433 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2434 | *map = COUNT_CONTINUED; | ||
2435 | kunmap_atomic(map, KM_USER0); | ||
2436 | page = list_entry(page->lru.prev, struct page, lru); | ||
2437 | } | ||
2438 | return true; /* incremented */ | ||
2439 | |||
2440 | } else { /* decrementing */ | ||
2441 | /* | ||
2442 | * Think of how you subtract 1 from 1000 | ||
2443 | */ | ||
2444 | BUG_ON(count != COUNT_CONTINUED); | ||
2445 | while (*map == COUNT_CONTINUED) { | ||
2446 | kunmap_atomic(map, KM_USER0); | ||
2447 | page = list_entry(page->lru.next, struct page, lru); | ||
2448 | BUG_ON(page == head); | ||
2449 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2450 | } | ||
2451 | BUG_ON(*map == 0); | ||
2452 | *map -= 1; | ||
2453 | if (*map == 0) | ||
2454 | count = 0; | ||
2455 | kunmap_atomic(map, KM_USER0); | ||
2456 | page = list_entry(page->lru.prev, struct page, lru); | ||
2457 | while (page != head) { | ||
2458 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2459 | *map = SWAP_CONT_MAX | count; | ||
2460 | count = COUNT_CONTINUED; | ||
2461 | kunmap_atomic(map, KM_USER0); | ||
2462 | page = list_entry(page->lru.prev, struct page, lru); | ||
2463 | } | ||
2464 | return count == COUNT_CONTINUED; | ||
2465 | } | ||
2466 | } | ||
2467 | |||
2468 | /* | ||
2469 | * free_swap_count_continuations - swapoff free all the continuation pages | ||
2470 | * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. | ||
2471 | */ | ||
2472 | static void free_swap_count_continuations(struct swap_info_struct *si) | ||
2473 | { | ||
2474 | pgoff_t offset; | ||
2475 | |||
2476 | for (offset = 0; offset < si->max; offset += PAGE_SIZE) { | ||
2477 | struct page *head; | ||
2478 | head = vmalloc_to_page(si->swap_map + offset); | ||
2479 | if (page_private(head)) { | ||
2480 | struct list_head *this, *next; | ||
2481 | list_for_each_safe(this, next, &head->lru) { | ||
2482 | struct page *page; | ||
2483 | page = list_entry(this, struct page, lru); | ||
2484 | list_del(this); | ||
2485 | __free_page(page); | ||
2486 | } | ||
2487 | } | ||
2488 | } | ||
2489 | } | ||
diff --git a/mm/truncate.c b/mm/truncate.c index 450cebdabfc0..f42675a3615d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -9,6 +9,7 @@ | |||
9 | 9 | ||
10 | #include <linux/kernel.h> | 10 | #include <linux/kernel.h> |
11 | #include <linux/backing-dev.h> | 11 | #include <linux/backing-dev.h> |
12 | #include <linux/gfp.h> | ||
12 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
13 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
14 | #include <linux/module.h> | 15 | #include <linux/module.h> |
@@ -272,6 +273,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
272 | pagevec_release(&pvec); | 273 | pagevec_release(&pvec); |
273 | break; | 274 | break; |
274 | } | 275 | } |
276 | mem_cgroup_uncharge_start(); | ||
275 | for (i = 0; i < pagevec_count(&pvec); i++) { | 277 | for (i = 0; i < pagevec_count(&pvec); i++) { |
276 | struct page *page = pvec.pages[i]; | 278 | struct page *page = pvec.pages[i]; |
277 | 279 | ||
@@ -286,6 +288,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
286 | unlock_page(page); | 288 | unlock_page(page); |
287 | } | 289 | } |
288 | pagevec_release(&pvec); | 290 | pagevec_release(&pvec); |
291 | mem_cgroup_uncharge_end(); | ||
289 | } | 292 | } |
290 | } | 293 | } |
291 | EXPORT_SYMBOL(truncate_inode_pages_range); | 294 | EXPORT_SYMBOL(truncate_inode_pages_range); |
@@ -327,6 +330,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
327 | pagevec_init(&pvec, 0); | 330 | pagevec_init(&pvec, 0); |
328 | while (next <= end && | 331 | while (next <= end && |
329 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 332 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { |
333 | mem_cgroup_uncharge_start(); | ||
330 | for (i = 0; i < pagevec_count(&pvec); i++) { | 334 | for (i = 0; i < pagevec_count(&pvec); i++) { |
331 | struct page *page = pvec.pages[i]; | 335 | struct page *page = pvec.pages[i]; |
332 | pgoff_t index; | 336 | pgoff_t index; |
@@ -354,6 +358,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
354 | break; | 358 | break; |
355 | } | 359 | } |
356 | pagevec_release(&pvec); | 360 | pagevec_release(&pvec); |
361 | mem_cgroup_uncharge_end(); | ||
357 | cond_resched(); | 362 | cond_resched(); |
358 | } | 363 | } |
359 | return ret; | 364 | return ret; |
@@ -428,6 +433,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
428 | while (next <= end && !wrapped && | 433 | while (next <= end && !wrapped && |
429 | pagevec_lookup(&pvec, mapping, next, | 434 | pagevec_lookup(&pvec, mapping, next, |
430 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 435 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
436 | mem_cgroup_uncharge_start(); | ||
431 | for (i = 0; i < pagevec_count(&pvec); i++) { | 437 | for (i = 0; i < pagevec_count(&pvec); i++) { |
432 | struct page *page = pvec.pages[i]; | 438 | struct page *page = pvec.pages[i]; |
433 | pgoff_t page_index; | 439 | pgoff_t page_index; |
@@ -477,6 +483,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
477 | unlock_page(page); | 483 | unlock_page(page); |
478 | } | 484 | } |
479 | pagevec_release(&pvec); | 485 | pagevec_release(&pvec); |
486 | mem_cgroup_uncharge_end(); | ||
480 | cond_resched(); | 487 | cond_resched(); |
481 | } | 488 | } |
482 | return ret; | 489 | return ret; |
@@ -490,7 +497,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); | |||
490 | * Any pages which are found to be mapped into pagetables are unmapped prior to | 497 | * Any pages which are found to be mapped into pagetables are unmapped prior to |
491 | * invalidation. | 498 | * invalidation. |
492 | * | 499 | * |
493 | * Returns -EIO if any pages could not be invalidated. | 500 | * Returns -EBUSY if any pages could not be invalidated. |
494 | */ | 501 | */ |
495 | int invalidate_inode_pages2(struct address_space *mapping) | 502 | int invalidate_inode_pages2(struct address_space *mapping) |
496 | { | 503 | { |
@@ -516,22 +523,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
516 | */ | 523 | */ |
517 | void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) | 524 | void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) |
518 | { | 525 | { |
519 | if (new < old) { | 526 | struct address_space *mapping = inode->i_mapping; |
520 | struct address_space *mapping = inode->i_mapping; | 527 | |
521 | 528 | /* | |
522 | /* | 529 | * unmap_mapping_range is called twice, first simply for |
523 | * unmap_mapping_range is called twice, first simply for | 530 | * efficiency so that truncate_inode_pages does fewer |
524 | * efficiency so that truncate_inode_pages does fewer | 531 | * single-page unmaps. However after this first call, and |
525 | * single-page unmaps. However after this first call, and | 532 | * before truncate_inode_pages finishes, it is possible for |
526 | * before truncate_inode_pages finishes, it is possible for | 533 | * private pages to be COWed, which remain after |
527 | * private pages to be COWed, which remain after | 534 | * truncate_inode_pages finishes, hence the second |
528 | * truncate_inode_pages finishes, hence the second | 535 | * unmap_mapping_range call must be made for correctness. |
529 | * unmap_mapping_range call must be made for correctness. | 536 | */ |
530 | */ | 537 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); |
531 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | 538 | truncate_inode_pages(mapping, new); |
532 | truncate_inode_pages(mapping, new); | 539 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); |
533 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | ||
534 | } | ||
535 | } | 540 | } |
536 | EXPORT_SYMBOL(truncate_pagecache); | 541 | EXPORT_SYMBOL(truncate_pagecache); |
537 | 542 | ||
@@ -186,6 +186,27 @@ void kzfree(const void *p) | |||
186 | } | 186 | } |
187 | EXPORT_SYMBOL(kzfree); | 187 | EXPORT_SYMBOL(kzfree); |
188 | 188 | ||
189 | int kern_ptr_validate(const void *ptr, unsigned long size) | ||
190 | { | ||
191 | unsigned long addr = (unsigned long)ptr; | ||
192 | unsigned long min_addr = PAGE_OFFSET; | ||
193 | unsigned long align_mask = sizeof(void *) - 1; | ||
194 | |||
195 | if (unlikely(addr < min_addr)) | ||
196 | goto out; | ||
197 | if (unlikely(addr > (unsigned long)high_memory - size)) | ||
198 | goto out; | ||
199 | if (unlikely(addr & align_mask)) | ||
200 | goto out; | ||
201 | if (unlikely(!kern_addr_valid(addr))) | ||
202 | goto out; | ||
203 | if (unlikely(!kern_addr_valid(addr + size - 1))) | ||
204 | goto out; | ||
205 | return 1; | ||
206 | out: | ||
207 | return 0; | ||
208 | } | ||
209 | |||
189 | /* | 210 | /* |
190 | * strndup_user - duplicate an existing string from user space | 211 | * strndup_user - duplicate an existing string from user space |
191 | * @s: The string to duplicate | 212 | * @s: The string to duplicate |
@@ -220,7 +241,7 @@ char *strndup_user(const char __user *s, long n) | |||
220 | } | 241 | } |
221 | EXPORT_SYMBOL(strndup_user); | 242 | EXPORT_SYMBOL(strndup_user); |
222 | 243 | ||
223 | #ifndef HAVE_ARCH_PICK_MMAP_LAYOUT | 244 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) |
224 | void arch_pick_mmap_layout(struct mm_struct *mm) | 245 | void arch_pick_mmap_layout(struct mm_struct *mm) |
225 | { | 246 | { |
226 | mm->mmap_base = TASK_UNMAPPED_BASE; | 247 | mm->mmap_base = TASK_UNMAPPED_BASE; |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0f551a4a44cd..ae007462b7f6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void) | |||
509 | 509 | ||
510 | static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); | 510 | static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); |
511 | 511 | ||
512 | /* for per-CPU blocks */ | ||
513 | static void purge_fragmented_blocks_allcpus(void); | ||
514 | |||
512 | /* | 515 | /* |
513 | * Purges all lazily-freed vmap areas. | 516 | * Purges all lazily-freed vmap areas. |
514 | * | 517 | * |
@@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | |||
539 | } else | 542 | } else |
540 | spin_lock(&purge_lock); | 543 | spin_lock(&purge_lock); |
541 | 544 | ||
545 | if (sync) | ||
546 | purge_fragmented_blocks_allcpus(); | ||
547 | |||
542 | rcu_read_lock(); | 548 | rcu_read_lock(); |
543 | list_for_each_entry_rcu(va, &vmap_area_list, list) { | 549 | list_for_each_entry_rcu(va, &vmap_area_list, list) { |
544 | if (va->flags & VM_LAZY_FREE) { | 550 | if (va->flags & VM_LAZY_FREE) { |
@@ -555,10 +561,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | |||
555 | } | 561 | } |
556 | rcu_read_unlock(); | 562 | rcu_read_unlock(); |
557 | 563 | ||
558 | if (nr) { | 564 | if (nr) |
559 | BUG_ON(nr > atomic_read(&vmap_lazy_nr)); | ||
560 | atomic_sub(nr, &vmap_lazy_nr); | 565 | atomic_sub(nr, &vmap_lazy_nr); |
561 | } | ||
562 | 566 | ||
563 | if (nr || force_flush) | 567 | if (nr || force_flush) |
564 | flush_tlb_kernel_range(*start, *end); | 568 | flush_tlb_kernel_range(*start, *end); |
@@ -669,8 +673,6 @@ static bool vmap_initialized __read_mostly = false; | |||
669 | struct vmap_block_queue { | 673 | struct vmap_block_queue { |
670 | spinlock_t lock; | 674 | spinlock_t lock; |
671 | struct list_head free; | 675 | struct list_head free; |
672 | struct list_head dirty; | ||
673 | unsigned int nr_dirty; | ||
674 | }; | 676 | }; |
675 | 677 | ||
676 | struct vmap_block { | 678 | struct vmap_block { |
@@ -680,10 +682,9 @@ struct vmap_block { | |||
680 | unsigned long free, dirty; | 682 | unsigned long free, dirty; |
681 | DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); | 683 | DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); |
682 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); | 684 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); |
683 | union { | 685 | struct list_head free_list; |
684 | struct list_head free_list; | 686 | struct rcu_head rcu_head; |
685 | struct rcu_head rcu_head; | 687 | struct list_head purge; |
686 | }; | ||
687 | }; | 688 | }; |
688 | 689 | ||
689 | /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ | 690 | /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ |
@@ -759,9 +760,9 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
759 | vbq = &get_cpu_var(vmap_block_queue); | 760 | vbq = &get_cpu_var(vmap_block_queue); |
760 | vb->vbq = vbq; | 761 | vb->vbq = vbq; |
761 | spin_lock(&vbq->lock); | 762 | spin_lock(&vbq->lock); |
762 | list_add(&vb->free_list, &vbq->free); | 763 | list_add_rcu(&vb->free_list, &vbq->free); |
763 | spin_unlock(&vbq->lock); | 764 | spin_unlock(&vbq->lock); |
764 | put_cpu_var(vmap_cpu_blocks); | 765 | put_cpu_var(vmap_block_queue); |
765 | 766 | ||
766 | return vb; | 767 | return vb; |
767 | } | 768 | } |
@@ -778,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb) | |||
778 | struct vmap_block *tmp; | 779 | struct vmap_block *tmp; |
779 | unsigned long vb_idx; | 780 | unsigned long vb_idx; |
780 | 781 | ||
781 | BUG_ON(!list_empty(&vb->free_list)); | ||
782 | |||
783 | vb_idx = addr_to_vb_idx(vb->va->va_start); | 782 | vb_idx = addr_to_vb_idx(vb->va->va_start); |
784 | spin_lock(&vmap_block_tree_lock); | 783 | spin_lock(&vmap_block_tree_lock); |
785 | tmp = radix_tree_delete(&vmap_block_tree, vb_idx); | 784 | tmp = radix_tree_delete(&vmap_block_tree, vb_idx); |
@@ -790,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb) | |||
790 | call_rcu(&vb->rcu_head, rcu_free_vb); | 789 | call_rcu(&vb->rcu_head, rcu_free_vb); |
791 | } | 790 | } |
792 | 791 | ||
792 | static void purge_fragmented_blocks(int cpu) | ||
793 | { | ||
794 | LIST_HEAD(purge); | ||
795 | struct vmap_block *vb; | ||
796 | struct vmap_block *n_vb; | ||
797 | struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); | ||
798 | |||
799 | rcu_read_lock(); | ||
800 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | ||
801 | |||
802 | if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) | ||
803 | continue; | ||
804 | |||
805 | spin_lock(&vb->lock); | ||
806 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { | ||
807 | vb->free = 0; /* prevent further allocs after releasing lock */ | ||
808 | vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ | ||
809 | bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); | ||
810 | bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); | ||
811 | spin_lock(&vbq->lock); | ||
812 | list_del_rcu(&vb->free_list); | ||
813 | spin_unlock(&vbq->lock); | ||
814 | spin_unlock(&vb->lock); | ||
815 | list_add_tail(&vb->purge, &purge); | ||
816 | } else | ||
817 | spin_unlock(&vb->lock); | ||
818 | } | ||
819 | rcu_read_unlock(); | ||
820 | |||
821 | list_for_each_entry_safe(vb, n_vb, &purge, purge) { | ||
822 | list_del(&vb->purge); | ||
823 | free_vmap_block(vb); | ||
824 | } | ||
825 | } | ||
826 | |||
827 | static void purge_fragmented_blocks_thiscpu(void) | ||
828 | { | ||
829 | purge_fragmented_blocks(smp_processor_id()); | ||
830 | } | ||
831 | |||
832 | static void purge_fragmented_blocks_allcpus(void) | ||
833 | { | ||
834 | int cpu; | ||
835 | |||
836 | for_each_possible_cpu(cpu) | ||
837 | purge_fragmented_blocks(cpu); | ||
838 | } | ||
839 | |||
793 | static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | 840 | static void *vb_alloc(unsigned long size, gfp_t gfp_mask) |
794 | { | 841 | { |
795 | struct vmap_block_queue *vbq; | 842 | struct vmap_block_queue *vbq; |
796 | struct vmap_block *vb; | 843 | struct vmap_block *vb; |
797 | unsigned long addr = 0; | 844 | unsigned long addr = 0; |
798 | unsigned int order; | 845 | unsigned int order; |
846 | int purge = 0; | ||
799 | 847 | ||
800 | BUG_ON(size & ~PAGE_MASK); | 848 | BUG_ON(size & ~PAGE_MASK); |
801 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | 849 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); |
@@ -808,25 +856,39 @@ again: | |||
808 | int i; | 856 | int i; |
809 | 857 | ||
810 | spin_lock(&vb->lock); | 858 | spin_lock(&vb->lock); |
859 | if (vb->free < 1UL << order) | ||
860 | goto next; | ||
861 | |||
811 | i = bitmap_find_free_region(vb->alloc_map, | 862 | i = bitmap_find_free_region(vb->alloc_map, |
812 | VMAP_BBMAP_BITS, order); | 863 | VMAP_BBMAP_BITS, order); |
813 | 864 | ||
814 | if (i >= 0) { | 865 | if (i < 0) { |
815 | addr = vb->va->va_start + (i << PAGE_SHIFT); | 866 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { |
816 | BUG_ON(addr_to_vb_idx(addr) != | 867 | /* fragmented and no outstanding allocations */ |
817 | addr_to_vb_idx(vb->va->va_start)); | 868 | BUG_ON(vb->dirty != VMAP_BBMAP_BITS); |
818 | vb->free -= 1UL << order; | 869 | purge = 1; |
819 | if (vb->free == 0) { | ||
820 | spin_lock(&vbq->lock); | ||
821 | list_del_init(&vb->free_list); | ||
822 | spin_unlock(&vbq->lock); | ||
823 | } | 870 | } |
824 | spin_unlock(&vb->lock); | 871 | goto next; |
825 | break; | 872 | } |
873 | addr = vb->va->va_start + (i << PAGE_SHIFT); | ||
874 | BUG_ON(addr_to_vb_idx(addr) != | ||
875 | addr_to_vb_idx(vb->va->va_start)); | ||
876 | vb->free -= 1UL << order; | ||
877 | if (vb->free == 0) { | ||
878 | spin_lock(&vbq->lock); | ||
879 | list_del_rcu(&vb->free_list); | ||
880 | spin_unlock(&vbq->lock); | ||
826 | } | 881 | } |
827 | spin_unlock(&vb->lock); | 882 | spin_unlock(&vb->lock); |
883 | break; | ||
884 | next: | ||
885 | spin_unlock(&vb->lock); | ||
828 | } | 886 | } |
829 | put_cpu_var(vmap_cpu_blocks); | 887 | |
888 | if (purge) | ||
889 | purge_fragmented_blocks_thiscpu(); | ||
890 | |||
891 | put_cpu_var(vmap_block_queue); | ||
830 | rcu_read_unlock(); | 892 | rcu_read_unlock(); |
831 | 893 | ||
832 | if (!addr) { | 894 | if (!addr) { |
@@ -862,11 +924,11 @@ static void vb_free(const void *addr, unsigned long size) | |||
862 | BUG_ON(!vb); | 924 | BUG_ON(!vb); |
863 | 925 | ||
864 | spin_lock(&vb->lock); | 926 | spin_lock(&vb->lock); |
865 | bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); | 927 | BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); |
866 | 928 | ||
867 | vb->dirty += 1UL << order; | 929 | vb->dirty += 1UL << order; |
868 | if (vb->dirty == VMAP_BBMAP_BITS) { | 930 | if (vb->dirty == VMAP_BBMAP_BITS) { |
869 | BUG_ON(vb->free || !list_empty(&vb->free_list)); | 931 | BUG_ON(vb->free); |
870 | spin_unlock(&vb->lock); | 932 | spin_unlock(&vb->lock); |
871 | free_vmap_block(vb); | 933 | free_vmap_block(vb); |
872 | } else | 934 | } else |
@@ -1035,8 +1097,6 @@ void __init vmalloc_init(void) | |||
1035 | vbq = &per_cpu(vmap_block_queue, i); | 1097 | vbq = &per_cpu(vmap_block_queue, i); |
1036 | spin_lock_init(&vbq->lock); | 1098 | spin_lock_init(&vbq->lock); |
1037 | INIT_LIST_HEAD(&vbq->free); | 1099 | INIT_LIST_HEAD(&vbq->free); |
1038 | INIT_LIST_HEAD(&vbq->dirty); | ||
1039 | vbq->nr_dirty = 0; | ||
1040 | } | 1100 | } |
1041 | 1101 | ||
1042 | /* Import existing vmlist entries. */ | 1102 | /* Import existing vmlist entries. */ |
@@ -1411,6 +1471,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1411 | { | 1471 | { |
1412 | struct page **pages; | 1472 | struct page **pages; |
1413 | unsigned int nr_pages, array_size, i; | 1473 | unsigned int nr_pages, array_size, i; |
1474 | gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; | ||
1414 | 1475 | ||
1415 | nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; | 1476 | nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; |
1416 | array_size = (nr_pages * sizeof(struct page *)); | 1477 | array_size = (nr_pages * sizeof(struct page *)); |
@@ -1418,13 +1479,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1418 | area->nr_pages = nr_pages; | 1479 | area->nr_pages = nr_pages; |
1419 | /* Please note that the recursion is strictly bounded. */ | 1480 | /* Please note that the recursion is strictly bounded. */ |
1420 | if (array_size > PAGE_SIZE) { | 1481 | if (array_size > PAGE_SIZE) { |
1421 | pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO, | 1482 | pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, |
1422 | PAGE_KERNEL, node, caller); | 1483 | PAGE_KERNEL, node, caller); |
1423 | area->flags |= VM_VPAGES; | 1484 | area->flags |= VM_VPAGES; |
1424 | } else { | 1485 | } else { |
1425 | pages = kmalloc_node(array_size, | 1486 | pages = kmalloc_node(array_size, nested_gfp, node); |
1426 | (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO, | ||
1427 | node); | ||
1428 | } | 1487 | } |
1429 | area->pages = pages; | 1488 | area->pages = pages; |
1430 | area->caller = caller; | 1489 | area->caller = caller; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 777af57fd8c8..3ff3311447f5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -13,7 +13,7 @@ | |||
13 | 13 | ||
14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/slab.h> | 16 | #include <linux/gfp.h> |
17 | #include <linux/kernel_stat.h> | 17 | #include <linux/kernel_stat.h> |
18 | #include <linux/swap.h> | 18 | #include <linux/swap.h> |
19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
@@ -55,6 +55,11 @@ struct scan_control { | |||
55 | /* Number of pages freed so far during a call to shrink_zones() */ | 55 | /* Number of pages freed so far during a call to shrink_zones() */ |
56 | unsigned long nr_reclaimed; | 56 | unsigned long nr_reclaimed; |
57 | 57 | ||
58 | /* How many pages shrink_list() should reclaim */ | ||
59 | unsigned long nr_to_reclaim; | ||
60 | |||
61 | unsigned long hibernation_mode; | ||
62 | |||
58 | /* This context's GFP mask */ | 63 | /* This context's GFP mask */ |
59 | gfp_t gfp_mask; | 64 | gfp_t gfp_mask; |
60 | 65 | ||
@@ -66,12 +71,6 @@ struct scan_control { | |||
66 | /* Can pages be swapped as part of reclaim? */ | 71 | /* Can pages be swapped as part of reclaim? */ |
67 | int may_swap; | 72 | int may_swap; |
68 | 73 | ||
69 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for | ||
70 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. | ||
71 | * In this context, it doesn't matter that we scan the | ||
72 | * whole list at once. */ | ||
73 | int swap_cluster_max; | ||
74 | |||
75 | int swappiness; | 74 | int swappiness; |
76 | 75 | ||
77 | int all_unreclaimable; | 76 | int all_unreclaimable; |
@@ -263,27 +262,6 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
263 | return ret; | 262 | return ret; |
264 | } | 263 | } |
265 | 264 | ||
266 | /* Called without lock on whether page is mapped, so answer is unstable */ | ||
267 | static inline int page_mapping_inuse(struct page *page) | ||
268 | { | ||
269 | struct address_space *mapping; | ||
270 | |||
271 | /* Page is in somebody's page tables. */ | ||
272 | if (page_mapped(page)) | ||
273 | return 1; | ||
274 | |||
275 | /* Be more reluctant to reclaim swapcache than pagecache */ | ||
276 | if (PageSwapCache(page)) | ||
277 | return 1; | ||
278 | |||
279 | mapping = page_mapping(page); | ||
280 | if (!mapping) | ||
281 | return 0; | ||
282 | |||
283 | /* File is mmap'd by somebody? */ | ||
284 | return mapping_mapped(mapping); | ||
285 | } | ||
286 | |||
287 | static inline int is_page_cache_freeable(struct page *page) | 265 | static inline int is_page_cache_freeable(struct page *page) |
288 | { | 266 | { |
289 | /* | 267 | /* |
@@ -358,7 +336,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
358 | * stalls if we need to run get_block(). We could test | 336 | * stalls if we need to run get_block(). We could test |
359 | * PagePrivate for that. | 337 | * PagePrivate for that. |
360 | * | 338 | * |
361 | * If this process is currently in generic_file_write() against | 339 | * If this process is currently in __generic_file_aio_write() against |
362 | * this page's queue, we can perform writeback even if that | 340 | * this page's queue, we can perform writeback even if that |
363 | * will block. | 341 | * will block. |
364 | * | 342 | * |
@@ -580,6 +558,65 @@ redo: | |||
580 | put_page(page); /* drop ref from isolate */ | 558 | put_page(page); /* drop ref from isolate */ |
581 | } | 559 | } |
582 | 560 | ||
561 | enum page_references { | ||
562 | PAGEREF_RECLAIM, | ||
563 | PAGEREF_RECLAIM_CLEAN, | ||
564 | PAGEREF_KEEP, | ||
565 | PAGEREF_ACTIVATE, | ||
566 | }; | ||
567 | |||
568 | static enum page_references page_check_references(struct page *page, | ||
569 | struct scan_control *sc) | ||
570 | { | ||
571 | int referenced_ptes, referenced_page; | ||
572 | unsigned long vm_flags; | ||
573 | |||
574 | referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); | ||
575 | referenced_page = TestClearPageReferenced(page); | ||
576 | |||
577 | /* Lumpy reclaim - ignore references */ | ||
578 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
579 | return PAGEREF_RECLAIM; | ||
580 | |||
581 | /* | ||
582 | * Mlock lost the isolation race with us. Let try_to_unmap() | ||
583 | * move the page to the unevictable list. | ||
584 | */ | ||
585 | if (vm_flags & VM_LOCKED) | ||
586 | return PAGEREF_RECLAIM; | ||
587 | |||
588 | if (referenced_ptes) { | ||
589 | if (PageAnon(page)) | ||
590 | return PAGEREF_ACTIVATE; | ||
591 | /* | ||
592 | * All mapped pages start out with page table | ||
593 | * references from the instantiating fault, so we need | ||
594 | * to look twice if a mapped file page is used more | ||
595 | * than once. | ||
596 | * | ||
597 | * Mark it and spare it for another trip around the | ||
598 | * inactive list. Another page table reference will | ||
599 | * lead to its activation. | ||
600 | * | ||
601 | * Note: the mark is set for activated pages as well | ||
602 | * so that recently deactivated but used pages are | ||
603 | * quickly recovered. | ||
604 | */ | ||
605 | SetPageReferenced(page); | ||
606 | |||
607 | if (referenced_page) | ||
608 | return PAGEREF_ACTIVATE; | ||
609 | |||
610 | return PAGEREF_KEEP; | ||
611 | } | ||
612 | |||
613 | /* Reclaim if clean, defer dirty pages to writeback */ | ||
614 | if (referenced_page) | ||
615 | return PAGEREF_RECLAIM_CLEAN; | ||
616 | |||
617 | return PAGEREF_RECLAIM; | ||
618 | } | ||
619 | |||
583 | /* | 620 | /* |
584 | * shrink_page_list() returns the number of reclaimed pages | 621 | * shrink_page_list() returns the number of reclaimed pages |
585 | */ | 622 | */ |
@@ -591,16 +628,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
591 | struct pagevec freed_pvec; | 628 | struct pagevec freed_pvec; |
592 | int pgactivate = 0; | 629 | int pgactivate = 0; |
593 | unsigned long nr_reclaimed = 0; | 630 | unsigned long nr_reclaimed = 0; |
594 | unsigned long vm_flags; | ||
595 | 631 | ||
596 | cond_resched(); | 632 | cond_resched(); |
597 | 633 | ||
598 | pagevec_init(&freed_pvec, 1); | 634 | pagevec_init(&freed_pvec, 1); |
599 | while (!list_empty(page_list)) { | 635 | while (!list_empty(page_list)) { |
636 | enum page_references references; | ||
600 | struct address_space *mapping; | 637 | struct address_space *mapping; |
601 | struct page *page; | 638 | struct page *page; |
602 | int may_enter_fs; | 639 | int may_enter_fs; |
603 | int referenced; | ||
604 | 640 | ||
605 | cond_resched(); | 641 | cond_resched(); |
606 | 642 | ||
@@ -642,17 +678,16 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
642 | goto keep_locked; | 678 | goto keep_locked; |
643 | } | 679 | } |
644 | 680 | ||
645 | referenced = page_referenced(page, 1, | 681 | references = page_check_references(page, sc); |
646 | sc->mem_cgroup, &vm_flags); | 682 | switch (references) { |
647 | /* | 683 | case PAGEREF_ACTIVATE: |
648 | * In active use or really unfreeable? Activate it. | ||
649 | * If page which have PG_mlocked lost isoltation race, | ||
650 | * try_to_unmap moves it to unevictable list | ||
651 | */ | ||
652 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && | ||
653 | referenced && page_mapping_inuse(page) | ||
654 | && !(vm_flags & VM_LOCKED)) | ||
655 | goto activate_locked; | 684 | goto activate_locked; |
685 | case PAGEREF_KEEP: | ||
686 | goto keep_locked; | ||
687 | case PAGEREF_RECLAIM: | ||
688 | case PAGEREF_RECLAIM_CLEAN: | ||
689 | ; /* try to reclaim the page below */ | ||
690 | } | ||
656 | 691 | ||
657 | /* | 692 | /* |
658 | * Anonymous process memory has backing store? | 693 | * Anonymous process memory has backing store? |
@@ -686,7 +721,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
686 | } | 721 | } |
687 | 722 | ||
688 | if (PageDirty(page)) { | 723 | if (PageDirty(page)) { |
689 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) | 724 | if (references == PAGEREF_RECLAIM_CLEAN) |
690 | goto keep_locked; | 725 | goto keep_locked; |
691 | if (!may_enter_fs) | 726 | if (!may_enter_fs) |
692 | goto keep_locked; | 727 | goto keep_locked; |
@@ -1132,7 +1167,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1132 | unsigned long nr_anon; | 1167 | unsigned long nr_anon; |
1133 | unsigned long nr_file; | 1168 | unsigned long nr_file; |
1134 | 1169 | ||
1135 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, | 1170 | nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, |
1136 | &page_list, &nr_scan, sc->order, mode, | 1171 | &page_list, &nr_scan, sc->order, mode, |
1137 | zone, sc->mem_cgroup, 0, file); | 1172 | zone, sc->mem_cgroup, 0, file); |
1138 | 1173 | ||
@@ -1166,10 +1201,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1166 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); | 1201 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); |
1167 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); | 1202 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); |
1168 | 1203 | ||
1169 | reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; | 1204 | reclaim_stat->recent_scanned[0] += nr_anon; |
1170 | reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; | 1205 | reclaim_stat->recent_scanned[1] += nr_file; |
1171 | reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE]; | ||
1172 | reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE]; | ||
1173 | 1206 | ||
1174 | spin_unlock_irq(&zone->lru_lock); | 1207 | spin_unlock_irq(&zone->lru_lock); |
1175 | 1208 | ||
@@ -1353,9 +1386,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1353 | continue; | 1386 | continue; |
1354 | } | 1387 | } |
1355 | 1388 | ||
1356 | /* page_referenced clears PageReferenced */ | 1389 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
1357 | if (page_mapping_inuse(page) && | ||
1358 | page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { | ||
1359 | nr_rotated++; | 1390 | nr_rotated++; |
1360 | /* | 1391 | /* |
1361 | * Identify referenced, file-backed active pages and | 1392 | * Identify referenced, file-backed active pages and |
@@ -1464,20 +1495,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | |||
1464 | return low; | 1495 | return low; |
1465 | } | 1496 | } |
1466 | 1497 | ||
1498 | static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, | ||
1499 | int file) | ||
1500 | { | ||
1501 | if (file) | ||
1502 | return inactive_file_is_low(zone, sc); | ||
1503 | else | ||
1504 | return inactive_anon_is_low(zone, sc); | ||
1505 | } | ||
1506 | |||
1467 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1507 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
1468 | struct zone *zone, struct scan_control *sc, int priority) | 1508 | struct zone *zone, struct scan_control *sc, int priority) |
1469 | { | 1509 | { |
1470 | int file = is_file_lru(lru); | 1510 | int file = is_file_lru(lru); |
1471 | 1511 | ||
1472 | if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { | 1512 | if (is_active_lru(lru)) { |
1473 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | 1513 | if (inactive_list_is_low(zone, sc, file)) |
1514 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | ||
1474 | return 0; | 1515 | return 0; |
1475 | } | 1516 | } |
1476 | 1517 | ||
1477 | if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { | ||
1478 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | ||
1479 | return 0; | ||
1480 | } | ||
1481 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); | 1518 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); |
1482 | } | 1519 | } |
1483 | 1520 | ||
@@ -1567,15 +1604,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1567 | * until we collected @swap_cluster_max pages to scan. | 1604 | * until we collected @swap_cluster_max pages to scan. |
1568 | */ | 1605 | */ |
1569 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, | 1606 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, |
1570 | unsigned long *nr_saved_scan, | 1607 | unsigned long *nr_saved_scan) |
1571 | unsigned long swap_cluster_max) | ||
1572 | { | 1608 | { |
1573 | unsigned long nr; | 1609 | unsigned long nr; |
1574 | 1610 | ||
1575 | *nr_saved_scan += nr_to_scan; | 1611 | *nr_saved_scan += nr_to_scan; |
1576 | nr = *nr_saved_scan; | 1612 | nr = *nr_saved_scan; |
1577 | 1613 | ||
1578 | if (nr >= swap_cluster_max) | 1614 | if (nr >= SWAP_CLUSTER_MAX) |
1579 | *nr_saved_scan = 0; | 1615 | *nr_saved_scan = 0; |
1580 | else | 1616 | else |
1581 | nr = 0; | 1617 | nr = 0; |
@@ -1594,7 +1630,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1594 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ | 1630 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ |
1595 | enum lru_list l; | 1631 | enum lru_list l; |
1596 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1632 | unsigned long nr_reclaimed = sc->nr_reclaimed; |
1597 | unsigned long swap_cluster_max = sc->swap_cluster_max; | 1633 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
1598 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1634 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1599 | int noswap = 0; | 1635 | int noswap = 0; |
1600 | 1636 | ||
@@ -1616,15 +1652,15 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1616 | scan = (scan * percent[file]) / 100; | 1652 | scan = (scan * percent[file]) / 100; |
1617 | } | 1653 | } |
1618 | nr[l] = nr_scan_try_batch(scan, | 1654 | nr[l] = nr_scan_try_batch(scan, |
1619 | &reclaim_stat->nr_saved_scan[l], | 1655 | &reclaim_stat->nr_saved_scan[l]); |
1620 | swap_cluster_max); | ||
1621 | } | 1656 | } |
1622 | 1657 | ||
1623 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1658 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
1624 | nr[LRU_INACTIVE_FILE]) { | 1659 | nr[LRU_INACTIVE_FILE]) { |
1625 | for_each_evictable_lru(l) { | 1660 | for_each_evictable_lru(l) { |
1626 | if (nr[l]) { | 1661 | if (nr[l]) { |
1627 | nr_to_scan = min(nr[l], swap_cluster_max); | 1662 | nr_to_scan = min_t(unsigned long, |
1663 | nr[l], SWAP_CLUSTER_MAX); | ||
1628 | nr[l] -= nr_to_scan; | 1664 | nr[l] -= nr_to_scan; |
1629 | 1665 | ||
1630 | nr_reclaimed += shrink_list(l, nr_to_scan, | 1666 | nr_reclaimed += shrink_list(l, nr_to_scan, |
@@ -1639,8 +1675,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1639 | * with multiple processes reclaiming pages, the total | 1675 | * with multiple processes reclaiming pages, the total |
1640 | * freeing target can get unreasonably large. | 1676 | * freeing target can get unreasonably large. |
1641 | */ | 1677 | */ |
1642 | if (nr_reclaimed > swap_cluster_max && | 1678 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
1643 | priority < DEF_PRIORITY && !current_is_kswapd()) | ||
1644 | break; | 1679 | break; |
1645 | } | 1680 | } |
1646 | 1681 | ||
@@ -1693,8 +1728,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
1693 | continue; | 1728 | continue; |
1694 | note_zone_scanning_priority(zone, priority); | 1729 | note_zone_scanning_priority(zone, priority); |
1695 | 1730 | ||
1696 | if (zone_is_all_unreclaimable(zone) && | 1731 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1697 | priority != DEF_PRIORITY) | ||
1698 | continue; /* Let kswapd poll it */ | 1732 | continue; /* Let kswapd poll it */ |
1699 | sc->all_unreclaimable = 0; | 1733 | sc->all_unreclaimable = 0; |
1700 | } else { | 1734 | } else { |
@@ -1738,6 +1772,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1738 | struct zoneref *z; | 1772 | struct zoneref *z; |
1739 | struct zone *zone; | 1773 | struct zone *zone; |
1740 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); | 1774 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); |
1775 | unsigned long writeback_threshold; | ||
1741 | 1776 | ||
1742 | delayacct_freepages_start(); | 1777 | delayacct_freepages_start(); |
1743 | 1778 | ||
@@ -1773,7 +1808,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1773 | } | 1808 | } |
1774 | } | 1809 | } |
1775 | total_scanned += sc->nr_scanned; | 1810 | total_scanned += sc->nr_scanned; |
1776 | if (sc->nr_reclaimed >= sc->swap_cluster_max) { | 1811 | if (sc->nr_reclaimed >= sc->nr_to_reclaim) { |
1777 | ret = sc->nr_reclaimed; | 1812 | ret = sc->nr_reclaimed; |
1778 | goto out; | 1813 | goto out; |
1779 | } | 1814 | } |
@@ -1785,14 +1820,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1785 | * that's undesirable in laptop mode, where we *want* lumpy | 1820 | * that's undesirable in laptop mode, where we *want* lumpy |
1786 | * writeout. So in laptop mode, write out the whole world. | 1821 | * writeout. So in laptop mode, write out the whole world. |
1787 | */ | 1822 | */ |
1788 | if (total_scanned > sc->swap_cluster_max + | 1823 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; |
1789 | sc->swap_cluster_max / 2) { | 1824 | if (total_scanned > writeback_threshold) { |
1790 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); | 1825 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); |
1791 | sc->may_writepage = 1; | 1826 | sc->may_writepage = 1; |
1792 | } | 1827 | } |
1793 | 1828 | ||
1794 | /* Take a nap, wait for some writeback to complete */ | 1829 | /* Take a nap, wait for some writeback to complete */ |
1795 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) | 1830 | if (!sc->hibernation_mode && sc->nr_scanned && |
1831 | priority < DEF_PRIORITY - 2) | ||
1796 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1832 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1797 | } | 1833 | } |
1798 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 1834 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
@@ -1831,7 +1867,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
1831 | struct scan_control sc = { | 1867 | struct scan_control sc = { |
1832 | .gfp_mask = gfp_mask, | 1868 | .gfp_mask = gfp_mask, |
1833 | .may_writepage = !laptop_mode, | 1869 | .may_writepage = !laptop_mode, |
1834 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1870 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
1835 | .may_unmap = 1, | 1871 | .may_unmap = 1, |
1836 | .may_swap = 1, | 1872 | .may_swap = 1, |
1837 | .swappiness = vm_swappiness, | 1873 | .swappiness = vm_swappiness, |
@@ -1855,7 +1891,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
1855 | .may_writepage = !laptop_mode, | 1891 | .may_writepage = !laptop_mode, |
1856 | .may_unmap = 1, | 1892 | .may_unmap = 1, |
1857 | .may_swap = !noswap, | 1893 | .may_swap = !noswap, |
1858 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
1859 | .swappiness = swappiness, | 1894 | .swappiness = swappiness, |
1860 | .order = 0, | 1895 | .order = 0, |
1861 | .mem_cgroup = mem, | 1896 | .mem_cgroup = mem, |
@@ -1889,7 +1924,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1889 | .may_writepage = !laptop_mode, | 1924 | .may_writepage = !laptop_mode, |
1890 | .may_unmap = 1, | 1925 | .may_unmap = 1, |
1891 | .may_swap = !noswap, | 1926 | .may_swap = !noswap, |
1892 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1927 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
1893 | .swappiness = swappiness, | 1928 | .swappiness = swappiness, |
1894 | .order = 0, | 1929 | .order = 0, |
1895 | .mem_cgroup = mem_cont, | 1930 | .mem_cgroup = mem_cont, |
@@ -1904,6 +1939,33 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1904 | } | 1939 | } |
1905 | #endif | 1940 | #endif |
1906 | 1941 | ||
1942 | /* is kswapd sleeping prematurely? */ | ||
1943 | static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | ||
1944 | { | ||
1945 | int i; | ||
1946 | |||
1947 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | ||
1948 | if (remaining) | ||
1949 | return 1; | ||
1950 | |||
1951 | /* If after HZ/10, a zone is below the high mark, it's premature */ | ||
1952 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
1953 | struct zone *zone = pgdat->node_zones + i; | ||
1954 | |||
1955 | if (!populated_zone(zone)) | ||
1956 | continue; | ||
1957 | |||
1958 | if (zone->all_unreclaimable) | ||
1959 | continue; | ||
1960 | |||
1961 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | ||
1962 | 0, 0)) | ||
1963 | return 1; | ||
1964 | } | ||
1965 | |||
1966 | return 0; | ||
1967 | } | ||
1968 | |||
1907 | /* | 1969 | /* |
1908 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1970 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1909 | * they are all at high_wmark_pages(zone). | 1971 | * they are all at high_wmark_pages(zone). |
@@ -1936,7 +1998,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1936 | .gfp_mask = GFP_KERNEL, | 1998 | .gfp_mask = GFP_KERNEL, |
1937 | .may_unmap = 1, | 1999 | .may_unmap = 1, |
1938 | .may_swap = 1, | 2000 | .may_swap = 1, |
1939 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 2001 | /* |
2002 | * kswapd doesn't want to be bailed out while reclaim. because | ||
2003 | * we want to put equal scanning pressure on each zone. | ||
2004 | */ | ||
2005 | .nr_to_reclaim = ULONG_MAX, | ||
1940 | .swappiness = vm_swappiness, | 2006 | .swappiness = vm_swappiness, |
1941 | .order = order, | 2007 | .order = order, |
1942 | .mem_cgroup = NULL, | 2008 | .mem_cgroup = NULL, |
@@ -1961,6 +2027,7 @@ loop_again: | |||
1961 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2027 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
1962 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2028 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
1963 | unsigned long lru_pages = 0; | 2029 | unsigned long lru_pages = 0; |
2030 | int has_under_min_watermark_zone = 0; | ||
1964 | 2031 | ||
1965 | /* The swap token gets in the way of swapout... */ | 2032 | /* The swap token gets in the way of swapout... */ |
1966 | if (!priority) | 2033 | if (!priority) |
@@ -1978,8 +2045,7 @@ loop_again: | |||
1978 | if (!populated_zone(zone)) | 2045 | if (!populated_zone(zone)) |
1979 | continue; | 2046 | continue; |
1980 | 2047 | ||
1981 | if (zone_is_all_unreclaimable(zone) && | 2048 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1982 | priority != DEF_PRIORITY) | ||
1983 | continue; | 2049 | continue; |
1984 | 2050 | ||
1985 | /* | 2051 | /* |
@@ -2022,13 +2088,9 @@ loop_again: | |||
2022 | if (!populated_zone(zone)) | 2088 | if (!populated_zone(zone)) |
2023 | continue; | 2089 | continue; |
2024 | 2090 | ||
2025 | if (zone_is_all_unreclaimable(zone) && | 2091 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
2026 | priority != DEF_PRIORITY) | ||
2027 | continue; | 2092 | continue; |
2028 | 2093 | ||
2029 | if (!zone_watermark_ok(zone, order, | ||
2030 | high_wmark_pages(zone), end_zone, 0)) | ||
2031 | all_zones_ok = 0; | ||
2032 | temp_priority[i] = priority; | 2094 | temp_priority[i] = priority; |
2033 | sc.nr_scanned = 0; | 2095 | sc.nr_scanned = 0; |
2034 | note_zone_scanning_priority(zone, priority); | 2096 | note_zone_scanning_priority(zone, priority); |
@@ -2053,12 +2115,11 @@ loop_again: | |||
2053 | lru_pages); | 2115 | lru_pages); |
2054 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 2116 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
2055 | total_scanned += sc.nr_scanned; | 2117 | total_scanned += sc.nr_scanned; |
2056 | if (zone_is_all_unreclaimable(zone)) | 2118 | if (zone->all_unreclaimable) |
2057 | continue; | 2119 | continue; |
2058 | if (nr_slab == 0 && zone->pages_scanned >= | 2120 | if (nr_slab == 0 && |
2059 | (zone_reclaimable_pages(zone) * 6)) | 2121 | zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6)) |
2060 | zone_set_flag(zone, | 2122 | zone->all_unreclaimable = 1; |
2061 | ZONE_ALL_UNRECLAIMABLE); | ||
2062 | /* | 2123 | /* |
2063 | * If we've done a decent amount of scanning and | 2124 | * If we've done a decent amount of scanning and |
2064 | * the reclaim ratio is low, start doing writepage | 2125 | * the reclaim ratio is low, start doing writepage |
@@ -2067,6 +2128,20 @@ loop_again: | |||
2067 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 2128 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && |
2068 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2129 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2069 | sc.may_writepage = 1; | 2130 | sc.may_writepage = 1; |
2131 | |||
2132 | if (!zone_watermark_ok(zone, order, | ||
2133 | high_wmark_pages(zone), end_zone, 0)) { | ||
2134 | all_zones_ok = 0; | ||
2135 | /* | ||
2136 | * We are still under min water mark. This | ||
2137 | * means that we have a GFP_ATOMIC allocation | ||
2138 | * failure risk. Hurry up! | ||
2139 | */ | ||
2140 | if (!zone_watermark_ok(zone, order, | ||
2141 | min_wmark_pages(zone), end_zone, 0)) | ||
2142 | has_under_min_watermark_zone = 1; | ||
2143 | } | ||
2144 | |||
2070 | } | 2145 | } |
2071 | if (all_zones_ok) | 2146 | if (all_zones_ok) |
2072 | break; /* kswapd: all done */ | 2147 | break; /* kswapd: all done */ |
@@ -2074,8 +2149,12 @@ loop_again: | |||
2074 | * OK, kswapd is getting into trouble. Take a nap, then take | 2149 | * OK, kswapd is getting into trouble. Take a nap, then take |
2075 | * another pass across the zones. | 2150 | * another pass across the zones. |
2076 | */ | 2151 | */ |
2077 | if (total_scanned && priority < DEF_PRIORITY - 2) | 2152 | if (total_scanned && (priority < DEF_PRIORITY - 2)) { |
2078 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 2153 | if (has_under_min_watermark_zone) |
2154 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); | ||
2155 | else | ||
2156 | congestion_wait(BLK_RW_ASYNC, HZ/10); | ||
2157 | } | ||
2079 | 2158 | ||
2080 | /* | 2159 | /* |
2081 | * We do this so kswapd doesn't build up large priorities for | 2160 | * We do this so kswapd doesn't build up large priorities for |
@@ -2173,6 +2252,7 @@ static int kswapd(void *p) | |||
2173 | order = 0; | 2252 | order = 0; |
2174 | for ( ; ; ) { | 2253 | for ( ; ; ) { |
2175 | unsigned long new_order; | 2254 | unsigned long new_order; |
2255 | int ret; | ||
2176 | 2256 | ||
2177 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2257 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
2178 | new_order = pgdat->kswapd_max_order; | 2258 | new_order = pgdat->kswapd_max_order; |
@@ -2184,19 +2264,45 @@ static int kswapd(void *p) | |||
2184 | */ | 2264 | */ |
2185 | order = new_order; | 2265 | order = new_order; |
2186 | } else { | 2266 | } else { |
2187 | if (!freezing(current)) | 2267 | if (!freezing(current) && !kthread_should_stop()) { |
2188 | schedule(); | 2268 | long remaining = 0; |
2269 | |||
2270 | /* Try to sleep for a short interval */ | ||
2271 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
2272 | remaining = schedule_timeout(HZ/10); | ||
2273 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2274 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2275 | } | ||
2276 | |||
2277 | /* | ||
2278 | * After a short sleep, check if it was a | ||
2279 | * premature sleep. If not, then go fully | ||
2280 | * to sleep until explicitly woken up | ||
2281 | */ | ||
2282 | if (!sleeping_prematurely(pgdat, order, remaining)) | ||
2283 | schedule(); | ||
2284 | else { | ||
2285 | if (remaining) | ||
2286 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
2287 | else | ||
2288 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
2289 | } | ||
2290 | } | ||
2189 | 2291 | ||
2190 | order = pgdat->kswapd_max_order; | 2292 | order = pgdat->kswapd_max_order; |
2191 | } | 2293 | } |
2192 | finish_wait(&pgdat->kswapd_wait, &wait); | 2294 | finish_wait(&pgdat->kswapd_wait, &wait); |
2193 | 2295 | ||
2194 | if (!try_to_freeze()) { | 2296 | ret = try_to_freeze(); |
2195 | /* We can speed up thawing tasks if we don't call | 2297 | if (kthread_should_stop()) |
2196 | * balance_pgdat after returning from the refrigerator | 2298 | break; |
2197 | */ | 2299 | |
2300 | /* | ||
2301 | * We can speed up thawing tasks if we don't call balance_pgdat | ||
2302 | * after returning from the refrigerator | ||
2303 | */ | ||
2304 | if (!ret) | ||
2198 | balance_pgdat(pgdat, order); | 2305 | balance_pgdat(pgdat, order); |
2199 | } | ||
2200 | } | 2306 | } |
2201 | return 0; | 2307 | return 0; |
2202 | } | 2308 | } |
@@ -2260,148 +2366,43 @@ unsigned long zone_reclaimable_pages(struct zone *zone) | |||
2260 | 2366 | ||
2261 | #ifdef CONFIG_HIBERNATION | 2367 | #ifdef CONFIG_HIBERNATION |
2262 | /* | 2368 | /* |
2263 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages | 2369 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of |
2264 | * from LRU lists system-wide, for given pass and priority. | ||
2265 | * | ||
2266 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages | ||
2267 | */ | ||
2268 | static void shrink_all_zones(unsigned long nr_pages, int prio, | ||
2269 | int pass, struct scan_control *sc) | ||
2270 | { | ||
2271 | struct zone *zone; | ||
2272 | unsigned long nr_reclaimed = 0; | ||
2273 | struct zone_reclaim_stat *reclaim_stat; | ||
2274 | |||
2275 | for_each_populated_zone(zone) { | ||
2276 | enum lru_list l; | ||
2277 | |||
2278 | if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) | ||
2279 | continue; | ||
2280 | |||
2281 | for_each_evictable_lru(l) { | ||
2282 | enum zone_stat_item ls = NR_LRU_BASE + l; | ||
2283 | unsigned long lru_pages = zone_page_state(zone, ls); | ||
2284 | |||
2285 | /* For pass = 0, we don't shrink the active list */ | ||
2286 | if (pass == 0 && (l == LRU_ACTIVE_ANON || | ||
2287 | l == LRU_ACTIVE_FILE)) | ||
2288 | continue; | ||
2289 | |||
2290 | reclaim_stat = get_reclaim_stat(zone, sc); | ||
2291 | reclaim_stat->nr_saved_scan[l] += | ||
2292 | (lru_pages >> prio) + 1; | ||
2293 | if (reclaim_stat->nr_saved_scan[l] | ||
2294 | >= nr_pages || pass > 3) { | ||
2295 | unsigned long nr_to_scan; | ||
2296 | |||
2297 | reclaim_stat->nr_saved_scan[l] = 0; | ||
2298 | nr_to_scan = min(nr_pages, lru_pages); | ||
2299 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, | ||
2300 | sc, prio); | ||
2301 | if (nr_reclaimed >= nr_pages) { | ||
2302 | sc->nr_reclaimed += nr_reclaimed; | ||
2303 | return; | ||
2304 | } | ||
2305 | } | ||
2306 | } | ||
2307 | } | ||
2308 | sc->nr_reclaimed += nr_reclaimed; | ||
2309 | } | ||
2310 | |||
2311 | /* | ||
2312 | * Try to free `nr_pages' of memory, system-wide, and return the number of | ||
2313 | * freed pages. | 2370 | * freed pages. |
2314 | * | 2371 | * |
2315 | * Rather than trying to age LRUs the aim is to preserve the overall | 2372 | * Rather than trying to age LRUs the aim is to preserve the overall |
2316 | * LRU order by reclaiming preferentially | 2373 | * LRU order by reclaiming preferentially |
2317 | * inactive > active > active referenced > active mapped | 2374 | * inactive > active > active referenced > active mapped |
2318 | */ | 2375 | */ |
2319 | unsigned long shrink_all_memory(unsigned long nr_pages) | 2376 | unsigned long shrink_all_memory(unsigned long nr_to_reclaim) |
2320 | { | 2377 | { |
2321 | unsigned long lru_pages, nr_slab; | ||
2322 | int pass; | ||
2323 | struct reclaim_state reclaim_state; | 2378 | struct reclaim_state reclaim_state; |
2324 | struct scan_control sc = { | 2379 | struct scan_control sc = { |
2325 | .gfp_mask = GFP_KERNEL, | 2380 | .gfp_mask = GFP_HIGHUSER_MOVABLE, |
2326 | .may_unmap = 0, | 2381 | .may_swap = 1, |
2382 | .may_unmap = 1, | ||
2327 | .may_writepage = 1, | 2383 | .may_writepage = 1, |
2384 | .nr_to_reclaim = nr_to_reclaim, | ||
2385 | .hibernation_mode = 1, | ||
2386 | .swappiness = vm_swappiness, | ||
2387 | .order = 0, | ||
2328 | .isolate_pages = isolate_pages_global, | 2388 | .isolate_pages = isolate_pages_global, |
2329 | .nr_reclaimed = 0, | ||
2330 | }; | 2389 | }; |
2390 | struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | ||
2391 | struct task_struct *p = current; | ||
2392 | unsigned long nr_reclaimed; | ||
2331 | 2393 | ||
2332 | current->reclaim_state = &reclaim_state; | 2394 | p->flags |= PF_MEMALLOC; |
2333 | 2395 | lockdep_set_current_reclaim_state(sc.gfp_mask); | |
2334 | lru_pages = global_reclaimable_pages(); | 2396 | reclaim_state.reclaimed_slab = 0; |
2335 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); | 2397 | p->reclaim_state = &reclaim_state; |
2336 | /* If slab caches are huge, it's better to hit them first */ | ||
2337 | while (nr_slab >= lru_pages) { | ||
2338 | reclaim_state.reclaimed_slab = 0; | ||
2339 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
2340 | if (!reclaim_state.reclaimed_slab) | ||
2341 | break; | ||
2342 | |||
2343 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2344 | if (sc.nr_reclaimed >= nr_pages) | ||
2345 | goto out; | ||
2346 | |||
2347 | nr_slab -= reclaim_state.reclaimed_slab; | ||
2348 | } | ||
2349 | |||
2350 | /* | ||
2351 | * We try to shrink LRUs in 5 passes: | ||
2352 | * 0 = Reclaim from inactive_list only | ||
2353 | * 1 = Reclaim from active list but don't reclaim mapped | ||
2354 | * 2 = 2nd pass of type 1 | ||
2355 | * 3 = Reclaim mapped (normal reclaim) | ||
2356 | * 4 = 2nd pass of type 3 | ||
2357 | */ | ||
2358 | for (pass = 0; pass < 5; pass++) { | ||
2359 | int prio; | ||
2360 | |||
2361 | /* Force reclaiming mapped pages in the passes #3 and #4 */ | ||
2362 | if (pass > 2) | ||
2363 | sc.may_unmap = 1; | ||
2364 | |||
2365 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | ||
2366 | unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; | ||
2367 | |||
2368 | sc.nr_scanned = 0; | ||
2369 | sc.swap_cluster_max = nr_to_scan; | ||
2370 | shrink_all_zones(nr_to_scan, prio, pass, &sc); | ||
2371 | if (sc.nr_reclaimed >= nr_pages) | ||
2372 | goto out; | ||
2373 | |||
2374 | reclaim_state.reclaimed_slab = 0; | ||
2375 | shrink_slab(sc.nr_scanned, sc.gfp_mask, | ||
2376 | global_reclaimable_pages()); | ||
2377 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2378 | if (sc.nr_reclaimed >= nr_pages) | ||
2379 | goto out; | ||
2380 | |||
2381 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | ||
2382 | congestion_wait(BLK_RW_ASYNC, HZ / 10); | ||
2383 | } | ||
2384 | } | ||
2385 | |||
2386 | /* | ||
2387 | * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be | ||
2388 | * something in slab caches | ||
2389 | */ | ||
2390 | if (!sc.nr_reclaimed) { | ||
2391 | do { | ||
2392 | reclaim_state.reclaimed_slab = 0; | ||
2393 | shrink_slab(nr_pages, sc.gfp_mask, | ||
2394 | global_reclaimable_pages()); | ||
2395 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2396 | } while (sc.nr_reclaimed < nr_pages && | ||
2397 | reclaim_state.reclaimed_slab > 0); | ||
2398 | } | ||
2399 | 2398 | ||
2399 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | ||
2400 | 2400 | ||
2401 | out: | 2401 | p->reclaim_state = NULL; |
2402 | current->reclaim_state = NULL; | 2402 | lockdep_clear_current_reclaim_state(); |
2403 | p->flags &= ~PF_MEMALLOC; | ||
2403 | 2404 | ||
2404 | return sc.nr_reclaimed; | 2405 | return nr_reclaimed; |
2405 | } | 2406 | } |
2406 | #endif /* CONFIG_HIBERNATION */ | 2407 | #endif /* CONFIG_HIBERNATION */ |
2407 | 2408 | ||
@@ -2451,6 +2452,17 @@ int kswapd_run(int nid) | |||
2451 | return ret; | 2452 | return ret; |
2452 | } | 2453 | } |
2453 | 2454 | ||
2455 | /* | ||
2456 | * Called by memory hotplug when all memory in a node is offlined. | ||
2457 | */ | ||
2458 | void kswapd_stop(int nid) | ||
2459 | { | ||
2460 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; | ||
2461 | |||
2462 | if (kswapd) | ||
2463 | kthread_stop(kswapd); | ||
2464 | } | ||
2465 | |||
2454 | static int __init kswapd_init(void) | 2466 | static int __init kswapd_init(void) |
2455 | { | 2467 | { |
2456 | int nid; | 2468 | int nid; |
@@ -2553,8 +2565,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2553 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 2565 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
2554 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 2566 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
2555 | .may_swap = 1, | 2567 | .may_swap = 1, |
2556 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 2568 | .nr_to_reclaim = max_t(unsigned long, nr_pages, |
2557 | SWAP_CLUSTER_MAX), | 2569 | SWAP_CLUSTER_MAX), |
2558 | .gfp_mask = gfp_mask, | 2570 | .gfp_mask = gfp_mask, |
2559 | .swappiness = vm_swappiness, | 2571 | .swappiness = vm_swappiness, |
2560 | .order = order, | 2572 | .order = order, |
@@ -2570,6 +2582,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2570 | * and RECLAIM_SWAP. | 2582 | * and RECLAIM_SWAP. |
2571 | */ | 2583 | */ |
2572 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; | 2584 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; |
2585 | lockdep_set_current_reclaim_state(gfp_mask); | ||
2573 | reclaim_state.reclaimed_slab = 0; | 2586 | reclaim_state.reclaimed_slab = 0; |
2574 | p->reclaim_state = &reclaim_state; | 2587 | p->reclaim_state = &reclaim_state; |
2575 | 2588 | ||
@@ -2613,6 +2626,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2613 | 2626 | ||
2614 | p->reclaim_state = NULL; | 2627 | p->reclaim_state = NULL; |
2615 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 2628 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
2629 | lockdep_clear_current_reclaim_state(); | ||
2616 | return sc.nr_reclaimed >= nr_pages; | 2630 | return sc.nr_reclaimed >= nr_pages; |
2617 | } | 2631 | } |
2618 | 2632 | ||
@@ -2635,7 +2649,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2635 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) | 2649 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) |
2636 | return ZONE_RECLAIM_FULL; | 2650 | return ZONE_RECLAIM_FULL; |
2637 | 2651 | ||
2638 | if (zone_is_all_unreclaimable(zone)) | 2652 | if (zone->all_unreclaimable) |
2639 | return ZONE_RECLAIM_FULL; | 2653 | return ZONE_RECLAIM_FULL; |
2640 | 2654 | ||
2641 | /* | 2655 | /* |
diff --git a/mm/vmstat.c b/mm/vmstat.c index c81321f9feec..fa12ea3051fb 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/err.h> | 13 | #include <linux/err.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/slab.h> | ||
15 | #include <linux/cpu.h> | 16 | #include <linux/cpu.h> |
16 | #include <linux/vmstat.h> | 17 | #include <linux/vmstat.h> |
17 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
@@ -139,7 +140,8 @@ static void refresh_zone_stat_thresholds(void) | |||
139 | threshold = calculate_threshold(zone); | 140 | threshold = calculate_threshold(zone); |
140 | 141 | ||
141 | for_each_online_cpu(cpu) | 142 | for_each_online_cpu(cpu) |
142 | zone_pcp(zone, cpu)->stat_threshold = threshold; | 143 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold |
144 | = threshold; | ||
143 | } | 145 | } |
144 | } | 146 | } |
145 | 147 | ||
@@ -149,7 +151,8 @@ static void refresh_zone_stat_thresholds(void) | |||
149 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | 151 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
150 | int delta) | 152 | int delta) |
151 | { | 153 | { |
152 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | 154 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); |
155 | |||
153 | s8 *p = pcp->vm_stat_diff + item; | 156 | s8 *p = pcp->vm_stat_diff + item; |
154 | long x; | 157 | long x; |
155 | 158 | ||
@@ -202,7 +205,7 @@ EXPORT_SYMBOL(mod_zone_page_state); | |||
202 | */ | 205 | */ |
203 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) | 206 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) |
204 | { | 207 | { |
205 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | 208 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); |
206 | s8 *p = pcp->vm_stat_diff + item; | 209 | s8 *p = pcp->vm_stat_diff + item; |
207 | 210 | ||
208 | (*p)++; | 211 | (*p)++; |
@@ -223,7 +226,7 @@ EXPORT_SYMBOL(__inc_zone_page_state); | |||
223 | 226 | ||
224 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) | 227 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) |
225 | { | 228 | { |
226 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | 229 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); |
227 | s8 *p = pcp->vm_stat_diff + item; | 230 | s8 *p = pcp->vm_stat_diff + item; |
228 | 231 | ||
229 | (*p)--; | 232 | (*p)--; |
@@ -300,7 +303,7 @@ void refresh_cpu_vm_stats(int cpu) | |||
300 | for_each_populated_zone(zone) { | 303 | for_each_populated_zone(zone) { |
301 | struct per_cpu_pageset *p; | 304 | struct per_cpu_pageset *p; |
302 | 305 | ||
303 | p = zone_pcp(zone, cpu); | 306 | p = per_cpu_ptr(zone->pageset, cpu); |
304 | 307 | ||
305 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 308 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
306 | if (p->vm_stat_diff[i]) { | 309 | if (p->vm_stat_diff[i]) { |
@@ -683,6 +686,9 @@ static const char * const vmstat_text[] = { | |||
683 | "slabs_scanned", | 686 | "slabs_scanned", |
684 | "kswapd_steal", | 687 | "kswapd_steal", |
685 | "kswapd_inodesteal", | 688 | "kswapd_inodesteal", |
689 | "kswapd_low_wmark_hit_quickly", | ||
690 | "kswapd_high_wmark_hit_quickly", | ||
691 | "kswapd_skip_congestion_wait", | ||
686 | "pageoutrun", | 692 | "pageoutrun", |
687 | "allocstall", | 693 | "allocstall", |
688 | 694 | ||
@@ -738,7 +744,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
738 | for_each_online_cpu(i) { | 744 | for_each_online_cpu(i) { |
739 | struct per_cpu_pageset *pageset; | 745 | struct per_cpu_pageset *pageset; |
740 | 746 | ||
741 | pageset = zone_pcp(zone, i); | 747 | pageset = per_cpu_ptr(zone->pageset, i); |
742 | seq_printf(m, | 748 | seq_printf(m, |
743 | "\n cpu: %i" | 749 | "\n cpu: %i" |
744 | "\n count: %i" | 750 | "\n count: %i" |
@@ -758,7 +764,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
758 | "\n prev_priority: %i" | 764 | "\n prev_priority: %i" |
759 | "\n start_pfn: %lu" | 765 | "\n start_pfn: %lu" |
760 | "\n inactive_ratio: %u", | 766 | "\n inactive_ratio: %u", |
761 | zone_is_all_unreclaimable(zone), | 767 | zone->all_unreclaimable, |
762 | zone->prev_priority, | 768 | zone->prev_priority, |
763 | zone->zone_start_pfn, | 769 | zone->zone_start_pfn, |
764 | zone->inactive_ratio); | 770 | zone->inactive_ratio); |
@@ -883,11 +889,10 @@ static void vmstat_update(struct work_struct *w) | |||
883 | 889 | ||
884 | static void __cpuinit start_cpu_timer(int cpu) | 890 | static void __cpuinit start_cpu_timer(int cpu) |
885 | { | 891 | { |
886 | struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); | 892 | struct delayed_work *work = &per_cpu(vmstat_work, cpu); |
887 | 893 | ||
888 | INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); | 894 | INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update); |
889 | schedule_delayed_work_on(cpu, vmstat_work, | 895 | schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); |
890 | __round_jiffies_relative(HZ, cpu)); | ||
891 | } | 896 | } |
892 | 897 | ||
893 | /* | 898 | /* |
@@ -904,6 +909,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
904 | case CPU_ONLINE: | 909 | case CPU_ONLINE: |
905 | case CPU_ONLINE_FROZEN: | 910 | case CPU_ONLINE_FROZEN: |
906 | start_cpu_timer(cpu); | 911 | start_cpu_timer(cpu); |
912 | node_set_state(cpu_to_node(cpu), N_CPU); | ||
907 | break; | 913 | break; |
908 | case CPU_DOWN_PREPARE: | 914 | case CPU_DOWN_PREPARE: |
909 | case CPU_DOWN_PREPARE_FROZEN: | 915 | case CPU_DOWN_PREPARE_FROZEN: |