diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 20 | ||||
-rw-r--r-- | mm/Makefile | 5 | ||||
-rw-r--r-- | mm/allocpercpu.c | 177 | ||||
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/filemap.c | 64 | ||||
-rw-r--r-- | mm/hugetlb.c | 551 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 113 | ||||
-rw-r--r-- | mm/internal.h | 35 | ||||
-rw-r--r-- | mm/kmemleak.c | 188 | ||||
-rw-r--r-- | mm/ksm.c | 953 | ||||
-rw-r--r-- | mm/madvise.c | 21 | ||||
-rw-r--r-- | mm/memcontrol.c | 442 | ||||
-rw-r--r-- | mm/memory-failure.c | 562 | ||||
-rw-r--r-- | mm/memory.c | 35 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 16 | ||||
-rw-r--r-- | mm/mempolicy.c | 69 | ||||
-rw-r--r-- | mm/migrate.c | 131 | ||||
-rw-r--r-- | mm/mincore.c | 37 | ||||
-rw-r--r-- | mm/mlock.c | 45 | ||||
-rw-r--r-- | mm/mmap.c | 92 | ||||
-rw-r--r-- | mm/mremap.c | 241 | ||||
-rw-r--r-- | mm/nommu.c | 8 | ||||
-rw-r--r-- | mm/oom_kill.c | 103 | ||||
-rw-r--r-- | mm/page_alloc.c | 47 | ||||
-rw-r--r-- | mm/page_io.c | 17 | ||||
-rw-r--r-- | mm/pagewalk.c | 32 | ||||
-rw-r--r-- | mm/percpu.c | 24 | ||||
-rw-r--r-- | mm/readahead.c | 12 | ||||
-rw-r--r-- | mm/rmap.c | 354 | ||||
-rw-r--r-- | mm/shmem.c | 84 | ||||
-rw-r--r-- | mm/shmem_acl.c | 171 | ||||
-rw-r--r-- | mm/slab.c | 160 | ||||
-rw-r--r-- | mm/slub.c | 24 | ||||
-rw-r--r-- | mm/swapfile.c | 847 | ||||
-rw-r--r-- | mm/truncate.c | 6 | ||||
-rw-r--r-- | mm/util.c | 44 | ||||
-rw-r--r-- | mm/vmalloc.c | 11 | ||||
-rw-r--r-- | mm/vmscan.c | 321 | ||||
-rw-r--r-- | mm/vmstat.c | 10 |
39 files changed, 3864 insertions, 2216 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 44cf6f0a3a6d..ee9f3e0f2b69 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -158,11 +158,13 @@ config PAGEFLAGS_EXTENDED | |||
158 | # Default to 4 for wider testing, though 8 might be more appropriate. | 158 | # Default to 4 for wider testing, though 8 might be more appropriate. |
159 | # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. | 159 | # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. |
160 | # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. | 160 | # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. |
161 | # DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. | ||
161 | # | 162 | # |
162 | config SPLIT_PTLOCK_CPUS | 163 | config SPLIT_PTLOCK_CPUS |
163 | int | 164 | int |
164 | default "4096" if ARM && !CPU_CACHE_VIPT | 165 | default "999999" if ARM && !CPU_CACHE_VIPT |
165 | default "4096" if PARISC && !PA20 | 166 | default "999999" if PARISC && !PA20 |
167 | default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC | ||
166 | default "4" | 168 | default "4" |
167 | 169 | ||
168 | # | 170 | # |
@@ -200,14 +202,6 @@ config VIRT_TO_BUS | |||
200 | def_bool y | 202 | def_bool y |
201 | depends on !ARCH_NO_VIRT_TO_BUS | 203 | depends on !ARCH_NO_VIRT_TO_BUS |
202 | 204 | ||
203 | config HAVE_MLOCK | ||
204 | bool | ||
205 | default y if MMU=y | ||
206 | |||
207 | config HAVE_MLOCKED_PAGE_BIT | ||
208 | bool | ||
209 | default y if HAVE_MLOCK=y | ||
210 | |||
211 | config MMU_NOTIFIER | 205 | config MMU_NOTIFIER |
212 | bool | 206 | bool |
213 | 207 | ||
@@ -218,7 +212,7 @@ config KSM | |||
218 | Enable Kernel Samepage Merging: KSM periodically scans those areas | 212 | Enable Kernel Samepage Merging: KSM periodically scans those areas |
219 | of an application's address space that an app has advised may be | 213 | of an application's address space that an app has advised may be |
220 | mergeable. When it finds pages of identical content, it replaces | 214 | mergeable. When it finds pages of identical content, it replaces |
221 | the many instances by a single resident page with that content, so | 215 | the many instances by a single page with that content, so |
222 | saving memory until one or another app needs to modify the content. | 216 | saving memory until one or another app needs to modify the content. |
223 | Recommended for use with KVM, or with other duplicative applications. | 217 | Recommended for use with KVM, or with other duplicative applications. |
224 | See Documentation/vm/ksm.txt for more information: KSM is inactive | 218 | See Documentation/vm/ksm.txt for more information: KSM is inactive |
@@ -227,6 +221,7 @@ config KSM | |||
227 | 221 | ||
228 | config DEFAULT_MMAP_MIN_ADDR | 222 | config DEFAULT_MMAP_MIN_ADDR |
229 | int "Low address space to protect from user allocation" | 223 | int "Low address space to protect from user allocation" |
224 | depends on MMU | ||
230 | default 4096 | 225 | default 4096 |
231 | help | 226 | help |
232 | This is the portion of low virtual memory which should be protected | 227 | This is the portion of low virtual memory which should be protected |
@@ -257,8 +252,9 @@ config MEMORY_FAILURE | |||
257 | special hardware support and typically ECC memory. | 252 | special hardware support and typically ECC memory. |
258 | 253 | ||
259 | config HWPOISON_INJECT | 254 | config HWPOISON_INJECT |
260 | tristate "Poison pages injector" | 255 | tristate "HWPoison pages injector" |
261 | depends on MEMORY_FAILURE && DEBUG_KERNEL | 256 | depends on MEMORY_FAILURE && DEBUG_KERNEL |
257 | select PROC_PAGE_MONITOR | ||
262 | 258 | ||
263 | config NOMMU_INITIAL_TRIM_EXCESS | 259 | config NOMMU_INITIAL_TRIM_EXCESS |
264 | int "Turn on mmap() excess space trimming before booting" | 260 | int "Turn on mmap() excess space trimming before booting" |
diff --git a/mm/Makefile b/mm/Makefile index ebf849042ed3..7a68d2ab5560 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o | |||
22 | obj-$(CONFIG_NUMA) += mempolicy.o | 22 | obj-$(CONFIG_NUMA) += mempolicy.o |
23 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 23 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
24 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | 24 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o |
25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | ||
26 | obj-$(CONFIG_SLOB) += slob.o | 25 | obj-$(CONFIG_SLOB) += slob.o |
27 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 26 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
28 | obj-$(CONFIG_KSM) += ksm.o | 27 | obj-$(CONFIG_KSM) += ksm.o |
@@ -34,11 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o | |||
34 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 33 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
35 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 34 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
36 | obj-$(CONFIG_MIGRATION) += migrate.o | 35 | obj-$(CONFIG_MIGRATION) += migrate.o |
37 | ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA | ||
38 | obj-$(CONFIG_SMP) += percpu.o | 36 | obj-$(CONFIG_SMP) += percpu.o |
39 | else | ||
40 | obj-$(CONFIG_SMP) += allocpercpu.o | ||
41 | endif | ||
42 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 37 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
43 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 38 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
44 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | 39 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o |
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c deleted file mode 100644 index df34ceae0c67..000000000000 --- a/mm/allocpercpu.c +++ /dev/null | |||
@@ -1,177 +0,0 @@ | |||
1 | /* | ||
2 | * linux/mm/allocpercpu.c | ||
3 | * | ||
4 | * Separated from slab.c August 11, 2006 Christoph Lameter | ||
5 | */ | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/bootmem.h> | ||
9 | #include <asm/sections.h> | ||
10 | |||
11 | #ifndef cache_line_size | ||
12 | #define cache_line_size() L1_CACHE_BYTES | ||
13 | #endif | ||
14 | |||
15 | /** | ||
16 | * percpu_depopulate - depopulate per-cpu data for given cpu | ||
17 | * @__pdata: per-cpu data to depopulate | ||
18 | * @cpu: depopulate per-cpu data for this cpu | ||
19 | * | ||
20 | * Depopulating per-cpu data for a cpu going offline would be a typical | ||
21 | * use case. You need to register a cpu hotplug handler for that purpose. | ||
22 | */ | ||
23 | static void percpu_depopulate(void *__pdata, int cpu) | ||
24 | { | ||
25 | struct percpu_data *pdata = __percpu_disguise(__pdata); | ||
26 | |||
27 | kfree(pdata->ptrs[cpu]); | ||
28 | pdata->ptrs[cpu] = NULL; | ||
29 | } | ||
30 | |||
31 | /** | ||
32 | * percpu_depopulate_mask - depopulate per-cpu data for some cpu's | ||
33 | * @__pdata: per-cpu data to depopulate | ||
34 | * @mask: depopulate per-cpu data for cpu's selected through mask bits | ||
35 | */ | ||
36 | static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask) | ||
37 | { | ||
38 | int cpu; | ||
39 | for_each_cpu_mask_nr(cpu, *mask) | ||
40 | percpu_depopulate(__pdata, cpu); | ||
41 | } | ||
42 | |||
43 | #define percpu_depopulate_mask(__pdata, mask) \ | ||
44 | __percpu_depopulate_mask((__pdata), &(mask)) | ||
45 | |||
46 | /** | ||
47 | * percpu_populate - populate per-cpu data for given cpu | ||
48 | * @__pdata: per-cpu data to populate further | ||
49 | * @size: size of per-cpu object | ||
50 | * @gfp: may sleep or not etc. | ||
51 | * @cpu: populate per-data for this cpu | ||
52 | * | ||
53 | * Populating per-cpu data for a cpu coming online would be a typical | ||
54 | * use case. You need to register a cpu hotplug handler for that purpose. | ||
55 | * Per-cpu object is populated with zeroed buffer. | ||
56 | */ | ||
57 | static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) | ||
58 | { | ||
59 | struct percpu_data *pdata = __percpu_disguise(__pdata); | ||
60 | int node = cpu_to_node(cpu); | ||
61 | |||
62 | /* | ||
63 | * We should make sure each CPU gets private memory. | ||
64 | */ | ||
65 | size = roundup(size, cache_line_size()); | ||
66 | |||
67 | BUG_ON(pdata->ptrs[cpu]); | ||
68 | if (node_online(node)) | ||
69 | pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node); | ||
70 | else | ||
71 | pdata->ptrs[cpu] = kzalloc(size, gfp); | ||
72 | return pdata->ptrs[cpu]; | ||
73 | } | ||
74 | |||
75 | /** | ||
76 | * percpu_populate_mask - populate per-cpu data for more cpu's | ||
77 | * @__pdata: per-cpu data to populate further | ||
78 | * @size: size of per-cpu object | ||
79 | * @gfp: may sleep or not etc. | ||
80 | * @mask: populate per-cpu data for cpu's selected through mask bits | ||
81 | * | ||
82 | * Per-cpu objects are populated with zeroed buffers. | ||
83 | */ | ||
84 | static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, | ||
85 | cpumask_t *mask) | ||
86 | { | ||
87 | cpumask_t populated; | ||
88 | int cpu; | ||
89 | |||
90 | cpus_clear(populated); | ||
91 | for_each_cpu_mask_nr(cpu, *mask) | ||
92 | if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { | ||
93 | __percpu_depopulate_mask(__pdata, &populated); | ||
94 | return -ENOMEM; | ||
95 | } else | ||
96 | cpu_set(cpu, populated); | ||
97 | return 0; | ||
98 | } | ||
99 | |||
100 | #define percpu_populate_mask(__pdata, size, gfp, mask) \ | ||
101 | __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) | ||
102 | |||
103 | /** | ||
104 | * alloc_percpu - initial setup of per-cpu data | ||
105 | * @size: size of per-cpu object | ||
106 | * @align: alignment | ||
107 | * | ||
108 | * Allocate dynamic percpu area. Percpu objects are populated with | ||
109 | * zeroed buffers. | ||
110 | */ | ||
111 | void *__alloc_percpu(size_t size, size_t align) | ||
112 | { | ||
113 | /* | ||
114 | * We allocate whole cache lines to avoid false sharing | ||
115 | */ | ||
116 | size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); | ||
117 | void *pdata = kzalloc(sz, GFP_KERNEL); | ||
118 | void *__pdata = __percpu_disguise(pdata); | ||
119 | |||
120 | /* | ||
121 | * Can't easily make larger alignment work with kmalloc. WARN | ||
122 | * on it. Larger alignment should only be used for module | ||
123 | * percpu sections on SMP for which this path isn't used. | ||
124 | */ | ||
125 | WARN_ON_ONCE(align > SMP_CACHE_BYTES); | ||
126 | |||
127 | if (unlikely(!pdata)) | ||
128 | return NULL; | ||
129 | if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL, | ||
130 | &cpu_possible_map))) | ||
131 | return __pdata; | ||
132 | kfree(pdata); | ||
133 | return NULL; | ||
134 | } | ||
135 | EXPORT_SYMBOL_GPL(__alloc_percpu); | ||
136 | |||
137 | /** | ||
138 | * free_percpu - final cleanup of per-cpu data | ||
139 | * @__pdata: object to clean up | ||
140 | * | ||
141 | * We simply clean up any per-cpu object left. No need for the client to | ||
142 | * track and specify through a bis mask which per-cpu objects are to free. | ||
143 | */ | ||
144 | void free_percpu(void *__pdata) | ||
145 | { | ||
146 | if (unlikely(!__pdata)) | ||
147 | return; | ||
148 | __percpu_depopulate_mask(__pdata, cpu_possible_mask); | ||
149 | kfree(__percpu_disguise(__pdata)); | ||
150 | } | ||
151 | EXPORT_SYMBOL_GPL(free_percpu); | ||
152 | |||
153 | /* | ||
154 | * Generic percpu area setup. | ||
155 | */ | ||
156 | #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA | ||
157 | unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; | ||
158 | |||
159 | EXPORT_SYMBOL(__per_cpu_offset); | ||
160 | |||
161 | void __init setup_per_cpu_areas(void) | ||
162 | { | ||
163 | unsigned long size, i; | ||
164 | char *ptr; | ||
165 | unsigned long nr_possible_cpus = num_possible_cpus(); | ||
166 | |||
167 | /* Copy section for each CPU (we discard the original) */ | ||
168 | size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); | ||
169 | ptr = alloc_bootmem_pages(size * nr_possible_cpus); | ||
170 | |||
171 | for_each_possible_cpu(i) { | ||
172 | __per_cpu_offset[i] = ptr - __per_cpu_start; | ||
173 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | ||
174 | ptr += size; | ||
175 | } | ||
176 | } | ||
177 | #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index d1dc23cc7f10..7d1486875e1c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -432,8 +432,8 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, | |||
432 | return mark_bootmem(start, end, 1, flags); | 432 | return mark_bootmem(start, end, 1, flags); |
433 | } | 433 | } |
434 | 434 | ||
435 | static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, | 435 | static unsigned long __init align_idx(struct bootmem_data *bdata, |
436 | unsigned long step) | 436 | unsigned long idx, unsigned long step) |
437 | { | 437 | { |
438 | unsigned long base = bdata->node_min_pfn; | 438 | unsigned long base = bdata->node_min_pfn; |
439 | 439 | ||
@@ -445,8 +445,8 @@ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, | |||
445 | return ALIGN(base + idx, step) - base; | 445 | return ALIGN(base + idx, step) - base; |
446 | } | 446 | } |
447 | 447 | ||
448 | static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, | 448 | static unsigned long __init align_off(struct bootmem_data *bdata, |
449 | unsigned long align) | 449 | unsigned long off, unsigned long align) |
450 | { | 450 | { |
451 | unsigned long base = PFN_PHYS(bdata->node_min_pfn); | 451 | unsigned long base = PFN_PHYS(bdata->node_min_pfn); |
452 | 452 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index c3d3506ecaba..96ac6b0eb6cb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -260,27 +260,27 @@ int filemap_flush(struct address_space *mapping) | |||
260 | EXPORT_SYMBOL(filemap_flush); | 260 | EXPORT_SYMBOL(filemap_flush); |
261 | 261 | ||
262 | /** | 262 | /** |
263 | * wait_on_page_writeback_range - wait for writeback to complete | 263 | * filemap_fdatawait_range - wait for writeback to complete |
264 | * @mapping: target address_space | 264 | * @mapping: address space structure to wait for |
265 | * @start: beginning page index | 265 | * @start_byte: offset in bytes where the range starts |
266 | * @end: ending page index | 266 | * @end_byte: offset in bytes where the range ends (inclusive) |
267 | * | 267 | * |
268 | * Wait for writeback to complete against pages indexed by start->end | 268 | * Walk the list of under-writeback pages of the given address space |
269 | * inclusive | 269 | * in the given range and wait for all of them. |
270 | */ | 270 | */ |
271 | int wait_on_page_writeback_range(struct address_space *mapping, | 271 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, |
272 | pgoff_t start, pgoff_t end) | 272 | loff_t end_byte) |
273 | { | 273 | { |
274 | pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; | ||
275 | pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; | ||
274 | struct pagevec pvec; | 276 | struct pagevec pvec; |
275 | int nr_pages; | 277 | int nr_pages; |
276 | int ret = 0; | 278 | int ret = 0; |
277 | pgoff_t index; | ||
278 | 279 | ||
279 | if (end < start) | 280 | if (end_byte < start_byte) |
280 | return 0; | 281 | return 0; |
281 | 282 | ||
282 | pagevec_init(&pvec, 0); | 283 | pagevec_init(&pvec, 0); |
283 | index = start; | ||
284 | while ((index <= end) && | 284 | while ((index <= end) && |
285 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | 285 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, |
286 | PAGECACHE_TAG_WRITEBACK, | 286 | PAGECACHE_TAG_WRITEBACK, |
@@ -310,25 +310,6 @@ int wait_on_page_writeback_range(struct address_space *mapping, | |||
310 | 310 | ||
311 | return ret; | 311 | return ret; |
312 | } | 312 | } |
313 | |||
314 | /** | ||
315 | * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range | ||
316 | * @mapping: address space structure to wait for | ||
317 | * @start: offset in bytes where the range starts | ||
318 | * @end: offset in bytes where the range ends (inclusive) | ||
319 | * | ||
320 | * Walk the list of under-writeback pages of the given address space | ||
321 | * in the given range and wait for all of them. | ||
322 | * | ||
323 | * This is just a simple wrapper so that callers don't have to convert offsets | ||
324 | * to page indexes themselves | ||
325 | */ | ||
326 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start, | ||
327 | loff_t end) | ||
328 | { | ||
329 | return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT, | ||
330 | end >> PAGE_CACHE_SHIFT); | ||
331 | } | ||
332 | EXPORT_SYMBOL(filemap_fdatawait_range); | 313 | EXPORT_SYMBOL(filemap_fdatawait_range); |
333 | 314 | ||
334 | /** | 315 | /** |
@@ -345,8 +326,7 @@ int filemap_fdatawait(struct address_space *mapping) | |||
345 | if (i_size == 0) | 326 | if (i_size == 0) |
346 | return 0; | 327 | return 0; |
347 | 328 | ||
348 | return wait_on_page_writeback_range(mapping, 0, | 329 | return filemap_fdatawait_range(mapping, 0, i_size - 1); |
349 | (i_size - 1) >> PAGE_CACHE_SHIFT); | ||
350 | } | 330 | } |
351 | EXPORT_SYMBOL(filemap_fdatawait); | 331 | EXPORT_SYMBOL(filemap_fdatawait); |
352 | 332 | ||
@@ -393,9 +373,8 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
393 | WB_SYNC_ALL); | 373 | WB_SYNC_ALL); |
394 | /* See comment of filemap_write_and_wait() */ | 374 | /* See comment of filemap_write_and_wait() */ |
395 | if (err != -EIO) { | 375 | if (err != -EIO) { |
396 | int err2 = wait_on_page_writeback_range(mapping, | 376 | int err2 = filemap_fdatawait_range(mapping, |
397 | lstart >> PAGE_CACHE_SHIFT, | 377 | lstart, lend); |
398 | lend >> PAGE_CACHE_SHIFT); | ||
399 | if (!err) | 378 | if (!err) |
400 | err = err2; | 379 | err = err2; |
401 | } | 380 | } |
@@ -2261,7 +2240,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2261 | size_t count, ssize_t written) | 2240 | size_t count, ssize_t written) |
2262 | { | 2241 | { |
2263 | struct file *file = iocb->ki_filp; | 2242 | struct file *file = iocb->ki_filp; |
2264 | struct address_space *mapping = file->f_mapping; | ||
2265 | ssize_t status; | 2243 | ssize_t status; |
2266 | struct iov_iter i; | 2244 | struct iov_iter i; |
2267 | 2245 | ||
@@ -2273,15 +2251,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2273 | *ppos = pos + status; | 2251 | *ppos = pos + status; |
2274 | } | 2252 | } |
2275 | 2253 | ||
2276 | /* | ||
2277 | * If we get here for O_DIRECT writes then we must have fallen through | ||
2278 | * to buffered writes (block instantiation inside i_size). So we sync | ||
2279 | * the file data here, to try to honour O_DIRECT expectations. | ||
2280 | */ | ||
2281 | if (unlikely(file->f_flags & O_DIRECT) && written) | ||
2282 | status = filemap_write_and_wait_range(mapping, | ||
2283 | pos, pos + written - 1); | ||
2284 | |||
2285 | return written ? written : status; | 2254 | return written ? written : status; |
2286 | } | 2255 | } |
2287 | EXPORT_SYMBOL(generic_file_buffered_write); | 2256 | EXPORT_SYMBOL(generic_file_buffered_write); |
@@ -2380,10 +2349,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2380 | * semantics. | 2349 | * semantics. |
2381 | */ | 2350 | */ |
2382 | endbyte = pos + written_buffered - written - 1; | 2351 | endbyte = pos + written_buffered - written - 1; |
2383 | err = do_sync_mapping_range(file->f_mapping, pos, endbyte, | 2352 | err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); |
2384 | SYNC_FILE_RANGE_WAIT_BEFORE| | ||
2385 | SYNC_FILE_RANGE_WRITE| | ||
2386 | SYNC_FILE_RANGE_WAIT_AFTER); | ||
2387 | if (err == 0) { | 2353 | if (err == 0) { |
2388 | written = written_buffered; | 2354 | written = written_buffered; |
2389 | invalidate_mapping_pages(mapping, | 2355 | invalidate_mapping_pages(mapping, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d7601b02874..65f38c218207 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <asm/io.h> | 24 | #include <asm/io.h> |
25 | 25 | ||
26 | #include <linux/hugetlb.h> | 26 | #include <linux/hugetlb.h> |
27 | #include <linux/node.h> | ||
27 | #include "internal.h" | 28 | #include "internal.h" |
28 | 29 | ||
29 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 30 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
@@ -622,42 +623,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
622 | } | 623 | } |
623 | 624 | ||
624 | /* | 625 | /* |
625 | * Use a helper variable to find the next node and then | 626 | * common helper functions for hstate_next_node_to_{alloc|free}. |
626 | * copy it back to next_nid_to_alloc afterwards: | 627 | * We may have allocated or freed a huge page based on a different |
627 | * otherwise there's a window in which a racer might | 628 | * nodes_allowed previously, so h->next_node_to_{alloc|free} might |
628 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. | 629 | * be outside of *nodes_allowed. Ensure that we use an allowed |
629 | * But we don't need to use a spin_lock here: it really | 630 | * node for alloc or free. |
630 | * doesn't matter if occasionally a racer chooses the | ||
631 | * same nid as we do. Move nid forward in the mask even | ||
632 | * if we just successfully allocated a hugepage so that | ||
633 | * the next caller gets hugepages on the next node. | ||
634 | */ | 631 | */ |
635 | static int hstate_next_node_to_alloc(struct hstate *h) | 632 | static int next_node_allowed(int nid, nodemask_t *nodes_allowed) |
636 | { | 633 | { |
637 | int next_nid; | 634 | nid = next_node(nid, *nodes_allowed); |
638 | next_nid = next_node(h->next_nid_to_alloc, node_online_map); | 635 | if (nid == MAX_NUMNODES) |
639 | if (next_nid == MAX_NUMNODES) | 636 | nid = first_node(*nodes_allowed); |
640 | next_nid = first_node(node_online_map); | 637 | VM_BUG_ON(nid >= MAX_NUMNODES); |
641 | h->next_nid_to_alloc = next_nid; | 638 | |
642 | return next_nid; | 639 | return nid; |
640 | } | ||
641 | |||
642 | static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) | ||
643 | { | ||
644 | if (!node_isset(nid, *nodes_allowed)) | ||
645 | nid = next_node_allowed(nid, nodes_allowed); | ||
646 | return nid; | ||
647 | } | ||
648 | |||
649 | /* | ||
650 | * returns the previously saved node ["this node"] from which to | ||
651 | * allocate a persistent huge page for the pool and advance the | ||
652 | * next node from which to allocate, handling wrap at end of node | ||
653 | * mask. | ||
654 | */ | ||
655 | static int hstate_next_node_to_alloc(struct hstate *h, | ||
656 | nodemask_t *nodes_allowed) | ||
657 | { | ||
658 | int nid; | ||
659 | |||
660 | VM_BUG_ON(!nodes_allowed); | ||
661 | |||
662 | nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); | ||
663 | h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); | ||
664 | |||
665 | return nid; | ||
643 | } | 666 | } |
644 | 667 | ||
645 | static int alloc_fresh_huge_page(struct hstate *h) | 668 | static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) |
646 | { | 669 | { |
647 | struct page *page; | 670 | struct page *page; |
648 | int start_nid; | 671 | int start_nid; |
649 | int next_nid; | 672 | int next_nid; |
650 | int ret = 0; | 673 | int ret = 0; |
651 | 674 | ||
652 | start_nid = h->next_nid_to_alloc; | 675 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
653 | next_nid = start_nid; | 676 | next_nid = start_nid; |
654 | 677 | ||
655 | do { | 678 | do { |
656 | page = alloc_fresh_huge_page_node(h, next_nid); | 679 | page = alloc_fresh_huge_page_node(h, next_nid); |
657 | if (page) | 680 | if (page) { |
658 | ret = 1; | 681 | ret = 1; |
659 | next_nid = hstate_next_node_to_alloc(h); | 682 | break; |
660 | } while (!page && next_nid != start_nid); | 683 | } |
684 | next_nid = hstate_next_node_to_alloc(h, nodes_allowed); | ||
685 | } while (next_nid != start_nid); | ||
661 | 686 | ||
662 | if (ret) | 687 | if (ret) |
663 | count_vm_event(HTLB_BUDDY_PGALLOC); | 688 | count_vm_event(HTLB_BUDDY_PGALLOC); |
@@ -668,17 +693,21 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
668 | } | 693 | } |
669 | 694 | ||
670 | /* | 695 | /* |
671 | * helper for free_pool_huge_page() - find next node | 696 | * helper for free_pool_huge_page() - return the previously saved |
672 | * from which to free a huge page | 697 | * node ["this node"] from which to free a huge page. Advance the |
698 | * next node id whether or not we find a free huge page to free so | ||
699 | * that the next attempt to free addresses the next node. | ||
673 | */ | 700 | */ |
674 | static int hstate_next_node_to_free(struct hstate *h) | 701 | static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) |
675 | { | 702 | { |
676 | int next_nid; | 703 | int nid; |
677 | next_nid = next_node(h->next_nid_to_free, node_online_map); | 704 | |
678 | if (next_nid == MAX_NUMNODES) | 705 | VM_BUG_ON(!nodes_allowed); |
679 | next_nid = first_node(node_online_map); | 706 | |
680 | h->next_nid_to_free = next_nid; | 707 | nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); |
681 | return next_nid; | 708 | h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); |
709 | |||
710 | return nid; | ||
682 | } | 711 | } |
683 | 712 | ||
684 | /* | 713 | /* |
@@ -687,13 +716,14 @@ static int hstate_next_node_to_free(struct hstate *h) | |||
687 | * balanced over allowed nodes. | 716 | * balanced over allowed nodes. |
688 | * Called with hugetlb_lock locked. | 717 | * Called with hugetlb_lock locked. |
689 | */ | 718 | */ |
690 | static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | 719 | static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, |
720 | bool acct_surplus) | ||
691 | { | 721 | { |
692 | int start_nid; | 722 | int start_nid; |
693 | int next_nid; | 723 | int next_nid; |
694 | int ret = 0; | 724 | int ret = 0; |
695 | 725 | ||
696 | start_nid = h->next_nid_to_free; | 726 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
697 | next_nid = start_nid; | 727 | next_nid = start_nid; |
698 | 728 | ||
699 | do { | 729 | do { |
@@ -715,9 +745,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | |||
715 | } | 745 | } |
716 | update_and_free_page(h, page); | 746 | update_and_free_page(h, page); |
717 | ret = 1; | 747 | ret = 1; |
748 | break; | ||
718 | } | 749 | } |
719 | next_nid = hstate_next_node_to_free(h); | 750 | next_nid = hstate_next_node_to_free(h, nodes_allowed); |
720 | } while (!ret && next_nid != start_nid); | 751 | } while (next_nid != start_nid); |
721 | 752 | ||
722 | return ret; | 753 | return ret; |
723 | } | 754 | } |
@@ -911,14 +942,14 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
911 | 942 | ||
912 | /* | 943 | /* |
913 | * We want to release as many surplus pages as possible, spread | 944 | * We want to release as many surplus pages as possible, spread |
914 | * evenly across all nodes. Iterate across all nodes until we | 945 | * evenly across all nodes with memory. Iterate across these nodes |
915 | * can no longer free unreserved surplus pages. This occurs when | 946 | * until we can no longer free unreserved surplus pages. This occurs |
916 | * the nodes with surplus pages have no free pages. | 947 | * when the nodes with surplus pages have no free pages. |
917 | * free_pool_huge_page() will balance the the frees across the | 948 | * free_pool_huge_page() will balance the the freed pages across the |
918 | * on-line nodes for us and will handle the hstate accounting. | 949 | * on-line nodes with memory and will handle the hstate accounting. |
919 | */ | 950 | */ |
920 | while (nr_pages--) { | 951 | while (nr_pages--) { |
921 | if (!free_pool_huge_page(h, 1)) | 952 | if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) |
922 | break; | 953 | break; |
923 | } | 954 | } |
924 | } | 955 | } |
@@ -1022,16 +1053,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1022 | int __weak alloc_bootmem_huge_page(struct hstate *h) | 1053 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
1023 | { | 1054 | { |
1024 | struct huge_bootmem_page *m; | 1055 | struct huge_bootmem_page *m; |
1025 | int nr_nodes = nodes_weight(node_online_map); | 1056 | int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); |
1026 | 1057 | ||
1027 | while (nr_nodes) { | 1058 | while (nr_nodes) { |
1028 | void *addr; | 1059 | void *addr; |
1029 | 1060 | ||
1030 | addr = __alloc_bootmem_node_nopanic( | 1061 | addr = __alloc_bootmem_node_nopanic( |
1031 | NODE_DATA(h->next_nid_to_alloc), | 1062 | NODE_DATA(hstate_next_node_to_alloc(h, |
1063 | &node_states[N_HIGH_MEMORY])), | ||
1032 | huge_page_size(h), huge_page_size(h), 0); | 1064 | huge_page_size(h), huge_page_size(h), 0); |
1033 | 1065 | ||
1034 | hstate_next_node_to_alloc(h); | ||
1035 | if (addr) { | 1066 | if (addr) { |
1036 | /* | 1067 | /* |
1037 | * Use the beginning of the huge page to store the | 1068 | * Use the beginning of the huge page to store the |
@@ -1084,7 +1115,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
1084 | if (h->order >= MAX_ORDER) { | 1115 | if (h->order >= MAX_ORDER) { |
1085 | if (!alloc_bootmem_huge_page(h)) | 1116 | if (!alloc_bootmem_huge_page(h)) |
1086 | break; | 1117 | break; |
1087 | } else if (!alloc_fresh_huge_page(h)) | 1118 | } else if (!alloc_fresh_huge_page(h, |
1119 | &node_states[N_HIGH_MEMORY])) | ||
1088 | break; | 1120 | break; |
1089 | } | 1121 | } |
1090 | h->max_huge_pages = i; | 1122 | h->max_huge_pages = i; |
@@ -1126,14 +1158,15 @@ static void __init report_hugepages(void) | |||
1126 | } | 1158 | } |
1127 | 1159 | ||
1128 | #ifdef CONFIG_HIGHMEM | 1160 | #ifdef CONFIG_HIGHMEM |
1129 | static void try_to_free_low(struct hstate *h, unsigned long count) | 1161 | static void try_to_free_low(struct hstate *h, unsigned long count, |
1162 | nodemask_t *nodes_allowed) | ||
1130 | { | 1163 | { |
1131 | int i; | 1164 | int i; |
1132 | 1165 | ||
1133 | if (h->order >= MAX_ORDER) | 1166 | if (h->order >= MAX_ORDER) |
1134 | return; | 1167 | return; |
1135 | 1168 | ||
1136 | for (i = 0; i < MAX_NUMNODES; ++i) { | 1169 | for_each_node_mask(i, *nodes_allowed) { |
1137 | struct page *page, *next; | 1170 | struct page *page, *next; |
1138 | struct list_head *freel = &h->hugepage_freelists[i]; | 1171 | struct list_head *freel = &h->hugepage_freelists[i]; |
1139 | list_for_each_entry_safe(page, next, freel, lru) { | 1172 | list_for_each_entry_safe(page, next, freel, lru) { |
@@ -1149,7 +1182,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count) | |||
1149 | } | 1182 | } |
1150 | } | 1183 | } |
1151 | #else | 1184 | #else |
1152 | static inline void try_to_free_low(struct hstate *h, unsigned long count) | 1185 | static inline void try_to_free_low(struct hstate *h, unsigned long count, |
1186 | nodemask_t *nodes_allowed) | ||
1153 | { | 1187 | { |
1154 | } | 1188 | } |
1155 | #endif | 1189 | #endif |
@@ -1159,7 +1193,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
1159 | * balanced by operating on them in a round-robin fashion. | 1193 | * balanced by operating on them in a round-robin fashion. |
1160 | * Returns 1 if an adjustment was made. | 1194 | * Returns 1 if an adjustment was made. |
1161 | */ | 1195 | */ |
1162 | static int adjust_pool_surplus(struct hstate *h, int delta) | 1196 | static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, |
1197 | int delta) | ||
1163 | { | 1198 | { |
1164 | int start_nid, next_nid; | 1199 | int start_nid, next_nid; |
1165 | int ret = 0; | 1200 | int ret = 0; |
@@ -1167,29 +1202,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
1167 | VM_BUG_ON(delta != -1 && delta != 1); | 1202 | VM_BUG_ON(delta != -1 && delta != 1); |
1168 | 1203 | ||
1169 | if (delta < 0) | 1204 | if (delta < 0) |
1170 | start_nid = h->next_nid_to_alloc; | 1205 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
1171 | else | 1206 | else |
1172 | start_nid = h->next_nid_to_free; | 1207 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
1173 | next_nid = start_nid; | 1208 | next_nid = start_nid; |
1174 | 1209 | ||
1175 | do { | 1210 | do { |
1176 | int nid = next_nid; | 1211 | int nid = next_nid; |
1177 | if (delta < 0) { | 1212 | if (delta < 0) { |
1178 | next_nid = hstate_next_node_to_alloc(h); | ||
1179 | /* | 1213 | /* |
1180 | * To shrink on this node, there must be a surplus page | 1214 | * To shrink on this node, there must be a surplus page |
1181 | */ | 1215 | */ |
1182 | if (!h->surplus_huge_pages_node[nid]) | 1216 | if (!h->surplus_huge_pages_node[nid]) { |
1217 | next_nid = hstate_next_node_to_alloc(h, | ||
1218 | nodes_allowed); | ||
1183 | continue; | 1219 | continue; |
1220 | } | ||
1184 | } | 1221 | } |
1185 | if (delta > 0) { | 1222 | if (delta > 0) { |
1186 | next_nid = hstate_next_node_to_free(h); | ||
1187 | /* | 1223 | /* |
1188 | * Surplus cannot exceed the total number of pages | 1224 | * Surplus cannot exceed the total number of pages |
1189 | */ | 1225 | */ |
1190 | if (h->surplus_huge_pages_node[nid] >= | 1226 | if (h->surplus_huge_pages_node[nid] >= |
1191 | h->nr_huge_pages_node[nid]) | 1227 | h->nr_huge_pages_node[nid]) { |
1228 | next_nid = hstate_next_node_to_free(h, | ||
1229 | nodes_allowed); | ||
1192 | continue; | 1230 | continue; |
1231 | } | ||
1193 | } | 1232 | } |
1194 | 1233 | ||
1195 | h->surplus_huge_pages += delta; | 1234 | h->surplus_huge_pages += delta; |
@@ -1202,7 +1241,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
1202 | } | 1241 | } |
1203 | 1242 | ||
1204 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) | 1243 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
1205 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | 1244 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, |
1245 | nodemask_t *nodes_allowed) | ||
1206 | { | 1246 | { |
1207 | unsigned long min_count, ret; | 1247 | unsigned long min_count, ret; |
1208 | 1248 | ||
@@ -1222,7 +1262,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1222 | */ | 1262 | */ |
1223 | spin_lock(&hugetlb_lock); | 1263 | spin_lock(&hugetlb_lock); |
1224 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { | 1264 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { |
1225 | if (!adjust_pool_surplus(h, -1)) | 1265 | if (!adjust_pool_surplus(h, nodes_allowed, -1)) |
1226 | break; | 1266 | break; |
1227 | } | 1267 | } |
1228 | 1268 | ||
@@ -1233,11 +1273,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1233 | * and reducing the surplus. | 1273 | * and reducing the surplus. |
1234 | */ | 1274 | */ |
1235 | spin_unlock(&hugetlb_lock); | 1275 | spin_unlock(&hugetlb_lock); |
1236 | ret = alloc_fresh_huge_page(h); | 1276 | ret = alloc_fresh_huge_page(h, nodes_allowed); |
1237 | spin_lock(&hugetlb_lock); | 1277 | spin_lock(&hugetlb_lock); |
1238 | if (!ret) | 1278 | if (!ret) |
1239 | goto out; | 1279 | goto out; |
1240 | 1280 | ||
1281 | /* Bail for signals. Probably ctrl-c from user */ | ||
1282 | if (signal_pending(current)) | ||
1283 | goto out; | ||
1241 | } | 1284 | } |
1242 | 1285 | ||
1243 | /* | 1286 | /* |
@@ -1257,13 +1300,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1257 | */ | 1300 | */ |
1258 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; | 1301 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; |
1259 | min_count = max(count, min_count); | 1302 | min_count = max(count, min_count); |
1260 | try_to_free_low(h, min_count); | 1303 | try_to_free_low(h, min_count, nodes_allowed); |
1261 | while (min_count < persistent_huge_pages(h)) { | 1304 | while (min_count < persistent_huge_pages(h)) { |
1262 | if (!free_pool_huge_page(h, 0)) | 1305 | if (!free_pool_huge_page(h, nodes_allowed, 0)) |
1263 | break; | 1306 | break; |
1264 | } | 1307 | } |
1265 | while (count < persistent_huge_pages(h)) { | 1308 | while (count < persistent_huge_pages(h)) { |
1266 | if (!adjust_pool_surplus(h, 1)) | 1309 | if (!adjust_pool_surplus(h, nodes_allowed, 1)) |
1267 | break; | 1310 | break; |
1268 | } | 1311 | } |
1269 | out: | 1312 | out: |
@@ -1282,43 +1325,117 @@ out: | |||
1282 | static struct kobject *hugepages_kobj; | 1325 | static struct kobject *hugepages_kobj; |
1283 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | 1326 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; |
1284 | 1327 | ||
1285 | static struct hstate *kobj_to_hstate(struct kobject *kobj) | 1328 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); |
1329 | |||
1330 | static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) | ||
1286 | { | 1331 | { |
1287 | int i; | 1332 | int i; |
1333 | |||
1288 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | 1334 | for (i = 0; i < HUGE_MAX_HSTATE; i++) |
1289 | if (hstate_kobjs[i] == kobj) | 1335 | if (hstate_kobjs[i] == kobj) { |
1336 | if (nidp) | ||
1337 | *nidp = NUMA_NO_NODE; | ||
1290 | return &hstates[i]; | 1338 | return &hstates[i]; |
1291 | BUG(); | 1339 | } |
1292 | return NULL; | 1340 | |
1341 | return kobj_to_node_hstate(kobj, nidp); | ||
1293 | } | 1342 | } |
1294 | 1343 | ||
1295 | static ssize_t nr_hugepages_show(struct kobject *kobj, | 1344 | static ssize_t nr_hugepages_show_common(struct kobject *kobj, |
1296 | struct kobj_attribute *attr, char *buf) | 1345 | struct kobj_attribute *attr, char *buf) |
1297 | { | 1346 | { |
1298 | struct hstate *h = kobj_to_hstate(kobj); | 1347 | struct hstate *h; |
1299 | return sprintf(buf, "%lu\n", h->nr_huge_pages); | 1348 | unsigned long nr_huge_pages; |
1349 | int nid; | ||
1350 | |||
1351 | h = kobj_to_hstate(kobj, &nid); | ||
1352 | if (nid == NUMA_NO_NODE) | ||
1353 | nr_huge_pages = h->nr_huge_pages; | ||
1354 | else | ||
1355 | nr_huge_pages = h->nr_huge_pages_node[nid]; | ||
1356 | |||
1357 | return sprintf(buf, "%lu\n", nr_huge_pages); | ||
1300 | } | 1358 | } |
1301 | static ssize_t nr_hugepages_store(struct kobject *kobj, | 1359 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, |
1302 | struct kobj_attribute *attr, const char *buf, size_t count) | 1360 | struct kobject *kobj, struct kobj_attribute *attr, |
1361 | const char *buf, size_t len) | ||
1303 | { | 1362 | { |
1304 | int err; | 1363 | int err; |
1305 | unsigned long input; | 1364 | int nid; |
1306 | struct hstate *h = kobj_to_hstate(kobj); | 1365 | unsigned long count; |
1366 | struct hstate *h; | ||
1367 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); | ||
1307 | 1368 | ||
1308 | err = strict_strtoul(buf, 10, &input); | 1369 | err = strict_strtoul(buf, 10, &count); |
1309 | if (err) | 1370 | if (err) |
1310 | return 0; | 1371 | return 0; |
1311 | 1372 | ||
1312 | h->max_huge_pages = set_max_huge_pages(h, input); | 1373 | h = kobj_to_hstate(kobj, &nid); |
1374 | if (nid == NUMA_NO_NODE) { | ||
1375 | /* | ||
1376 | * global hstate attribute | ||
1377 | */ | ||
1378 | if (!(obey_mempolicy && | ||
1379 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
1380 | NODEMASK_FREE(nodes_allowed); | ||
1381 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1382 | } | ||
1383 | } else if (nodes_allowed) { | ||
1384 | /* | ||
1385 | * per node hstate attribute: adjust count to global, | ||
1386 | * but restrict alloc/free to the specified node. | ||
1387 | */ | ||
1388 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; | ||
1389 | init_nodemask_of_node(nodes_allowed, nid); | ||
1390 | } else | ||
1391 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1392 | |||
1393 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); | ||
1313 | 1394 | ||
1314 | return count; | 1395 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) |
1396 | NODEMASK_FREE(nodes_allowed); | ||
1397 | |||
1398 | return len; | ||
1399 | } | ||
1400 | |||
1401 | static ssize_t nr_hugepages_show(struct kobject *kobj, | ||
1402 | struct kobj_attribute *attr, char *buf) | ||
1403 | { | ||
1404 | return nr_hugepages_show_common(kobj, attr, buf); | ||
1405 | } | ||
1406 | |||
1407 | static ssize_t nr_hugepages_store(struct kobject *kobj, | ||
1408 | struct kobj_attribute *attr, const char *buf, size_t len) | ||
1409 | { | ||
1410 | return nr_hugepages_store_common(false, kobj, attr, buf, len); | ||
1315 | } | 1411 | } |
1316 | HSTATE_ATTR(nr_hugepages); | 1412 | HSTATE_ATTR(nr_hugepages); |
1317 | 1413 | ||
1414 | #ifdef CONFIG_NUMA | ||
1415 | |||
1416 | /* | ||
1417 | * hstate attribute for optionally mempolicy-based constraint on persistent | ||
1418 | * huge page alloc/free. | ||
1419 | */ | ||
1420 | static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, | ||
1421 | struct kobj_attribute *attr, char *buf) | ||
1422 | { | ||
1423 | return nr_hugepages_show_common(kobj, attr, buf); | ||
1424 | } | ||
1425 | |||
1426 | static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, | ||
1427 | struct kobj_attribute *attr, const char *buf, size_t len) | ||
1428 | { | ||
1429 | return nr_hugepages_store_common(true, kobj, attr, buf, len); | ||
1430 | } | ||
1431 | HSTATE_ATTR(nr_hugepages_mempolicy); | ||
1432 | #endif | ||
1433 | |||
1434 | |||
1318 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, | 1435 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, |
1319 | struct kobj_attribute *attr, char *buf) | 1436 | struct kobj_attribute *attr, char *buf) |
1320 | { | 1437 | { |
1321 | struct hstate *h = kobj_to_hstate(kobj); | 1438 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1322 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); | 1439 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); |
1323 | } | 1440 | } |
1324 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | 1441 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, |
@@ -1326,7 +1443,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | |||
1326 | { | 1443 | { |
1327 | int err; | 1444 | int err; |
1328 | unsigned long input; | 1445 | unsigned long input; |
1329 | struct hstate *h = kobj_to_hstate(kobj); | 1446 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1330 | 1447 | ||
1331 | err = strict_strtoul(buf, 10, &input); | 1448 | err = strict_strtoul(buf, 10, &input); |
1332 | if (err) | 1449 | if (err) |
@@ -1343,15 +1460,24 @@ HSTATE_ATTR(nr_overcommit_hugepages); | |||
1343 | static ssize_t free_hugepages_show(struct kobject *kobj, | 1460 | static ssize_t free_hugepages_show(struct kobject *kobj, |
1344 | struct kobj_attribute *attr, char *buf) | 1461 | struct kobj_attribute *attr, char *buf) |
1345 | { | 1462 | { |
1346 | struct hstate *h = kobj_to_hstate(kobj); | 1463 | struct hstate *h; |
1347 | return sprintf(buf, "%lu\n", h->free_huge_pages); | 1464 | unsigned long free_huge_pages; |
1465 | int nid; | ||
1466 | |||
1467 | h = kobj_to_hstate(kobj, &nid); | ||
1468 | if (nid == NUMA_NO_NODE) | ||
1469 | free_huge_pages = h->free_huge_pages; | ||
1470 | else | ||
1471 | free_huge_pages = h->free_huge_pages_node[nid]; | ||
1472 | |||
1473 | return sprintf(buf, "%lu\n", free_huge_pages); | ||
1348 | } | 1474 | } |
1349 | HSTATE_ATTR_RO(free_hugepages); | 1475 | HSTATE_ATTR_RO(free_hugepages); |
1350 | 1476 | ||
1351 | static ssize_t resv_hugepages_show(struct kobject *kobj, | 1477 | static ssize_t resv_hugepages_show(struct kobject *kobj, |
1352 | struct kobj_attribute *attr, char *buf) | 1478 | struct kobj_attribute *attr, char *buf) |
1353 | { | 1479 | { |
1354 | struct hstate *h = kobj_to_hstate(kobj); | 1480 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1355 | return sprintf(buf, "%lu\n", h->resv_huge_pages); | 1481 | return sprintf(buf, "%lu\n", h->resv_huge_pages); |
1356 | } | 1482 | } |
1357 | HSTATE_ATTR_RO(resv_hugepages); | 1483 | HSTATE_ATTR_RO(resv_hugepages); |
@@ -1359,8 +1485,17 @@ HSTATE_ATTR_RO(resv_hugepages); | |||
1359 | static ssize_t surplus_hugepages_show(struct kobject *kobj, | 1485 | static ssize_t surplus_hugepages_show(struct kobject *kobj, |
1360 | struct kobj_attribute *attr, char *buf) | 1486 | struct kobj_attribute *attr, char *buf) |
1361 | { | 1487 | { |
1362 | struct hstate *h = kobj_to_hstate(kobj); | 1488 | struct hstate *h; |
1363 | return sprintf(buf, "%lu\n", h->surplus_huge_pages); | 1489 | unsigned long surplus_huge_pages; |
1490 | int nid; | ||
1491 | |||
1492 | h = kobj_to_hstate(kobj, &nid); | ||
1493 | if (nid == NUMA_NO_NODE) | ||
1494 | surplus_huge_pages = h->surplus_huge_pages; | ||
1495 | else | ||
1496 | surplus_huge_pages = h->surplus_huge_pages_node[nid]; | ||
1497 | |||
1498 | return sprintf(buf, "%lu\n", surplus_huge_pages); | ||
1364 | } | 1499 | } |
1365 | HSTATE_ATTR_RO(surplus_hugepages); | 1500 | HSTATE_ATTR_RO(surplus_hugepages); |
1366 | 1501 | ||
@@ -1370,6 +1505,9 @@ static struct attribute *hstate_attrs[] = { | |||
1370 | &free_hugepages_attr.attr, | 1505 | &free_hugepages_attr.attr, |
1371 | &resv_hugepages_attr.attr, | 1506 | &resv_hugepages_attr.attr, |
1372 | &surplus_hugepages_attr.attr, | 1507 | &surplus_hugepages_attr.attr, |
1508 | #ifdef CONFIG_NUMA | ||
1509 | &nr_hugepages_mempolicy_attr.attr, | ||
1510 | #endif | ||
1373 | NULL, | 1511 | NULL, |
1374 | }; | 1512 | }; |
1375 | 1513 | ||
@@ -1377,19 +1515,21 @@ static struct attribute_group hstate_attr_group = { | |||
1377 | .attrs = hstate_attrs, | 1515 | .attrs = hstate_attrs, |
1378 | }; | 1516 | }; |
1379 | 1517 | ||
1380 | static int __init hugetlb_sysfs_add_hstate(struct hstate *h) | 1518 | static int __init hugetlb_sysfs_add_hstate(struct hstate *h, |
1519 | struct kobject *parent, | ||
1520 | struct kobject **hstate_kobjs, | ||
1521 | struct attribute_group *hstate_attr_group) | ||
1381 | { | 1522 | { |
1382 | int retval; | 1523 | int retval; |
1524 | int hi = h - hstates; | ||
1383 | 1525 | ||
1384 | hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, | 1526 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); |
1385 | hugepages_kobj); | 1527 | if (!hstate_kobjs[hi]) |
1386 | if (!hstate_kobjs[h - hstates]) | ||
1387 | return -ENOMEM; | 1528 | return -ENOMEM; |
1388 | 1529 | ||
1389 | retval = sysfs_create_group(hstate_kobjs[h - hstates], | 1530 | retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); |
1390 | &hstate_attr_group); | ||
1391 | if (retval) | 1531 | if (retval) |
1392 | kobject_put(hstate_kobjs[h - hstates]); | 1532 | kobject_put(hstate_kobjs[hi]); |
1393 | 1533 | ||
1394 | return retval; | 1534 | return retval; |
1395 | } | 1535 | } |
@@ -1404,17 +1544,184 @@ static void __init hugetlb_sysfs_init(void) | |||
1404 | return; | 1544 | return; |
1405 | 1545 | ||
1406 | for_each_hstate(h) { | 1546 | for_each_hstate(h) { |
1407 | err = hugetlb_sysfs_add_hstate(h); | 1547 | err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, |
1548 | hstate_kobjs, &hstate_attr_group); | ||
1408 | if (err) | 1549 | if (err) |
1409 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", | 1550 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", |
1410 | h->name); | 1551 | h->name); |
1411 | } | 1552 | } |
1412 | } | 1553 | } |
1413 | 1554 | ||
1555 | #ifdef CONFIG_NUMA | ||
1556 | |||
1557 | /* | ||
1558 | * node_hstate/s - associate per node hstate attributes, via their kobjects, | ||
1559 | * with node sysdevs in node_devices[] using a parallel array. The array | ||
1560 | * index of a node sysdev or _hstate == node id. | ||
1561 | * This is here to avoid any static dependency of the node sysdev driver, in | ||
1562 | * the base kernel, on the hugetlb module. | ||
1563 | */ | ||
1564 | struct node_hstate { | ||
1565 | struct kobject *hugepages_kobj; | ||
1566 | struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | ||
1567 | }; | ||
1568 | struct node_hstate node_hstates[MAX_NUMNODES]; | ||
1569 | |||
1570 | /* | ||
1571 | * A subset of global hstate attributes for node sysdevs | ||
1572 | */ | ||
1573 | static struct attribute *per_node_hstate_attrs[] = { | ||
1574 | &nr_hugepages_attr.attr, | ||
1575 | &free_hugepages_attr.attr, | ||
1576 | &surplus_hugepages_attr.attr, | ||
1577 | NULL, | ||
1578 | }; | ||
1579 | |||
1580 | static struct attribute_group per_node_hstate_attr_group = { | ||
1581 | .attrs = per_node_hstate_attrs, | ||
1582 | }; | ||
1583 | |||
1584 | /* | ||
1585 | * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. | ||
1586 | * Returns node id via non-NULL nidp. | ||
1587 | */ | ||
1588 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | ||
1589 | { | ||
1590 | int nid; | ||
1591 | |||
1592 | for (nid = 0; nid < nr_node_ids; nid++) { | ||
1593 | struct node_hstate *nhs = &node_hstates[nid]; | ||
1594 | int i; | ||
1595 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | ||
1596 | if (nhs->hstate_kobjs[i] == kobj) { | ||
1597 | if (nidp) | ||
1598 | *nidp = nid; | ||
1599 | return &hstates[i]; | ||
1600 | } | ||
1601 | } | ||
1602 | |||
1603 | BUG(); | ||
1604 | return NULL; | ||
1605 | } | ||
1606 | |||
1607 | /* | ||
1608 | * Unregister hstate attributes from a single node sysdev. | ||
1609 | * No-op if no hstate attributes attached. | ||
1610 | */ | ||
1611 | void hugetlb_unregister_node(struct node *node) | ||
1612 | { | ||
1613 | struct hstate *h; | ||
1614 | struct node_hstate *nhs = &node_hstates[node->sysdev.id]; | ||
1615 | |||
1616 | if (!nhs->hugepages_kobj) | ||
1617 | return; /* no hstate attributes */ | ||
1618 | |||
1619 | for_each_hstate(h) | ||
1620 | if (nhs->hstate_kobjs[h - hstates]) { | ||
1621 | kobject_put(nhs->hstate_kobjs[h - hstates]); | ||
1622 | nhs->hstate_kobjs[h - hstates] = NULL; | ||
1623 | } | ||
1624 | |||
1625 | kobject_put(nhs->hugepages_kobj); | ||
1626 | nhs->hugepages_kobj = NULL; | ||
1627 | } | ||
1628 | |||
1629 | /* | ||
1630 | * hugetlb module exit: unregister hstate attributes from node sysdevs | ||
1631 | * that have them. | ||
1632 | */ | ||
1633 | static void hugetlb_unregister_all_nodes(void) | ||
1634 | { | ||
1635 | int nid; | ||
1636 | |||
1637 | /* | ||
1638 | * disable node sysdev registrations. | ||
1639 | */ | ||
1640 | register_hugetlbfs_with_node(NULL, NULL); | ||
1641 | |||
1642 | /* | ||
1643 | * remove hstate attributes from any nodes that have them. | ||
1644 | */ | ||
1645 | for (nid = 0; nid < nr_node_ids; nid++) | ||
1646 | hugetlb_unregister_node(&node_devices[nid]); | ||
1647 | } | ||
1648 | |||
1649 | /* | ||
1650 | * Register hstate attributes for a single node sysdev. | ||
1651 | * No-op if attributes already registered. | ||
1652 | */ | ||
1653 | void hugetlb_register_node(struct node *node) | ||
1654 | { | ||
1655 | struct hstate *h; | ||
1656 | struct node_hstate *nhs = &node_hstates[node->sysdev.id]; | ||
1657 | int err; | ||
1658 | |||
1659 | if (nhs->hugepages_kobj) | ||
1660 | return; /* already allocated */ | ||
1661 | |||
1662 | nhs->hugepages_kobj = kobject_create_and_add("hugepages", | ||
1663 | &node->sysdev.kobj); | ||
1664 | if (!nhs->hugepages_kobj) | ||
1665 | return; | ||
1666 | |||
1667 | for_each_hstate(h) { | ||
1668 | err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, | ||
1669 | nhs->hstate_kobjs, | ||
1670 | &per_node_hstate_attr_group); | ||
1671 | if (err) { | ||
1672 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s" | ||
1673 | " for node %d\n", | ||
1674 | h->name, node->sysdev.id); | ||
1675 | hugetlb_unregister_node(node); | ||
1676 | break; | ||
1677 | } | ||
1678 | } | ||
1679 | } | ||
1680 | |||
1681 | /* | ||
1682 | * hugetlb init time: register hstate attributes for all registered node | ||
1683 | * sysdevs of nodes that have memory. All on-line nodes should have | ||
1684 | * registered their associated sysdev by this time. | ||
1685 | */ | ||
1686 | static void hugetlb_register_all_nodes(void) | ||
1687 | { | ||
1688 | int nid; | ||
1689 | |||
1690 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
1691 | struct node *node = &node_devices[nid]; | ||
1692 | if (node->sysdev.id == nid) | ||
1693 | hugetlb_register_node(node); | ||
1694 | } | ||
1695 | |||
1696 | /* | ||
1697 | * Let the node sysdev driver know we're here so it can | ||
1698 | * [un]register hstate attributes on node hotplug. | ||
1699 | */ | ||
1700 | register_hugetlbfs_with_node(hugetlb_register_node, | ||
1701 | hugetlb_unregister_node); | ||
1702 | } | ||
1703 | #else /* !CONFIG_NUMA */ | ||
1704 | |||
1705 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | ||
1706 | { | ||
1707 | BUG(); | ||
1708 | if (nidp) | ||
1709 | *nidp = -1; | ||
1710 | return NULL; | ||
1711 | } | ||
1712 | |||
1713 | static void hugetlb_unregister_all_nodes(void) { } | ||
1714 | |||
1715 | static void hugetlb_register_all_nodes(void) { } | ||
1716 | |||
1717 | #endif | ||
1718 | |||
1414 | static void __exit hugetlb_exit(void) | 1719 | static void __exit hugetlb_exit(void) |
1415 | { | 1720 | { |
1416 | struct hstate *h; | 1721 | struct hstate *h; |
1417 | 1722 | ||
1723 | hugetlb_unregister_all_nodes(); | ||
1724 | |||
1418 | for_each_hstate(h) { | 1725 | for_each_hstate(h) { |
1419 | kobject_put(hstate_kobjs[h - hstates]); | 1726 | kobject_put(hstate_kobjs[h - hstates]); |
1420 | } | 1727 | } |
@@ -1449,6 +1756,8 @@ static int __init hugetlb_init(void) | |||
1449 | 1756 | ||
1450 | hugetlb_sysfs_init(); | 1757 | hugetlb_sysfs_init(); |
1451 | 1758 | ||
1759 | hugetlb_register_all_nodes(); | ||
1760 | |||
1452 | return 0; | 1761 | return 0; |
1453 | } | 1762 | } |
1454 | module_init(hugetlb_init); | 1763 | module_init(hugetlb_init); |
@@ -1472,8 +1781,8 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1472 | h->free_huge_pages = 0; | 1781 | h->free_huge_pages = 0; |
1473 | for (i = 0; i < MAX_NUMNODES; ++i) | 1782 | for (i = 0; i < MAX_NUMNODES; ++i) |
1474 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1783 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1475 | h->next_nid_to_alloc = first_node(node_online_map); | 1784 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); |
1476 | h->next_nid_to_free = first_node(node_online_map); | 1785 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); |
1477 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1786 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1478 | huge_page_size(h)/1024); | 1787 | huge_page_size(h)/1024); |
1479 | 1788 | ||
@@ -1536,9 +1845,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
1536 | } | 1845 | } |
1537 | 1846 | ||
1538 | #ifdef CONFIG_SYSCTL | 1847 | #ifdef CONFIG_SYSCTL |
1539 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1848 | static int hugetlb_sysctl_handler_common(bool obey_mempolicy, |
1540 | void __user *buffer, | 1849 | struct ctl_table *table, int write, |
1541 | size_t *length, loff_t *ppos) | 1850 | void __user *buffer, size_t *length, loff_t *ppos) |
1542 | { | 1851 | { |
1543 | struct hstate *h = &default_hstate; | 1852 | struct hstate *h = &default_hstate; |
1544 | unsigned long tmp; | 1853 | unsigned long tmp; |
@@ -1550,12 +1859,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, | |||
1550 | table->maxlen = sizeof(unsigned long); | 1859 | table->maxlen = sizeof(unsigned long); |
1551 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1860 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1552 | 1861 | ||
1553 | if (write) | 1862 | if (write) { |
1554 | h->max_huge_pages = set_max_huge_pages(h, tmp); | 1863 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, |
1864 | GFP_KERNEL | __GFP_NORETRY); | ||
1865 | if (!(obey_mempolicy && | ||
1866 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
1867 | NODEMASK_FREE(nodes_allowed); | ||
1868 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1869 | } | ||
1870 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); | ||
1871 | |||
1872 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | ||
1873 | NODEMASK_FREE(nodes_allowed); | ||
1874 | } | ||
1555 | 1875 | ||
1556 | return 0; | 1876 | return 0; |
1557 | } | 1877 | } |
1558 | 1878 | ||
1879 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | ||
1880 | void __user *buffer, size_t *length, loff_t *ppos) | ||
1881 | { | ||
1882 | |||
1883 | return hugetlb_sysctl_handler_common(false, table, write, | ||
1884 | buffer, length, ppos); | ||
1885 | } | ||
1886 | |||
1887 | #ifdef CONFIG_NUMA | ||
1888 | int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, | ||
1889 | void __user *buffer, size_t *length, loff_t *ppos) | ||
1890 | { | ||
1891 | return hugetlb_sysctl_handler_common(true, table, write, | ||
1892 | buffer, length, ppos); | ||
1893 | } | ||
1894 | #endif /* CONFIG_NUMA */ | ||
1895 | |||
1559 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, | 1896 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, |
1560 | void __user *buffer, | 1897 | void __user *buffer, |
1561 | size_t *length, loff_t *ppos) | 1898 | size_t *length, loff_t *ppos) |
@@ -1903,6 +2240,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1903 | + (vma->vm_pgoff >> PAGE_SHIFT); | 2240 | + (vma->vm_pgoff >> PAGE_SHIFT); |
1904 | mapping = (struct address_space *)page_private(page); | 2241 | mapping = (struct address_space *)page_private(page); |
1905 | 2242 | ||
2243 | /* | ||
2244 | * Take the mapping lock for the duration of the table walk. As | ||
2245 | * this mapping should be shared between all the VMAs, | ||
2246 | * __unmap_hugepage_range() is called as the lock is already held | ||
2247 | */ | ||
2248 | spin_lock(&mapping->i_mmap_lock); | ||
1906 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 2249 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1907 | /* Do not unmap the current VMA */ | 2250 | /* Do not unmap the current VMA */ |
1908 | if (iter_vma == vma) | 2251 | if (iter_vma == vma) |
@@ -1916,10 +2259,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1916 | * from the time of fork. This would look like data corruption | 2259 | * from the time of fork. This would look like data corruption |
1917 | */ | 2260 | */ |
1918 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | 2261 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) |
1919 | unmap_hugepage_range(iter_vma, | 2262 | __unmap_hugepage_range(iter_vma, |
1920 | address, address + huge_page_size(h), | 2263 | address, address + huge_page_size(h), |
1921 | page); | 2264 | page); |
1922 | } | 2265 | } |
2266 | spin_unlock(&mapping->i_mmap_lock); | ||
1923 | 2267 | ||
1924 | return 1; | 2268 | return 1; |
1925 | } | 2269 | } |
@@ -1959,6 +2303,9 @@ retry_avoidcopy: | |||
1959 | outside_reserve = 1; | 2303 | outside_reserve = 1; |
1960 | 2304 | ||
1961 | page_cache_get(old_page); | 2305 | page_cache_get(old_page); |
2306 | |||
2307 | /* Drop page_table_lock as buddy allocator may be called */ | ||
2308 | spin_unlock(&mm->page_table_lock); | ||
1962 | new_page = alloc_huge_page(vma, address, outside_reserve); | 2309 | new_page = alloc_huge_page(vma, address, outside_reserve); |
1963 | 2310 | ||
1964 | if (IS_ERR(new_page)) { | 2311 | if (IS_ERR(new_page)) { |
@@ -1976,19 +2323,25 @@ retry_avoidcopy: | |||
1976 | if (unmap_ref_private(mm, vma, old_page, address)) { | 2323 | if (unmap_ref_private(mm, vma, old_page, address)) { |
1977 | BUG_ON(page_count(old_page) != 1); | 2324 | BUG_ON(page_count(old_page) != 1); |
1978 | BUG_ON(huge_pte_none(pte)); | 2325 | BUG_ON(huge_pte_none(pte)); |
2326 | spin_lock(&mm->page_table_lock); | ||
1979 | goto retry_avoidcopy; | 2327 | goto retry_avoidcopy; |
1980 | } | 2328 | } |
1981 | WARN_ON_ONCE(1); | 2329 | WARN_ON_ONCE(1); |
1982 | } | 2330 | } |
1983 | 2331 | ||
2332 | /* Caller expects lock to be held */ | ||
2333 | spin_lock(&mm->page_table_lock); | ||
1984 | return -PTR_ERR(new_page); | 2334 | return -PTR_ERR(new_page); |
1985 | } | 2335 | } |
1986 | 2336 | ||
1987 | spin_unlock(&mm->page_table_lock); | ||
1988 | copy_huge_page(new_page, old_page, address, vma); | 2337 | copy_huge_page(new_page, old_page, address, vma); |
1989 | __SetPageUptodate(new_page); | 2338 | __SetPageUptodate(new_page); |
1990 | spin_lock(&mm->page_table_lock); | ||
1991 | 2339 | ||
2340 | /* | ||
2341 | * Retake the page_table_lock to check for racing updates | ||
2342 | * before the page tables are altered | ||
2343 | */ | ||
2344 | spin_lock(&mm->page_table_lock); | ||
1992 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2345 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
1993 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 2346 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
1994 | /* Break COW */ | 2347 | /* Break COW */ |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index e1d85137f086..10ea71905c1f 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -3,18 +3,68 @@ | |||
3 | #include <linux/debugfs.h> | 3 | #include <linux/debugfs.h> |
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/swap.h> | ||
7 | #include <linux/pagemap.h> | ||
8 | #include "internal.h" | ||
6 | 9 | ||
7 | static struct dentry *hwpoison_dir, *corrupt_pfn; | 10 | static struct dentry *hwpoison_dir; |
8 | 11 | ||
9 | static int hwpoison_inject(void *data, u64 val) | 12 | static int hwpoison_inject(void *data, u64 val) |
10 | { | 13 | { |
14 | unsigned long pfn = val; | ||
15 | struct page *p; | ||
16 | int err; | ||
17 | |||
18 | if (!capable(CAP_SYS_ADMIN)) | ||
19 | return -EPERM; | ||
20 | |||
21 | if (!hwpoison_filter_enable) | ||
22 | goto inject; | ||
23 | if (!pfn_valid(pfn)) | ||
24 | return -ENXIO; | ||
25 | |||
26 | p = pfn_to_page(pfn); | ||
27 | /* | ||
28 | * This implies unable to support free buddy pages. | ||
29 | */ | ||
30 | if (!get_page_unless_zero(p)) | ||
31 | return 0; | ||
32 | |||
33 | if (!PageLRU(p)) | ||
34 | shake_page(p, 0); | ||
35 | /* | ||
36 | * This implies unable to support non-LRU pages. | ||
37 | */ | ||
38 | if (!PageLRU(p)) | ||
39 | return 0; | ||
40 | |||
41 | /* | ||
42 | * do a racy check with elevated page count, to make sure PG_hwpoison | ||
43 | * will only be set for the targeted owner (or on a free page). | ||
44 | * We temporarily take page lock for try_get_mem_cgroup_from_page(). | ||
45 | * __memory_failure() will redo the check reliably inside page lock. | ||
46 | */ | ||
47 | lock_page(p); | ||
48 | err = hwpoison_filter(p); | ||
49 | unlock_page(p); | ||
50 | if (err) | ||
51 | return 0; | ||
52 | |||
53 | inject: | ||
54 | printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); | ||
55 | return __memory_failure(pfn, 18, MF_COUNT_INCREASED); | ||
56 | } | ||
57 | |||
58 | static int hwpoison_unpoison(void *data, u64 val) | ||
59 | { | ||
11 | if (!capable(CAP_SYS_ADMIN)) | 60 | if (!capable(CAP_SYS_ADMIN)) |
12 | return -EPERM; | 61 | return -EPERM; |
13 | printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); | 62 | |
14 | return __memory_failure(val, 18, 0); | 63 | return unpoison_memory(val); |
15 | } | 64 | } |
16 | 65 | ||
17 | DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); | 66 | DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); |
67 | DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); | ||
18 | 68 | ||
19 | static void pfn_inject_exit(void) | 69 | static void pfn_inject_exit(void) |
20 | { | 70 | { |
@@ -24,16 +74,63 @@ static void pfn_inject_exit(void) | |||
24 | 74 | ||
25 | static int pfn_inject_init(void) | 75 | static int pfn_inject_init(void) |
26 | { | 76 | { |
77 | struct dentry *dentry; | ||
78 | |||
27 | hwpoison_dir = debugfs_create_dir("hwpoison", NULL); | 79 | hwpoison_dir = debugfs_create_dir("hwpoison", NULL); |
28 | if (hwpoison_dir == NULL) | 80 | if (hwpoison_dir == NULL) |
29 | return -ENOMEM; | 81 | return -ENOMEM; |
30 | corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | 82 | |
83 | /* | ||
84 | * Note that the below poison/unpoison interfaces do not involve | ||
85 | * hardware status change, hence do not require hardware support. | ||
86 | * They are mainly for testing hwpoison in software level. | ||
87 | */ | ||
88 | dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | ||
31 | NULL, &hwpoison_fops); | 89 | NULL, &hwpoison_fops); |
32 | if (corrupt_pfn == NULL) { | 90 | if (!dentry) |
33 | pfn_inject_exit(); | 91 | goto fail; |
34 | return -ENOMEM; | 92 | |
35 | } | 93 | dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, |
94 | NULL, &unpoison_fops); | ||
95 | if (!dentry) | ||
96 | goto fail; | ||
97 | |||
98 | dentry = debugfs_create_u32("corrupt-filter-enable", 0600, | ||
99 | hwpoison_dir, &hwpoison_filter_enable); | ||
100 | if (!dentry) | ||
101 | goto fail; | ||
102 | |||
103 | dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600, | ||
104 | hwpoison_dir, &hwpoison_filter_dev_major); | ||
105 | if (!dentry) | ||
106 | goto fail; | ||
107 | |||
108 | dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600, | ||
109 | hwpoison_dir, &hwpoison_filter_dev_minor); | ||
110 | if (!dentry) | ||
111 | goto fail; | ||
112 | |||
113 | dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600, | ||
114 | hwpoison_dir, &hwpoison_filter_flags_mask); | ||
115 | if (!dentry) | ||
116 | goto fail; | ||
117 | |||
118 | dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600, | ||
119 | hwpoison_dir, &hwpoison_filter_flags_value); | ||
120 | if (!dentry) | ||
121 | goto fail; | ||
122 | |||
123 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
124 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, | ||
125 | hwpoison_dir, &hwpoison_filter_memcg); | ||
126 | if (!dentry) | ||
127 | goto fail; | ||
128 | #endif | ||
129 | |||
36 | return 0; | 130 | return 0; |
131 | fail: | ||
132 | pfn_inject_exit(); | ||
133 | return -ENOMEM; | ||
37 | } | 134 | } |
38 | 135 | ||
39 | module_init(pfn_inject_init); | 136 | module_init(pfn_inject_init); |
diff --git a/mm/internal.h b/mm/internal.h index 22ec8d2b0fb8..6a697bb97fc5 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page); | |||
50 | */ | 50 | */ |
51 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 51 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
52 | extern void prep_compound_page(struct page *page, unsigned long order); | 52 | extern void prep_compound_page(struct page *page, unsigned long order); |
53 | #ifdef CONFIG_MEMORY_FAILURE | ||
54 | extern bool is_free_buddy_page(struct page *page); | ||
55 | #endif | ||
53 | 56 | ||
54 | 57 | ||
55 | /* | 58 | /* |
@@ -63,7 +66,7 @@ static inline unsigned long page_order(struct page *page) | |||
63 | return page_private(page); | 66 | return page_private(page); |
64 | } | 67 | } |
65 | 68 | ||
66 | #ifdef CONFIG_HAVE_MLOCK | 69 | #ifdef CONFIG_MMU |
67 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, | 70 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, |
68 | unsigned long start, unsigned long end); | 71 | unsigned long start, unsigned long end); |
69 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, | 72 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, |
@@ -72,22 +75,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
72 | { | 75 | { |
73 | munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); | 76 | munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); |
74 | } | 77 | } |
75 | #endif | ||
76 | 78 | ||
77 | /* | 79 | /* |
78 | * unevictable_migrate_page() called only from migrate_page_copy() to | ||
79 | * migrate unevictable flag to new page. | ||
80 | * Note that the old page has been isolated from the LRU lists at this | ||
81 | * point so we don't need to worry about LRU statistics. | ||
82 | */ | ||
83 | static inline void unevictable_migrate_page(struct page *new, struct page *old) | ||
84 | { | ||
85 | if (TestClearPageUnevictable(old)) | ||
86 | SetPageUnevictable(new); | ||
87 | } | ||
88 | |||
89 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
90 | /* | ||
91 | * Called only in fault path via page_evictable() for a new page | 80 | * Called only in fault path via page_evictable() for a new page |
92 | * to determine if it's being mapped into a LOCKED vma. | 81 | * to determine if it's being mapped into a LOCKED vma. |
93 | * If so, mark page as mlocked. | 82 | * If so, mark page as mlocked. |
@@ -107,9 +96,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) | |||
107 | } | 96 | } |
108 | 97 | ||
109 | /* | 98 | /* |
110 | * must be called with vma's mmap_sem held for read, and page locked. | 99 | * must be called with vma's mmap_sem held for read or write, and page locked. |
111 | */ | 100 | */ |
112 | extern void mlock_vma_page(struct page *page); | 101 | extern void mlock_vma_page(struct page *page); |
102 | extern void munlock_vma_page(struct page *page); | ||
113 | 103 | ||
114 | /* | 104 | /* |
115 | * Clear the page's PageMlocked(). This can be useful in a situation where | 105 | * Clear the page's PageMlocked(). This can be useful in a situation where |
@@ -144,7 +134,7 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
144 | } | 134 | } |
145 | } | 135 | } |
146 | 136 | ||
147 | #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 137 | #else /* !CONFIG_MMU */ |
148 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 138 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) |
149 | { | 139 | { |
150 | return 0; | 140 | return 0; |
@@ -153,7 +143,7 @@ static inline void clear_page_mlock(struct page *page) { } | |||
153 | static inline void mlock_vma_page(struct page *page) { } | 143 | static inline void mlock_vma_page(struct page *page) { } |
154 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } | 144 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } |
155 | 145 | ||
156 | #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 146 | #endif /* !CONFIG_MMU */ |
157 | 147 | ||
158 | /* | 148 | /* |
159 | * Return the mem_map entry representing the 'offset' subpage within | 149 | * Return the mem_map entry representing the 'offset' subpage within |
@@ -260,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
260 | #define ZONE_RECLAIM_SOME 0 | 250 | #define ZONE_RECLAIM_SOME 0 |
261 | #define ZONE_RECLAIM_SUCCESS 1 | 251 | #define ZONE_RECLAIM_SUCCESS 1 |
262 | #endif | 252 | #endif |
253 | |||
254 | extern int hwpoison_filter(struct page *p); | ||
255 | |||
256 | extern u32 hwpoison_filter_dev_major; | ||
257 | extern u32 hwpoison_filter_dev_minor; | ||
258 | extern u64 hwpoison_filter_flags_mask; | ||
259 | extern u64 hwpoison_filter_flags_value; | ||
260 | extern u64 hwpoison_filter_memcg; | ||
261 | extern u32 hwpoison_filter_enable; | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 13f33b3081ec..5b069e4f5e48 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -93,6 +93,7 @@ | |||
93 | #include <linux/nodemask.h> | 93 | #include <linux/nodemask.h> |
94 | #include <linux/mm.h> | 94 | #include <linux/mm.h> |
95 | #include <linux/workqueue.h> | 95 | #include <linux/workqueue.h> |
96 | #include <linux/crc32.h> | ||
96 | 97 | ||
97 | #include <asm/sections.h> | 98 | #include <asm/sections.h> |
98 | #include <asm/processor.h> | 99 | #include <asm/processor.h> |
@@ -108,7 +109,6 @@ | |||
108 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ | 109 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ |
109 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ | 110 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ |
110 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ | 111 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ |
111 | #define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ | ||
112 | #define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ | 112 | #define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ |
113 | 113 | ||
114 | #define BYTES_PER_POINTER sizeof(void *) | 114 | #define BYTES_PER_POINTER sizeof(void *) |
@@ -119,8 +119,8 @@ | |||
119 | /* scanning area inside a memory block */ | 119 | /* scanning area inside a memory block */ |
120 | struct kmemleak_scan_area { | 120 | struct kmemleak_scan_area { |
121 | struct hlist_node node; | 121 | struct hlist_node node; |
122 | unsigned long offset; | 122 | unsigned long start; |
123 | size_t length; | 123 | size_t size; |
124 | }; | 124 | }; |
125 | 125 | ||
126 | #define KMEMLEAK_GREY 0 | 126 | #define KMEMLEAK_GREY 0 |
@@ -149,6 +149,8 @@ struct kmemleak_object { | |||
149 | int min_count; | 149 | int min_count; |
150 | /* the total number of pointers found pointing to this object */ | 150 | /* the total number of pointers found pointing to this object */ |
151 | int count; | 151 | int count; |
152 | /* checksum for detecting modified objects */ | ||
153 | u32 checksum; | ||
152 | /* memory ranges to be scanned inside an object (empty for all) */ | 154 | /* memory ranges to be scanned inside an object (empty for all) */ |
153 | struct hlist_head area_list; | 155 | struct hlist_head area_list; |
154 | unsigned long trace[MAX_TRACE]; | 156 | unsigned long trace[MAX_TRACE]; |
@@ -164,8 +166,6 @@ struct kmemleak_object { | |||
164 | #define OBJECT_REPORTED (1 << 1) | 166 | #define OBJECT_REPORTED (1 << 1) |
165 | /* flag set to not scan the object */ | 167 | /* flag set to not scan the object */ |
166 | #define OBJECT_NO_SCAN (1 << 2) | 168 | #define OBJECT_NO_SCAN (1 << 2) |
167 | /* flag set on newly allocated objects */ | ||
168 | #define OBJECT_NEW (1 << 3) | ||
169 | 169 | ||
170 | /* number of bytes to print per line; must be 16 or 32 */ | 170 | /* number of bytes to print per line; must be 16 or 32 */ |
171 | #define HEX_ROW_SIZE 16 | 171 | #define HEX_ROW_SIZE 16 |
@@ -241,8 +241,6 @@ struct early_log { | |||
241 | const void *ptr; /* allocated/freed memory block */ | 241 | const void *ptr; /* allocated/freed memory block */ |
242 | size_t size; /* memory block size */ | 242 | size_t size; /* memory block size */ |
243 | int min_count; /* minimum reference count */ | 243 | int min_count; /* minimum reference count */ |
244 | unsigned long offset; /* scan area offset */ | ||
245 | size_t length; /* scan area length */ | ||
246 | unsigned long trace[MAX_TRACE]; /* stack trace */ | 244 | unsigned long trace[MAX_TRACE]; /* stack trace */ |
247 | unsigned int trace_len; /* stack trace length */ | 245 | unsigned int trace_len; /* stack trace length */ |
248 | }; | 246 | }; |
@@ -323,11 +321,6 @@ static bool color_gray(const struct kmemleak_object *object) | |||
323 | object->count >= object->min_count; | 321 | object->count >= object->min_count; |
324 | } | 322 | } |
325 | 323 | ||
326 | static bool color_black(const struct kmemleak_object *object) | ||
327 | { | ||
328 | return object->min_count == KMEMLEAK_BLACK; | ||
329 | } | ||
330 | |||
331 | /* | 324 | /* |
332 | * Objects are considered unreferenced only if their color is white, they have | 325 | * Objects are considered unreferenced only if their color is white, they have |
333 | * not be deleted and have a minimum age to avoid false positives caused by | 326 | * not be deleted and have a minimum age to avoid false positives caused by |
@@ -335,7 +328,7 @@ static bool color_black(const struct kmemleak_object *object) | |||
335 | */ | 328 | */ |
336 | static bool unreferenced_object(struct kmemleak_object *object) | 329 | static bool unreferenced_object(struct kmemleak_object *object) |
337 | { | 330 | { |
338 | return (object->flags & OBJECT_ALLOCATED) && color_white(object) && | 331 | return (color_white(object) && object->flags & OBJECT_ALLOCATED) && |
339 | time_before_eq(object->jiffies + jiffies_min_age, | 332 | time_before_eq(object->jiffies + jiffies_min_age, |
340 | jiffies_last_scan); | 333 | jiffies_last_scan); |
341 | } | 334 | } |
@@ -348,11 +341,13 @@ static void print_unreferenced(struct seq_file *seq, | |||
348 | struct kmemleak_object *object) | 341 | struct kmemleak_object *object) |
349 | { | 342 | { |
350 | int i; | 343 | int i; |
344 | unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); | ||
351 | 345 | ||
352 | seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", | 346 | seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", |
353 | object->pointer, object->size); | 347 | object->pointer, object->size); |
354 | seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", | 348 | seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", |
355 | object->comm, object->pid, object->jiffies); | 349 | object->comm, object->pid, object->jiffies, |
350 | msecs_age / 1000, msecs_age % 1000); | ||
356 | hex_dump_object(seq, object); | 351 | hex_dump_object(seq, object); |
357 | seq_printf(seq, " backtrace:\n"); | 352 | seq_printf(seq, " backtrace:\n"); |
358 | 353 | ||
@@ -381,6 +376,7 @@ static void dump_object_info(struct kmemleak_object *object) | |||
381 | pr_notice(" min_count = %d\n", object->min_count); | 376 | pr_notice(" min_count = %d\n", object->min_count); |
382 | pr_notice(" count = %d\n", object->count); | 377 | pr_notice(" count = %d\n", object->count); |
383 | pr_notice(" flags = 0x%lx\n", object->flags); | 378 | pr_notice(" flags = 0x%lx\n", object->flags); |
379 | pr_notice(" checksum = %d\n", object->checksum); | ||
384 | pr_notice(" backtrace:\n"); | 380 | pr_notice(" backtrace:\n"); |
385 | print_stack_trace(&trace, 4); | 381 | print_stack_trace(&trace, 4); |
386 | } | 382 | } |
@@ -522,12 +518,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
522 | INIT_HLIST_HEAD(&object->area_list); | 518 | INIT_HLIST_HEAD(&object->area_list); |
523 | spin_lock_init(&object->lock); | 519 | spin_lock_init(&object->lock); |
524 | atomic_set(&object->use_count, 1); | 520 | atomic_set(&object->use_count, 1); |
525 | object->flags = OBJECT_ALLOCATED | OBJECT_NEW; | 521 | object->flags = OBJECT_ALLOCATED; |
526 | object->pointer = ptr; | 522 | object->pointer = ptr; |
527 | object->size = size; | 523 | object->size = size; |
528 | object->min_count = min_count; | 524 | object->min_count = min_count; |
529 | object->count = -1; /* no color initially */ | 525 | object->count = 0; /* white color initially */ |
530 | object->jiffies = jiffies; | 526 | object->jiffies = jiffies; |
527 | object->checksum = 0; | ||
531 | 528 | ||
532 | /* task information */ | 529 | /* task information */ |
533 | if (in_irq()) { | 530 | if (in_irq()) { |
@@ -720,14 +717,13 @@ static void make_black_object(unsigned long ptr) | |||
720 | * Add a scanning area to the object. If at least one such area is added, | 717 | * Add a scanning area to the object. If at least one such area is added, |
721 | * kmemleak will only scan these ranges rather than the whole memory block. | 718 | * kmemleak will only scan these ranges rather than the whole memory block. |
722 | */ | 719 | */ |
723 | static void add_scan_area(unsigned long ptr, unsigned long offset, | 720 | static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) |
724 | size_t length, gfp_t gfp) | ||
725 | { | 721 | { |
726 | unsigned long flags; | 722 | unsigned long flags; |
727 | struct kmemleak_object *object; | 723 | struct kmemleak_object *object; |
728 | struct kmemleak_scan_area *area; | 724 | struct kmemleak_scan_area *area; |
729 | 725 | ||
730 | object = find_and_get_object(ptr, 0); | 726 | object = find_and_get_object(ptr, 1); |
731 | if (!object) { | 727 | if (!object) { |
732 | kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", | 728 | kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", |
733 | ptr); | 729 | ptr); |
@@ -741,7 +737,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, | |||
741 | } | 737 | } |
742 | 738 | ||
743 | spin_lock_irqsave(&object->lock, flags); | 739 | spin_lock_irqsave(&object->lock, flags); |
744 | if (offset + length > object->size) { | 740 | if (ptr + size > object->pointer + object->size) { |
745 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); | 741 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); |
746 | dump_object_info(object); | 742 | dump_object_info(object); |
747 | kmem_cache_free(scan_area_cache, area); | 743 | kmem_cache_free(scan_area_cache, area); |
@@ -749,8 +745,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, | |||
749 | } | 745 | } |
750 | 746 | ||
751 | INIT_HLIST_NODE(&area->node); | 747 | INIT_HLIST_NODE(&area->node); |
752 | area->offset = offset; | 748 | area->start = ptr; |
753 | area->length = length; | 749 | area->size = size; |
754 | 750 | ||
755 | hlist_add_head(&area->node, &object->area_list); | 751 | hlist_add_head(&area->node, &object->area_list); |
756 | out_unlock: | 752 | out_unlock: |
@@ -786,7 +782,7 @@ static void object_no_scan(unsigned long ptr) | |||
786 | * processed later once kmemleak is fully initialized. | 782 | * processed later once kmemleak is fully initialized. |
787 | */ | 783 | */ |
788 | static void __init log_early(int op_type, const void *ptr, size_t size, | 784 | static void __init log_early(int op_type, const void *ptr, size_t size, |
789 | int min_count, unsigned long offset, size_t length) | 785 | int min_count) |
790 | { | 786 | { |
791 | unsigned long flags; | 787 | unsigned long flags; |
792 | struct early_log *log; | 788 | struct early_log *log; |
@@ -808,8 +804,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size, | |||
808 | log->ptr = ptr; | 804 | log->ptr = ptr; |
809 | log->size = size; | 805 | log->size = size; |
810 | log->min_count = min_count; | 806 | log->min_count = min_count; |
811 | log->offset = offset; | ||
812 | log->length = length; | ||
813 | if (op_type == KMEMLEAK_ALLOC) | 807 | if (op_type == KMEMLEAK_ALLOC) |
814 | log->trace_len = __save_stack_trace(log->trace); | 808 | log->trace_len = __save_stack_trace(log->trace); |
815 | crt_early_log++; | 809 | crt_early_log++; |
@@ -858,7 +852,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, | |||
858 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 852 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
859 | create_object((unsigned long)ptr, size, min_count, gfp); | 853 | create_object((unsigned long)ptr, size, min_count, gfp); |
860 | else if (atomic_read(&kmemleak_early_log)) | 854 | else if (atomic_read(&kmemleak_early_log)) |
861 | log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); | 855 | log_early(KMEMLEAK_ALLOC, ptr, size, min_count); |
862 | } | 856 | } |
863 | EXPORT_SYMBOL_GPL(kmemleak_alloc); | 857 | EXPORT_SYMBOL_GPL(kmemleak_alloc); |
864 | 858 | ||
@@ -873,7 +867,7 @@ void __ref kmemleak_free(const void *ptr) | |||
873 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 867 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
874 | delete_object_full((unsigned long)ptr); | 868 | delete_object_full((unsigned long)ptr); |
875 | else if (atomic_read(&kmemleak_early_log)) | 869 | else if (atomic_read(&kmemleak_early_log)) |
876 | log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); | 870 | log_early(KMEMLEAK_FREE, ptr, 0, 0); |
877 | } | 871 | } |
878 | EXPORT_SYMBOL_GPL(kmemleak_free); | 872 | EXPORT_SYMBOL_GPL(kmemleak_free); |
879 | 873 | ||
@@ -888,7 +882,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) | |||
888 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 882 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
889 | delete_object_part((unsigned long)ptr, size); | 883 | delete_object_part((unsigned long)ptr, size); |
890 | else if (atomic_read(&kmemleak_early_log)) | 884 | else if (atomic_read(&kmemleak_early_log)) |
891 | log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); | 885 | log_early(KMEMLEAK_FREE_PART, ptr, size, 0); |
892 | } | 886 | } |
893 | EXPORT_SYMBOL_GPL(kmemleak_free_part); | 887 | EXPORT_SYMBOL_GPL(kmemleak_free_part); |
894 | 888 | ||
@@ -903,7 +897,7 @@ void __ref kmemleak_not_leak(const void *ptr) | |||
903 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 897 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
904 | make_gray_object((unsigned long)ptr); | 898 | make_gray_object((unsigned long)ptr); |
905 | else if (atomic_read(&kmemleak_early_log)) | 899 | else if (atomic_read(&kmemleak_early_log)) |
906 | log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); | 900 | log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); |
907 | } | 901 | } |
908 | EXPORT_SYMBOL(kmemleak_not_leak); | 902 | EXPORT_SYMBOL(kmemleak_not_leak); |
909 | 903 | ||
@@ -919,22 +913,21 @@ void __ref kmemleak_ignore(const void *ptr) | |||
919 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 913 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
920 | make_black_object((unsigned long)ptr); | 914 | make_black_object((unsigned long)ptr); |
921 | else if (atomic_read(&kmemleak_early_log)) | 915 | else if (atomic_read(&kmemleak_early_log)) |
922 | log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); | 916 | log_early(KMEMLEAK_IGNORE, ptr, 0, 0); |
923 | } | 917 | } |
924 | EXPORT_SYMBOL(kmemleak_ignore); | 918 | EXPORT_SYMBOL(kmemleak_ignore); |
925 | 919 | ||
926 | /* | 920 | /* |
927 | * Limit the range to be scanned in an allocated memory block. | 921 | * Limit the range to be scanned in an allocated memory block. |
928 | */ | 922 | */ |
929 | void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, | 923 | void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) |
930 | size_t length, gfp_t gfp) | ||
931 | { | 924 | { |
932 | pr_debug("%s(0x%p)\n", __func__, ptr); | 925 | pr_debug("%s(0x%p)\n", __func__, ptr); |
933 | 926 | ||
934 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 927 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
935 | add_scan_area((unsigned long)ptr, offset, length, gfp); | 928 | add_scan_area((unsigned long)ptr, size, gfp); |
936 | else if (atomic_read(&kmemleak_early_log)) | 929 | else if (atomic_read(&kmemleak_early_log)) |
937 | log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); | 930 | log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); |
938 | } | 931 | } |
939 | EXPORT_SYMBOL(kmemleak_scan_area); | 932 | EXPORT_SYMBOL(kmemleak_scan_area); |
940 | 933 | ||
@@ -948,11 +941,25 @@ void __ref kmemleak_no_scan(const void *ptr) | |||
948 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 941 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
949 | object_no_scan((unsigned long)ptr); | 942 | object_no_scan((unsigned long)ptr); |
950 | else if (atomic_read(&kmemleak_early_log)) | 943 | else if (atomic_read(&kmemleak_early_log)) |
951 | log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); | 944 | log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); |
952 | } | 945 | } |
953 | EXPORT_SYMBOL(kmemleak_no_scan); | 946 | EXPORT_SYMBOL(kmemleak_no_scan); |
954 | 947 | ||
955 | /* | 948 | /* |
949 | * Update an object's checksum and return true if it was modified. | ||
950 | */ | ||
951 | static bool update_checksum(struct kmemleak_object *object) | ||
952 | { | ||
953 | u32 old_csum = object->checksum; | ||
954 | |||
955 | if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) | ||
956 | return false; | ||
957 | |||
958 | object->checksum = crc32(0, (void *)object->pointer, object->size); | ||
959 | return object->checksum != old_csum; | ||
960 | } | ||
961 | |||
962 | /* | ||
956 | * Memory scanning is a long process and it needs to be interruptable. This | 963 | * Memory scanning is a long process and it needs to be interruptable. This |
957 | * function checks whether such interrupt condition occured. | 964 | * function checks whether such interrupt condition occured. |
958 | */ | 965 | */ |
@@ -1031,11 +1038,14 @@ static void scan_block(void *_start, void *_end, | |||
1031 | * added to the gray_list. | 1038 | * added to the gray_list. |
1032 | */ | 1039 | */ |
1033 | object->count++; | 1040 | object->count++; |
1034 | if (color_gray(object)) | 1041 | if (color_gray(object)) { |
1035 | list_add_tail(&object->gray_list, &gray_list); | 1042 | list_add_tail(&object->gray_list, &gray_list); |
1036 | else | 1043 | spin_unlock_irqrestore(&object->lock, flags); |
1037 | put_object(object); | 1044 | continue; |
1045 | } | ||
1046 | |||
1038 | spin_unlock_irqrestore(&object->lock, flags); | 1047 | spin_unlock_irqrestore(&object->lock, flags); |
1048 | put_object(object); | ||
1039 | } | 1049 | } |
1040 | } | 1050 | } |
1041 | 1051 | ||
@@ -1075,14 +1085,47 @@ static void scan_object(struct kmemleak_object *object) | |||
1075 | } | 1085 | } |
1076 | } else | 1086 | } else |
1077 | hlist_for_each_entry(area, elem, &object->area_list, node) | 1087 | hlist_for_each_entry(area, elem, &object->area_list, node) |
1078 | scan_block((void *)(object->pointer + area->offset), | 1088 | scan_block((void *)area->start, |
1079 | (void *)(object->pointer + area->offset | 1089 | (void *)(area->start + area->size), |
1080 | + area->length), object, 0); | 1090 | object, 0); |
1081 | out: | 1091 | out: |
1082 | spin_unlock_irqrestore(&object->lock, flags); | 1092 | spin_unlock_irqrestore(&object->lock, flags); |
1083 | } | 1093 | } |
1084 | 1094 | ||
1085 | /* | 1095 | /* |
1096 | * Scan the objects already referenced (gray objects). More objects will be | ||
1097 | * referenced and, if there are no memory leaks, all the objects are scanned. | ||
1098 | */ | ||
1099 | static void scan_gray_list(void) | ||
1100 | { | ||
1101 | struct kmemleak_object *object, *tmp; | ||
1102 | |||
1103 | /* | ||
1104 | * The list traversal is safe for both tail additions and removals | ||
1105 | * from inside the loop. The kmemleak objects cannot be freed from | ||
1106 | * outside the loop because their use_count was incremented. | ||
1107 | */ | ||
1108 | object = list_entry(gray_list.next, typeof(*object), gray_list); | ||
1109 | while (&object->gray_list != &gray_list) { | ||
1110 | cond_resched(); | ||
1111 | |||
1112 | /* may add new objects to the list */ | ||
1113 | if (!scan_should_stop()) | ||
1114 | scan_object(object); | ||
1115 | |||
1116 | tmp = list_entry(object->gray_list.next, typeof(*object), | ||
1117 | gray_list); | ||
1118 | |||
1119 | /* remove the object from the list and release it */ | ||
1120 | list_del(&object->gray_list); | ||
1121 | put_object(object); | ||
1122 | |||
1123 | object = tmp; | ||
1124 | } | ||
1125 | WARN_ON(!list_empty(&gray_list)); | ||
1126 | } | ||
1127 | |||
1128 | /* | ||
1086 | * Scan data sections and all the referenced memory blocks allocated via the | 1129 | * Scan data sections and all the referenced memory blocks allocated via the |
1087 | * kernel's standard allocators. This function must be called with the | 1130 | * kernel's standard allocators. This function must be called with the |
1088 | * scan_mutex held. | 1131 | * scan_mutex held. |
@@ -1090,10 +1133,9 @@ out: | |||
1090 | static void kmemleak_scan(void) | 1133 | static void kmemleak_scan(void) |
1091 | { | 1134 | { |
1092 | unsigned long flags; | 1135 | unsigned long flags; |
1093 | struct kmemleak_object *object, *tmp; | 1136 | struct kmemleak_object *object; |
1094 | int i; | 1137 | int i; |
1095 | int new_leaks = 0; | 1138 | int new_leaks = 0; |
1096 | int gray_list_pass = 0; | ||
1097 | 1139 | ||
1098 | jiffies_last_scan = jiffies; | 1140 | jiffies_last_scan = jiffies; |
1099 | 1141 | ||
@@ -1114,7 +1156,6 @@ static void kmemleak_scan(void) | |||
1114 | #endif | 1156 | #endif |
1115 | /* reset the reference count (whiten the object) */ | 1157 | /* reset the reference count (whiten the object) */ |
1116 | object->count = 0; | 1158 | object->count = 0; |
1117 | object->flags &= ~OBJECT_NEW; | ||
1118 | if (color_gray(object) && get_object(object)) | 1159 | if (color_gray(object) && get_object(object)) |
1119 | list_add_tail(&object->gray_list, &gray_list); | 1160 | list_add_tail(&object->gray_list, &gray_list); |
1120 | 1161 | ||
@@ -1172,62 +1213,36 @@ static void kmemleak_scan(void) | |||
1172 | 1213 | ||
1173 | /* | 1214 | /* |
1174 | * Scan the objects already referenced from the sections scanned | 1215 | * Scan the objects already referenced from the sections scanned |
1175 | * above. More objects will be referenced and, if there are no memory | 1216 | * above. |
1176 | * leaks, all the objects will be scanned. The list traversal is safe | ||
1177 | * for both tail additions and removals from inside the loop. The | ||
1178 | * kmemleak objects cannot be freed from outside the loop because their | ||
1179 | * use_count was increased. | ||
1180 | */ | 1217 | */ |
1181 | repeat: | 1218 | scan_gray_list(); |
1182 | object = list_entry(gray_list.next, typeof(*object), gray_list); | ||
1183 | while (&object->gray_list != &gray_list) { | ||
1184 | cond_resched(); | ||
1185 | |||
1186 | /* may add new objects to the list */ | ||
1187 | if (!scan_should_stop()) | ||
1188 | scan_object(object); | ||
1189 | |||
1190 | tmp = list_entry(object->gray_list.next, typeof(*object), | ||
1191 | gray_list); | ||
1192 | |||
1193 | /* remove the object from the list and release it */ | ||
1194 | list_del(&object->gray_list); | ||
1195 | put_object(object); | ||
1196 | |||
1197 | object = tmp; | ||
1198 | } | ||
1199 | |||
1200 | if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES) | ||
1201 | goto scan_end; | ||
1202 | 1219 | ||
1203 | /* | 1220 | /* |
1204 | * Check for new objects allocated during this scanning and add them | 1221 | * Check for new or unreferenced objects modified since the previous |
1205 | * to the gray list. | 1222 | * scan and color them gray until the next scan. |
1206 | */ | 1223 | */ |
1207 | rcu_read_lock(); | 1224 | rcu_read_lock(); |
1208 | list_for_each_entry_rcu(object, &object_list, object_list) { | 1225 | list_for_each_entry_rcu(object, &object_list, object_list) { |
1209 | spin_lock_irqsave(&object->lock, flags); | 1226 | spin_lock_irqsave(&object->lock, flags); |
1210 | if ((object->flags & OBJECT_NEW) && !color_black(object) && | 1227 | if (color_white(object) && (object->flags & OBJECT_ALLOCATED) |
1211 | get_object(object)) { | 1228 | && update_checksum(object) && get_object(object)) { |
1212 | object->flags &= ~OBJECT_NEW; | 1229 | /* color it gray temporarily */ |
1230 | object->count = object->min_count; | ||
1213 | list_add_tail(&object->gray_list, &gray_list); | 1231 | list_add_tail(&object->gray_list, &gray_list); |
1214 | } | 1232 | } |
1215 | spin_unlock_irqrestore(&object->lock, flags); | 1233 | spin_unlock_irqrestore(&object->lock, flags); |
1216 | } | 1234 | } |
1217 | rcu_read_unlock(); | 1235 | rcu_read_unlock(); |
1218 | 1236 | ||
1219 | if (!list_empty(&gray_list)) | 1237 | /* |
1220 | goto repeat; | 1238 | * Re-scan the gray list for modified unreferenced objects. |
1221 | 1239 | */ | |
1222 | scan_end: | 1240 | scan_gray_list(); |
1223 | WARN_ON(!list_empty(&gray_list)); | ||
1224 | 1241 | ||
1225 | /* | 1242 | /* |
1226 | * If scanning was stopped or new objects were being allocated at a | 1243 | * If scanning was stopped do not report any new unreferenced objects. |
1227 | * higher rate than gray list scanning, do not report any new | ||
1228 | * unreferenced objects. | ||
1229 | */ | 1244 | */ |
1230 | if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) | 1245 | if (scan_should_stop()) |
1231 | return; | 1246 | return; |
1232 | 1247 | ||
1233 | /* | 1248 | /* |
@@ -1642,8 +1657,7 @@ void __init kmemleak_init(void) | |||
1642 | kmemleak_ignore(log->ptr); | 1657 | kmemleak_ignore(log->ptr); |
1643 | break; | 1658 | break; |
1644 | case KMEMLEAK_SCAN_AREA: | 1659 | case KMEMLEAK_SCAN_AREA: |
1645 | kmemleak_scan_area(log->ptr, log->offset, log->length, | 1660 | kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); |
1646 | GFP_KERNEL); | ||
1647 | break; | 1661 | break; |
1648 | case KMEMLEAK_NO_SCAN: | 1662 | case KMEMLEAK_NO_SCAN: |
1649 | kmemleak_no_scan(log->ptr); | 1663 | kmemleak_no_scan(log->ptr); |
@@ -29,11 +29,13 @@ | |||
29 | #include <linux/wait.h> | 29 | #include <linux/wait.h> |
30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
31 | #include <linux/rbtree.h> | 31 | #include <linux/rbtree.h> |
32 | #include <linux/memory.h> | ||
32 | #include <linux/mmu_notifier.h> | 33 | #include <linux/mmu_notifier.h> |
33 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
34 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
35 | 36 | ||
36 | #include <asm/tlbflush.h> | 37 | #include <asm/tlbflush.h> |
38 | #include "internal.h" | ||
37 | 39 | ||
38 | /* | 40 | /* |
39 | * A few notes about the KSM scanning process, | 41 | * A few notes about the KSM scanning process, |
@@ -79,13 +81,13 @@ | |||
79 | * struct mm_slot - ksm information per mm that is being scanned | 81 | * struct mm_slot - ksm information per mm that is being scanned |
80 | * @link: link to the mm_slots hash list | 82 | * @link: link to the mm_slots hash list |
81 | * @mm_list: link into the mm_slots list, rooted in ksm_mm_head | 83 | * @mm_list: link into the mm_slots list, rooted in ksm_mm_head |
82 | * @rmap_list: head for this mm_slot's list of rmap_items | 84 | * @rmap_list: head for this mm_slot's singly-linked list of rmap_items |
83 | * @mm: the mm that this information is valid for | 85 | * @mm: the mm that this information is valid for |
84 | */ | 86 | */ |
85 | struct mm_slot { | 87 | struct mm_slot { |
86 | struct hlist_node link; | 88 | struct hlist_node link; |
87 | struct list_head mm_list; | 89 | struct list_head mm_list; |
88 | struct list_head rmap_list; | 90 | struct rmap_item *rmap_list; |
89 | struct mm_struct *mm; | 91 | struct mm_struct *mm; |
90 | }; | 92 | }; |
91 | 93 | ||
@@ -93,7 +95,7 @@ struct mm_slot { | |||
93 | * struct ksm_scan - cursor for scanning | 95 | * struct ksm_scan - cursor for scanning |
94 | * @mm_slot: the current mm_slot we are scanning | 96 | * @mm_slot: the current mm_slot we are scanning |
95 | * @address: the next address inside that to be scanned | 97 | * @address: the next address inside that to be scanned |
96 | * @rmap_item: the current rmap that we are scanning inside the rmap_list | 98 | * @rmap_list: link to the next rmap to be scanned in the rmap_list |
97 | * @seqnr: count of completed full scans (needed when removing unstable node) | 99 | * @seqnr: count of completed full scans (needed when removing unstable node) |
98 | * | 100 | * |
99 | * There is only the one ksm_scan instance of this cursor structure. | 101 | * There is only the one ksm_scan instance of this cursor structure. |
@@ -101,37 +103,51 @@ struct mm_slot { | |||
101 | struct ksm_scan { | 103 | struct ksm_scan { |
102 | struct mm_slot *mm_slot; | 104 | struct mm_slot *mm_slot; |
103 | unsigned long address; | 105 | unsigned long address; |
104 | struct rmap_item *rmap_item; | 106 | struct rmap_item **rmap_list; |
105 | unsigned long seqnr; | 107 | unsigned long seqnr; |
106 | }; | 108 | }; |
107 | 109 | ||
108 | /** | 110 | /** |
111 | * struct stable_node - node of the stable rbtree | ||
112 | * @node: rb node of this ksm page in the stable tree | ||
113 | * @hlist: hlist head of rmap_items using this ksm page | ||
114 | * @kpfn: page frame number of this ksm page | ||
115 | */ | ||
116 | struct stable_node { | ||
117 | struct rb_node node; | ||
118 | struct hlist_head hlist; | ||
119 | unsigned long kpfn; | ||
120 | }; | ||
121 | |||
122 | /** | ||
109 | * struct rmap_item - reverse mapping item for virtual addresses | 123 | * struct rmap_item - reverse mapping item for virtual addresses |
110 | * @link: link into mm_slot's rmap_list (rmap_list is per mm) | 124 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list |
125 | * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree | ||
111 | * @mm: the memory structure this rmap_item is pointing into | 126 | * @mm: the memory structure this rmap_item is pointing into |
112 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) | 127 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) |
113 | * @oldchecksum: previous checksum of the page at that virtual address | 128 | * @oldchecksum: previous checksum of the page at that virtual address |
114 | * @node: rb_node of this rmap_item in either unstable or stable tree | 129 | * @node: rb node of this rmap_item in the unstable tree |
115 | * @next: next rmap_item hanging off the same node of the stable tree | 130 | * @head: pointer to stable_node heading this list in the stable tree |
116 | * @prev: previous rmap_item hanging off the same node of the stable tree | 131 | * @hlist: link into hlist of rmap_items hanging off that stable_node |
117 | */ | 132 | */ |
118 | struct rmap_item { | 133 | struct rmap_item { |
119 | struct list_head link; | 134 | struct rmap_item *rmap_list; |
135 | struct anon_vma *anon_vma; /* when stable */ | ||
120 | struct mm_struct *mm; | 136 | struct mm_struct *mm; |
121 | unsigned long address; /* + low bits used for flags below */ | 137 | unsigned long address; /* + low bits used for flags below */ |
138 | unsigned int oldchecksum; /* when unstable */ | ||
122 | union { | 139 | union { |
123 | unsigned int oldchecksum; /* when unstable */ | 140 | struct rb_node node; /* when node of unstable tree */ |
124 | struct rmap_item *next; /* when stable */ | 141 | struct { /* when listed from stable tree */ |
125 | }; | 142 | struct stable_node *head; |
126 | union { | 143 | struct hlist_node hlist; |
127 | struct rb_node node; /* when tree node */ | 144 | }; |
128 | struct rmap_item *prev; /* in stable list */ | ||
129 | }; | 145 | }; |
130 | }; | 146 | }; |
131 | 147 | ||
132 | #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ | 148 | #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ |
133 | #define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ | 149 | #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ |
134 | #define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ | 150 | #define STABLE_FLAG 0x200 /* is listed from the stable tree */ |
135 | 151 | ||
136 | /* The stable and unstable tree heads */ | 152 | /* The stable and unstable tree heads */ |
137 | static struct rb_root root_stable_tree = RB_ROOT; | 153 | static struct rb_root root_stable_tree = RB_ROOT; |
@@ -148,6 +164,7 @@ static struct ksm_scan ksm_scan = { | |||
148 | }; | 164 | }; |
149 | 165 | ||
150 | static struct kmem_cache *rmap_item_cache; | 166 | static struct kmem_cache *rmap_item_cache; |
167 | static struct kmem_cache *stable_node_cache; | ||
151 | static struct kmem_cache *mm_slot_cache; | 168 | static struct kmem_cache *mm_slot_cache; |
152 | 169 | ||
153 | /* The number of nodes in the stable tree */ | 170 | /* The number of nodes in the stable tree */ |
@@ -162,9 +179,6 @@ static unsigned long ksm_pages_unshared; | |||
162 | /* The number of rmap_items in use: to calculate pages_volatile */ | 179 | /* The number of rmap_items in use: to calculate pages_volatile */ |
163 | static unsigned long ksm_rmap_items; | 180 | static unsigned long ksm_rmap_items; |
164 | 181 | ||
165 | /* Limit on the number of unswappable pages used */ | ||
166 | static unsigned long ksm_max_kernel_pages; | ||
167 | |||
168 | /* Number of pages ksmd should scan in one batch */ | 182 | /* Number of pages ksmd should scan in one batch */ |
169 | static unsigned int ksm_thread_pages_to_scan = 100; | 183 | static unsigned int ksm_thread_pages_to_scan = 100; |
170 | 184 | ||
@@ -190,13 +204,19 @@ static int __init ksm_slab_init(void) | |||
190 | if (!rmap_item_cache) | 204 | if (!rmap_item_cache) |
191 | goto out; | 205 | goto out; |
192 | 206 | ||
207 | stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); | ||
208 | if (!stable_node_cache) | ||
209 | goto out_free1; | ||
210 | |||
193 | mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); | 211 | mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); |
194 | if (!mm_slot_cache) | 212 | if (!mm_slot_cache) |
195 | goto out_free; | 213 | goto out_free2; |
196 | 214 | ||
197 | return 0; | 215 | return 0; |
198 | 216 | ||
199 | out_free: | 217 | out_free2: |
218 | kmem_cache_destroy(stable_node_cache); | ||
219 | out_free1: | ||
200 | kmem_cache_destroy(rmap_item_cache); | 220 | kmem_cache_destroy(rmap_item_cache); |
201 | out: | 221 | out: |
202 | return -ENOMEM; | 222 | return -ENOMEM; |
@@ -205,6 +225,7 @@ out: | |||
205 | static void __init ksm_slab_free(void) | 225 | static void __init ksm_slab_free(void) |
206 | { | 226 | { |
207 | kmem_cache_destroy(mm_slot_cache); | 227 | kmem_cache_destroy(mm_slot_cache); |
228 | kmem_cache_destroy(stable_node_cache); | ||
208 | kmem_cache_destroy(rmap_item_cache); | 229 | kmem_cache_destroy(rmap_item_cache); |
209 | mm_slot_cache = NULL; | 230 | mm_slot_cache = NULL; |
210 | } | 231 | } |
@@ -226,6 +247,16 @@ static inline void free_rmap_item(struct rmap_item *rmap_item) | |||
226 | kmem_cache_free(rmap_item_cache, rmap_item); | 247 | kmem_cache_free(rmap_item_cache, rmap_item); |
227 | } | 248 | } |
228 | 249 | ||
250 | static inline struct stable_node *alloc_stable_node(void) | ||
251 | { | ||
252 | return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); | ||
253 | } | ||
254 | |||
255 | static inline void free_stable_node(struct stable_node *stable_node) | ||
256 | { | ||
257 | kmem_cache_free(stable_node_cache, stable_node); | ||
258 | } | ||
259 | |||
229 | static inline struct mm_slot *alloc_mm_slot(void) | 260 | static inline struct mm_slot *alloc_mm_slot(void) |
230 | { | 261 | { |
231 | if (!mm_slot_cache) /* initialization failed */ | 262 | if (!mm_slot_cache) /* initialization failed */ |
@@ -275,7 +306,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, | |||
275 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | 306 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) |
276 | % MM_SLOTS_HASH_HEADS]; | 307 | % MM_SLOTS_HASH_HEADS]; |
277 | mm_slot->mm = mm; | 308 | mm_slot->mm = mm; |
278 | INIT_LIST_HEAD(&mm_slot->rmap_list); | ||
279 | hlist_add_head(&mm_slot->link, bucket); | 309 | hlist_add_head(&mm_slot->link, bucket); |
280 | } | 310 | } |
281 | 311 | ||
@@ -284,6 +314,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) | |||
284 | return rmap_item->address & STABLE_FLAG; | 314 | return rmap_item->address & STABLE_FLAG; |
285 | } | 315 | } |
286 | 316 | ||
317 | static void hold_anon_vma(struct rmap_item *rmap_item, | ||
318 | struct anon_vma *anon_vma) | ||
319 | { | ||
320 | rmap_item->anon_vma = anon_vma; | ||
321 | atomic_inc(&anon_vma->ksm_refcount); | ||
322 | } | ||
323 | |||
324 | static void drop_anon_vma(struct rmap_item *rmap_item) | ||
325 | { | ||
326 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
327 | |||
328 | if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { | ||
329 | int empty = list_empty(&anon_vma->head); | ||
330 | spin_unlock(&anon_vma->lock); | ||
331 | if (empty) | ||
332 | anon_vma_free(anon_vma); | ||
333 | } | ||
334 | } | ||
335 | |||
287 | /* | 336 | /* |
288 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's | 337 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's |
289 | * page tables after it has passed through ksm_exit() - which, if necessary, | 338 | * page tables after it has passed through ksm_exit() - which, if necessary, |
@@ -356,10 +405,18 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
356 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; | 405 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; |
357 | } | 406 | } |
358 | 407 | ||
359 | static void break_cow(struct mm_struct *mm, unsigned long addr) | 408 | static void break_cow(struct rmap_item *rmap_item) |
360 | { | 409 | { |
410 | struct mm_struct *mm = rmap_item->mm; | ||
411 | unsigned long addr = rmap_item->address; | ||
361 | struct vm_area_struct *vma; | 412 | struct vm_area_struct *vma; |
362 | 413 | ||
414 | /* | ||
415 | * It is not an accident that whenever we want to break COW | ||
416 | * to undo, we also need to drop a reference to the anon_vma. | ||
417 | */ | ||
418 | drop_anon_vma(rmap_item); | ||
419 | |||
363 | down_read(&mm->mmap_sem); | 420 | down_read(&mm->mmap_sem); |
364 | if (ksm_test_exit(mm)) | 421 | if (ksm_test_exit(mm)) |
365 | goto out; | 422 | goto out; |
@@ -403,21 +460,77 @@ out: page = NULL; | |||
403 | return page; | 460 | return page; |
404 | } | 461 | } |
405 | 462 | ||
463 | static void remove_node_from_stable_tree(struct stable_node *stable_node) | ||
464 | { | ||
465 | struct rmap_item *rmap_item; | ||
466 | struct hlist_node *hlist; | ||
467 | |||
468 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
469 | if (rmap_item->hlist.next) | ||
470 | ksm_pages_sharing--; | ||
471 | else | ||
472 | ksm_pages_shared--; | ||
473 | drop_anon_vma(rmap_item); | ||
474 | rmap_item->address &= PAGE_MASK; | ||
475 | cond_resched(); | ||
476 | } | ||
477 | |||
478 | rb_erase(&stable_node->node, &root_stable_tree); | ||
479 | free_stable_node(stable_node); | ||
480 | } | ||
481 | |||
406 | /* | 482 | /* |
407 | * get_ksm_page: checks if the page at the virtual address in rmap_item | 483 | * get_ksm_page: checks if the page indicated by the stable node |
408 | * is still PageKsm, in which case we can trust the content of the page, | 484 | * is still its ksm page, despite having held no reference to it. |
409 | * and it returns the gotten page; but NULL if the page has been zapped. | 485 | * In which case we can trust the content of the page, and it |
486 | * returns the gotten page; but if the page has now been zapped, | ||
487 | * remove the stale node from the stable tree and return NULL. | ||
488 | * | ||
489 | * You would expect the stable_node to hold a reference to the ksm page. | ||
490 | * But if it increments the page's count, swapping out has to wait for | ||
491 | * ksmd to come around again before it can free the page, which may take | ||
492 | * seconds or even minutes: much too unresponsive. So instead we use a | ||
493 | * "keyhole reference": access to the ksm page from the stable node peeps | ||
494 | * out through its keyhole to see if that page still holds the right key, | ||
495 | * pointing back to this stable node. This relies on freeing a PageAnon | ||
496 | * page to reset its page->mapping to NULL, and relies on no other use of | ||
497 | * a page to put something that might look like our key in page->mapping. | ||
498 | * | ||
499 | * include/linux/pagemap.h page_cache_get_speculative() is a good reference, | ||
500 | * but this is different - made simpler by ksm_thread_mutex being held, but | ||
501 | * interesting for assuming that no other use of the struct page could ever | ||
502 | * put our expected_mapping into page->mapping (or a field of the union which | ||
503 | * coincides with page->mapping). The RCU calls are not for KSM at all, but | ||
504 | * to keep the page_count protocol described with page_cache_get_speculative. | ||
505 | * | ||
506 | * Note: it is possible that get_ksm_page() will return NULL one moment, | ||
507 | * then page the next, if the page is in between page_freeze_refs() and | ||
508 | * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page | ||
509 | * is on its way to being freed; but it is an anomaly to bear in mind. | ||
410 | */ | 510 | */ |
411 | static struct page *get_ksm_page(struct rmap_item *rmap_item) | 511 | static struct page *get_ksm_page(struct stable_node *stable_node) |
412 | { | 512 | { |
413 | struct page *page; | 513 | struct page *page; |
414 | 514 | void *expected_mapping; | |
415 | page = get_mergeable_page(rmap_item); | 515 | |
416 | if (page && !PageKsm(page)) { | 516 | page = pfn_to_page(stable_node->kpfn); |
517 | expected_mapping = (void *)stable_node + | ||
518 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | ||
519 | rcu_read_lock(); | ||
520 | if (page->mapping != expected_mapping) | ||
521 | goto stale; | ||
522 | if (!get_page_unless_zero(page)) | ||
523 | goto stale; | ||
524 | if (page->mapping != expected_mapping) { | ||
417 | put_page(page); | 525 | put_page(page); |
418 | page = NULL; | 526 | goto stale; |
419 | } | 527 | } |
528 | rcu_read_unlock(); | ||
420 | return page; | 529 | return page; |
530 | stale: | ||
531 | rcu_read_unlock(); | ||
532 | remove_node_from_stable_tree(stable_node); | ||
533 | return NULL; | ||
421 | } | 534 | } |
422 | 535 | ||
423 | /* | 536 | /* |
@@ -426,35 +539,29 @@ static struct page *get_ksm_page(struct rmap_item *rmap_item) | |||
426 | */ | 539 | */ |
427 | static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | 540 | static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) |
428 | { | 541 | { |
429 | if (in_stable_tree(rmap_item)) { | 542 | if (rmap_item->address & STABLE_FLAG) { |
430 | struct rmap_item *next_item = rmap_item->next; | 543 | struct stable_node *stable_node; |
431 | 544 | struct page *page; | |
432 | if (rmap_item->address & NODE_FLAG) { | ||
433 | if (next_item) { | ||
434 | rb_replace_node(&rmap_item->node, | ||
435 | &next_item->node, | ||
436 | &root_stable_tree); | ||
437 | next_item->address |= NODE_FLAG; | ||
438 | ksm_pages_sharing--; | ||
439 | } else { | ||
440 | rb_erase(&rmap_item->node, &root_stable_tree); | ||
441 | ksm_pages_shared--; | ||
442 | } | ||
443 | } else { | ||
444 | struct rmap_item *prev_item = rmap_item->prev; | ||
445 | 545 | ||
446 | BUG_ON(prev_item->next != rmap_item); | 546 | stable_node = rmap_item->head; |
447 | prev_item->next = next_item; | 547 | page = get_ksm_page(stable_node); |
448 | if (next_item) { | 548 | if (!page) |
449 | BUG_ON(next_item->prev != rmap_item); | 549 | goto out; |
450 | next_item->prev = rmap_item->prev; | 550 | |
451 | } | 551 | lock_page(page); |
552 | hlist_del(&rmap_item->hlist); | ||
553 | unlock_page(page); | ||
554 | put_page(page); | ||
555 | |||
556 | if (stable_node->hlist.first) | ||
452 | ksm_pages_sharing--; | 557 | ksm_pages_sharing--; |
453 | } | 558 | else |
559 | ksm_pages_shared--; | ||
454 | 560 | ||
455 | rmap_item->next = NULL; | 561 | drop_anon_vma(rmap_item); |
562 | rmap_item->address &= PAGE_MASK; | ||
456 | 563 | ||
457 | } else if (rmap_item->address & NODE_FLAG) { | 564 | } else if (rmap_item->address & UNSTABLE_FLAG) { |
458 | unsigned char age; | 565 | unsigned char age; |
459 | /* | 566 | /* |
460 | * Usually ksmd can and must skip the rb_erase, because | 567 | * Usually ksmd can and must skip the rb_erase, because |
@@ -467,24 +574,21 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
467 | BUG_ON(age > 1); | 574 | BUG_ON(age > 1); |
468 | if (!age) | 575 | if (!age) |
469 | rb_erase(&rmap_item->node, &root_unstable_tree); | 576 | rb_erase(&rmap_item->node, &root_unstable_tree); |
577 | |||
470 | ksm_pages_unshared--; | 578 | ksm_pages_unshared--; |
579 | rmap_item->address &= PAGE_MASK; | ||
471 | } | 580 | } |
472 | 581 | out: | |
473 | rmap_item->address &= PAGE_MASK; | ||
474 | |||
475 | cond_resched(); /* we're called from many long loops */ | 582 | cond_resched(); /* we're called from many long loops */ |
476 | } | 583 | } |
477 | 584 | ||
478 | static void remove_trailing_rmap_items(struct mm_slot *mm_slot, | 585 | static void remove_trailing_rmap_items(struct mm_slot *mm_slot, |
479 | struct list_head *cur) | 586 | struct rmap_item **rmap_list) |
480 | { | 587 | { |
481 | struct rmap_item *rmap_item; | 588 | while (*rmap_list) { |
482 | 589 | struct rmap_item *rmap_item = *rmap_list; | |
483 | while (cur != &mm_slot->rmap_list) { | 590 | *rmap_list = rmap_item->rmap_list; |
484 | rmap_item = list_entry(cur, struct rmap_item, link); | ||
485 | cur = cur->next; | ||
486 | remove_rmap_item_from_tree(rmap_item); | 591 | remove_rmap_item_from_tree(rmap_item); |
487 | list_del(&rmap_item->link); | ||
488 | free_rmap_item(rmap_item); | 592 | free_rmap_item(rmap_item); |
489 | } | 593 | } |
490 | } | 594 | } |
@@ -550,7 +654,7 @@ static int unmerge_and_remove_all_rmap_items(void) | |||
550 | goto error; | 654 | goto error; |
551 | } | 655 | } |
552 | 656 | ||
553 | remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); | 657 | remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); |
554 | 658 | ||
555 | spin_lock(&ksm_mmlist_lock); | 659 | spin_lock(&ksm_mmlist_lock); |
556 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, | 660 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, |
@@ -646,7 +750,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
646 | * Check that no O_DIRECT or similar I/O is in progress on the | 750 | * Check that no O_DIRECT or similar I/O is in progress on the |
647 | * page | 751 | * page |
648 | */ | 752 | */ |
649 | if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { | 753 | if (page_mapcount(page) + 1 + swapped != page_count(page)) { |
650 | set_pte_at_notify(mm, addr, ptep, entry); | 754 | set_pte_at_notify(mm, addr, ptep, entry); |
651 | goto out_unlock; | 755 | goto out_unlock; |
652 | } | 756 | } |
@@ -664,15 +768,15 @@ out: | |||
664 | 768 | ||
665 | /** | 769 | /** |
666 | * replace_page - replace page in vma by new ksm page | 770 | * replace_page - replace page in vma by new ksm page |
667 | * @vma: vma that holds the pte pointing to oldpage | 771 | * @vma: vma that holds the pte pointing to page |
668 | * @oldpage: the page we are replacing by newpage | 772 | * @page: the page we are replacing by kpage |
669 | * @newpage: the ksm page we replace oldpage by | 773 | * @kpage: the ksm page we replace page by |
670 | * @orig_pte: the original value of the pte | 774 | * @orig_pte: the original value of the pte |
671 | * | 775 | * |
672 | * Returns 0 on success, -EFAULT on failure. | 776 | * Returns 0 on success, -EFAULT on failure. |
673 | */ | 777 | */ |
674 | static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | 778 | static int replace_page(struct vm_area_struct *vma, struct page *page, |
675 | struct page *newpage, pte_t orig_pte) | 779 | struct page *kpage, pte_t orig_pte) |
676 | { | 780 | { |
677 | struct mm_struct *mm = vma->vm_mm; | 781 | struct mm_struct *mm = vma->vm_mm; |
678 | pgd_t *pgd; | 782 | pgd_t *pgd; |
@@ -681,12 +785,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | |||
681 | pte_t *ptep; | 785 | pte_t *ptep; |
682 | spinlock_t *ptl; | 786 | spinlock_t *ptl; |
683 | unsigned long addr; | 787 | unsigned long addr; |
684 | pgprot_t prot; | ||
685 | int err = -EFAULT; | 788 | int err = -EFAULT; |
686 | 789 | ||
687 | prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); | 790 | addr = page_address_in_vma(page, vma); |
688 | |||
689 | addr = page_address_in_vma(oldpage, vma); | ||
690 | if (addr == -EFAULT) | 791 | if (addr == -EFAULT) |
691 | goto out; | 792 | goto out; |
692 | 793 | ||
@@ -708,15 +809,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | |||
708 | goto out; | 809 | goto out; |
709 | } | 810 | } |
710 | 811 | ||
711 | get_page(newpage); | 812 | get_page(kpage); |
712 | page_add_ksm_rmap(newpage); | 813 | page_add_anon_rmap(kpage, vma, addr); |
713 | 814 | ||
714 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | 815 | flush_cache_page(vma, addr, pte_pfn(*ptep)); |
715 | ptep_clear_flush(vma, addr, ptep); | 816 | ptep_clear_flush(vma, addr, ptep); |
716 | set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); | 817 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); |
717 | 818 | ||
718 | page_remove_rmap(oldpage); | 819 | page_remove_rmap(page); |
719 | put_page(oldpage); | 820 | put_page(page); |
720 | 821 | ||
721 | pte_unmap_unlock(ptep, ptl); | 822 | pte_unmap_unlock(ptep, ptl); |
722 | err = 0; | 823 | err = 0; |
@@ -726,32 +827,27 @@ out: | |||
726 | 827 | ||
727 | /* | 828 | /* |
728 | * try_to_merge_one_page - take two pages and merge them into one | 829 | * try_to_merge_one_page - take two pages and merge them into one |
729 | * @vma: the vma that hold the pte pointing into oldpage | 830 | * @vma: the vma that holds the pte pointing to page |
730 | * @oldpage: the page that we want to replace with newpage | 831 | * @page: the PageAnon page that we want to replace with kpage |
731 | * @newpage: the page that we want to map instead of oldpage | 832 | * @kpage: the PageKsm page that we want to map instead of page, |
732 | * | 833 | * or NULL the first time when we want to use page as kpage. |
733 | * Note: | ||
734 | * oldpage should be a PageAnon page, while newpage should be a PageKsm page, | ||
735 | * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm. | ||
736 | * | 834 | * |
737 | * This function returns 0 if the pages were merged, -EFAULT otherwise. | 835 | * This function returns 0 if the pages were merged, -EFAULT otherwise. |
738 | */ | 836 | */ |
739 | static int try_to_merge_one_page(struct vm_area_struct *vma, | 837 | static int try_to_merge_one_page(struct vm_area_struct *vma, |
740 | struct page *oldpage, | 838 | struct page *page, struct page *kpage) |
741 | struct page *newpage) | ||
742 | { | 839 | { |
743 | pte_t orig_pte = __pte(0); | 840 | pte_t orig_pte = __pte(0); |
744 | int err = -EFAULT; | 841 | int err = -EFAULT; |
745 | 842 | ||
843 | if (page == kpage) /* ksm page forked */ | ||
844 | return 0; | ||
845 | |||
746 | if (!(vma->vm_flags & VM_MERGEABLE)) | 846 | if (!(vma->vm_flags & VM_MERGEABLE)) |
747 | goto out; | 847 | goto out; |
748 | 848 | if (!PageAnon(page)) | |
749 | if (!PageAnon(oldpage)) | ||
750 | goto out; | 849 | goto out; |
751 | 850 | ||
752 | get_page(newpage); | ||
753 | get_page(oldpage); | ||
754 | |||
755 | /* | 851 | /* |
756 | * We need the page lock to read a stable PageSwapCache in | 852 | * We need the page lock to read a stable PageSwapCache in |
757 | * write_protect_page(). We use trylock_page() instead of | 853 | * write_protect_page(). We use trylock_page() instead of |
@@ -759,26 +855,39 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
759 | * prefer to continue scanning and merging different pages, | 855 | * prefer to continue scanning and merging different pages, |
760 | * then come back to this page when it is unlocked. | 856 | * then come back to this page when it is unlocked. |
761 | */ | 857 | */ |
762 | if (!trylock_page(oldpage)) | 858 | if (!trylock_page(page)) |
763 | goto out_putpage; | 859 | goto out; |
764 | /* | 860 | /* |
765 | * If this anonymous page is mapped only here, its pte may need | 861 | * If this anonymous page is mapped only here, its pte may need |
766 | * to be write-protected. If it's mapped elsewhere, all of its | 862 | * to be write-protected. If it's mapped elsewhere, all of its |
767 | * ptes are necessarily already write-protected. But in either | 863 | * ptes are necessarily already write-protected. But in either |
768 | * case, we need to lock and check page_count is not raised. | 864 | * case, we need to lock and check page_count is not raised. |
769 | */ | 865 | */ |
770 | if (write_protect_page(vma, oldpage, &orig_pte)) { | 866 | if (write_protect_page(vma, page, &orig_pte) == 0) { |
771 | unlock_page(oldpage); | 867 | if (!kpage) { |
772 | goto out_putpage; | 868 | /* |
869 | * While we hold page lock, upgrade page from | ||
870 | * PageAnon+anon_vma to PageKsm+NULL stable_node: | ||
871 | * stable_tree_insert() will update stable_node. | ||
872 | */ | ||
873 | set_page_stable_node(page, NULL); | ||
874 | mark_page_accessed(page); | ||
875 | err = 0; | ||
876 | } else if (pages_identical(page, kpage)) | ||
877 | err = replace_page(vma, page, kpage, orig_pte); | ||
773 | } | 878 | } |
774 | unlock_page(oldpage); | ||
775 | 879 | ||
776 | if (pages_identical(oldpage, newpage)) | 880 | if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { |
777 | err = replace_page(vma, oldpage, newpage, orig_pte); | 881 | munlock_vma_page(page); |
882 | if (!PageMlocked(kpage)) { | ||
883 | unlock_page(page); | ||
884 | lock_page(kpage); | ||
885 | mlock_vma_page(kpage); | ||
886 | page = kpage; /* for final unlock */ | ||
887 | } | ||
888 | } | ||
778 | 889 | ||
779 | out_putpage: | 890 | unlock_page(page); |
780 | put_page(oldpage); | ||
781 | put_page(newpage); | ||
782 | out: | 891 | out: |
783 | return err; | 892 | return err; |
784 | } | 893 | } |
@@ -786,26 +895,31 @@ out: | |||
786 | /* | 895 | /* |
787 | * try_to_merge_with_ksm_page - like try_to_merge_two_pages, | 896 | * try_to_merge_with_ksm_page - like try_to_merge_two_pages, |
788 | * but no new kernel page is allocated: kpage must already be a ksm page. | 897 | * but no new kernel page is allocated: kpage must already be a ksm page. |
898 | * | ||
899 | * This function returns 0 if the pages were merged, -EFAULT otherwise. | ||
789 | */ | 900 | */ |
790 | static int try_to_merge_with_ksm_page(struct mm_struct *mm1, | 901 | static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, |
791 | unsigned long addr1, | 902 | struct page *page, struct page *kpage) |
792 | struct page *page1, | ||
793 | struct page *kpage) | ||
794 | { | 903 | { |
904 | struct mm_struct *mm = rmap_item->mm; | ||
795 | struct vm_area_struct *vma; | 905 | struct vm_area_struct *vma; |
796 | int err = -EFAULT; | 906 | int err = -EFAULT; |
797 | 907 | ||
798 | down_read(&mm1->mmap_sem); | 908 | down_read(&mm->mmap_sem); |
799 | if (ksm_test_exit(mm1)) | 909 | if (ksm_test_exit(mm)) |
910 | goto out; | ||
911 | vma = find_vma(mm, rmap_item->address); | ||
912 | if (!vma || vma->vm_start > rmap_item->address) | ||
800 | goto out; | 913 | goto out; |
801 | 914 | ||
802 | vma = find_vma(mm1, addr1); | 915 | err = try_to_merge_one_page(vma, page, kpage); |
803 | if (!vma || vma->vm_start > addr1) | 916 | if (err) |
804 | goto out; | 917 | goto out; |
805 | 918 | ||
806 | err = try_to_merge_one_page(vma, page1, kpage); | 919 | /* Must get reference to anon_vma while still holding mmap_sem */ |
920 | hold_anon_vma(rmap_item, vma->anon_vma); | ||
807 | out: | 921 | out: |
808 | up_read(&mm1->mmap_sem); | 922 | up_read(&mm->mmap_sem); |
809 | return err; | 923 | return err; |
810 | } | 924 | } |
811 | 925 | ||
@@ -813,109 +927,73 @@ out: | |||
813 | * try_to_merge_two_pages - take two identical pages and prepare them | 927 | * try_to_merge_two_pages - take two identical pages and prepare them |
814 | * to be merged into one page. | 928 | * to be merged into one page. |
815 | * | 929 | * |
816 | * This function returns 0 if we successfully mapped two identical pages | 930 | * This function returns the kpage if we successfully merged two identical |
817 | * into one page, -EFAULT otherwise. | 931 | * pages into one ksm page, NULL otherwise. |
818 | * | 932 | * |
819 | * Note that this function allocates a new kernel page: if one of the pages | 933 | * Note that this function upgrades page to ksm page: if one of the pages |
820 | * is already a ksm page, try_to_merge_with_ksm_page should be used. | 934 | * is already a ksm page, try_to_merge_with_ksm_page should be used. |
821 | */ | 935 | */ |
822 | static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, | 936 | static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, |
823 | struct page *page1, struct mm_struct *mm2, | 937 | struct page *page, |
824 | unsigned long addr2, struct page *page2) | 938 | struct rmap_item *tree_rmap_item, |
939 | struct page *tree_page) | ||
825 | { | 940 | { |
826 | struct vm_area_struct *vma; | 941 | int err; |
827 | struct page *kpage; | ||
828 | int err = -EFAULT; | ||
829 | |||
830 | /* | ||
831 | * The number of nodes in the stable tree | ||
832 | * is the number of kernel pages that we hold. | ||
833 | */ | ||
834 | if (ksm_max_kernel_pages && | ||
835 | ksm_max_kernel_pages <= ksm_pages_shared) | ||
836 | return err; | ||
837 | |||
838 | kpage = alloc_page(GFP_HIGHUSER); | ||
839 | if (!kpage) | ||
840 | return err; | ||
841 | |||
842 | down_read(&mm1->mmap_sem); | ||
843 | if (ksm_test_exit(mm1)) { | ||
844 | up_read(&mm1->mmap_sem); | ||
845 | goto out; | ||
846 | } | ||
847 | vma = find_vma(mm1, addr1); | ||
848 | if (!vma || vma->vm_start > addr1) { | ||
849 | up_read(&mm1->mmap_sem); | ||
850 | goto out; | ||
851 | } | ||
852 | |||
853 | copy_user_highpage(kpage, page1, addr1, vma); | ||
854 | err = try_to_merge_one_page(vma, page1, kpage); | ||
855 | up_read(&mm1->mmap_sem); | ||
856 | 942 | ||
943 | err = try_to_merge_with_ksm_page(rmap_item, page, NULL); | ||
857 | if (!err) { | 944 | if (!err) { |
858 | err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); | 945 | err = try_to_merge_with_ksm_page(tree_rmap_item, |
946 | tree_page, page); | ||
859 | /* | 947 | /* |
860 | * If that fails, we have a ksm page with only one pte | 948 | * If that fails, we have a ksm page with only one pte |
861 | * pointing to it: so break it. | 949 | * pointing to it: so break it. |
862 | */ | 950 | */ |
863 | if (err) | 951 | if (err) |
864 | break_cow(mm1, addr1); | 952 | break_cow(rmap_item); |
865 | } | 953 | } |
866 | out: | 954 | return err ? NULL : page; |
867 | put_page(kpage); | ||
868 | return err; | ||
869 | } | 955 | } |
870 | 956 | ||
871 | /* | 957 | /* |
872 | * stable_tree_search - search page inside the stable tree | 958 | * stable_tree_search - search for page inside the stable tree |
873 | * @page: the page that we are searching identical pages to. | ||
874 | * @page2: pointer into identical page that we are holding inside the stable | ||
875 | * tree that we have found. | ||
876 | * @rmap_item: the reverse mapping item | ||
877 | * | 959 | * |
878 | * This function checks if there is a page inside the stable tree | 960 | * This function checks if there is a page inside the stable tree |
879 | * with identical content to the page that we are scanning right now. | 961 | * with identical content to the page that we are scanning right now. |
880 | * | 962 | * |
881 | * This function return rmap_item pointer to the identical item if found, | 963 | * This function returns the stable tree node of identical content if found, |
882 | * NULL otherwise. | 964 | * NULL otherwise. |
883 | */ | 965 | */ |
884 | static struct rmap_item *stable_tree_search(struct page *page, | 966 | static struct page *stable_tree_search(struct page *page) |
885 | struct page **page2, | ||
886 | struct rmap_item *rmap_item) | ||
887 | { | 967 | { |
888 | struct rb_node *node = root_stable_tree.rb_node; | 968 | struct rb_node *node = root_stable_tree.rb_node; |
969 | struct stable_node *stable_node; | ||
970 | |||
971 | stable_node = page_stable_node(page); | ||
972 | if (stable_node) { /* ksm page forked */ | ||
973 | get_page(page); | ||
974 | return page; | ||
975 | } | ||
889 | 976 | ||
890 | while (node) { | 977 | while (node) { |
891 | struct rmap_item *tree_rmap_item, *next_rmap_item; | 978 | struct page *tree_page; |
892 | int ret; | 979 | int ret; |
893 | 980 | ||
894 | tree_rmap_item = rb_entry(node, struct rmap_item, node); | 981 | cond_resched(); |
895 | while (tree_rmap_item) { | 982 | stable_node = rb_entry(node, struct stable_node, node); |
896 | BUG_ON(!in_stable_tree(tree_rmap_item)); | 983 | tree_page = get_ksm_page(stable_node); |
897 | cond_resched(); | 984 | if (!tree_page) |
898 | page2[0] = get_ksm_page(tree_rmap_item); | ||
899 | if (page2[0]) | ||
900 | break; | ||
901 | next_rmap_item = tree_rmap_item->next; | ||
902 | remove_rmap_item_from_tree(tree_rmap_item); | ||
903 | tree_rmap_item = next_rmap_item; | ||
904 | } | ||
905 | if (!tree_rmap_item) | ||
906 | return NULL; | 985 | return NULL; |
907 | 986 | ||
908 | ret = memcmp_pages(page, page2[0]); | 987 | ret = memcmp_pages(page, tree_page); |
909 | 988 | ||
910 | if (ret < 0) { | 989 | if (ret < 0) { |
911 | put_page(page2[0]); | 990 | put_page(tree_page); |
912 | node = node->rb_left; | 991 | node = node->rb_left; |
913 | } else if (ret > 0) { | 992 | } else if (ret > 0) { |
914 | put_page(page2[0]); | 993 | put_page(tree_page); |
915 | node = node->rb_right; | 994 | node = node->rb_right; |
916 | } else { | 995 | } else |
917 | return tree_rmap_item; | 996 | return tree_page; |
918 | } | ||
919 | } | 997 | } |
920 | 998 | ||
921 | return NULL; | 999 | return NULL; |
@@ -925,38 +1003,26 @@ static struct rmap_item *stable_tree_search(struct page *page, | |||
925 | * stable_tree_insert - insert rmap_item pointing to new ksm page | 1003 | * stable_tree_insert - insert rmap_item pointing to new ksm page |
926 | * into the stable tree. | 1004 | * into the stable tree. |
927 | * | 1005 | * |
928 | * @page: the page that we are searching identical page to inside the stable | 1006 | * This function returns the stable tree node just allocated on success, |
929 | * tree. | 1007 | * NULL otherwise. |
930 | * @rmap_item: pointer to the reverse mapping item. | ||
931 | * | ||
932 | * This function returns rmap_item if success, NULL otherwise. | ||
933 | */ | 1008 | */ |
934 | static struct rmap_item *stable_tree_insert(struct page *page, | 1009 | static struct stable_node *stable_tree_insert(struct page *kpage) |
935 | struct rmap_item *rmap_item) | ||
936 | { | 1010 | { |
937 | struct rb_node **new = &root_stable_tree.rb_node; | 1011 | struct rb_node **new = &root_stable_tree.rb_node; |
938 | struct rb_node *parent = NULL; | 1012 | struct rb_node *parent = NULL; |
1013 | struct stable_node *stable_node; | ||
939 | 1014 | ||
940 | while (*new) { | 1015 | while (*new) { |
941 | struct rmap_item *tree_rmap_item, *next_rmap_item; | ||
942 | struct page *tree_page; | 1016 | struct page *tree_page; |
943 | int ret; | 1017 | int ret; |
944 | 1018 | ||
945 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); | 1019 | cond_resched(); |
946 | while (tree_rmap_item) { | 1020 | stable_node = rb_entry(*new, struct stable_node, node); |
947 | BUG_ON(!in_stable_tree(tree_rmap_item)); | 1021 | tree_page = get_ksm_page(stable_node); |
948 | cond_resched(); | 1022 | if (!tree_page) |
949 | tree_page = get_ksm_page(tree_rmap_item); | ||
950 | if (tree_page) | ||
951 | break; | ||
952 | next_rmap_item = tree_rmap_item->next; | ||
953 | remove_rmap_item_from_tree(tree_rmap_item); | ||
954 | tree_rmap_item = next_rmap_item; | ||
955 | } | ||
956 | if (!tree_rmap_item) | ||
957 | return NULL; | 1023 | return NULL; |
958 | 1024 | ||
959 | ret = memcmp_pages(page, tree_page); | 1025 | ret = memcmp_pages(kpage, tree_page); |
960 | put_page(tree_page); | 1026 | put_page(tree_page); |
961 | 1027 | ||
962 | parent = *new; | 1028 | parent = *new; |
@@ -974,22 +1040,24 @@ static struct rmap_item *stable_tree_insert(struct page *page, | |||
974 | } | 1040 | } |
975 | } | 1041 | } |
976 | 1042 | ||
977 | rmap_item->address |= NODE_FLAG | STABLE_FLAG; | 1043 | stable_node = alloc_stable_node(); |
978 | rmap_item->next = NULL; | 1044 | if (!stable_node) |
979 | rb_link_node(&rmap_item->node, parent, new); | 1045 | return NULL; |
980 | rb_insert_color(&rmap_item->node, &root_stable_tree); | ||
981 | 1046 | ||
982 | ksm_pages_shared++; | 1047 | rb_link_node(&stable_node->node, parent, new); |
983 | return rmap_item; | 1048 | rb_insert_color(&stable_node->node, &root_stable_tree); |
1049 | |||
1050 | INIT_HLIST_HEAD(&stable_node->hlist); | ||
1051 | |||
1052 | stable_node->kpfn = page_to_pfn(kpage); | ||
1053 | set_page_stable_node(kpage, stable_node); | ||
1054 | |||
1055 | return stable_node; | ||
984 | } | 1056 | } |
985 | 1057 | ||
986 | /* | 1058 | /* |
987 | * unstable_tree_search_insert - search and insert items into the unstable tree. | 1059 | * unstable_tree_search_insert - search for identical page, |
988 | * | 1060 | * else insert rmap_item into the unstable tree. |
989 | * @page: the page that we are going to search for identical page or to insert | ||
990 | * into the unstable tree | ||
991 | * @page2: pointer into identical page that was found inside the unstable tree | ||
992 | * @rmap_item: the reverse mapping item of page | ||
993 | * | 1061 | * |
994 | * This function searches for a page in the unstable tree identical to the | 1062 | * This function searches for a page in the unstable tree identical to the |
995 | * page currently being scanned; and if no identical page is found in the | 1063 | * page currently being scanned; and if no identical page is found in the |
@@ -1001,47 +1069,50 @@ static struct rmap_item *stable_tree_insert(struct page *page, | |||
1001 | * This function does both searching and inserting, because they share | 1069 | * This function does both searching and inserting, because they share |
1002 | * the same walking algorithm in an rbtree. | 1070 | * the same walking algorithm in an rbtree. |
1003 | */ | 1071 | */ |
1004 | static struct rmap_item *unstable_tree_search_insert(struct page *page, | 1072 | static |
1005 | struct page **page2, | 1073 | struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, |
1006 | struct rmap_item *rmap_item) | 1074 | struct page *page, |
1075 | struct page **tree_pagep) | ||
1076 | |||
1007 | { | 1077 | { |
1008 | struct rb_node **new = &root_unstable_tree.rb_node; | 1078 | struct rb_node **new = &root_unstable_tree.rb_node; |
1009 | struct rb_node *parent = NULL; | 1079 | struct rb_node *parent = NULL; |
1010 | 1080 | ||
1011 | while (*new) { | 1081 | while (*new) { |
1012 | struct rmap_item *tree_rmap_item; | 1082 | struct rmap_item *tree_rmap_item; |
1083 | struct page *tree_page; | ||
1013 | int ret; | 1084 | int ret; |
1014 | 1085 | ||
1015 | cond_resched(); | 1086 | cond_resched(); |
1016 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); | 1087 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); |
1017 | page2[0] = get_mergeable_page(tree_rmap_item); | 1088 | tree_page = get_mergeable_page(tree_rmap_item); |
1018 | if (!page2[0]) | 1089 | if (!tree_page) |
1019 | return NULL; | 1090 | return NULL; |
1020 | 1091 | ||
1021 | /* | 1092 | /* |
1022 | * Don't substitute an unswappable ksm page | 1093 | * Don't substitute a ksm page for a forked page. |
1023 | * just for one good swappable forked page. | ||
1024 | */ | 1094 | */ |
1025 | if (page == page2[0]) { | 1095 | if (page == tree_page) { |
1026 | put_page(page2[0]); | 1096 | put_page(tree_page); |
1027 | return NULL; | 1097 | return NULL; |
1028 | } | 1098 | } |
1029 | 1099 | ||
1030 | ret = memcmp_pages(page, page2[0]); | 1100 | ret = memcmp_pages(page, tree_page); |
1031 | 1101 | ||
1032 | parent = *new; | 1102 | parent = *new; |
1033 | if (ret < 0) { | 1103 | if (ret < 0) { |
1034 | put_page(page2[0]); | 1104 | put_page(tree_page); |
1035 | new = &parent->rb_left; | 1105 | new = &parent->rb_left; |
1036 | } else if (ret > 0) { | 1106 | } else if (ret > 0) { |
1037 | put_page(page2[0]); | 1107 | put_page(tree_page); |
1038 | new = &parent->rb_right; | 1108 | new = &parent->rb_right; |
1039 | } else { | 1109 | } else { |
1110 | *tree_pagep = tree_page; | ||
1040 | return tree_rmap_item; | 1111 | return tree_rmap_item; |
1041 | } | 1112 | } |
1042 | } | 1113 | } |
1043 | 1114 | ||
1044 | rmap_item->address |= NODE_FLAG; | 1115 | rmap_item->address |= UNSTABLE_FLAG; |
1045 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); | 1116 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); |
1046 | rb_link_node(&rmap_item->node, parent, new); | 1117 | rb_link_node(&rmap_item->node, parent, new); |
1047 | rb_insert_color(&rmap_item->node, &root_unstable_tree); | 1118 | rb_insert_color(&rmap_item->node, &root_unstable_tree); |
@@ -1056,18 +1127,16 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page, | |||
1056 | * the same ksm page. | 1127 | * the same ksm page. |
1057 | */ | 1128 | */ |
1058 | static void stable_tree_append(struct rmap_item *rmap_item, | 1129 | static void stable_tree_append(struct rmap_item *rmap_item, |
1059 | struct rmap_item *tree_rmap_item) | 1130 | struct stable_node *stable_node) |
1060 | { | 1131 | { |
1061 | rmap_item->next = tree_rmap_item->next; | 1132 | rmap_item->head = stable_node; |
1062 | rmap_item->prev = tree_rmap_item; | ||
1063 | |||
1064 | if (tree_rmap_item->next) | ||
1065 | tree_rmap_item->next->prev = rmap_item; | ||
1066 | |||
1067 | tree_rmap_item->next = rmap_item; | ||
1068 | rmap_item->address |= STABLE_FLAG; | 1133 | rmap_item->address |= STABLE_FLAG; |
1134 | hlist_add_head(&rmap_item->hlist, &stable_node->hlist); | ||
1069 | 1135 | ||
1070 | ksm_pages_sharing++; | 1136 | if (rmap_item->hlist.next) |
1137 | ksm_pages_sharing++; | ||
1138 | else | ||
1139 | ksm_pages_shared++; | ||
1071 | } | 1140 | } |
1072 | 1141 | ||
1073 | /* | 1142 | /* |
@@ -1081,49 +1150,37 @@ static void stable_tree_append(struct rmap_item *rmap_item, | |||
1081 | */ | 1150 | */ |
1082 | static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | 1151 | static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) |
1083 | { | 1152 | { |
1084 | struct page *page2[1]; | ||
1085 | struct rmap_item *tree_rmap_item; | 1153 | struct rmap_item *tree_rmap_item; |
1154 | struct page *tree_page = NULL; | ||
1155 | struct stable_node *stable_node; | ||
1156 | struct page *kpage; | ||
1086 | unsigned int checksum; | 1157 | unsigned int checksum; |
1087 | int err; | 1158 | int err; |
1088 | 1159 | ||
1089 | if (in_stable_tree(rmap_item)) | 1160 | remove_rmap_item_from_tree(rmap_item); |
1090 | remove_rmap_item_from_tree(rmap_item); | ||
1091 | 1161 | ||
1092 | /* We first start with searching the page inside the stable tree */ | 1162 | /* We first start with searching the page inside the stable tree */ |
1093 | tree_rmap_item = stable_tree_search(page, page2, rmap_item); | 1163 | kpage = stable_tree_search(page); |
1094 | if (tree_rmap_item) { | 1164 | if (kpage) { |
1095 | if (page == page2[0]) /* forked */ | 1165 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
1096 | err = 0; | ||
1097 | else | ||
1098 | err = try_to_merge_with_ksm_page(rmap_item->mm, | ||
1099 | rmap_item->address, | ||
1100 | page, page2[0]); | ||
1101 | put_page(page2[0]); | ||
1102 | |||
1103 | if (!err) { | 1166 | if (!err) { |
1104 | /* | 1167 | /* |
1105 | * The page was successfully merged: | 1168 | * The page was successfully merged: |
1106 | * add its rmap_item to the stable tree. | 1169 | * add its rmap_item to the stable tree. |
1107 | */ | 1170 | */ |
1108 | stable_tree_append(rmap_item, tree_rmap_item); | 1171 | lock_page(kpage); |
1172 | stable_tree_append(rmap_item, page_stable_node(kpage)); | ||
1173 | unlock_page(kpage); | ||
1109 | } | 1174 | } |
1175 | put_page(kpage); | ||
1110 | return; | 1176 | return; |
1111 | } | 1177 | } |
1112 | 1178 | ||
1113 | /* | 1179 | /* |
1114 | * A ksm page might have got here by fork, but its other | 1180 | * If the hash value of the page has changed from the last time |
1115 | * references have already been removed from the stable tree. | 1181 | * we calculated it, this page is changing frequently: therefore we |
1116 | * Or it might be left over from a break_ksm which failed | 1182 | * don't want to insert it in the unstable tree, and we don't want |
1117 | * when the mem_cgroup had reached its limit: try again now. | 1183 | * to waste our time searching for something identical to it there. |
1118 | */ | ||
1119 | if (PageKsm(page)) | ||
1120 | break_cow(rmap_item->mm, rmap_item->address); | ||
1121 | |||
1122 | /* | ||
1123 | * In case the hash value of the page was changed from the last time we | ||
1124 | * have calculated it, this page to be changed frequely, therefore we | ||
1125 | * don't want to insert it to the unstable tree, and we don't want to | ||
1126 | * waste our time to search if there is something identical to it there. | ||
1127 | */ | 1184 | */ |
1128 | checksum = calc_checksum(page); | 1185 | checksum = calc_checksum(page); |
1129 | if (rmap_item->oldchecksum != checksum) { | 1186 | if (rmap_item->oldchecksum != checksum) { |
@@ -1131,21 +1188,27 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1131 | return; | 1188 | return; |
1132 | } | 1189 | } |
1133 | 1190 | ||
1134 | tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); | 1191 | tree_rmap_item = |
1192 | unstable_tree_search_insert(rmap_item, page, &tree_page); | ||
1135 | if (tree_rmap_item) { | 1193 | if (tree_rmap_item) { |
1136 | err = try_to_merge_two_pages(rmap_item->mm, | 1194 | kpage = try_to_merge_two_pages(rmap_item, page, |
1137 | rmap_item->address, page, | 1195 | tree_rmap_item, tree_page); |
1138 | tree_rmap_item->mm, | 1196 | put_page(tree_page); |
1139 | tree_rmap_item->address, page2[0]); | ||
1140 | /* | 1197 | /* |
1141 | * As soon as we merge this page, we want to remove the | 1198 | * As soon as we merge this page, we want to remove the |
1142 | * rmap_item of the page we have merged with from the unstable | 1199 | * rmap_item of the page we have merged with from the unstable |
1143 | * tree, and insert it instead as new node in the stable tree. | 1200 | * tree, and insert it instead as new node in the stable tree. |
1144 | */ | 1201 | */ |
1145 | if (!err) { | 1202 | if (kpage) { |
1146 | rb_erase(&tree_rmap_item->node, &root_unstable_tree); | 1203 | remove_rmap_item_from_tree(tree_rmap_item); |
1147 | tree_rmap_item->address &= ~NODE_FLAG; | 1204 | |
1148 | ksm_pages_unshared--; | 1205 | lock_page(kpage); |
1206 | stable_node = stable_tree_insert(kpage); | ||
1207 | if (stable_node) { | ||
1208 | stable_tree_append(tree_rmap_item, stable_node); | ||
1209 | stable_tree_append(rmap_item, stable_node); | ||
1210 | } | ||
1211 | unlock_page(kpage); | ||
1149 | 1212 | ||
1150 | /* | 1213 | /* |
1151 | * If we fail to insert the page into the stable tree, | 1214 | * If we fail to insert the page into the stable tree, |
@@ -1153,37 +1216,28 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1153 | * to a ksm page left outside the stable tree, | 1216 | * to a ksm page left outside the stable tree, |
1154 | * in which case we need to break_cow on both. | 1217 | * in which case we need to break_cow on both. |
1155 | */ | 1218 | */ |
1156 | if (stable_tree_insert(page2[0], tree_rmap_item)) | 1219 | if (!stable_node) { |
1157 | stable_tree_append(rmap_item, tree_rmap_item); | 1220 | break_cow(tree_rmap_item); |
1158 | else { | 1221 | break_cow(rmap_item); |
1159 | break_cow(tree_rmap_item->mm, | ||
1160 | tree_rmap_item->address); | ||
1161 | break_cow(rmap_item->mm, rmap_item->address); | ||
1162 | } | 1222 | } |
1163 | } | 1223 | } |
1164 | |||
1165 | put_page(page2[0]); | ||
1166 | } | 1224 | } |
1167 | } | 1225 | } |
1168 | 1226 | ||
1169 | static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, | 1227 | static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, |
1170 | struct list_head *cur, | 1228 | struct rmap_item **rmap_list, |
1171 | unsigned long addr) | 1229 | unsigned long addr) |
1172 | { | 1230 | { |
1173 | struct rmap_item *rmap_item; | 1231 | struct rmap_item *rmap_item; |
1174 | 1232 | ||
1175 | while (cur != &mm_slot->rmap_list) { | 1233 | while (*rmap_list) { |
1176 | rmap_item = list_entry(cur, struct rmap_item, link); | 1234 | rmap_item = *rmap_list; |
1177 | if ((rmap_item->address & PAGE_MASK) == addr) { | 1235 | if ((rmap_item->address & PAGE_MASK) == addr) |
1178 | if (!in_stable_tree(rmap_item)) | ||
1179 | remove_rmap_item_from_tree(rmap_item); | ||
1180 | return rmap_item; | 1236 | return rmap_item; |
1181 | } | ||
1182 | if (rmap_item->address > addr) | 1237 | if (rmap_item->address > addr) |
1183 | break; | 1238 | break; |
1184 | cur = cur->next; | 1239 | *rmap_list = rmap_item->rmap_list; |
1185 | remove_rmap_item_from_tree(rmap_item); | 1240 | remove_rmap_item_from_tree(rmap_item); |
1186 | list_del(&rmap_item->link); | ||
1187 | free_rmap_item(rmap_item); | 1241 | free_rmap_item(rmap_item); |
1188 | } | 1242 | } |
1189 | 1243 | ||
@@ -1192,7 +1246,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, | |||
1192 | /* It has already been zeroed */ | 1246 | /* It has already been zeroed */ |
1193 | rmap_item->mm = mm_slot->mm; | 1247 | rmap_item->mm = mm_slot->mm; |
1194 | rmap_item->address = addr; | 1248 | rmap_item->address = addr; |
1195 | list_add_tail(&rmap_item->link, cur); | 1249 | rmap_item->rmap_list = *rmap_list; |
1250 | *rmap_list = rmap_item; | ||
1196 | } | 1251 | } |
1197 | return rmap_item; | 1252 | return rmap_item; |
1198 | } | 1253 | } |
@@ -1217,8 +1272,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1217 | spin_unlock(&ksm_mmlist_lock); | 1272 | spin_unlock(&ksm_mmlist_lock); |
1218 | next_mm: | 1273 | next_mm: |
1219 | ksm_scan.address = 0; | 1274 | ksm_scan.address = 0; |
1220 | ksm_scan.rmap_item = list_entry(&slot->rmap_list, | 1275 | ksm_scan.rmap_list = &slot->rmap_list; |
1221 | struct rmap_item, link); | ||
1222 | } | 1276 | } |
1223 | 1277 | ||
1224 | mm = slot->mm; | 1278 | mm = slot->mm; |
@@ -1244,10 +1298,10 @@ next_mm: | |||
1244 | flush_anon_page(vma, *page, ksm_scan.address); | 1298 | flush_anon_page(vma, *page, ksm_scan.address); |
1245 | flush_dcache_page(*page); | 1299 | flush_dcache_page(*page); |
1246 | rmap_item = get_next_rmap_item(slot, | 1300 | rmap_item = get_next_rmap_item(slot, |
1247 | ksm_scan.rmap_item->link.next, | 1301 | ksm_scan.rmap_list, ksm_scan.address); |
1248 | ksm_scan.address); | ||
1249 | if (rmap_item) { | 1302 | if (rmap_item) { |
1250 | ksm_scan.rmap_item = rmap_item; | 1303 | ksm_scan.rmap_list = |
1304 | &rmap_item->rmap_list; | ||
1251 | ksm_scan.address += PAGE_SIZE; | 1305 | ksm_scan.address += PAGE_SIZE; |
1252 | } else | 1306 | } else |
1253 | put_page(*page); | 1307 | put_page(*page); |
@@ -1263,14 +1317,13 @@ next_mm: | |||
1263 | 1317 | ||
1264 | if (ksm_test_exit(mm)) { | 1318 | if (ksm_test_exit(mm)) { |
1265 | ksm_scan.address = 0; | 1319 | ksm_scan.address = 0; |
1266 | ksm_scan.rmap_item = list_entry(&slot->rmap_list, | 1320 | ksm_scan.rmap_list = &slot->rmap_list; |
1267 | struct rmap_item, link); | ||
1268 | } | 1321 | } |
1269 | /* | 1322 | /* |
1270 | * Nuke all the rmap_items that are above this current rmap: | 1323 | * Nuke all the rmap_items that are above this current rmap: |
1271 | * because there were no VM_MERGEABLE vmas with such addresses. | 1324 | * because there were no VM_MERGEABLE vmas with such addresses. |
1272 | */ | 1325 | */ |
1273 | remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); | 1326 | remove_trailing_rmap_items(slot, ksm_scan.rmap_list); |
1274 | 1327 | ||
1275 | spin_lock(&ksm_mmlist_lock); | 1328 | spin_lock(&ksm_mmlist_lock); |
1276 | ksm_scan.mm_slot = list_entry(slot->mm_list.next, | 1329 | ksm_scan.mm_slot = list_entry(slot->mm_list.next, |
@@ -1323,14 +1376,6 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
1323 | return; | 1376 | return; |
1324 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) | 1377 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) |
1325 | cmp_and_merge_page(page, rmap_item); | 1378 | cmp_and_merge_page(page, rmap_item); |
1326 | else if (page_mapcount(page) == 1) { | ||
1327 | /* | ||
1328 | * Replace now-unshared ksm page by ordinary page. | ||
1329 | */ | ||
1330 | break_cow(rmap_item->mm, rmap_item->address); | ||
1331 | remove_rmap_item_from_tree(rmap_item); | ||
1332 | rmap_item->oldchecksum = calc_checksum(page); | ||
1333 | } | ||
1334 | put_page(page); | 1379 | put_page(page); |
1335 | } | 1380 | } |
1336 | } | 1381 | } |
@@ -1375,7 +1420,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
1375 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | | 1420 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | |
1376 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | 1421 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | |
1377 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | 1422 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | |
1378 | VM_MIXEDMAP | VM_SAO)) | 1423 | VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) |
1379 | return 0; /* just ignore the advice */ | 1424 | return 0; /* just ignore the advice */ |
1380 | 1425 | ||
1381 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { | 1426 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { |
@@ -1452,7 +1497,7 @@ void __ksm_exit(struct mm_struct *mm) | |||
1452 | spin_lock(&ksm_mmlist_lock); | 1497 | spin_lock(&ksm_mmlist_lock); |
1453 | mm_slot = get_mm_slot(mm); | 1498 | mm_slot = get_mm_slot(mm); |
1454 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { | 1499 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { |
1455 | if (list_empty(&mm_slot->rmap_list)) { | 1500 | if (!mm_slot->rmap_list) { |
1456 | hlist_del(&mm_slot->link); | 1501 | hlist_del(&mm_slot->link); |
1457 | list_del(&mm_slot->mm_list); | 1502 | list_del(&mm_slot->mm_list); |
1458 | easy_to_free = 1; | 1503 | easy_to_free = 1; |
@@ -1473,6 +1518,249 @@ void __ksm_exit(struct mm_struct *mm) | |||
1473 | } | 1518 | } |
1474 | } | 1519 | } |
1475 | 1520 | ||
1521 | struct page *ksm_does_need_to_copy(struct page *page, | ||
1522 | struct vm_area_struct *vma, unsigned long address) | ||
1523 | { | ||
1524 | struct page *new_page; | ||
1525 | |||
1526 | unlock_page(page); /* any racers will COW it, not modify it */ | ||
1527 | |||
1528 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
1529 | if (new_page) { | ||
1530 | copy_user_highpage(new_page, page, address, vma); | ||
1531 | |||
1532 | SetPageDirty(new_page); | ||
1533 | __SetPageUptodate(new_page); | ||
1534 | SetPageSwapBacked(new_page); | ||
1535 | __set_page_locked(new_page); | ||
1536 | |||
1537 | if (page_evictable(new_page, vma)) | ||
1538 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); | ||
1539 | else | ||
1540 | add_page_to_unevictable_list(new_page); | ||
1541 | } | ||
1542 | |||
1543 | page_cache_release(page); | ||
1544 | return new_page; | ||
1545 | } | ||
1546 | |||
1547 | int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | ||
1548 | unsigned long *vm_flags) | ||
1549 | { | ||
1550 | struct stable_node *stable_node; | ||
1551 | struct rmap_item *rmap_item; | ||
1552 | struct hlist_node *hlist; | ||
1553 | unsigned int mapcount = page_mapcount(page); | ||
1554 | int referenced = 0; | ||
1555 | int search_new_forks = 0; | ||
1556 | |||
1557 | VM_BUG_ON(!PageKsm(page)); | ||
1558 | VM_BUG_ON(!PageLocked(page)); | ||
1559 | |||
1560 | stable_node = page_stable_node(page); | ||
1561 | if (!stable_node) | ||
1562 | return 0; | ||
1563 | again: | ||
1564 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
1565 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1566 | struct vm_area_struct *vma; | ||
1567 | |||
1568 | spin_lock(&anon_vma->lock); | ||
1569 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
1570 | if (rmap_item->address < vma->vm_start || | ||
1571 | rmap_item->address >= vma->vm_end) | ||
1572 | continue; | ||
1573 | /* | ||
1574 | * Initially we examine only the vma which covers this | ||
1575 | * rmap_item; but later, if there is still work to do, | ||
1576 | * we examine covering vmas in other mms: in case they | ||
1577 | * were forked from the original since ksmd passed. | ||
1578 | */ | ||
1579 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1580 | continue; | ||
1581 | |||
1582 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | ||
1583 | continue; | ||
1584 | |||
1585 | referenced += page_referenced_one(page, vma, | ||
1586 | rmap_item->address, &mapcount, vm_flags); | ||
1587 | if (!search_new_forks || !mapcount) | ||
1588 | break; | ||
1589 | } | ||
1590 | spin_unlock(&anon_vma->lock); | ||
1591 | if (!mapcount) | ||
1592 | goto out; | ||
1593 | } | ||
1594 | if (!search_new_forks++) | ||
1595 | goto again; | ||
1596 | out: | ||
1597 | return referenced; | ||
1598 | } | ||
1599 | |||
1600 | int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | ||
1601 | { | ||
1602 | struct stable_node *stable_node; | ||
1603 | struct hlist_node *hlist; | ||
1604 | struct rmap_item *rmap_item; | ||
1605 | int ret = SWAP_AGAIN; | ||
1606 | int search_new_forks = 0; | ||
1607 | |||
1608 | VM_BUG_ON(!PageKsm(page)); | ||
1609 | VM_BUG_ON(!PageLocked(page)); | ||
1610 | |||
1611 | stable_node = page_stable_node(page); | ||
1612 | if (!stable_node) | ||
1613 | return SWAP_FAIL; | ||
1614 | again: | ||
1615 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
1616 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1617 | struct vm_area_struct *vma; | ||
1618 | |||
1619 | spin_lock(&anon_vma->lock); | ||
1620 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
1621 | if (rmap_item->address < vma->vm_start || | ||
1622 | rmap_item->address >= vma->vm_end) | ||
1623 | continue; | ||
1624 | /* | ||
1625 | * Initially we examine only the vma which covers this | ||
1626 | * rmap_item; but later, if there is still work to do, | ||
1627 | * we examine covering vmas in other mms: in case they | ||
1628 | * were forked from the original since ksmd passed. | ||
1629 | */ | ||
1630 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1631 | continue; | ||
1632 | |||
1633 | ret = try_to_unmap_one(page, vma, | ||
1634 | rmap_item->address, flags); | ||
1635 | if (ret != SWAP_AGAIN || !page_mapped(page)) { | ||
1636 | spin_unlock(&anon_vma->lock); | ||
1637 | goto out; | ||
1638 | } | ||
1639 | } | ||
1640 | spin_unlock(&anon_vma->lock); | ||
1641 | } | ||
1642 | if (!search_new_forks++) | ||
1643 | goto again; | ||
1644 | out: | ||
1645 | return ret; | ||
1646 | } | ||
1647 | |||
1648 | #ifdef CONFIG_MIGRATION | ||
1649 | int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, | ||
1650 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1651 | { | ||
1652 | struct stable_node *stable_node; | ||
1653 | struct hlist_node *hlist; | ||
1654 | struct rmap_item *rmap_item; | ||
1655 | int ret = SWAP_AGAIN; | ||
1656 | int search_new_forks = 0; | ||
1657 | |||
1658 | VM_BUG_ON(!PageKsm(page)); | ||
1659 | VM_BUG_ON(!PageLocked(page)); | ||
1660 | |||
1661 | stable_node = page_stable_node(page); | ||
1662 | if (!stable_node) | ||
1663 | return ret; | ||
1664 | again: | ||
1665 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
1666 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1667 | struct vm_area_struct *vma; | ||
1668 | |||
1669 | spin_lock(&anon_vma->lock); | ||
1670 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
1671 | if (rmap_item->address < vma->vm_start || | ||
1672 | rmap_item->address >= vma->vm_end) | ||
1673 | continue; | ||
1674 | /* | ||
1675 | * Initially we examine only the vma which covers this | ||
1676 | * rmap_item; but later, if there is still work to do, | ||
1677 | * we examine covering vmas in other mms: in case they | ||
1678 | * were forked from the original since ksmd passed. | ||
1679 | */ | ||
1680 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1681 | continue; | ||
1682 | |||
1683 | ret = rmap_one(page, vma, rmap_item->address, arg); | ||
1684 | if (ret != SWAP_AGAIN) { | ||
1685 | spin_unlock(&anon_vma->lock); | ||
1686 | goto out; | ||
1687 | } | ||
1688 | } | ||
1689 | spin_unlock(&anon_vma->lock); | ||
1690 | } | ||
1691 | if (!search_new_forks++) | ||
1692 | goto again; | ||
1693 | out: | ||
1694 | return ret; | ||
1695 | } | ||
1696 | |||
1697 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) | ||
1698 | { | ||
1699 | struct stable_node *stable_node; | ||
1700 | |||
1701 | VM_BUG_ON(!PageLocked(oldpage)); | ||
1702 | VM_BUG_ON(!PageLocked(newpage)); | ||
1703 | VM_BUG_ON(newpage->mapping != oldpage->mapping); | ||
1704 | |||
1705 | stable_node = page_stable_node(newpage); | ||
1706 | if (stable_node) { | ||
1707 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); | ||
1708 | stable_node->kpfn = page_to_pfn(newpage); | ||
1709 | } | ||
1710 | } | ||
1711 | #endif /* CONFIG_MIGRATION */ | ||
1712 | |||
1713 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
1714 | static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, | ||
1715 | unsigned long end_pfn) | ||
1716 | { | ||
1717 | struct rb_node *node; | ||
1718 | |||
1719 | for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { | ||
1720 | struct stable_node *stable_node; | ||
1721 | |||
1722 | stable_node = rb_entry(node, struct stable_node, node); | ||
1723 | if (stable_node->kpfn >= start_pfn && | ||
1724 | stable_node->kpfn < end_pfn) | ||
1725 | return stable_node; | ||
1726 | } | ||
1727 | return NULL; | ||
1728 | } | ||
1729 | |||
1730 | static int ksm_memory_callback(struct notifier_block *self, | ||
1731 | unsigned long action, void *arg) | ||
1732 | { | ||
1733 | struct memory_notify *mn = arg; | ||
1734 | struct stable_node *stable_node; | ||
1735 | |||
1736 | switch (action) { | ||
1737 | case MEM_GOING_OFFLINE: | ||
1738 | /* | ||
1739 | * Keep it very simple for now: just lock out ksmd and | ||
1740 | * MADV_UNMERGEABLE while any memory is going offline. | ||
1741 | */ | ||
1742 | mutex_lock(&ksm_thread_mutex); | ||
1743 | break; | ||
1744 | |||
1745 | case MEM_OFFLINE: | ||
1746 | /* | ||
1747 | * Most of the work is done by page migration; but there might | ||
1748 | * be a few stable_nodes left over, still pointing to struct | ||
1749 | * pages which have been offlined: prune those from the tree. | ||
1750 | */ | ||
1751 | while ((stable_node = ksm_check_stable_tree(mn->start_pfn, | ||
1752 | mn->start_pfn + mn->nr_pages)) != NULL) | ||
1753 | remove_node_from_stable_tree(stable_node); | ||
1754 | /* fallthrough */ | ||
1755 | |||
1756 | case MEM_CANCEL_OFFLINE: | ||
1757 | mutex_unlock(&ksm_thread_mutex); | ||
1758 | break; | ||
1759 | } | ||
1760 | return NOTIFY_OK; | ||
1761 | } | ||
1762 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
1763 | |||
1476 | #ifdef CONFIG_SYSFS | 1764 | #ifdef CONFIG_SYSFS |
1477 | /* | 1765 | /* |
1478 | * This all compiles without CONFIG_SYSFS, but is a waste of space. | 1766 | * This all compiles without CONFIG_SYSFS, but is a waste of space. |
@@ -1551,8 +1839,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1551 | /* | 1839 | /* |
1552 | * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. | 1840 | * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. |
1553 | * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, | 1841 | * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, |
1554 | * breaking COW to free the unswappable pages_shared (but leaves | 1842 | * breaking COW to free the pages_shared (but leaves mm_slots |
1555 | * mm_slots on the list for when ksmd may be set running again). | 1843 | * on the list for when ksmd may be set running again). |
1556 | */ | 1844 | */ |
1557 | 1845 | ||
1558 | mutex_lock(&ksm_thread_mutex); | 1846 | mutex_lock(&ksm_thread_mutex); |
@@ -1577,29 +1865,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1577 | } | 1865 | } |
1578 | KSM_ATTR(run); | 1866 | KSM_ATTR(run); |
1579 | 1867 | ||
1580 | static ssize_t max_kernel_pages_store(struct kobject *kobj, | ||
1581 | struct kobj_attribute *attr, | ||
1582 | const char *buf, size_t count) | ||
1583 | { | ||
1584 | int err; | ||
1585 | unsigned long nr_pages; | ||
1586 | |||
1587 | err = strict_strtoul(buf, 10, &nr_pages); | ||
1588 | if (err) | ||
1589 | return -EINVAL; | ||
1590 | |||
1591 | ksm_max_kernel_pages = nr_pages; | ||
1592 | |||
1593 | return count; | ||
1594 | } | ||
1595 | |||
1596 | static ssize_t max_kernel_pages_show(struct kobject *kobj, | ||
1597 | struct kobj_attribute *attr, char *buf) | ||
1598 | { | ||
1599 | return sprintf(buf, "%lu\n", ksm_max_kernel_pages); | ||
1600 | } | ||
1601 | KSM_ATTR(max_kernel_pages); | ||
1602 | |||
1603 | static ssize_t pages_shared_show(struct kobject *kobj, | 1868 | static ssize_t pages_shared_show(struct kobject *kobj, |
1604 | struct kobj_attribute *attr, char *buf) | 1869 | struct kobj_attribute *attr, char *buf) |
1605 | { | 1870 | { |
@@ -1649,7 +1914,6 @@ static struct attribute *ksm_attrs[] = { | |||
1649 | &sleep_millisecs_attr.attr, | 1914 | &sleep_millisecs_attr.attr, |
1650 | &pages_to_scan_attr.attr, | 1915 | &pages_to_scan_attr.attr, |
1651 | &run_attr.attr, | 1916 | &run_attr.attr, |
1652 | &max_kernel_pages_attr.attr, | ||
1653 | &pages_shared_attr.attr, | 1917 | &pages_shared_attr.attr, |
1654 | &pages_sharing_attr.attr, | 1918 | &pages_sharing_attr.attr, |
1655 | &pages_unshared_attr.attr, | 1919 | &pages_unshared_attr.attr, |
@@ -1669,8 +1933,6 @@ static int __init ksm_init(void) | |||
1669 | struct task_struct *ksm_thread; | 1933 | struct task_struct *ksm_thread; |
1670 | int err; | 1934 | int err; |
1671 | 1935 | ||
1672 | ksm_max_kernel_pages = totalram_pages / 4; | ||
1673 | |||
1674 | err = ksm_slab_init(); | 1936 | err = ksm_slab_init(); |
1675 | if (err) | 1937 | if (err) |
1676 | goto out; | 1938 | goto out; |
@@ -1698,6 +1960,13 @@ static int __init ksm_init(void) | |||
1698 | 1960 | ||
1699 | #endif /* CONFIG_SYSFS */ | 1961 | #endif /* CONFIG_SYSFS */ |
1700 | 1962 | ||
1963 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
1964 | /* | ||
1965 | * Choose a high priority since the callback takes ksm_thread_mutex: | ||
1966 | * later callbacks could only be taking locks which nest within that. | ||
1967 | */ | ||
1968 | hotplug_memory_notifier(ksm_memory_callback, 100); | ||
1969 | #endif | ||
1701 | return 0; | 1970 | return 0; |
1702 | 1971 | ||
1703 | out_free2: | 1972 | out_free2: |
diff --git a/mm/madvise.c b/mm/madvise.c index 35b1479b7c9d..319528b8db74 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/pagemap.h> | 9 | #include <linux/pagemap.h> |
10 | #include <linux/syscalls.h> | 10 | #include <linux/syscalls.h> |
11 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
12 | #include <linux/page-isolation.h> | ||
12 | #include <linux/hugetlb.h> | 13 | #include <linux/hugetlb.h> |
13 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
14 | #include <linux/ksm.h> | 15 | #include <linux/ksm.h> |
@@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
222 | /* | 223 | /* |
223 | * Error injection support for memory error handling. | 224 | * Error injection support for memory error handling. |
224 | */ | 225 | */ |
225 | static int madvise_hwpoison(unsigned long start, unsigned long end) | 226 | static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) |
226 | { | 227 | { |
227 | int ret = 0; | 228 | int ret = 0; |
228 | 229 | ||
@@ -230,15 +231,21 @@ static int madvise_hwpoison(unsigned long start, unsigned long end) | |||
230 | return -EPERM; | 231 | return -EPERM; |
231 | for (; start < end; start += PAGE_SIZE) { | 232 | for (; start < end; start += PAGE_SIZE) { |
232 | struct page *p; | 233 | struct page *p; |
233 | int ret = get_user_pages(current, current->mm, start, 1, | 234 | int ret = get_user_pages_fast(start, 1, 0, &p); |
234 | 0, 0, &p, NULL); | ||
235 | if (ret != 1) | 235 | if (ret != 1) |
236 | return ret; | 236 | return ret; |
237 | if (bhv == MADV_SOFT_OFFLINE) { | ||
238 | printk(KERN_INFO "Soft offlining page %lx at %lx\n", | ||
239 | page_to_pfn(p), start); | ||
240 | ret = soft_offline_page(p, MF_COUNT_INCREASED); | ||
241 | if (ret) | ||
242 | break; | ||
243 | continue; | ||
244 | } | ||
237 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", | 245 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", |
238 | page_to_pfn(p), start); | 246 | page_to_pfn(p), start); |
239 | /* Ignore return value for now */ | 247 | /* Ignore return value for now */ |
240 | __memory_failure(page_to_pfn(p), 0, 1); | 248 | __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); |
241 | put_page(p); | ||
242 | } | 249 | } |
243 | return ret; | 250 | return ret; |
244 | } | 251 | } |
@@ -335,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
335 | size_t len; | 342 | size_t len; |
336 | 343 | ||
337 | #ifdef CONFIG_MEMORY_FAILURE | 344 | #ifdef CONFIG_MEMORY_FAILURE |
338 | if (behavior == MADV_HWPOISON) | 345 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) |
339 | return madvise_hwpoison(start, start+len_in); | 346 | return madvise_hwpoison(behavior, start, start+len_in); |
340 | #endif | 347 | #endif |
341 | if (!madvise_behavior_valid(behavior)) | 348 | if (!madvise_behavior_valid(behavior)) |
342 | return error; | 349 | return error; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c31a310aa146..488b644e0e8e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
39 | #include <linux/mm_inline.h> | 39 | #include <linux/mm_inline.h> |
40 | #include <linux/page_cgroup.h> | 40 | #include <linux/page_cgroup.h> |
41 | #include <linux/cpu.h> | ||
41 | #include "internal.h" | 42 | #include "internal.h" |
42 | 43 | ||
43 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
@@ -54,7 +55,6 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
54 | #define do_swap_account (0) | 55 | #define do_swap_account (0) |
55 | #endif | 56 | #endif |
56 | 57 | ||
57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | ||
58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | 58 | #define SOFTLIMIT_EVENTS_THRESH (1000) |
59 | 59 | ||
60 | /* | 60 | /* |
@@ -66,7 +66,7 @@ enum mem_cgroup_stat_index { | |||
66 | */ | 66 | */ |
67 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 67 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
68 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 68 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | 69 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | 72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ |
@@ -275,6 +275,7 @@ enum charge_type { | |||
275 | static void mem_cgroup_get(struct mem_cgroup *mem); | 275 | static void mem_cgroup_get(struct mem_cgroup *mem); |
276 | static void mem_cgroup_put(struct mem_cgroup *mem); | 276 | static void mem_cgroup_put(struct mem_cgroup *mem); |
277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
278 | static void drain_all_stock_async(void); | ||
278 | 279 | ||
279 | static struct mem_cgroup_per_zone * | 280 | static struct mem_cgroup_per_zone * |
280 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 281 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
@@ -282,6 +283,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | |||
282 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | 283 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; |
283 | } | 284 | } |
284 | 285 | ||
286 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | ||
287 | { | ||
288 | return &mem->css; | ||
289 | } | ||
290 | |||
285 | static struct mem_cgroup_per_zone * | 291 | static struct mem_cgroup_per_zone * |
286 | page_cgroup_zoneinfo(struct page_cgroup *pc) | 292 | page_cgroup_zoneinfo(struct page_cgroup *pc) |
287 | { | 293 | { |
@@ -758,7 +764,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
758 | task_unlock(task); | 764 | task_unlock(task); |
759 | if (!curr) | 765 | if (!curr) |
760 | return 0; | 766 | return 0; |
761 | if (curr->use_hierarchy) | 767 | /* |
768 | * We should check use_hierarchy of "mem" not "curr". Because checking | ||
769 | * use_hierarchy of "curr" here make this function true if hierarchy is | ||
770 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | ||
771 | * hierarchy(even if use_hierarchy is disabled in "mem"). | ||
772 | */ | ||
773 | if (mem->use_hierarchy) | ||
762 | ret = css_is_ancestor(&curr->css, &mem->css); | 774 | ret = css_is_ancestor(&curr->css, &mem->css); |
763 | else | 775 | else |
764 | ret = (curr == mem); | 776 | ret = (curr == mem); |
@@ -1007,7 +1019,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1007 | static char memcg_name[PATH_MAX]; | 1019 | static char memcg_name[PATH_MAX]; |
1008 | int ret; | 1020 | int ret; |
1009 | 1021 | ||
1010 | if (!memcg) | 1022 | if (!memcg || !p) |
1011 | return; | 1023 | return; |
1012 | 1024 | ||
1013 | 1025 | ||
@@ -1137,6 +1149,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1137 | victim = mem_cgroup_select_victim(root_mem); | 1149 | victim = mem_cgroup_select_victim(root_mem); |
1138 | if (victim == root_mem) { | 1150 | if (victim == root_mem) { |
1139 | loop++; | 1151 | loop++; |
1152 | if (loop >= 1) | ||
1153 | drain_all_stock_async(); | ||
1140 | if (loop >= 2) { | 1154 | if (loop >= 2) { |
1141 | /* | 1155 | /* |
1142 | * If we have not been able to reclaim | 1156 | * If we have not been able to reclaim |
@@ -1223,7 +1237,7 @@ static void record_last_oom(struct mem_cgroup *mem) | |||
1223 | * Currently used to update mapped file statistics, but the routine can be | 1237 | * Currently used to update mapped file statistics, but the routine can be |
1224 | * generalized to update other statistics as well. | 1238 | * generalized to update other statistics as well. |
1225 | */ | 1239 | */ |
1226 | void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | 1240 | void mem_cgroup_update_file_mapped(struct page *page, int val) |
1227 | { | 1241 | { |
1228 | struct mem_cgroup *mem; | 1242 | struct mem_cgroup *mem; |
1229 | struct mem_cgroup_stat *stat; | 1243 | struct mem_cgroup_stat *stat; |
@@ -1231,9 +1245,6 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | |||
1231 | int cpu; | 1245 | int cpu; |
1232 | struct page_cgroup *pc; | 1246 | struct page_cgroup *pc; |
1233 | 1247 | ||
1234 | if (!page_is_file_cache(page)) | ||
1235 | return; | ||
1236 | |||
1237 | pc = lookup_page_cgroup(page); | 1248 | pc = lookup_page_cgroup(page); |
1238 | if (unlikely(!pc)) | 1249 | if (unlikely(!pc)) |
1239 | return; | 1250 | return; |
@@ -1253,12 +1264,139 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | |||
1253 | stat = &mem->stat; | 1264 | stat = &mem->stat; |
1254 | cpustat = &stat->cpustat[cpu]; | 1265 | cpustat = &stat->cpustat[cpu]; |
1255 | 1266 | ||
1256 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); | 1267 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val); |
1257 | done: | 1268 | done: |
1258 | unlock_page_cgroup(pc); | 1269 | unlock_page_cgroup(pc); |
1259 | } | 1270 | } |
1260 | 1271 | ||
1261 | /* | 1272 | /* |
1273 | * size of first charge trial. "32" comes from vmscan.c's magic value. | ||
1274 | * TODO: maybe necessary to use big numbers in big irons. | ||
1275 | */ | ||
1276 | #define CHARGE_SIZE (32 * PAGE_SIZE) | ||
1277 | struct memcg_stock_pcp { | ||
1278 | struct mem_cgroup *cached; /* this never be root cgroup */ | ||
1279 | int charge; | ||
1280 | struct work_struct work; | ||
1281 | }; | ||
1282 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | ||
1283 | static atomic_t memcg_drain_count; | ||
1284 | |||
1285 | /* | ||
1286 | * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed | ||
1287 | * from local stock and true is returned. If the stock is 0 or charges from a | ||
1288 | * cgroup which is not current target, returns false. This stock will be | ||
1289 | * refilled. | ||
1290 | */ | ||
1291 | static bool consume_stock(struct mem_cgroup *mem) | ||
1292 | { | ||
1293 | struct memcg_stock_pcp *stock; | ||
1294 | bool ret = true; | ||
1295 | |||
1296 | stock = &get_cpu_var(memcg_stock); | ||
1297 | if (mem == stock->cached && stock->charge) | ||
1298 | stock->charge -= PAGE_SIZE; | ||
1299 | else /* need to call res_counter_charge */ | ||
1300 | ret = false; | ||
1301 | put_cpu_var(memcg_stock); | ||
1302 | return ret; | ||
1303 | } | ||
1304 | |||
1305 | /* | ||
1306 | * Returns stocks cached in percpu to res_counter and reset cached information. | ||
1307 | */ | ||
1308 | static void drain_stock(struct memcg_stock_pcp *stock) | ||
1309 | { | ||
1310 | struct mem_cgroup *old = stock->cached; | ||
1311 | |||
1312 | if (stock->charge) { | ||
1313 | res_counter_uncharge(&old->res, stock->charge); | ||
1314 | if (do_swap_account) | ||
1315 | res_counter_uncharge(&old->memsw, stock->charge); | ||
1316 | } | ||
1317 | stock->cached = NULL; | ||
1318 | stock->charge = 0; | ||
1319 | } | ||
1320 | |||
1321 | /* | ||
1322 | * This must be called under preempt disabled or must be called by | ||
1323 | * a thread which is pinned to local cpu. | ||
1324 | */ | ||
1325 | static void drain_local_stock(struct work_struct *dummy) | ||
1326 | { | ||
1327 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | ||
1328 | drain_stock(stock); | ||
1329 | } | ||
1330 | |||
1331 | /* | ||
1332 | * Cache charges(val) which is from res_counter, to local per_cpu area. | ||
1333 | * This will be consumed by consumt_stock() function, later. | ||
1334 | */ | ||
1335 | static void refill_stock(struct mem_cgroup *mem, int val) | ||
1336 | { | ||
1337 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | ||
1338 | |||
1339 | if (stock->cached != mem) { /* reset if necessary */ | ||
1340 | drain_stock(stock); | ||
1341 | stock->cached = mem; | ||
1342 | } | ||
1343 | stock->charge += val; | ||
1344 | put_cpu_var(memcg_stock); | ||
1345 | } | ||
1346 | |||
1347 | /* | ||
1348 | * Tries to drain stocked charges in other cpus. This function is asynchronous | ||
1349 | * and just put a work per cpu for draining localy on each cpu. Caller can | ||
1350 | * expects some charges will be back to res_counter later but cannot wait for | ||
1351 | * it. | ||
1352 | */ | ||
1353 | static void drain_all_stock_async(void) | ||
1354 | { | ||
1355 | int cpu; | ||
1356 | /* This function is for scheduling "drain" in asynchronous way. | ||
1357 | * The result of "drain" is not directly handled by callers. Then, | ||
1358 | * if someone is calling drain, we don't have to call drain more. | ||
1359 | * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if | ||
1360 | * there is a race. We just do loose check here. | ||
1361 | */ | ||
1362 | if (atomic_read(&memcg_drain_count)) | ||
1363 | return; | ||
1364 | /* Notify other cpus that system-wide "drain" is running */ | ||
1365 | atomic_inc(&memcg_drain_count); | ||
1366 | get_online_cpus(); | ||
1367 | for_each_online_cpu(cpu) { | ||
1368 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | ||
1369 | schedule_work_on(cpu, &stock->work); | ||
1370 | } | ||
1371 | put_online_cpus(); | ||
1372 | atomic_dec(&memcg_drain_count); | ||
1373 | /* We don't wait for flush_work */ | ||
1374 | } | ||
1375 | |||
1376 | /* This is a synchronous drain interface. */ | ||
1377 | static void drain_all_stock_sync(void) | ||
1378 | { | ||
1379 | /* called when force_empty is called */ | ||
1380 | atomic_inc(&memcg_drain_count); | ||
1381 | schedule_on_each_cpu(drain_local_stock); | ||
1382 | atomic_dec(&memcg_drain_count); | ||
1383 | } | ||
1384 | |||
1385 | static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | ||
1386 | unsigned long action, | ||
1387 | void *hcpu) | ||
1388 | { | ||
1389 | int cpu = (unsigned long)hcpu; | ||
1390 | struct memcg_stock_pcp *stock; | ||
1391 | |||
1392 | if (action != CPU_DEAD) | ||
1393 | return NOTIFY_OK; | ||
1394 | stock = &per_cpu(memcg_stock, cpu); | ||
1395 | drain_stock(stock); | ||
1396 | return NOTIFY_OK; | ||
1397 | } | ||
1398 | |||
1399 | /* | ||
1262 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 1400 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
1263 | * oom-killer can be invoked. | 1401 | * oom-killer can be invoked. |
1264 | */ | 1402 | */ |
@@ -1269,6 +1407,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1269 | struct mem_cgroup *mem, *mem_over_limit; | 1407 | struct mem_cgroup *mem, *mem_over_limit; |
1270 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1408 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1271 | struct res_counter *fail_res; | 1409 | struct res_counter *fail_res; |
1410 | int csize = CHARGE_SIZE; | ||
1272 | 1411 | ||
1273 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1412 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { |
1274 | /* Don't account this! */ | 1413 | /* Don't account this! */ |
@@ -1293,23 +1432,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1293 | return 0; | 1432 | return 0; |
1294 | 1433 | ||
1295 | VM_BUG_ON(css_is_removed(&mem->css)); | 1434 | VM_BUG_ON(css_is_removed(&mem->css)); |
1435 | if (mem_cgroup_is_root(mem)) | ||
1436 | goto done; | ||
1296 | 1437 | ||
1297 | while (1) { | 1438 | while (1) { |
1298 | int ret = 0; | 1439 | int ret = 0; |
1299 | unsigned long flags = 0; | 1440 | unsigned long flags = 0; |
1300 | 1441 | ||
1301 | if (mem_cgroup_is_root(mem)) | 1442 | if (consume_stock(mem)) |
1302 | goto done; | 1443 | goto charged; |
1303 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1444 | |
1445 | ret = res_counter_charge(&mem->res, csize, &fail_res); | ||
1304 | if (likely(!ret)) { | 1446 | if (likely(!ret)) { |
1305 | if (!do_swap_account) | 1447 | if (!do_swap_account) |
1306 | break; | 1448 | break; |
1307 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | 1449 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); |
1308 | &fail_res); | ||
1309 | if (likely(!ret)) | 1450 | if (likely(!ret)) |
1310 | break; | 1451 | break; |
1311 | /* mem+swap counter fails */ | 1452 | /* mem+swap counter fails */ |
1312 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1453 | res_counter_uncharge(&mem->res, csize); |
1313 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1454 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1314 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1455 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1315 | memsw); | 1456 | memsw); |
@@ -1318,6 +1459,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1318 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1459 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1319 | res); | 1460 | res); |
1320 | 1461 | ||
1462 | /* reduce request size and retry */ | ||
1463 | if (csize > PAGE_SIZE) { | ||
1464 | csize = PAGE_SIZE; | ||
1465 | continue; | ||
1466 | } | ||
1321 | if (!(gfp_mask & __GFP_WAIT)) | 1467 | if (!(gfp_mask & __GFP_WAIT)) |
1322 | goto nomem; | 1468 | goto nomem; |
1323 | 1469 | ||
@@ -1339,14 +1485,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1339 | 1485 | ||
1340 | if (!nr_retries--) { | 1486 | if (!nr_retries--) { |
1341 | if (oom) { | 1487 | if (oom) { |
1342 | mutex_lock(&memcg_tasklist); | ||
1343 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 1488 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); |
1344 | mutex_unlock(&memcg_tasklist); | ||
1345 | record_last_oom(mem_over_limit); | 1489 | record_last_oom(mem_over_limit); |
1346 | } | 1490 | } |
1347 | goto nomem; | 1491 | goto nomem; |
1348 | } | 1492 | } |
1349 | } | 1493 | } |
1494 | if (csize > PAGE_SIZE) | ||
1495 | refill_stock(mem, csize - PAGE_SIZE); | ||
1496 | charged: | ||
1350 | /* | 1497 | /* |
1351 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 1498 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
1352 | * if they exceeds softlimit. | 1499 | * if they exceeds softlimit. |
@@ -1361,6 +1508,21 @@ nomem: | |||
1361 | } | 1508 | } |
1362 | 1509 | ||
1363 | /* | 1510 | /* |
1511 | * Somemtimes we have to undo a charge we got by try_charge(). | ||
1512 | * This function is for that and do uncharge, put css's refcnt. | ||
1513 | * gotten by try_charge(). | ||
1514 | */ | ||
1515 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | ||
1516 | { | ||
1517 | if (!mem_cgroup_is_root(mem)) { | ||
1518 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1519 | if (do_swap_account) | ||
1520 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1521 | } | ||
1522 | css_put(&mem->css); | ||
1523 | } | ||
1524 | |||
1525 | /* | ||
1364 | * A helper function to get mem_cgroup from ID. must be called under | 1526 | * A helper function to get mem_cgroup from ID. must be called under |
1365 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 1527 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
1366 | * it's concern. (dropping refcnt from swap can be called against removed | 1528 | * it's concern. (dropping refcnt from swap can be called against removed |
@@ -1379,25 +1541,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
1379 | return container_of(css, struct mem_cgroup, css); | 1541 | return container_of(css, struct mem_cgroup, css); |
1380 | } | 1542 | } |
1381 | 1543 | ||
1382 | static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) | 1544 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
1383 | { | 1545 | { |
1384 | struct mem_cgroup *mem; | 1546 | struct mem_cgroup *mem = NULL; |
1385 | struct page_cgroup *pc; | 1547 | struct page_cgroup *pc; |
1386 | unsigned short id; | 1548 | unsigned short id; |
1387 | swp_entry_t ent; | 1549 | swp_entry_t ent; |
1388 | 1550 | ||
1389 | VM_BUG_ON(!PageLocked(page)); | 1551 | VM_BUG_ON(!PageLocked(page)); |
1390 | 1552 | ||
1391 | if (!PageSwapCache(page)) | ||
1392 | return NULL; | ||
1393 | |||
1394 | pc = lookup_page_cgroup(page); | 1553 | pc = lookup_page_cgroup(page); |
1395 | lock_page_cgroup(pc); | 1554 | lock_page_cgroup(pc); |
1396 | if (PageCgroupUsed(pc)) { | 1555 | if (PageCgroupUsed(pc)) { |
1397 | mem = pc->mem_cgroup; | 1556 | mem = pc->mem_cgroup; |
1398 | if (mem && !css_tryget(&mem->css)) | 1557 | if (mem && !css_tryget(&mem->css)) |
1399 | mem = NULL; | 1558 | mem = NULL; |
1400 | } else { | 1559 | } else if (PageSwapCache(page)) { |
1401 | ent.val = page_private(page); | 1560 | ent.val = page_private(page); |
1402 | id = lookup_swap_cgroup(ent); | 1561 | id = lookup_swap_cgroup(ent); |
1403 | rcu_read_lock(); | 1562 | rcu_read_lock(); |
@@ -1426,12 +1585,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1426 | lock_page_cgroup(pc); | 1585 | lock_page_cgroup(pc); |
1427 | if (unlikely(PageCgroupUsed(pc))) { | 1586 | if (unlikely(PageCgroupUsed(pc))) { |
1428 | unlock_page_cgroup(pc); | 1587 | unlock_page_cgroup(pc); |
1429 | if (!mem_cgroup_is_root(mem)) { | 1588 | mem_cgroup_cancel_charge(mem); |
1430 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1431 | if (do_swap_account) | ||
1432 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1433 | } | ||
1434 | css_put(&mem->css); | ||
1435 | return; | 1589 | return; |
1436 | } | 1590 | } |
1437 | 1591 | ||
@@ -1464,27 +1618,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1464 | } | 1618 | } |
1465 | 1619 | ||
1466 | /** | 1620 | /** |
1467 | * mem_cgroup_move_account - move account of the page | 1621 | * __mem_cgroup_move_account - move account of the page |
1468 | * @pc: page_cgroup of the page. | 1622 | * @pc: page_cgroup of the page. |
1469 | * @from: mem_cgroup which the page is moved from. | 1623 | * @from: mem_cgroup which the page is moved from. |
1470 | * @to: mem_cgroup which the page is moved to. @from != @to. | 1624 | * @to: mem_cgroup which the page is moved to. @from != @to. |
1471 | * | 1625 | * |
1472 | * The caller must confirm following. | 1626 | * The caller must confirm following. |
1473 | * - page is not on LRU (isolate_page() is useful.) | 1627 | * - page is not on LRU (isolate_page() is useful.) |
1474 | * | 1628 | * - the pc is locked, used, and ->mem_cgroup points to @from. |
1475 | * returns 0 at success, | ||
1476 | * returns -EBUSY when lock is busy or "pc" is unstable. | ||
1477 | * | 1629 | * |
1478 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | 1630 | * This function does "uncharge" from old cgroup but doesn't do "charge" to |
1479 | * new cgroup. It should be done by a caller. | 1631 | * new cgroup. It should be done by a caller. |
1480 | */ | 1632 | */ |
1481 | 1633 | ||
1482 | static int mem_cgroup_move_account(struct page_cgroup *pc, | 1634 | static void __mem_cgroup_move_account(struct page_cgroup *pc, |
1483 | struct mem_cgroup *from, struct mem_cgroup *to) | 1635 | struct mem_cgroup *from, struct mem_cgroup *to) |
1484 | { | 1636 | { |
1485 | struct mem_cgroup_per_zone *from_mz, *to_mz; | ||
1486 | int nid, zid; | ||
1487 | int ret = -EBUSY; | ||
1488 | struct page *page; | 1637 | struct page *page; |
1489 | int cpu; | 1638 | int cpu; |
1490 | struct mem_cgroup_stat *stat; | 1639 | struct mem_cgroup_stat *stat; |
@@ -1492,38 +1641,27 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1492 | 1641 | ||
1493 | VM_BUG_ON(from == to); | 1642 | VM_BUG_ON(from == to); |
1494 | VM_BUG_ON(PageLRU(pc->page)); | 1643 | VM_BUG_ON(PageLRU(pc->page)); |
1495 | 1644 | VM_BUG_ON(!PageCgroupLocked(pc)); | |
1496 | nid = page_cgroup_nid(pc); | 1645 | VM_BUG_ON(!PageCgroupUsed(pc)); |
1497 | zid = page_cgroup_zid(pc); | 1646 | VM_BUG_ON(pc->mem_cgroup != from); |
1498 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); | ||
1499 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); | ||
1500 | |||
1501 | if (!trylock_page_cgroup(pc)) | ||
1502 | return ret; | ||
1503 | |||
1504 | if (!PageCgroupUsed(pc)) | ||
1505 | goto out; | ||
1506 | |||
1507 | if (pc->mem_cgroup != from) | ||
1508 | goto out; | ||
1509 | 1647 | ||
1510 | if (!mem_cgroup_is_root(from)) | 1648 | if (!mem_cgroup_is_root(from)) |
1511 | res_counter_uncharge(&from->res, PAGE_SIZE); | 1649 | res_counter_uncharge(&from->res, PAGE_SIZE); |
1512 | mem_cgroup_charge_statistics(from, pc, false); | 1650 | mem_cgroup_charge_statistics(from, pc, false); |
1513 | 1651 | ||
1514 | page = pc->page; | 1652 | page = pc->page; |
1515 | if (page_is_file_cache(page) && page_mapped(page)) { | 1653 | if (page_mapped(page) && !PageAnon(page)) { |
1516 | cpu = smp_processor_id(); | 1654 | cpu = smp_processor_id(); |
1517 | /* Update mapped_file data for mem_cgroup "from" */ | 1655 | /* Update mapped_file data for mem_cgroup "from" */ |
1518 | stat = &from->stat; | 1656 | stat = &from->stat; |
1519 | cpustat = &stat->cpustat[cpu]; | 1657 | cpustat = &stat->cpustat[cpu]; |
1520 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | 1658 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, |
1521 | -1); | 1659 | -1); |
1522 | 1660 | ||
1523 | /* Update mapped_file data for mem_cgroup "to" */ | 1661 | /* Update mapped_file data for mem_cgroup "to" */ |
1524 | stat = &to->stat; | 1662 | stat = &to->stat; |
1525 | cpustat = &stat->cpustat[cpu]; | 1663 | cpustat = &stat->cpustat[cpu]; |
1526 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | 1664 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, |
1527 | 1); | 1665 | 1); |
1528 | } | 1666 | } |
1529 | 1667 | ||
@@ -1534,15 +1672,28 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1534 | css_get(&to->css); | 1672 | css_get(&to->css); |
1535 | pc->mem_cgroup = to; | 1673 | pc->mem_cgroup = to; |
1536 | mem_cgroup_charge_statistics(to, pc, true); | 1674 | mem_cgroup_charge_statistics(to, pc, true); |
1537 | ret = 0; | ||
1538 | out: | ||
1539 | unlock_page_cgroup(pc); | ||
1540 | /* | 1675 | /* |
1541 | * We charges against "to" which may not have any tasks. Then, "to" | 1676 | * We charges against "to" which may not have any tasks. Then, "to" |
1542 | * can be under rmdir(). But in current implementation, caller of | 1677 | * can be under rmdir(). But in current implementation, caller of |
1543 | * this function is just force_empty() and it's garanteed that | 1678 | * this function is just force_empty() and it's garanteed that |
1544 | * "to" is never removed. So, we don't check rmdir status here. | 1679 | * "to" is never removed. So, we don't check rmdir status here. |
1545 | */ | 1680 | */ |
1681 | } | ||
1682 | |||
1683 | /* | ||
1684 | * check whether the @pc is valid for moving account and call | ||
1685 | * __mem_cgroup_move_account() | ||
1686 | */ | ||
1687 | static int mem_cgroup_move_account(struct page_cgroup *pc, | ||
1688 | struct mem_cgroup *from, struct mem_cgroup *to) | ||
1689 | { | ||
1690 | int ret = -EINVAL; | ||
1691 | lock_page_cgroup(pc); | ||
1692 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | ||
1693 | __mem_cgroup_move_account(pc, from, to); | ||
1694 | ret = 0; | ||
1695 | } | ||
1696 | unlock_page_cgroup(pc); | ||
1546 | return ret; | 1697 | return ret; |
1547 | } | 1698 | } |
1548 | 1699 | ||
@@ -1564,45 +1715,27 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
1564 | if (!pcg) | 1715 | if (!pcg) |
1565 | return -EINVAL; | 1716 | return -EINVAL; |
1566 | 1717 | ||
1718 | ret = -EBUSY; | ||
1719 | if (!get_page_unless_zero(page)) | ||
1720 | goto out; | ||
1721 | if (isolate_lru_page(page)) | ||
1722 | goto put; | ||
1567 | 1723 | ||
1568 | parent = mem_cgroup_from_cont(pcg); | 1724 | parent = mem_cgroup_from_cont(pcg); |
1569 | |||
1570 | |||
1571 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); | 1725 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); |
1572 | if (ret || !parent) | 1726 | if (ret || !parent) |
1573 | return ret; | 1727 | goto put_back; |
1574 | |||
1575 | if (!get_page_unless_zero(page)) { | ||
1576 | ret = -EBUSY; | ||
1577 | goto uncharge; | ||
1578 | } | ||
1579 | |||
1580 | ret = isolate_lru_page(page); | ||
1581 | |||
1582 | if (ret) | ||
1583 | goto cancel; | ||
1584 | 1728 | ||
1585 | ret = mem_cgroup_move_account(pc, child, parent); | 1729 | ret = mem_cgroup_move_account(pc, child, parent); |
1586 | 1730 | if (!ret) | |
1731 | css_put(&parent->css); /* drop extra refcnt by try_charge() */ | ||
1732 | else | ||
1733 | mem_cgroup_cancel_charge(parent); /* does css_put */ | ||
1734 | put_back: | ||
1587 | putback_lru_page(page); | 1735 | putback_lru_page(page); |
1588 | if (!ret) { | 1736 | put: |
1589 | put_page(page); | ||
1590 | /* drop extra refcnt by try_charge() */ | ||
1591 | css_put(&parent->css); | ||
1592 | return 0; | ||
1593 | } | ||
1594 | |||
1595 | cancel: | ||
1596 | put_page(page); | 1737 | put_page(page); |
1597 | uncharge: | 1738 | out: |
1598 | /* drop extra refcnt by try_charge() */ | ||
1599 | css_put(&parent->css); | ||
1600 | /* uncharge if move fails */ | ||
1601 | if (!mem_cgroup_is_root(parent)) { | ||
1602 | res_counter_uncharge(&parent->res, PAGE_SIZE); | ||
1603 | if (do_swap_account) | ||
1604 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
1605 | } | ||
1606 | return ret; | 1739 | return ret; |
1607 | } | 1740 | } |
1608 | 1741 | ||
@@ -1737,12 +1870,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
1737 | goto charge_cur_mm; | 1870 | goto charge_cur_mm; |
1738 | /* | 1871 | /* |
1739 | * A racing thread's fault, or swapoff, may have already updated | 1872 | * A racing thread's fault, or swapoff, may have already updated |
1740 | * the pte, and even removed page from swap cache: return success | 1873 | * the pte, and even removed page from swap cache: in those cases |
1741 | * to go on to do_swap_page()'s pte_same() test, which should fail. | 1874 | * do_swap_page()'s pte_same() test will fail; but there's also a |
1875 | * KSM case which does need to charge the page. | ||
1742 | */ | 1876 | */ |
1743 | if (!PageSwapCache(page)) | 1877 | if (!PageSwapCache(page)) |
1744 | return 0; | 1878 | goto charge_cur_mm; |
1745 | mem = try_get_mem_cgroup_from_swapcache(page); | 1879 | mem = try_get_mem_cgroup_from_page(page); |
1746 | if (!mem) | 1880 | if (!mem) |
1747 | goto charge_cur_mm; | 1881 | goto charge_cur_mm; |
1748 | *ptr = mem; | 1882 | *ptr = mem; |
@@ -1818,14 +1952,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
1818 | return; | 1952 | return; |
1819 | if (!mem) | 1953 | if (!mem) |
1820 | return; | 1954 | return; |
1821 | if (!mem_cgroup_is_root(mem)) { | 1955 | mem_cgroup_cancel_charge(mem); |
1822 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1823 | if (do_swap_account) | ||
1824 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1825 | } | ||
1826 | css_put(&mem->css); | ||
1827 | } | 1956 | } |
1828 | 1957 | ||
1958 | static void | ||
1959 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | ||
1960 | { | ||
1961 | struct memcg_batch_info *batch = NULL; | ||
1962 | bool uncharge_memsw = true; | ||
1963 | /* If swapout, usage of swap doesn't decrease */ | ||
1964 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
1965 | uncharge_memsw = false; | ||
1966 | /* | ||
1967 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
1968 | * In those cases, all pages freed continously can be expected to be in | ||
1969 | * the same cgroup and we have chance to coalesce uncharges. | ||
1970 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
1971 | * because we want to do uncharge as soon as possible. | ||
1972 | */ | ||
1973 | if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||
1974 | goto direct_uncharge; | ||
1975 | |||
1976 | batch = ¤t->memcg_batch; | ||
1977 | /* | ||
1978 | * In usual, we do css_get() when we remember memcg pointer. | ||
1979 | * But in this case, we keep res->usage until end of a series of | ||
1980 | * uncharges. Then, it's ok to ignore memcg's refcnt. | ||
1981 | */ | ||
1982 | if (!batch->memcg) | ||
1983 | batch->memcg = mem; | ||
1984 | /* | ||
1985 | * In typical case, batch->memcg == mem. This means we can | ||
1986 | * merge a series of uncharges to an uncharge of res_counter. | ||
1987 | * If not, we uncharge res_counter ony by one. | ||
1988 | */ | ||
1989 | if (batch->memcg != mem) | ||
1990 | goto direct_uncharge; | ||
1991 | /* remember freed charge and uncharge it later */ | ||
1992 | batch->bytes += PAGE_SIZE; | ||
1993 | if (uncharge_memsw) | ||
1994 | batch->memsw_bytes += PAGE_SIZE; | ||
1995 | return; | ||
1996 | direct_uncharge: | ||
1997 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1998 | if (uncharge_memsw) | ||
1999 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
2000 | return; | ||
2001 | } | ||
1829 | 2002 | ||
1830 | /* | 2003 | /* |
1831 | * uncharge if !page_mapped(page) | 2004 | * uncharge if !page_mapped(page) |
@@ -1874,12 +2047,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1874 | break; | 2047 | break; |
1875 | } | 2048 | } |
1876 | 2049 | ||
1877 | if (!mem_cgroup_is_root(mem)) { | 2050 | if (!mem_cgroup_is_root(mem)) |
1878 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2051 | __do_uncharge(mem, ctype); |
1879 | if (do_swap_account && | ||
1880 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1881 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1882 | } | ||
1883 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2052 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1884 | mem_cgroup_swap_statistics(mem, true); | 2053 | mem_cgroup_swap_statistics(mem, true); |
1885 | mem_cgroup_charge_statistics(mem, pc, false); | 2054 | mem_cgroup_charge_statistics(mem, pc, false); |
@@ -1925,6 +2094,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
1925 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 2094 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
1926 | } | 2095 | } |
1927 | 2096 | ||
2097 | /* | ||
2098 | * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. | ||
2099 | * In that cases, pages are freed continuously and we can expect pages | ||
2100 | * are in the same memcg. All these calls itself limits the number of | ||
2101 | * pages freed at once, then uncharge_start/end() is called properly. | ||
2102 | * This may be called prural(2) times in a context, | ||
2103 | */ | ||
2104 | |||
2105 | void mem_cgroup_uncharge_start(void) | ||
2106 | { | ||
2107 | current->memcg_batch.do_batch++; | ||
2108 | /* We can do nest. */ | ||
2109 | if (current->memcg_batch.do_batch == 1) { | ||
2110 | current->memcg_batch.memcg = NULL; | ||
2111 | current->memcg_batch.bytes = 0; | ||
2112 | current->memcg_batch.memsw_bytes = 0; | ||
2113 | } | ||
2114 | } | ||
2115 | |||
2116 | void mem_cgroup_uncharge_end(void) | ||
2117 | { | ||
2118 | struct memcg_batch_info *batch = ¤t->memcg_batch; | ||
2119 | |||
2120 | if (!batch->do_batch) | ||
2121 | return; | ||
2122 | |||
2123 | batch->do_batch--; | ||
2124 | if (batch->do_batch) /* If stacked, do nothing. */ | ||
2125 | return; | ||
2126 | |||
2127 | if (!batch->memcg) | ||
2128 | return; | ||
2129 | /* | ||
2130 | * This "batch->memcg" is valid without any css_get/put etc... | ||
2131 | * bacause we hide charges behind us. | ||
2132 | */ | ||
2133 | if (batch->bytes) | ||
2134 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | ||
2135 | if (batch->memsw_bytes) | ||
2136 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | ||
2137 | /* forget this pointer (for sanity check) */ | ||
2138 | batch->memcg = NULL; | ||
2139 | } | ||
2140 | |||
1928 | #ifdef CONFIG_SWAP | 2141 | #ifdef CONFIG_SWAP |
1929 | /* | 2142 | /* |
1930 | * called after __delete_from_swap_cache() and drop "page" account. | 2143 | * called after __delete_from_swap_cache() and drop "page" account. |
@@ -2100,7 +2313,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2100 | unsigned long long val) | 2313 | unsigned long long val) |
2101 | { | 2314 | { |
2102 | int retry_count; | 2315 | int retry_count; |
2103 | int progress; | ||
2104 | u64 memswlimit; | 2316 | u64 memswlimit; |
2105 | int ret = 0; | 2317 | int ret = 0; |
2106 | int children = mem_cgroup_count_children(memcg); | 2318 | int children = mem_cgroup_count_children(memcg); |
@@ -2144,8 +2356,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2144 | if (!ret) | 2356 | if (!ret) |
2145 | break; | 2357 | break; |
2146 | 2358 | ||
2147 | progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, | 2359 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
2148 | GFP_KERNEL, | ||
2149 | MEM_CGROUP_RECLAIM_SHRINK); | 2360 | MEM_CGROUP_RECLAIM_SHRINK); |
2150 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2361 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
2151 | /* Usage is reduced ? */ | 2362 | /* Usage is reduced ? */ |
@@ -2384,6 +2595,7 @@ move_account: | |||
2384 | goto out; | 2595 | goto out; |
2385 | /* This is for making all *used* pages to be on LRU. */ | 2596 | /* This is for making all *used* pages to be on LRU. */ |
2386 | lru_add_drain_all(); | 2597 | lru_add_drain_all(); |
2598 | drain_all_stock_sync(); | ||
2387 | ret = 0; | 2599 | ret = 0; |
2388 | for_each_node_state(node, N_HIGH_MEMORY) { | 2600 | for_each_node_state(node, N_HIGH_MEMORY) { |
2389 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 2601 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
@@ -2541,6 +2753,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
2541 | val += idx_val; | 2753 | val += idx_val; |
2542 | mem_cgroup_get_recursive_idx_stat(mem, | 2754 | mem_cgroup_get_recursive_idx_stat(mem, |
2543 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | 2755 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); |
2756 | val += idx_val; | ||
2544 | val <<= PAGE_SHIFT; | 2757 | val <<= PAGE_SHIFT; |
2545 | } else | 2758 | } else |
2546 | val = res_counter_read_u64(&mem->memsw, name); | 2759 | val = res_counter_read_u64(&mem->memsw, name); |
@@ -2660,7 +2873,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2660 | enum { | 2873 | enum { |
2661 | MCS_CACHE, | 2874 | MCS_CACHE, |
2662 | MCS_RSS, | 2875 | MCS_RSS, |
2663 | MCS_MAPPED_FILE, | 2876 | MCS_FILE_MAPPED, |
2664 | MCS_PGPGIN, | 2877 | MCS_PGPGIN, |
2665 | MCS_PGPGOUT, | 2878 | MCS_PGPGOUT, |
2666 | MCS_SWAP, | 2879 | MCS_SWAP, |
@@ -2704,8 +2917,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
2704 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 2917 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
2705 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | 2918 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); |
2706 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 2919 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
2707 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); | 2920 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); |
2708 | s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; | 2921 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
2709 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); | 2922 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); |
2710 | s->stat[MCS_PGPGIN] += val; | 2923 | s->stat[MCS_PGPGIN] += val; |
2711 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 2924 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
@@ -3097,11 +3310,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3097 | 3310 | ||
3098 | /* root ? */ | 3311 | /* root ? */ |
3099 | if (cont->parent == NULL) { | 3312 | if (cont->parent == NULL) { |
3313 | int cpu; | ||
3100 | enable_swap_cgroup(); | 3314 | enable_swap_cgroup(); |
3101 | parent = NULL; | 3315 | parent = NULL; |
3102 | root_mem_cgroup = mem; | 3316 | root_mem_cgroup = mem; |
3103 | if (mem_cgroup_soft_limit_tree_init()) | 3317 | if (mem_cgroup_soft_limit_tree_init()) |
3104 | goto free_out; | 3318 | goto free_out; |
3319 | for_each_possible_cpu(cpu) { | ||
3320 | struct memcg_stock_pcp *stock = | ||
3321 | &per_cpu(memcg_stock, cpu); | ||
3322 | INIT_WORK(&stock->work, drain_local_stock); | ||
3323 | } | ||
3324 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | ||
3105 | 3325 | ||
3106 | } else { | 3326 | } else { |
3107 | parent = mem_cgroup_from_cont(cont->parent); | 3327 | parent = mem_cgroup_from_cont(cont->parent); |
@@ -3170,12 +3390,10 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
3170 | struct task_struct *p, | 3390 | struct task_struct *p, |
3171 | bool threadgroup) | 3391 | bool threadgroup) |
3172 | { | 3392 | { |
3173 | mutex_lock(&memcg_tasklist); | ||
3174 | /* | 3393 | /* |
3175 | * FIXME: It's better to move charges of this process from old | 3394 | * FIXME: It's better to move charges of this process from old |
3176 | * memcg to new memcg. But it's just on TODO-List now. | 3395 | * memcg to new memcg. But it's just on TODO-List now. |
3177 | */ | 3396 | */ |
3178 | mutex_unlock(&memcg_tasklist); | ||
3179 | } | 3397 | } |
3180 | 3398 | ||
3181 | struct cgroup_subsys mem_cgroup_subsys = { | 3399 | struct cgroup_subsys mem_cgroup_subsys = { |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1ac49fef95ab..6a0466ed5bfd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -34,12 +34,16 @@ | |||
34 | #include <linux/kernel.h> | 34 | #include <linux/kernel.h> |
35 | #include <linux/mm.h> | 35 | #include <linux/mm.h> |
36 | #include <linux/page-flags.h> | 36 | #include <linux/page-flags.h> |
37 | #include <linux/kernel-page-flags.h> | ||
37 | #include <linux/sched.h> | 38 | #include <linux/sched.h> |
38 | #include <linux/ksm.h> | 39 | #include <linux/ksm.h> |
39 | #include <linux/rmap.h> | 40 | #include <linux/rmap.h> |
40 | #include <linux/pagemap.h> | 41 | #include <linux/pagemap.h> |
41 | #include <linux/swap.h> | 42 | #include <linux/swap.h> |
42 | #include <linux/backing-dev.h> | 43 | #include <linux/backing-dev.h> |
44 | #include <linux/migrate.h> | ||
45 | #include <linux/page-isolation.h> | ||
46 | #include <linux/suspend.h> | ||
43 | #include "internal.h" | 47 | #include "internal.h" |
44 | 48 | ||
45 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 49 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -48,6 +52,120 @@ int sysctl_memory_failure_recovery __read_mostly = 1; | |||
48 | 52 | ||
49 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); | 53 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); |
50 | 54 | ||
55 | u32 hwpoison_filter_enable = 0; | ||
56 | u32 hwpoison_filter_dev_major = ~0U; | ||
57 | u32 hwpoison_filter_dev_minor = ~0U; | ||
58 | u64 hwpoison_filter_flags_mask; | ||
59 | u64 hwpoison_filter_flags_value; | ||
60 | EXPORT_SYMBOL_GPL(hwpoison_filter_enable); | ||
61 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); | ||
62 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); | ||
63 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); | ||
64 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); | ||
65 | |||
66 | static int hwpoison_filter_dev(struct page *p) | ||
67 | { | ||
68 | struct address_space *mapping; | ||
69 | dev_t dev; | ||
70 | |||
71 | if (hwpoison_filter_dev_major == ~0U && | ||
72 | hwpoison_filter_dev_minor == ~0U) | ||
73 | return 0; | ||
74 | |||
75 | /* | ||
76 | * page_mapping() does not accept slab page | ||
77 | */ | ||
78 | if (PageSlab(p)) | ||
79 | return -EINVAL; | ||
80 | |||
81 | mapping = page_mapping(p); | ||
82 | if (mapping == NULL || mapping->host == NULL) | ||
83 | return -EINVAL; | ||
84 | |||
85 | dev = mapping->host->i_sb->s_dev; | ||
86 | if (hwpoison_filter_dev_major != ~0U && | ||
87 | hwpoison_filter_dev_major != MAJOR(dev)) | ||
88 | return -EINVAL; | ||
89 | if (hwpoison_filter_dev_minor != ~0U && | ||
90 | hwpoison_filter_dev_minor != MINOR(dev)) | ||
91 | return -EINVAL; | ||
92 | |||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | static int hwpoison_filter_flags(struct page *p) | ||
97 | { | ||
98 | if (!hwpoison_filter_flags_mask) | ||
99 | return 0; | ||
100 | |||
101 | if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == | ||
102 | hwpoison_filter_flags_value) | ||
103 | return 0; | ||
104 | else | ||
105 | return -EINVAL; | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * This allows stress tests to limit test scope to a collection of tasks | ||
110 | * by putting them under some memcg. This prevents killing unrelated/important | ||
111 | * processes such as /sbin/init. Note that the target task may share clean | ||
112 | * pages with init (eg. libc text), which is harmless. If the target task | ||
113 | * share _dirty_ pages with another task B, the test scheme must make sure B | ||
114 | * is also included in the memcg. At last, due to race conditions this filter | ||
115 | * can only guarantee that the page either belongs to the memcg tasks, or is | ||
116 | * a freed page. | ||
117 | */ | ||
118 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
119 | u64 hwpoison_filter_memcg; | ||
120 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); | ||
121 | static int hwpoison_filter_task(struct page *p) | ||
122 | { | ||
123 | struct mem_cgroup *mem; | ||
124 | struct cgroup_subsys_state *css; | ||
125 | unsigned long ino; | ||
126 | |||
127 | if (!hwpoison_filter_memcg) | ||
128 | return 0; | ||
129 | |||
130 | mem = try_get_mem_cgroup_from_page(p); | ||
131 | if (!mem) | ||
132 | return -EINVAL; | ||
133 | |||
134 | css = mem_cgroup_css(mem); | ||
135 | /* root_mem_cgroup has NULL dentries */ | ||
136 | if (!css->cgroup->dentry) | ||
137 | return -EINVAL; | ||
138 | |||
139 | ino = css->cgroup->dentry->d_inode->i_ino; | ||
140 | css_put(css); | ||
141 | |||
142 | if (ino != hwpoison_filter_memcg) | ||
143 | return -EINVAL; | ||
144 | |||
145 | return 0; | ||
146 | } | ||
147 | #else | ||
148 | static int hwpoison_filter_task(struct page *p) { return 0; } | ||
149 | #endif | ||
150 | |||
151 | int hwpoison_filter(struct page *p) | ||
152 | { | ||
153 | if (!hwpoison_filter_enable) | ||
154 | return 0; | ||
155 | |||
156 | if (hwpoison_filter_dev(p)) | ||
157 | return -EINVAL; | ||
158 | |||
159 | if (hwpoison_filter_flags(p)) | ||
160 | return -EINVAL; | ||
161 | |||
162 | if (hwpoison_filter_task(p)) | ||
163 | return -EINVAL; | ||
164 | |||
165 | return 0; | ||
166 | } | ||
167 | EXPORT_SYMBOL_GPL(hwpoison_filter); | ||
168 | |||
51 | /* | 169 | /* |
52 | * Send all the processes who have the page mapped an ``action optional'' | 170 | * Send all the processes who have the page mapped an ``action optional'' |
53 | * signal. | 171 | * signal. |
@@ -83,6 +201,36 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | |||
83 | } | 201 | } |
84 | 202 | ||
85 | /* | 203 | /* |
204 | * When a unknown page type is encountered drain as many buffers as possible | ||
205 | * in the hope to turn the page into a LRU or free page, which we can handle. | ||
206 | */ | ||
207 | void shake_page(struct page *p, int access) | ||
208 | { | ||
209 | if (!PageSlab(p)) { | ||
210 | lru_add_drain_all(); | ||
211 | if (PageLRU(p)) | ||
212 | return; | ||
213 | drain_all_pages(); | ||
214 | if (PageLRU(p) || is_free_buddy_page(p)) | ||
215 | return; | ||
216 | } | ||
217 | |||
218 | /* | ||
219 | * Only all shrink_slab here (which would also | ||
220 | * shrink other caches) if access is not potentially fatal. | ||
221 | */ | ||
222 | if (access) { | ||
223 | int nr; | ||
224 | do { | ||
225 | nr = shrink_slab(1000, GFP_KERNEL, 1000); | ||
226 | if (page_count(p) == 0) | ||
227 | break; | ||
228 | } while (nr > 10); | ||
229 | } | ||
230 | } | ||
231 | EXPORT_SYMBOL_GPL(shake_page); | ||
232 | |||
233 | /* | ||
86 | * Kill all processes that have a poisoned page mapped and then isolate | 234 | * Kill all processes that have a poisoned page mapped and then isolate |
87 | * the page. | 235 | * the page. |
88 | * | 236 | * |
@@ -177,7 +325,6 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, | |||
177 | * In case something went wrong with munmapping | 325 | * In case something went wrong with munmapping |
178 | * make sure the process doesn't catch the | 326 | * make sure the process doesn't catch the |
179 | * signal and then access the memory. Just kill it. | 327 | * signal and then access the memory. Just kill it. |
180 | * the signal handlers | ||
181 | */ | 328 | */ |
182 | if (fail || tk->addr_valid == 0) { | 329 | if (fail || tk->addr_valid == 0) { |
183 | printk(KERN_ERR | 330 | printk(KERN_ERR |
@@ -314,33 +461,49 @@ static void collect_procs(struct page *page, struct list_head *tokill) | |||
314 | */ | 461 | */ |
315 | 462 | ||
316 | enum outcome { | 463 | enum outcome { |
317 | FAILED, /* Error handling failed */ | 464 | IGNORED, /* Error: cannot be handled */ |
465 | FAILED, /* Error: handling failed */ | ||
318 | DELAYED, /* Will be handled later */ | 466 | DELAYED, /* Will be handled later */ |
319 | IGNORED, /* Error safely ignored */ | ||
320 | RECOVERED, /* Successfully recovered */ | 467 | RECOVERED, /* Successfully recovered */ |
321 | }; | 468 | }; |
322 | 469 | ||
323 | static const char *action_name[] = { | 470 | static const char *action_name[] = { |
471 | [IGNORED] = "Ignored", | ||
324 | [FAILED] = "Failed", | 472 | [FAILED] = "Failed", |
325 | [DELAYED] = "Delayed", | 473 | [DELAYED] = "Delayed", |
326 | [IGNORED] = "Ignored", | ||
327 | [RECOVERED] = "Recovered", | 474 | [RECOVERED] = "Recovered", |
328 | }; | 475 | }; |
329 | 476 | ||
330 | /* | 477 | /* |
331 | * Error hit kernel page. | 478 | * XXX: It is possible that a page is isolated from LRU cache, |
332 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | 479 | * and then kept in swap cache or failed to remove from page cache. |
333 | * could be more sophisticated. | 480 | * The page count will stop it from being freed by unpoison. |
481 | * Stress tests should be aware of this memory leak problem. | ||
334 | */ | 482 | */ |
335 | static int me_kernel(struct page *p, unsigned long pfn) | 483 | static int delete_from_lru_cache(struct page *p) |
336 | { | 484 | { |
337 | return DELAYED; | 485 | if (!isolate_lru_page(p)) { |
486 | /* | ||
487 | * Clear sensible page flags, so that the buddy system won't | ||
488 | * complain when the page is unpoison-and-freed. | ||
489 | */ | ||
490 | ClearPageActive(p); | ||
491 | ClearPageUnevictable(p); | ||
492 | /* | ||
493 | * drop the page count elevated by isolate_lru_page() | ||
494 | */ | ||
495 | page_cache_release(p); | ||
496 | return 0; | ||
497 | } | ||
498 | return -EIO; | ||
338 | } | 499 | } |
339 | 500 | ||
340 | /* | 501 | /* |
341 | * Already poisoned page. | 502 | * Error hit kernel page. |
503 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | ||
504 | * could be more sophisticated. | ||
342 | */ | 505 | */ |
343 | static int me_ignore(struct page *p, unsigned long pfn) | 506 | static int me_kernel(struct page *p, unsigned long pfn) |
344 | { | 507 | { |
345 | return IGNORED; | 508 | return IGNORED; |
346 | } | 509 | } |
@@ -355,14 +518,6 @@ static int me_unknown(struct page *p, unsigned long pfn) | |||
355 | } | 518 | } |
356 | 519 | ||
357 | /* | 520 | /* |
358 | * Free memory | ||
359 | */ | ||
360 | static int me_free(struct page *p, unsigned long pfn) | ||
361 | { | ||
362 | return DELAYED; | ||
363 | } | ||
364 | |||
365 | /* | ||
366 | * Clean (or cleaned) page cache page. | 521 | * Clean (or cleaned) page cache page. |
367 | */ | 522 | */ |
368 | static int me_pagecache_clean(struct page *p, unsigned long pfn) | 523 | static int me_pagecache_clean(struct page *p, unsigned long pfn) |
@@ -371,6 +526,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
371 | int ret = FAILED; | 526 | int ret = FAILED; |
372 | struct address_space *mapping; | 527 | struct address_space *mapping; |
373 | 528 | ||
529 | delete_from_lru_cache(p); | ||
530 | |||
374 | /* | 531 | /* |
375 | * For anonymous pages we're done the only reference left | 532 | * For anonymous pages we're done the only reference left |
376 | * should be the one m_f() holds. | 533 | * should be the one m_f() holds. |
@@ -500,14 +657,20 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) | |||
500 | /* Trigger EIO in shmem: */ | 657 | /* Trigger EIO in shmem: */ |
501 | ClearPageUptodate(p); | 658 | ClearPageUptodate(p); |
502 | 659 | ||
503 | return DELAYED; | 660 | if (!delete_from_lru_cache(p)) |
661 | return DELAYED; | ||
662 | else | ||
663 | return FAILED; | ||
504 | } | 664 | } |
505 | 665 | ||
506 | static int me_swapcache_clean(struct page *p, unsigned long pfn) | 666 | static int me_swapcache_clean(struct page *p, unsigned long pfn) |
507 | { | 667 | { |
508 | delete_from_swap_cache(p); | 668 | delete_from_swap_cache(p); |
509 | 669 | ||
510 | return RECOVERED; | 670 | if (!delete_from_lru_cache(p)) |
671 | return RECOVERED; | ||
672 | else | ||
673 | return FAILED; | ||
511 | } | 674 | } |
512 | 675 | ||
513 | /* | 676 | /* |
@@ -550,7 +713,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
550 | #define tail (1UL << PG_tail) | 713 | #define tail (1UL << PG_tail) |
551 | #define compound (1UL << PG_compound) | 714 | #define compound (1UL << PG_compound) |
552 | #define slab (1UL << PG_slab) | 715 | #define slab (1UL << PG_slab) |
553 | #define buddy (1UL << PG_buddy) | ||
554 | #define reserved (1UL << PG_reserved) | 716 | #define reserved (1UL << PG_reserved) |
555 | 717 | ||
556 | static struct page_state { | 718 | static struct page_state { |
@@ -559,8 +721,11 @@ static struct page_state { | |||
559 | char *msg; | 721 | char *msg; |
560 | int (*action)(struct page *p, unsigned long pfn); | 722 | int (*action)(struct page *p, unsigned long pfn); |
561 | } error_states[] = { | 723 | } error_states[] = { |
562 | { reserved, reserved, "reserved kernel", me_ignore }, | 724 | { reserved, reserved, "reserved kernel", me_kernel }, |
563 | { buddy, buddy, "free kernel", me_free }, | 725 | /* |
726 | * free pages are specially detected outside this table: | ||
727 | * PG_buddy pages only make a small fraction of all free pages. | ||
728 | */ | ||
564 | 729 | ||
565 | /* | 730 | /* |
566 | * Could in theory check if slab page is free or if we can drop | 731 | * Could in theory check if slab page is free or if we can drop |
@@ -582,14 +747,11 @@ static struct page_state { | |||
582 | { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, | 747 | { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, |
583 | { unevict, unevict, "unevictable LRU", me_pagecache_clean}, | 748 | { unevict, unevict, "unevictable LRU", me_pagecache_clean}, |
584 | 749 | ||
585 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
586 | { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, | 750 | { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, |
587 | { mlock, mlock, "mlocked LRU", me_pagecache_clean }, | 751 | { mlock, mlock, "mlocked LRU", me_pagecache_clean }, |
588 | #endif | ||
589 | 752 | ||
590 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, | 753 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, |
591 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 754 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, |
592 | { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, | ||
593 | 755 | ||
594 | /* | 756 | /* |
595 | * Catchall entry: must be at end. | 757 | * Catchall entry: must be at end. |
@@ -597,20 +759,31 @@ static struct page_state { | |||
597 | { 0, 0, "unknown page state", me_unknown }, | 759 | { 0, 0, "unknown page state", me_unknown }, |
598 | }; | 760 | }; |
599 | 761 | ||
762 | #undef dirty | ||
763 | #undef sc | ||
764 | #undef unevict | ||
765 | #undef mlock | ||
766 | #undef writeback | ||
767 | #undef lru | ||
768 | #undef swapbacked | ||
769 | #undef head | ||
770 | #undef tail | ||
771 | #undef compound | ||
772 | #undef slab | ||
773 | #undef reserved | ||
774 | |||
600 | static void action_result(unsigned long pfn, char *msg, int result) | 775 | static void action_result(unsigned long pfn, char *msg, int result) |
601 | { | 776 | { |
602 | struct page *page = NULL; | 777 | struct page *page = pfn_to_page(pfn); |
603 | if (pfn_valid(pfn)) | ||
604 | page = pfn_to_page(pfn); | ||
605 | 778 | ||
606 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", | 779 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", |
607 | pfn, | 780 | pfn, |
608 | page && PageDirty(page) ? "dirty " : "", | 781 | PageDirty(page) ? "dirty " : "", |
609 | msg, action_name[result]); | 782 | msg, action_name[result]); |
610 | } | 783 | } |
611 | 784 | ||
612 | static int page_action(struct page_state *ps, struct page *p, | 785 | static int page_action(struct page_state *ps, struct page *p, |
613 | unsigned long pfn, int ref) | 786 | unsigned long pfn) |
614 | { | 787 | { |
615 | int result; | 788 | int result; |
616 | int count; | 789 | int count; |
@@ -618,18 +791,22 @@ static int page_action(struct page_state *ps, struct page *p, | |||
618 | result = ps->action(p, pfn); | 791 | result = ps->action(p, pfn); |
619 | action_result(pfn, ps->msg, result); | 792 | action_result(pfn, ps->msg, result); |
620 | 793 | ||
621 | count = page_count(p) - 1 - ref; | 794 | count = page_count(p) - 1; |
622 | if (count != 0) | 795 | if (ps->action == me_swapcache_dirty && result == DELAYED) |
796 | count--; | ||
797 | if (count != 0) { | ||
623 | printk(KERN_ERR | 798 | printk(KERN_ERR |
624 | "MCE %#lx: %s page still referenced by %d users\n", | 799 | "MCE %#lx: %s page still referenced by %d users\n", |
625 | pfn, ps->msg, count); | 800 | pfn, ps->msg, count); |
801 | result = FAILED; | ||
802 | } | ||
626 | 803 | ||
627 | /* Could do more checks here if page looks ok */ | 804 | /* Could do more checks here if page looks ok */ |
628 | /* | 805 | /* |
629 | * Could adjust zone counters here to correct for the missing page. | 806 | * Could adjust zone counters here to correct for the missing page. |
630 | */ | 807 | */ |
631 | 808 | ||
632 | return result == RECOVERED ? 0 : -EBUSY; | 809 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; |
633 | } | 810 | } |
634 | 811 | ||
635 | #define N_UNMAP_TRIES 5 | 812 | #define N_UNMAP_TRIES 5 |
@@ -638,7 +815,7 @@ static int page_action(struct page_state *ps, struct page *p, | |||
638 | * Do all that is necessary to remove user space mappings. Unmap | 815 | * Do all that is necessary to remove user space mappings. Unmap |
639 | * the pages and send SIGBUS to the processes if the data was dirty. | 816 | * the pages and send SIGBUS to the processes if the data was dirty. |
640 | */ | 817 | */ |
641 | static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | 818 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, |
642 | int trapno) | 819 | int trapno) |
643 | { | 820 | { |
644 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | 821 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
@@ -648,15 +825,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
648 | int i; | 825 | int i; |
649 | int kill = 1; | 826 | int kill = 1; |
650 | 827 | ||
651 | if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) | 828 | if (PageReserved(p) || PageSlab(p)) |
652 | return; | 829 | return SWAP_SUCCESS; |
653 | 830 | ||
654 | /* | 831 | /* |
655 | * This check implies we don't kill processes if their pages | 832 | * This check implies we don't kill processes if their pages |
656 | * are in the swap cache early. Those are always late kills. | 833 | * are in the swap cache early. Those are always late kills. |
657 | */ | 834 | */ |
658 | if (!page_mapped(p)) | 835 | if (!page_mapped(p)) |
659 | return; | 836 | return SWAP_SUCCESS; |
837 | |||
838 | if (PageCompound(p) || PageKsm(p)) | ||
839 | return SWAP_FAIL; | ||
660 | 840 | ||
661 | if (PageSwapCache(p)) { | 841 | if (PageSwapCache(p)) { |
662 | printk(KERN_ERR | 842 | printk(KERN_ERR |
@@ -667,6 +847,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
667 | /* | 847 | /* |
668 | * Propagate the dirty bit from PTEs to struct page first, because we | 848 | * Propagate the dirty bit from PTEs to struct page first, because we |
669 | * need this to decide if we should kill or just drop the page. | 849 | * need this to decide if we should kill or just drop the page. |
850 | * XXX: the dirty test could be racy: set_page_dirty() may not always | ||
851 | * be called inside page lock (it's recommended but not enforced). | ||
670 | */ | 852 | */ |
671 | mapping = page_mapping(p); | 853 | mapping = page_mapping(p); |
672 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { | 854 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { |
@@ -718,11 +900,12 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
718 | */ | 900 | */ |
719 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, | 901 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, |
720 | ret != SWAP_SUCCESS, pfn); | 902 | ret != SWAP_SUCCESS, pfn); |
903 | |||
904 | return ret; | ||
721 | } | 905 | } |
722 | 906 | ||
723 | int __memory_failure(unsigned long pfn, int trapno, int ref) | 907 | int __memory_failure(unsigned long pfn, int trapno, int flags) |
724 | { | 908 | { |
725 | unsigned long lru_flag; | ||
726 | struct page_state *ps; | 909 | struct page_state *ps; |
727 | struct page *p; | 910 | struct page *p; |
728 | int res; | 911 | int res; |
@@ -731,13 +914,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
731 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | 914 | panic("Memory failure from trap %d on page %lx", trapno, pfn); |
732 | 915 | ||
733 | if (!pfn_valid(pfn)) { | 916 | if (!pfn_valid(pfn)) { |
734 | action_result(pfn, "memory outside kernel control", IGNORED); | 917 | printk(KERN_ERR |
735 | return -EIO; | 918 | "MCE %#lx: memory outside kernel control\n", |
919 | pfn); | ||
920 | return -ENXIO; | ||
736 | } | 921 | } |
737 | 922 | ||
738 | p = pfn_to_page(pfn); | 923 | p = pfn_to_page(pfn); |
739 | if (TestSetPageHWPoison(p)) { | 924 | if (TestSetPageHWPoison(p)) { |
740 | action_result(pfn, "already hardware poisoned", IGNORED); | 925 | printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); |
741 | return 0; | 926 | return 0; |
742 | } | 927 | } |
743 | 928 | ||
@@ -754,9 +939,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
754 | * In fact it's dangerous to directly bump up page count from 0, | 939 | * In fact it's dangerous to directly bump up page count from 0, |
755 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. | 940 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. |
756 | */ | 941 | */ |
757 | if (!get_page_unless_zero(compound_head(p))) { | 942 | if (!(flags & MF_COUNT_INCREASED) && |
758 | action_result(pfn, "free or high order kernel", IGNORED); | 943 | !get_page_unless_zero(compound_head(p))) { |
759 | return PageBuddy(compound_head(p)) ? 0 : -EBUSY; | 944 | if (is_free_buddy_page(p)) { |
945 | action_result(pfn, "free buddy", DELAYED); | ||
946 | return 0; | ||
947 | } else { | ||
948 | action_result(pfn, "high order kernel", IGNORED); | ||
949 | return -EBUSY; | ||
950 | } | ||
760 | } | 951 | } |
761 | 952 | ||
762 | /* | 953 | /* |
@@ -768,14 +959,19 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
768 | * walked by the page reclaim code, however that's not a big loss. | 959 | * walked by the page reclaim code, however that's not a big loss. |
769 | */ | 960 | */ |
770 | if (!PageLRU(p)) | 961 | if (!PageLRU(p)) |
771 | lru_add_drain_all(); | 962 | shake_page(p, 0); |
772 | lru_flag = p->flags & lru; | 963 | if (!PageLRU(p)) { |
773 | if (isolate_lru_page(p)) { | 964 | /* |
965 | * shake_page could have turned it free. | ||
966 | */ | ||
967 | if (is_free_buddy_page(p)) { | ||
968 | action_result(pfn, "free buddy, 2nd try", DELAYED); | ||
969 | return 0; | ||
970 | } | ||
774 | action_result(pfn, "non LRU", IGNORED); | 971 | action_result(pfn, "non LRU", IGNORED); |
775 | put_page(p); | 972 | put_page(p); |
776 | return -EBUSY; | 973 | return -EBUSY; |
777 | } | 974 | } |
778 | page_cache_release(p); | ||
779 | 975 | ||
780 | /* | 976 | /* |
781 | * Lock the page and wait for writeback to finish. | 977 | * Lock the page and wait for writeback to finish. |
@@ -783,26 +979,48 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
783 | * and in many cases impossible, so we just avoid it here. | 979 | * and in many cases impossible, so we just avoid it here. |
784 | */ | 980 | */ |
785 | lock_page_nosync(p); | 981 | lock_page_nosync(p); |
982 | |||
983 | /* | ||
984 | * unpoison always clear PG_hwpoison inside page lock | ||
985 | */ | ||
986 | if (!PageHWPoison(p)) { | ||
987 | printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); | ||
988 | res = 0; | ||
989 | goto out; | ||
990 | } | ||
991 | if (hwpoison_filter(p)) { | ||
992 | if (TestClearPageHWPoison(p)) | ||
993 | atomic_long_dec(&mce_bad_pages); | ||
994 | unlock_page(p); | ||
995 | put_page(p); | ||
996 | return 0; | ||
997 | } | ||
998 | |||
786 | wait_on_page_writeback(p); | 999 | wait_on_page_writeback(p); |
787 | 1000 | ||
788 | /* | 1001 | /* |
789 | * Now take care of user space mappings. | 1002 | * Now take care of user space mappings. |
1003 | * Abort on fail: __remove_from_page_cache() assumes unmapped page. | ||
790 | */ | 1004 | */ |
791 | hwpoison_user_mappings(p, pfn, trapno); | 1005 | if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { |
1006 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); | ||
1007 | res = -EBUSY; | ||
1008 | goto out; | ||
1009 | } | ||
792 | 1010 | ||
793 | /* | 1011 | /* |
794 | * Torn down by someone else? | 1012 | * Torn down by someone else? |
795 | */ | 1013 | */ |
796 | if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { | 1014 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { |
797 | action_result(pfn, "already truncated LRU", IGNORED); | 1015 | action_result(pfn, "already truncated LRU", IGNORED); |
798 | res = 0; | 1016 | res = -EBUSY; |
799 | goto out; | 1017 | goto out; |
800 | } | 1018 | } |
801 | 1019 | ||
802 | res = -EBUSY; | 1020 | res = -EBUSY; |
803 | for (ps = error_states;; ps++) { | 1021 | for (ps = error_states;; ps++) { |
804 | if (((p->flags | lru_flag)& ps->mask) == ps->res) { | 1022 | if ((p->flags & ps->mask) == ps->res) { |
805 | res = page_action(ps, p, pfn, ref); | 1023 | res = page_action(ps, p, pfn); |
806 | break; | 1024 | break; |
807 | } | 1025 | } |
808 | } | 1026 | } |
@@ -833,3 +1051,235 @@ void memory_failure(unsigned long pfn, int trapno) | |||
833 | { | 1051 | { |
834 | __memory_failure(pfn, trapno, 0); | 1052 | __memory_failure(pfn, trapno, 0); |
835 | } | 1053 | } |
1054 | |||
1055 | /** | ||
1056 | * unpoison_memory - Unpoison a previously poisoned page | ||
1057 | * @pfn: Page number of the to be unpoisoned page | ||
1058 | * | ||
1059 | * Software-unpoison a page that has been poisoned by | ||
1060 | * memory_failure() earlier. | ||
1061 | * | ||
1062 | * This is only done on the software-level, so it only works | ||
1063 | * for linux injected failures, not real hardware failures | ||
1064 | * | ||
1065 | * Returns 0 for success, otherwise -errno. | ||
1066 | */ | ||
1067 | int unpoison_memory(unsigned long pfn) | ||
1068 | { | ||
1069 | struct page *page; | ||
1070 | struct page *p; | ||
1071 | int freeit = 0; | ||
1072 | |||
1073 | if (!pfn_valid(pfn)) | ||
1074 | return -ENXIO; | ||
1075 | |||
1076 | p = pfn_to_page(pfn); | ||
1077 | page = compound_head(p); | ||
1078 | |||
1079 | if (!PageHWPoison(p)) { | ||
1080 | pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); | ||
1081 | return 0; | ||
1082 | } | ||
1083 | |||
1084 | if (!get_page_unless_zero(page)) { | ||
1085 | if (TestClearPageHWPoison(p)) | ||
1086 | atomic_long_dec(&mce_bad_pages); | ||
1087 | pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); | ||
1088 | return 0; | ||
1089 | } | ||
1090 | |||
1091 | lock_page_nosync(page); | ||
1092 | /* | ||
1093 | * This test is racy because PG_hwpoison is set outside of page lock. | ||
1094 | * That's acceptable because that won't trigger kernel panic. Instead, | ||
1095 | * the PG_hwpoison page will be caught and isolated on the entrance to | ||
1096 | * the free buddy page pool. | ||
1097 | */ | ||
1098 | if (TestClearPageHWPoison(p)) { | ||
1099 | pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); | ||
1100 | atomic_long_dec(&mce_bad_pages); | ||
1101 | freeit = 1; | ||
1102 | } | ||
1103 | unlock_page(page); | ||
1104 | |||
1105 | put_page(page); | ||
1106 | if (freeit) | ||
1107 | put_page(page); | ||
1108 | |||
1109 | return 0; | ||
1110 | } | ||
1111 | EXPORT_SYMBOL(unpoison_memory); | ||
1112 | |||
1113 | static struct page *new_page(struct page *p, unsigned long private, int **x) | ||
1114 | { | ||
1115 | int nid = page_to_nid(p); | ||
1116 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | ||
1117 | } | ||
1118 | |||
1119 | /* | ||
1120 | * Safely get reference count of an arbitrary page. | ||
1121 | * Returns 0 for a free page, -EIO for a zero refcount page | ||
1122 | * that is not free, and 1 for any other page type. | ||
1123 | * For 1 the page is returned with increased page count, otherwise not. | ||
1124 | */ | ||
1125 | static int get_any_page(struct page *p, unsigned long pfn, int flags) | ||
1126 | { | ||
1127 | int ret; | ||
1128 | |||
1129 | if (flags & MF_COUNT_INCREASED) | ||
1130 | return 1; | ||
1131 | |||
1132 | /* | ||
1133 | * The lock_system_sleep prevents a race with memory hotplug, | ||
1134 | * because the isolation assumes there's only a single user. | ||
1135 | * This is a big hammer, a better would be nicer. | ||
1136 | */ | ||
1137 | lock_system_sleep(); | ||
1138 | |||
1139 | /* | ||
1140 | * Isolate the page, so that it doesn't get reallocated if it | ||
1141 | * was free. | ||
1142 | */ | ||
1143 | set_migratetype_isolate(p); | ||
1144 | if (!get_page_unless_zero(compound_head(p))) { | ||
1145 | if (is_free_buddy_page(p)) { | ||
1146 | pr_debug("get_any_page: %#lx free buddy page\n", pfn); | ||
1147 | /* Set hwpoison bit while page is still isolated */ | ||
1148 | SetPageHWPoison(p); | ||
1149 | ret = 0; | ||
1150 | } else { | ||
1151 | pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", | ||
1152 | pfn, p->flags); | ||
1153 | ret = -EIO; | ||
1154 | } | ||
1155 | } else { | ||
1156 | /* Not a free page */ | ||
1157 | ret = 1; | ||
1158 | } | ||
1159 | unset_migratetype_isolate(p); | ||
1160 | unlock_system_sleep(); | ||
1161 | return ret; | ||
1162 | } | ||
1163 | |||
1164 | /** | ||
1165 | * soft_offline_page - Soft offline a page. | ||
1166 | * @page: page to offline | ||
1167 | * @flags: flags. Same as memory_failure(). | ||
1168 | * | ||
1169 | * Returns 0 on success, otherwise negated errno. | ||
1170 | * | ||
1171 | * Soft offline a page, by migration or invalidation, | ||
1172 | * without killing anything. This is for the case when | ||
1173 | * a page is not corrupted yet (so it's still valid to access), | ||
1174 | * but has had a number of corrected errors and is better taken | ||
1175 | * out. | ||
1176 | * | ||
1177 | * The actual policy on when to do that is maintained by | ||
1178 | * user space. | ||
1179 | * | ||
1180 | * This should never impact any application or cause data loss, | ||
1181 | * however it might take some time. | ||
1182 | * | ||
1183 | * This is not a 100% solution for all memory, but tries to be | ||
1184 | * ``good enough'' for the majority of memory. | ||
1185 | */ | ||
1186 | int soft_offline_page(struct page *page, int flags) | ||
1187 | { | ||
1188 | int ret; | ||
1189 | unsigned long pfn = page_to_pfn(page); | ||
1190 | |||
1191 | ret = get_any_page(page, pfn, flags); | ||
1192 | if (ret < 0) | ||
1193 | return ret; | ||
1194 | if (ret == 0) | ||
1195 | goto done; | ||
1196 | |||
1197 | /* | ||
1198 | * Page cache page we can handle? | ||
1199 | */ | ||
1200 | if (!PageLRU(page)) { | ||
1201 | /* | ||
1202 | * Try to free it. | ||
1203 | */ | ||
1204 | put_page(page); | ||
1205 | shake_page(page, 1); | ||
1206 | |||
1207 | /* | ||
1208 | * Did it turn free? | ||
1209 | */ | ||
1210 | ret = get_any_page(page, pfn, 0); | ||
1211 | if (ret < 0) | ||
1212 | return ret; | ||
1213 | if (ret == 0) | ||
1214 | goto done; | ||
1215 | } | ||
1216 | if (!PageLRU(page)) { | ||
1217 | pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", | ||
1218 | pfn, page->flags); | ||
1219 | return -EIO; | ||
1220 | } | ||
1221 | |||
1222 | lock_page(page); | ||
1223 | wait_on_page_writeback(page); | ||
1224 | |||
1225 | /* | ||
1226 | * Synchronized using the page lock with memory_failure() | ||
1227 | */ | ||
1228 | if (PageHWPoison(page)) { | ||
1229 | unlock_page(page); | ||
1230 | put_page(page); | ||
1231 | pr_debug("soft offline: %#lx page already poisoned\n", pfn); | ||
1232 | return -EBUSY; | ||
1233 | } | ||
1234 | |||
1235 | /* | ||
1236 | * Try to invalidate first. This should work for | ||
1237 | * non dirty unmapped page cache pages. | ||
1238 | */ | ||
1239 | ret = invalidate_inode_page(page); | ||
1240 | unlock_page(page); | ||
1241 | |||
1242 | /* | ||
1243 | * Drop count because page migration doesn't like raised | ||
1244 | * counts. The page could get re-allocated, but if it becomes | ||
1245 | * LRU the isolation will just fail. | ||
1246 | * RED-PEN would be better to keep it isolated here, but we | ||
1247 | * would need to fix isolation locking first. | ||
1248 | */ | ||
1249 | put_page(page); | ||
1250 | if (ret == 1) { | ||
1251 | ret = 0; | ||
1252 | pr_debug("soft_offline: %#lx: invalidated\n", pfn); | ||
1253 | goto done; | ||
1254 | } | ||
1255 | |||
1256 | /* | ||
1257 | * Simple invalidation didn't work. | ||
1258 | * Try to migrate to a new page instead. migrate.c | ||
1259 | * handles a large number of cases for us. | ||
1260 | */ | ||
1261 | ret = isolate_lru_page(page); | ||
1262 | if (!ret) { | ||
1263 | LIST_HEAD(pagelist); | ||
1264 | |||
1265 | list_add(&page->lru, &pagelist); | ||
1266 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | ||
1267 | if (ret) { | ||
1268 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | ||
1269 | pfn, ret, page->flags); | ||
1270 | if (ret > 0) | ||
1271 | ret = -EIO; | ||
1272 | } | ||
1273 | } else { | ||
1274 | pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | ||
1275 | pfn, ret, page_count(page), page->flags); | ||
1276 | } | ||
1277 | if (ret) | ||
1278 | return ret; | ||
1279 | |||
1280 | done: | ||
1281 | atomic_long_add(1, &mce_bad_pages); | ||
1282 | SetPageHWPoison(page); | ||
1283 | /* keep elevated page count for bad page */ | ||
1284 | return ret; | ||
1285 | } | ||
diff --git a/mm/memory.c b/mm/memory.c index 6ab19dd4a199..09e4b1be7b67 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -572,7 +572,7 @@ out: | |||
572 | * covered by this vma. | 572 | * covered by this vma. |
573 | */ | 573 | */ |
574 | 574 | ||
575 | static inline void | 575 | static inline unsigned long |
576 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 576 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
577 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, | 577 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, |
578 | unsigned long addr, int *rss) | 578 | unsigned long addr, int *rss) |
@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
586 | if (!pte_file(pte)) { | 586 | if (!pte_file(pte)) { |
587 | swp_entry_t entry = pte_to_swp_entry(pte); | 587 | swp_entry_t entry = pte_to_swp_entry(pte); |
588 | 588 | ||
589 | swap_duplicate(entry); | 589 | if (swap_duplicate(entry) < 0) |
590 | return entry.val; | ||
591 | |||
590 | /* make sure dst_mm is on swapoff's mmlist. */ | 592 | /* make sure dst_mm is on swapoff's mmlist. */ |
591 | if (unlikely(list_empty(&dst_mm->mmlist))) { | 593 | if (unlikely(list_empty(&dst_mm->mmlist))) { |
592 | spin_lock(&mmlist_lock); | 594 | spin_lock(&mmlist_lock); |
@@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
635 | 637 | ||
636 | out_set_pte: | 638 | out_set_pte: |
637 | set_pte_at(dst_mm, addr, dst_pte, pte); | 639 | set_pte_at(dst_mm, addr, dst_pte, pte); |
640 | return 0; | ||
638 | } | 641 | } |
639 | 642 | ||
640 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 643 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
@@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
646 | spinlock_t *src_ptl, *dst_ptl; | 649 | spinlock_t *src_ptl, *dst_ptl; |
647 | int progress = 0; | 650 | int progress = 0; |
648 | int rss[2]; | 651 | int rss[2]; |
652 | swp_entry_t entry = (swp_entry_t){0}; | ||
649 | 653 | ||
650 | again: | 654 | again: |
651 | rss[1] = rss[0] = 0; | 655 | rss[1] = rss[0] = 0; |
@@ -674,7 +678,10 @@ again: | |||
674 | progress++; | 678 | progress++; |
675 | continue; | 679 | continue; |
676 | } | 680 | } |
677 | copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); | 681 | entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, |
682 | vma, addr, rss); | ||
683 | if (entry.val) | ||
684 | break; | ||
678 | progress += 8; | 685 | progress += 8; |
679 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); | 686 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); |
680 | 687 | ||
@@ -684,6 +691,12 @@ again: | |||
684 | add_mm_rss(dst_mm, rss[0], rss[1]); | 691 | add_mm_rss(dst_mm, rss[0], rss[1]); |
685 | pte_unmap_unlock(orig_dst_pte, dst_ptl); | 692 | pte_unmap_unlock(orig_dst_pte, dst_ptl); |
686 | cond_resched(); | 693 | cond_resched(); |
694 | |||
695 | if (entry.val) { | ||
696 | if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) | ||
697 | return -ENOMEM; | ||
698 | progress = 0; | ||
699 | } | ||
687 | if (addr != end) | 700 | if (addr != end) |
688 | goto again; | 701 | goto again; |
689 | return 0; | 702 | return 0; |
@@ -943,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
943 | details = NULL; | 956 | details = NULL; |
944 | 957 | ||
945 | BUG_ON(addr >= end); | 958 | BUG_ON(addr >= end); |
959 | mem_cgroup_uncharge_start(); | ||
946 | tlb_start_vma(tlb, vma); | 960 | tlb_start_vma(tlb, vma); |
947 | pgd = pgd_offset(vma->vm_mm, addr); | 961 | pgd = pgd_offset(vma->vm_mm, addr); |
948 | do { | 962 | do { |
@@ -955,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
955 | zap_work, details); | 969 | zap_work, details); |
956 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); | 970 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); |
957 | tlb_end_vma(tlb, vma); | 971 | tlb_end_vma(tlb, vma); |
972 | mem_cgroup_uncharge_end(); | ||
958 | 973 | ||
959 | return addr; | 974 | return addr; |
960 | } | 975 | } |
@@ -2514,7 +2529,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2514 | ret = VM_FAULT_HWPOISON; | 2529 | ret = VM_FAULT_HWPOISON; |
2515 | } else { | 2530 | } else { |
2516 | print_bad_pte(vma, address, orig_pte, NULL); | 2531 | print_bad_pte(vma, address, orig_pte, NULL); |
2517 | ret = VM_FAULT_OOM; | 2532 | ret = VM_FAULT_SIGBUS; |
2518 | } | 2533 | } |
2519 | goto out; | 2534 | goto out; |
2520 | } | 2535 | } |
@@ -2540,6 +2555,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2540 | ret = VM_FAULT_MAJOR; | 2555 | ret = VM_FAULT_MAJOR; |
2541 | count_vm_event(PGMAJFAULT); | 2556 | count_vm_event(PGMAJFAULT); |
2542 | } else if (PageHWPoison(page)) { | 2557 | } else if (PageHWPoison(page)) { |
2558 | /* | ||
2559 | * hwpoisoned dirty swapcache pages are kept for killing | ||
2560 | * owner processes (which may be unknown at hwpoison time) | ||
2561 | */ | ||
2543 | ret = VM_FAULT_HWPOISON; | 2562 | ret = VM_FAULT_HWPOISON; |
2544 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2563 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2545 | goto out_release; | 2564 | goto out_release; |
@@ -2548,6 +2567,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2548 | lock_page(page); | 2567 | lock_page(page); |
2549 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2568 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2550 | 2569 | ||
2570 | page = ksm_might_need_to_copy(page, vma, address); | ||
2571 | if (!page) { | ||
2572 | ret = VM_FAULT_OOM; | ||
2573 | goto out; | ||
2574 | } | ||
2575 | |||
2551 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { | 2576 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { |
2552 | ret = VM_FAULT_OOM; | 2577 | ret = VM_FAULT_OOM; |
2553 | goto out_page; | 2578 | goto out_page; |
@@ -2910,7 +2935,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2910 | * Page table corrupted: show pte and kill process. | 2935 | * Page table corrupted: show pte and kill process. |
2911 | */ | 2936 | */ |
2912 | print_bad_pte(vma, address, orig_pte, NULL); | 2937 | print_bad_pte(vma, address, orig_pte, NULL); |
2913 | return VM_FAULT_OOM; | 2938 | return VM_FAULT_SIGBUS; |
2914 | } | 2939 | } |
2915 | 2940 | ||
2916 | pgoff = pte_to_pgoff(orig_pte); | 2941 | pgoff = pte_to_pgoff(orig_pte); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2047465cd27c..030ce8a5bb0e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/page-isolation.h> | 27 | #include <linux/page-isolation.h> |
28 | #include <linux/pfn.h> | 28 | #include <linux/pfn.h> |
29 | #include <linux/suspend.h> | 29 | #include <linux/suspend.h> |
30 | #include <linux/mm_inline.h> | ||
30 | 31 | ||
31 | #include <asm/tlbflush.h> | 32 | #include <asm/tlbflush.h> |
32 | 33 | ||
@@ -71,7 +72,9 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) | |||
71 | atomic_inc(&page->_count); | 72 | atomic_inc(&page->_count); |
72 | } | 73 | } |
73 | 74 | ||
74 | void put_page_bootmem(struct page *page) | 75 | /* reference to __meminit __free_pages_bootmem is valid |
76 | * so use __ref to tell modpost not to generate a warning */ | ||
77 | void __ref put_page_bootmem(struct page *page) | ||
75 | { | 78 | { |
76 | int type; | 79 | int type; |
77 | 80 | ||
@@ -672,6 +675,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
672 | if (!ret) { /* Success */ | 675 | if (!ret) { /* Success */ |
673 | list_add_tail(&page->lru, &source); | 676 | list_add_tail(&page->lru, &source); |
674 | move_pages--; | 677 | move_pages--; |
678 | inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
679 | page_is_file_cache(page)); | ||
680 | |||
675 | } else { | 681 | } else { |
676 | /* Becasue we don't have big zone->lock. we should | 682 | /* Becasue we don't have big zone->lock. we should |
677 | check this again here. */ | 683 | check this again here. */ |
@@ -694,7 +700,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
694 | if (list_empty(&source)) | 700 | if (list_empty(&source)) |
695 | goto out; | 701 | goto out; |
696 | /* this function returns # of failed pages */ | 702 | /* this function returns # of failed pages */ |
697 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0); | 703 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); |
698 | 704 | ||
699 | out: | 705 | out: |
700 | return ret; | 706 | return ret; |
@@ -747,7 +753,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
747 | return offlined; | 753 | return offlined; |
748 | } | 754 | } |
749 | 755 | ||
750 | int offline_pages(unsigned long start_pfn, | 756 | static int offline_pages(unsigned long start_pfn, |
751 | unsigned long end_pfn, unsigned long timeout) | 757 | unsigned long end_pfn, unsigned long timeout) |
752 | { | 758 | { |
753 | unsigned long pfn, nr_pages, expire; | 759 | unsigned long pfn, nr_pages, expire; |
@@ -849,6 +855,10 @@ repeat: | |||
849 | 855 | ||
850 | setup_per_zone_wmarks(); | 856 | setup_per_zone_wmarks(); |
851 | calculate_zone_inactive_ratio(zone); | 857 | calculate_zone_inactive_ratio(zone); |
858 | if (!node_present_pages(node)) { | ||
859 | node_clear_state(node, N_HIGH_MEMORY); | ||
860 | kswapd_stop(node); | ||
861 | } | ||
852 | 862 | ||
853 | vm_total_pages = nr_free_pagecache_pages(); | 863 | vm_total_pages = nr_free_pagecache_pages(); |
854 | writeback_set_ratelimit(); | 864 | writeback_set_ratelimit(); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4545d5944243..290fb5bf0440 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -85,10 +85,12 @@ | |||
85 | #include <linux/seq_file.h> | 85 | #include <linux/seq_file.h> |
86 | #include <linux/proc_fs.h> | 86 | #include <linux/proc_fs.h> |
87 | #include <linux/migrate.h> | 87 | #include <linux/migrate.h> |
88 | #include <linux/ksm.h> | ||
88 | #include <linux/rmap.h> | 89 | #include <linux/rmap.h> |
89 | #include <linux/security.h> | 90 | #include <linux/security.h> |
90 | #include <linux/syscalls.h> | 91 | #include <linux/syscalls.h> |
91 | #include <linux/ctype.h> | 92 | #include <linux/ctype.h> |
93 | #include <linux/mm_inline.h> | ||
92 | 94 | ||
93 | #include <asm/tlbflush.h> | 95 | #include <asm/tlbflush.h> |
94 | #include <asm/uaccess.h> | 96 | #include <asm/uaccess.h> |
@@ -412,17 +414,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
412 | if (!page) | 414 | if (!page) |
413 | continue; | 415 | continue; |
414 | /* | 416 | /* |
415 | * The check for PageReserved here is important to avoid | 417 | * vm_normal_page() filters out zero pages, but there might |
416 | * handling zero pages and other pages that may have been | 418 | * still be PageReserved pages to skip, perhaps in a VDSO. |
417 | * marked special by the system. | 419 | * And we cannot move PageKsm pages sensibly or safely yet. |
418 | * | ||
419 | * If the PageReserved would not be checked here then f.e. | ||
420 | * the location of the zero page could have an influence | ||
421 | * on MPOL_MF_STRICT, zero pages would be counted for | ||
422 | * the per node stats, and there would be useless attempts | ||
423 | * to put zero pages on the migration list. | ||
424 | */ | 420 | */ |
425 | if (PageReserved(page)) | 421 | if (PageReserved(page) || PageKsm(page)) |
426 | continue; | 422 | continue; |
427 | nid = page_to_nid(page); | 423 | nid = page_to_nid(page); |
428 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 424 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
@@ -809,6 +805,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
809 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { | 805 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { |
810 | if (!isolate_lru_page(page)) { | 806 | if (!isolate_lru_page(page)) { |
811 | list_add_tail(&page->lru, pagelist); | 807 | list_add_tail(&page->lru, pagelist); |
808 | inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
809 | page_is_file_cache(page)); | ||
812 | } | 810 | } |
813 | } | 811 | } |
814 | } | 812 | } |
@@ -836,7 +834,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
836 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 834 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
837 | 835 | ||
838 | if (!list_empty(&pagelist)) | 836 | if (!list_empty(&pagelist)) |
839 | err = migrate_pages(&pagelist, new_node_page, dest); | 837 | err = migrate_pages(&pagelist, new_node_page, dest, 0); |
840 | 838 | ||
841 | return err; | 839 | return err; |
842 | } | 840 | } |
@@ -1053,7 +1051,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1053 | 1051 | ||
1054 | if (!list_empty(&pagelist)) | 1052 | if (!list_empty(&pagelist)) |
1055 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1053 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1056 | (unsigned long)vma); | 1054 | (unsigned long)vma, 0); |
1057 | 1055 | ||
1058 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 1056 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) |
1059 | err = -EIO; | 1057 | err = -EIO; |
@@ -1565,6 +1563,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | |||
1565 | } | 1563 | } |
1566 | return zl; | 1564 | return zl; |
1567 | } | 1565 | } |
1566 | |||
1567 | /* | ||
1568 | * init_nodemask_of_mempolicy | ||
1569 | * | ||
1570 | * If the current task's mempolicy is "default" [NULL], return 'false' | ||
1571 | * to indicate default policy. Otherwise, extract the policy nodemask | ||
1572 | * for 'bind' or 'interleave' policy into the argument nodemask, or | ||
1573 | * initialize the argument nodemask to contain the single node for | ||
1574 | * 'preferred' or 'local' policy and return 'true' to indicate presence | ||
1575 | * of non-default mempolicy. | ||
1576 | * | ||
1577 | * We don't bother with reference counting the mempolicy [mpol_get/put] | ||
1578 | * because the current task is examining it's own mempolicy and a task's | ||
1579 | * mempolicy is only ever changed by the task itself. | ||
1580 | * | ||
1581 | * N.B., it is the caller's responsibility to free a returned nodemask. | ||
1582 | */ | ||
1583 | bool init_nodemask_of_mempolicy(nodemask_t *mask) | ||
1584 | { | ||
1585 | struct mempolicy *mempolicy; | ||
1586 | int nid; | ||
1587 | |||
1588 | if (!(mask && current->mempolicy)) | ||
1589 | return false; | ||
1590 | |||
1591 | mempolicy = current->mempolicy; | ||
1592 | switch (mempolicy->mode) { | ||
1593 | case MPOL_PREFERRED: | ||
1594 | if (mempolicy->flags & MPOL_F_LOCAL) | ||
1595 | nid = numa_node_id(); | ||
1596 | else | ||
1597 | nid = mempolicy->v.preferred_node; | ||
1598 | init_nodemask_of_node(mask, nid); | ||
1599 | break; | ||
1600 | |||
1601 | case MPOL_BIND: | ||
1602 | /* Fall through */ | ||
1603 | case MPOL_INTERLEAVE: | ||
1604 | *mask = mempolicy->v.nodes; | ||
1605 | break; | ||
1606 | |||
1607 | default: | ||
1608 | BUG(); | ||
1609 | } | ||
1610 | |||
1611 | return true; | ||
1612 | } | ||
1568 | #endif | 1613 | #endif |
1569 | 1614 | ||
1570 | /* Allocate a page in interleaved policy. | 1615 | /* Allocate a page in interleaved policy. |
diff --git a/mm/migrate.c b/mm/migrate.c index 0bc640fd68fa..efddbf0926b2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/mm_inline.h> | 21 | #include <linux/mm_inline.h> |
22 | #include <linux/nsproxy.h> | 22 | #include <linux/nsproxy.h> |
23 | #include <linux/pagevec.h> | 23 | #include <linux/pagevec.h> |
24 | #include <linux/ksm.h> | ||
24 | #include <linux/rmap.h> | 25 | #include <linux/rmap.h> |
25 | #include <linux/topology.h> | 26 | #include <linux/topology.h> |
26 | #include <linux/cpu.h> | 27 | #include <linux/cpu.h> |
@@ -78,8 +79,8 @@ int putback_lru_pages(struct list_head *l) | |||
78 | /* | 79 | /* |
79 | * Restore a potential migration pte to a working pte entry | 80 | * Restore a potential migration pte to a working pte entry |
80 | */ | 81 | */ |
81 | static void remove_migration_pte(struct vm_area_struct *vma, | 82 | static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, |
82 | struct page *old, struct page *new) | 83 | unsigned long addr, void *old) |
83 | { | 84 | { |
84 | struct mm_struct *mm = vma->vm_mm; | 85 | struct mm_struct *mm = vma->vm_mm; |
85 | swp_entry_t entry; | 86 | swp_entry_t entry; |
@@ -88,40 +89,37 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
88 | pmd_t *pmd; | 89 | pmd_t *pmd; |
89 | pte_t *ptep, pte; | 90 | pte_t *ptep, pte; |
90 | spinlock_t *ptl; | 91 | spinlock_t *ptl; |
91 | unsigned long addr = page_address_in_vma(new, vma); | ||
92 | |||
93 | if (addr == -EFAULT) | ||
94 | return; | ||
95 | 92 | ||
96 | pgd = pgd_offset(mm, addr); | 93 | pgd = pgd_offset(mm, addr); |
97 | if (!pgd_present(*pgd)) | 94 | if (!pgd_present(*pgd)) |
98 | return; | 95 | goto out; |
99 | 96 | ||
100 | pud = pud_offset(pgd, addr); | 97 | pud = pud_offset(pgd, addr); |
101 | if (!pud_present(*pud)) | 98 | if (!pud_present(*pud)) |
102 | return; | 99 | goto out; |
103 | 100 | ||
104 | pmd = pmd_offset(pud, addr); | 101 | pmd = pmd_offset(pud, addr); |
105 | if (!pmd_present(*pmd)) | 102 | if (!pmd_present(*pmd)) |
106 | return; | 103 | goto out; |
107 | 104 | ||
108 | ptep = pte_offset_map(pmd, addr); | 105 | ptep = pte_offset_map(pmd, addr); |
109 | 106 | ||
110 | if (!is_swap_pte(*ptep)) { | 107 | if (!is_swap_pte(*ptep)) { |
111 | pte_unmap(ptep); | 108 | pte_unmap(ptep); |
112 | return; | 109 | goto out; |
113 | } | 110 | } |
114 | 111 | ||
115 | ptl = pte_lockptr(mm, pmd); | 112 | ptl = pte_lockptr(mm, pmd); |
116 | spin_lock(ptl); | 113 | spin_lock(ptl); |
117 | pte = *ptep; | 114 | pte = *ptep; |
118 | if (!is_swap_pte(pte)) | 115 | if (!is_swap_pte(pte)) |
119 | goto out; | 116 | goto unlock; |
120 | 117 | ||
121 | entry = pte_to_swp_entry(pte); | 118 | entry = pte_to_swp_entry(pte); |
122 | 119 | ||
123 | if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) | 120 | if (!is_migration_entry(entry) || |
124 | goto out; | 121 | migration_entry_to_page(entry) != old) |
122 | goto unlock; | ||
125 | 123 | ||
126 | get_page(new); | 124 | get_page(new); |
127 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 125 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
@@ -137,58 +135,10 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
137 | 135 | ||
138 | /* No need to invalidate - it was non-present before */ | 136 | /* No need to invalidate - it was non-present before */ |
139 | update_mmu_cache(vma, addr, pte); | 137 | update_mmu_cache(vma, addr, pte); |
140 | 138 | unlock: | |
141 | out: | ||
142 | pte_unmap_unlock(ptep, ptl); | 139 | pte_unmap_unlock(ptep, ptl); |
143 | } | 140 | out: |
144 | 141 | return SWAP_AGAIN; | |
145 | /* | ||
146 | * Note that remove_file_migration_ptes will only work on regular mappings, | ||
147 | * Nonlinear mappings do not use migration entries. | ||
148 | */ | ||
149 | static void remove_file_migration_ptes(struct page *old, struct page *new) | ||
150 | { | ||
151 | struct vm_area_struct *vma; | ||
152 | struct address_space *mapping = new->mapping; | ||
153 | struct prio_tree_iter iter; | ||
154 | pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
155 | |||
156 | if (!mapping) | ||
157 | return; | ||
158 | |||
159 | spin_lock(&mapping->i_mmap_lock); | ||
160 | |||
161 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) | ||
162 | remove_migration_pte(vma, old, new); | ||
163 | |||
164 | spin_unlock(&mapping->i_mmap_lock); | ||
165 | } | ||
166 | |||
167 | /* | ||
168 | * Must hold mmap_sem lock on at least one of the vmas containing | ||
169 | * the page so that the anon_vma cannot vanish. | ||
170 | */ | ||
171 | static void remove_anon_migration_ptes(struct page *old, struct page *new) | ||
172 | { | ||
173 | struct anon_vma *anon_vma; | ||
174 | struct vm_area_struct *vma; | ||
175 | unsigned long mapping; | ||
176 | |||
177 | mapping = (unsigned long)new->mapping; | ||
178 | |||
179 | if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) | ||
180 | return; | ||
181 | |||
182 | /* | ||
183 | * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. | ||
184 | */ | ||
185 | anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); | ||
186 | spin_lock(&anon_vma->lock); | ||
187 | |||
188 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) | ||
189 | remove_migration_pte(vma, old, new); | ||
190 | |||
191 | spin_unlock(&anon_vma->lock); | ||
192 | } | 142 | } |
193 | 143 | ||
194 | /* | 144 | /* |
@@ -197,10 +147,7 @@ static void remove_anon_migration_ptes(struct page *old, struct page *new) | |||
197 | */ | 147 | */ |
198 | static void remove_migration_ptes(struct page *old, struct page *new) | 148 | static void remove_migration_ptes(struct page *old, struct page *new) |
199 | { | 149 | { |
200 | if (PageAnon(new)) | 150 | rmap_walk(new, remove_migration_pte, old); |
201 | remove_anon_migration_ptes(old, new); | ||
202 | else | ||
203 | remove_file_migration_ptes(old, new); | ||
204 | } | 151 | } |
205 | 152 | ||
206 | /* | 153 | /* |
@@ -341,8 +288,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
341 | if (TestClearPageActive(page)) { | 288 | if (TestClearPageActive(page)) { |
342 | VM_BUG_ON(PageUnevictable(page)); | 289 | VM_BUG_ON(PageUnevictable(page)); |
343 | SetPageActive(newpage); | 290 | SetPageActive(newpage); |
344 | } else | 291 | } else if (TestClearPageUnevictable(page)) |
345 | unevictable_migrate_page(newpage, page); | 292 | SetPageUnevictable(newpage); |
346 | if (PageChecked(page)) | 293 | if (PageChecked(page)) |
347 | SetPageChecked(newpage); | 294 | SetPageChecked(newpage); |
348 | if (PageMappedToDisk(page)) | 295 | if (PageMappedToDisk(page)) |
@@ -361,6 +308,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
361 | } | 308 | } |
362 | 309 | ||
363 | mlock_migrate_page(newpage, page); | 310 | mlock_migrate_page(newpage, page); |
311 | ksm_migrate_page(newpage, page); | ||
364 | 312 | ||
365 | ClearPageSwapCache(page); | 313 | ClearPageSwapCache(page); |
366 | ClearPagePrivate(page); | 314 | ClearPagePrivate(page); |
@@ -580,9 +528,9 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
580 | else | 528 | else |
581 | rc = fallback_migrate_page(mapping, newpage, page); | 529 | rc = fallback_migrate_page(mapping, newpage, page); |
582 | 530 | ||
583 | if (!rc) { | 531 | if (!rc) |
584 | remove_migration_ptes(page, newpage); | 532 | remove_migration_ptes(page, newpage); |
585 | } else | 533 | else |
586 | newpage->mapping = NULL; | 534 | newpage->mapping = NULL; |
587 | 535 | ||
588 | unlock_page(newpage); | 536 | unlock_page(newpage); |
@@ -595,7 +543,7 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
595 | * to the newly allocated page in newpage. | 543 | * to the newly allocated page in newpage. |
596 | */ | 544 | */ |
597 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 545 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, |
598 | struct page *page, int force) | 546 | struct page *page, int force, int offlining) |
599 | { | 547 | { |
600 | int rc = 0; | 548 | int rc = 0; |
601 | int *result = NULL; | 549 | int *result = NULL; |
@@ -621,6 +569,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
621 | lock_page(page); | 569 | lock_page(page); |
622 | } | 570 | } |
623 | 571 | ||
572 | /* | ||
573 | * Only memory hotplug's offline_pages() caller has locked out KSM, | ||
574 | * and can safely migrate a KSM page. The other cases have skipped | ||
575 | * PageKsm along with PageReserved - but it is only now when we have | ||
576 | * the page lock that we can be certain it will not go KSM beneath us | ||
577 | * (KSM will not upgrade a page from PageAnon to PageKsm when it sees | ||
578 | * its pagecount raised, but only here do we take the page lock which | ||
579 | * serializes that). | ||
580 | */ | ||
581 | if (PageKsm(page) && !offlining) { | ||
582 | rc = -EBUSY; | ||
583 | goto unlock; | ||
584 | } | ||
585 | |||
624 | /* charge against new page */ | 586 | /* charge against new page */ |
625 | charge = mem_cgroup_prepare_migration(page, &mem); | 587 | charge = mem_cgroup_prepare_migration(page, &mem); |
626 | if (charge == -ENOMEM) { | 588 | if (charge == -ENOMEM) { |
@@ -737,7 +699,7 @@ move_newpage: | |||
737 | * Return: Number of pages not migrated or error code. | 699 | * Return: Number of pages not migrated or error code. |
738 | */ | 700 | */ |
739 | int migrate_pages(struct list_head *from, | 701 | int migrate_pages(struct list_head *from, |
740 | new_page_t get_new_page, unsigned long private) | 702 | new_page_t get_new_page, unsigned long private, int offlining) |
741 | { | 703 | { |
742 | int retry = 1; | 704 | int retry = 1; |
743 | int nr_failed = 0; | 705 | int nr_failed = 0; |
@@ -746,13 +708,6 @@ int migrate_pages(struct list_head *from, | |||
746 | struct page *page2; | 708 | struct page *page2; |
747 | int swapwrite = current->flags & PF_SWAPWRITE; | 709 | int swapwrite = current->flags & PF_SWAPWRITE; |
748 | int rc; | 710 | int rc; |
749 | unsigned long flags; | ||
750 | |||
751 | local_irq_save(flags); | ||
752 | list_for_each_entry(page, from, lru) | ||
753 | __inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
754 | page_is_file_cache(page)); | ||
755 | local_irq_restore(flags); | ||
756 | 711 | ||
757 | if (!swapwrite) | 712 | if (!swapwrite) |
758 | current->flags |= PF_SWAPWRITE; | 713 | current->flags |= PF_SWAPWRITE; |
@@ -764,7 +719,7 @@ int migrate_pages(struct list_head *from, | |||
764 | cond_resched(); | 719 | cond_resched(); |
765 | 720 | ||
766 | rc = unmap_and_move(get_new_page, private, | 721 | rc = unmap_and_move(get_new_page, private, |
767 | page, pass > 2); | 722 | page, pass > 2, offlining); |
768 | 723 | ||
769 | switch(rc) { | 724 | switch(rc) { |
770 | case -ENOMEM: | 725 | case -ENOMEM: |
@@ -860,7 +815,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
860 | if (!page) | 815 | if (!page) |
861 | goto set_status; | 816 | goto set_status; |
862 | 817 | ||
863 | if (PageReserved(page)) /* Check for zero page */ | 818 | /* Use PageReserved to check for zero page */ |
819 | if (PageReserved(page) || PageKsm(page)) | ||
864 | goto put_and_set; | 820 | goto put_and_set; |
865 | 821 | ||
866 | pp->page = page; | 822 | pp->page = page; |
@@ -878,8 +834,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
878 | goto put_and_set; | 834 | goto put_and_set; |
879 | 835 | ||
880 | err = isolate_lru_page(page); | 836 | err = isolate_lru_page(page); |
881 | if (!err) | 837 | if (!err) { |
882 | list_add_tail(&page->lru, &pagelist); | 838 | list_add_tail(&page->lru, &pagelist); |
839 | inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
840 | page_is_file_cache(page)); | ||
841 | } | ||
883 | put_and_set: | 842 | put_and_set: |
884 | /* | 843 | /* |
885 | * Either remove the duplicate refcount from | 844 | * Either remove the duplicate refcount from |
@@ -894,7 +853,7 @@ set_status: | |||
894 | err = 0; | 853 | err = 0; |
895 | if (!list_empty(&pagelist)) | 854 | if (!list_empty(&pagelist)) |
896 | err = migrate_pages(&pagelist, new_page_node, | 855 | err = migrate_pages(&pagelist, new_page_node, |
897 | (unsigned long)pm); | 856 | (unsigned long)pm, 0); |
898 | 857 | ||
899 | up_read(&mm->mmap_sem); | 858 | up_read(&mm->mmap_sem); |
900 | return err; | 859 | return err; |
@@ -1015,7 +974,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, | |||
1015 | 974 | ||
1016 | err = -ENOENT; | 975 | err = -ENOENT; |
1017 | /* Use PageReserved to check for zero page */ | 976 | /* Use PageReserved to check for zero page */ |
1018 | if (!page || PageReserved(page)) | 977 | if (!page || PageReserved(page) || PageKsm(page)) |
1019 | goto set_status; | 978 | goto set_status; |
1020 | 979 | ||
1021 | err = page_to_nid(page); | 980 | err = page_to_nid(page); |
diff --git a/mm/mincore.c b/mm/mincore.c index 8cb508f84ea4..7a3436ef39eb 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> |
15 | #include <linux/swap.h> | 15 | #include <linux/swap.h> |
16 | #include <linux/swapops.h> | 16 | #include <linux/swapops.h> |
17 | #include <linux/hugetlb.h> | ||
17 | 18 | ||
18 | #include <asm/uaccess.h> | 19 | #include <asm/uaccess.h> |
19 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
@@ -72,6 +73,42 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag | |||
72 | if (!vma || addr < vma->vm_start) | 73 | if (!vma || addr < vma->vm_start) |
73 | return -ENOMEM; | 74 | return -ENOMEM; |
74 | 75 | ||
76 | #ifdef CONFIG_HUGETLB_PAGE | ||
77 | if (is_vm_hugetlb_page(vma)) { | ||
78 | struct hstate *h; | ||
79 | unsigned long nr_huge; | ||
80 | unsigned char present; | ||
81 | |||
82 | i = 0; | ||
83 | nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); | ||
84 | h = hstate_vma(vma); | ||
85 | nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h)) | ||
86 | - (addr >> huge_page_shift(h)) + 1; | ||
87 | nr_huge = min(nr_huge, | ||
88 | (vma->vm_end - addr) >> huge_page_shift(h)); | ||
89 | while (1) { | ||
90 | /* hugepage always in RAM for now, | ||
91 | * but generally it needs to be check */ | ||
92 | ptep = huge_pte_offset(current->mm, | ||
93 | addr & huge_page_mask(h)); | ||
94 | present = !!(ptep && | ||
95 | !huge_pte_none(huge_ptep_get(ptep))); | ||
96 | while (1) { | ||
97 | vec[i++] = present; | ||
98 | addr += PAGE_SIZE; | ||
99 | /* reach buffer limit */ | ||
100 | if (i == nr) | ||
101 | return nr; | ||
102 | /* check hugepage border */ | ||
103 | if (!((addr & ~huge_page_mask(h)) | ||
104 | >> PAGE_SHIFT)) | ||
105 | break; | ||
106 | } | ||
107 | } | ||
108 | return nr; | ||
109 | } | ||
110 | #endif | ||
111 | |||
75 | /* | 112 | /* |
76 | * Calculate how many pages there are left in the last level of the | 113 | * Calculate how many pages there are left in the last level of the |
77 | * PTE array for our address. | 114 | * PTE array for our address. |
diff --git a/mm/mlock.c b/mm/mlock.c index bd6f0e466f6c..2b8335a89400 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -88,25 +88,22 @@ void mlock_vma_page(struct page *page) | |||
88 | } | 88 | } |
89 | } | 89 | } |
90 | 90 | ||
91 | /* | 91 | /** |
92 | * called from munlock()/munmap() path with page supposedly on the LRU. | 92 | * munlock_vma_page - munlock a vma page |
93 | * @page - page to be unlocked | ||
93 | * | 94 | * |
94 | * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked | 95 | * called from munlock()/munmap() path with page supposedly on the LRU. |
95 | * [in try_to_munlock()] and then attempt to isolate the page. We must | 96 | * When we munlock a page, because the vma where we found the page is being |
96 | * isolate the page to keep others from messing with its unevictable | 97 | * munlock()ed or munmap()ed, we want to check whether other vmas hold the |
97 | * and mlocked state while trying to munlock. However, we pre-clear the | 98 | * page locked so that we can leave it on the unevictable lru list and not |
98 | * mlocked state anyway as we might lose the isolation race and we might | 99 | * bother vmscan with it. However, to walk the page's rmap list in |
99 | * not get another chance to clear PageMlocked. If we successfully | 100 | * try_to_munlock() we must isolate the page from the LRU. If some other |
100 | * isolate the page and try_to_munlock() detects other VM_LOCKED vmas | 101 | * task has removed the page from the LRU, we won't be able to do that. |
101 | * mapping the page, it will restore the PageMlocked state, unless the page | 102 | * So we clear the PageMlocked as we might not get another chance. If we |
102 | * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), | 103 | * can't isolate the page, we leave it for putback_lru_page() and vmscan |
103 | * perhaps redundantly. | 104 | * [page_referenced()/try_to_unmap()] to deal with. |
104 | * If we lose the isolation race, and the page is mapped by other VM_LOCKED | ||
105 | * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() | ||
106 | * either of which will restore the PageMlocked state by calling | ||
107 | * mlock_vma_page() above, if it can grab the vma's mmap sem. | ||
108 | */ | 105 | */ |
109 | static void munlock_vma_page(struct page *page) | 106 | void munlock_vma_page(struct page *page) |
110 | { | 107 | { |
111 | BUG_ON(!PageLocked(page)); | 108 | BUG_ON(!PageLocked(page)); |
112 | 109 | ||
@@ -117,18 +114,18 @@ static void munlock_vma_page(struct page *page) | |||
117 | /* | 114 | /* |
118 | * did try_to_unlock() succeed or punt? | 115 | * did try_to_unlock() succeed or punt? |
119 | */ | 116 | */ |
120 | if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) | 117 | if (ret != SWAP_MLOCK) |
121 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | 118 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); |
122 | 119 | ||
123 | putback_lru_page(page); | 120 | putback_lru_page(page); |
124 | } else { | 121 | } else { |
125 | /* | 122 | /* |
126 | * We lost the race. let try_to_unmap() deal | 123 | * Some other task has removed the page from the LRU. |
127 | * with it. At least we get the page state and | 124 | * putback_lru_page() will take care of removing the |
128 | * mlock stats right. However, page is still on | 125 | * page from the unevictable list, if necessary. |
129 | * the noreclaim list. We'll fix that up when | 126 | * vmscan [page_referenced()] will move the page back |
130 | * the page is eventually freed or we scan the | 127 | * to the unevictable list if some other vma has it |
131 | * noreclaim list. | 128 | * mlocked. |
132 | */ | 129 | */ |
133 | if (PageUnevictable(page)) | 130 | if (PageUnevictable(page)) |
134 | count_vm_event(UNEVICTABLE_PGSTRANDED); | 131 | count_vm_event(UNEVICTABLE_PGSTRANDED); |
@@ -931,13 +931,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
931 | if (!(flags & MAP_FIXED)) | 931 | if (!(flags & MAP_FIXED)) |
932 | addr = round_hint_to_min(addr); | 932 | addr = round_hint_to_min(addr); |
933 | 933 | ||
934 | error = arch_mmap_check(addr, len, flags); | ||
935 | if (error) | ||
936 | return error; | ||
937 | |||
938 | /* Careful about overflows.. */ | 934 | /* Careful about overflows.. */ |
939 | len = PAGE_ALIGN(len); | 935 | len = PAGE_ALIGN(len); |
940 | if (!len || len > TASK_SIZE) | 936 | if (!len) |
941 | return -ENOMEM; | 937 | return -ENOMEM; |
942 | 938 | ||
943 | /* offset overflow? */ | 939 | /* offset overflow? */ |
@@ -948,24 +944,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
948 | if (mm->map_count > sysctl_max_map_count) | 944 | if (mm->map_count > sysctl_max_map_count) |
949 | return -ENOMEM; | 945 | return -ENOMEM; |
950 | 946 | ||
951 | if (flags & MAP_HUGETLB) { | ||
952 | struct user_struct *user = NULL; | ||
953 | if (file) | ||
954 | return -EINVAL; | ||
955 | |||
956 | /* | ||
957 | * VM_NORESERVE is used because the reservations will be | ||
958 | * taken when vm_ops->mmap() is called | ||
959 | * A dummy user value is used because we are not locking | ||
960 | * memory so no accounting is necessary | ||
961 | */ | ||
962 | len = ALIGN(len, huge_page_size(&default_hstate)); | ||
963 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, | ||
964 | &user, HUGETLB_ANONHUGE_INODE); | ||
965 | if (IS_ERR(file)) | ||
966 | return PTR_ERR(file); | ||
967 | } | ||
968 | |||
969 | /* Obtain the address to map to. we verify (or select) it and ensure | 947 | /* Obtain the address to map to. we verify (or select) it and ensure |
970 | * that it represents a valid section of the address space. | 948 | * that it represents a valid section of the address space. |
971 | */ | 949 | */ |
@@ -1220,8 +1198,20 @@ munmap_back: | |||
1220 | goto free_vma; | 1198 | goto free_vma; |
1221 | } | 1199 | } |
1222 | 1200 | ||
1223 | if (vma_wants_writenotify(vma)) | 1201 | if (vma_wants_writenotify(vma)) { |
1202 | pgprot_t pprot = vma->vm_page_prot; | ||
1203 | |||
1204 | /* Can vma->vm_page_prot have changed?? | ||
1205 | * | ||
1206 | * Answer: Yes, drivers may have changed it in their | ||
1207 | * f_op->mmap method. | ||
1208 | * | ||
1209 | * Ensures that vmas marked as uncached stay that way. | ||
1210 | */ | ||
1224 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); | 1211 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); |
1212 | if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot))) | ||
1213 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | ||
1214 | } | ||
1225 | 1215 | ||
1226 | vma_link(mm, vma, prev, rb_link, rb_parent); | 1216 | vma_link(mm, vma, prev, rb_link, rb_parent); |
1227 | file = vma->vm_file; | 1217 | file = vma->vm_file; |
@@ -1455,6 +1445,14 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | |||
1455 | unsigned long (*get_area)(struct file *, unsigned long, | 1445 | unsigned long (*get_area)(struct file *, unsigned long, |
1456 | unsigned long, unsigned long, unsigned long); | 1446 | unsigned long, unsigned long, unsigned long); |
1457 | 1447 | ||
1448 | unsigned long error = arch_mmap_check(addr, len, flags); | ||
1449 | if (error) | ||
1450 | return error; | ||
1451 | |||
1452 | /* Careful about overflows.. */ | ||
1453 | if (len > TASK_SIZE) | ||
1454 | return -ENOMEM; | ||
1455 | |||
1458 | get_area = current->mm->get_unmapped_area; | 1456 | get_area = current->mm->get_unmapped_area; |
1459 | if (file && file->f_op && file->f_op->get_unmapped_area) | 1457 | if (file && file->f_op && file->f_op->get_unmapped_area) |
1460 | get_area = file->f_op->get_unmapped_area; | 1458 | get_area = file->f_op->get_unmapped_area; |
@@ -1825,10 +1823,10 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1825 | } | 1823 | } |
1826 | 1824 | ||
1827 | /* | 1825 | /* |
1828 | * Split a vma into two pieces at address 'addr', a new vma is allocated | 1826 | * __split_vma() bypasses sysctl_max_map_count checking. We use this on the |
1829 | * either for the first part or the tail. | 1827 | * munmap path where it doesn't make sense to fail. |
1830 | */ | 1828 | */ |
1831 | int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | 1829 | static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, |
1832 | unsigned long addr, int new_below) | 1830 | unsigned long addr, int new_below) |
1833 | { | 1831 | { |
1834 | struct mempolicy *pol; | 1832 | struct mempolicy *pol; |
@@ -1838,9 +1836,6 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1838 | ~(huge_page_mask(hstate_vma(vma))))) | 1836 | ~(huge_page_mask(hstate_vma(vma))))) |
1839 | return -EINVAL; | 1837 | return -EINVAL; |
1840 | 1838 | ||
1841 | if (mm->map_count >= sysctl_max_map_count) | ||
1842 | return -ENOMEM; | ||
1843 | |||
1844 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 1839 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
1845 | if (!new) | 1840 | if (!new) |
1846 | return -ENOMEM; | 1841 | return -ENOMEM; |
@@ -1880,6 +1875,19 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1880 | return 0; | 1875 | return 0; |
1881 | } | 1876 | } |
1882 | 1877 | ||
1878 | /* | ||
1879 | * Split a vma into two pieces at address 'addr', a new vma is allocated | ||
1880 | * either for the first part or the tail. | ||
1881 | */ | ||
1882 | int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | ||
1883 | unsigned long addr, int new_below) | ||
1884 | { | ||
1885 | if (mm->map_count >= sysctl_max_map_count) | ||
1886 | return -ENOMEM; | ||
1887 | |||
1888 | return __split_vma(mm, vma, addr, new_below); | ||
1889 | } | ||
1890 | |||
1883 | /* Munmap is split into 2 main parts -- this part which finds | 1891 | /* Munmap is split into 2 main parts -- this part which finds |
1884 | * what needs doing, and the areas themselves, which do the | 1892 | * what needs doing, and the areas themselves, which do the |
1885 | * work. This now handles partial unmappings. | 1893 | * work. This now handles partial unmappings. |
@@ -1915,7 +1923,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1915 | * places tmp vma above, and higher split_vma places tmp vma below. | 1923 | * places tmp vma above, and higher split_vma places tmp vma below. |
1916 | */ | 1924 | */ |
1917 | if (start > vma->vm_start) { | 1925 | if (start > vma->vm_start) { |
1918 | int error = split_vma(mm, vma, start, 0); | 1926 | int error; |
1927 | |||
1928 | /* | ||
1929 | * Make sure that map_count on return from munmap() will | ||
1930 | * not exceed its limit; but let map_count go just above | ||
1931 | * its limit temporarily, to help free resources as expected. | ||
1932 | */ | ||
1933 | if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) | ||
1934 | return -ENOMEM; | ||
1935 | |||
1936 | error = __split_vma(mm, vma, start, 0); | ||
1919 | if (error) | 1937 | if (error) |
1920 | return error; | 1938 | return error; |
1921 | prev = vma; | 1939 | prev = vma; |
@@ -1924,7 +1942,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1924 | /* Does it split the last one? */ | 1942 | /* Does it split the last one? */ |
1925 | last = find_vma(mm, end); | 1943 | last = find_vma(mm, end); |
1926 | if (last && end > last->vm_start) { | 1944 | if (last && end > last->vm_start) { |
1927 | int error = split_vma(mm, last, end, 1); | 1945 | int error = __split_vma(mm, last, end, 1); |
1928 | if (error) | 1946 | if (error) |
1929 | return error; | 1947 | return error; |
1930 | } | 1948 | } |
@@ -1999,20 +2017,14 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
1999 | if (!len) | 2017 | if (!len) |
2000 | return addr; | 2018 | return addr; |
2001 | 2019 | ||
2002 | if ((addr + len) > TASK_SIZE || (addr + len) < addr) | ||
2003 | return -EINVAL; | ||
2004 | |||
2005 | if (is_hugepage_only_range(mm, addr, len)) | ||
2006 | return -EINVAL; | ||
2007 | |||
2008 | error = security_file_mmap(NULL, 0, 0, 0, addr, 1); | 2020 | error = security_file_mmap(NULL, 0, 0, 0, addr, 1); |
2009 | if (error) | 2021 | if (error) |
2010 | return error; | 2022 | return error; |
2011 | 2023 | ||
2012 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | 2024 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; |
2013 | 2025 | ||
2014 | error = arch_mmap_check(addr, len, flags); | 2026 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); |
2015 | if (error) | 2027 | if (error & ~PAGE_MASK) |
2016 | return error; | 2028 | return error; |
2017 | 2029 | ||
2018 | /* | 2030 | /* |
diff --git a/mm/mremap.c b/mm/mremap.c index 97bff2547719..845190898d59 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -261,6 +261,137 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
261 | return new_addr; | 261 | return new_addr; |
262 | } | 262 | } |
263 | 263 | ||
264 | static struct vm_area_struct *vma_to_resize(unsigned long addr, | ||
265 | unsigned long old_len, unsigned long new_len, unsigned long *p) | ||
266 | { | ||
267 | struct mm_struct *mm = current->mm; | ||
268 | struct vm_area_struct *vma = find_vma(mm, addr); | ||
269 | |||
270 | if (!vma || vma->vm_start > addr) | ||
271 | goto Efault; | ||
272 | |||
273 | if (is_vm_hugetlb_page(vma)) | ||
274 | goto Einval; | ||
275 | |||
276 | /* We can't remap across vm area boundaries */ | ||
277 | if (old_len > vma->vm_end - addr) | ||
278 | goto Efault; | ||
279 | |||
280 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { | ||
281 | if (new_len > old_len) | ||
282 | goto Efault; | ||
283 | } | ||
284 | |||
285 | if (vma->vm_flags & VM_LOCKED) { | ||
286 | unsigned long locked, lock_limit; | ||
287 | locked = mm->locked_vm << PAGE_SHIFT; | ||
288 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | ||
289 | locked += new_len - old_len; | ||
290 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
291 | goto Eagain; | ||
292 | } | ||
293 | |||
294 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) | ||
295 | goto Enomem; | ||
296 | |||
297 | if (vma->vm_flags & VM_ACCOUNT) { | ||
298 | unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; | ||
299 | if (security_vm_enough_memory(charged)) | ||
300 | goto Efault; | ||
301 | *p = charged; | ||
302 | } | ||
303 | |||
304 | return vma; | ||
305 | |||
306 | Efault: /* very odd choice for most of the cases, but... */ | ||
307 | return ERR_PTR(-EFAULT); | ||
308 | Einval: | ||
309 | return ERR_PTR(-EINVAL); | ||
310 | Enomem: | ||
311 | return ERR_PTR(-ENOMEM); | ||
312 | Eagain: | ||
313 | return ERR_PTR(-EAGAIN); | ||
314 | } | ||
315 | |||
316 | static unsigned long mremap_to(unsigned long addr, | ||
317 | unsigned long old_len, unsigned long new_addr, | ||
318 | unsigned long new_len) | ||
319 | { | ||
320 | struct mm_struct *mm = current->mm; | ||
321 | struct vm_area_struct *vma; | ||
322 | unsigned long ret = -EINVAL; | ||
323 | unsigned long charged = 0; | ||
324 | unsigned long map_flags; | ||
325 | |||
326 | if (new_addr & ~PAGE_MASK) | ||
327 | goto out; | ||
328 | |||
329 | if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) | ||
330 | goto out; | ||
331 | |||
332 | /* Check if the location we're moving into overlaps the | ||
333 | * old location at all, and fail if it does. | ||
334 | */ | ||
335 | if ((new_addr <= addr) && (new_addr+new_len) > addr) | ||
336 | goto out; | ||
337 | |||
338 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | ||
339 | goto out; | ||
340 | |||
341 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
342 | if (ret) | ||
343 | goto out; | ||
344 | |||
345 | ret = do_munmap(mm, new_addr, new_len); | ||
346 | if (ret) | ||
347 | goto out; | ||
348 | |||
349 | if (old_len >= new_len) { | ||
350 | ret = do_munmap(mm, addr+new_len, old_len - new_len); | ||
351 | if (ret && old_len != new_len) | ||
352 | goto out; | ||
353 | old_len = new_len; | ||
354 | } | ||
355 | |||
356 | vma = vma_to_resize(addr, old_len, new_len, &charged); | ||
357 | if (IS_ERR(vma)) { | ||
358 | ret = PTR_ERR(vma); | ||
359 | goto out; | ||
360 | } | ||
361 | |||
362 | map_flags = MAP_FIXED; | ||
363 | if (vma->vm_flags & VM_MAYSHARE) | ||
364 | map_flags |= MAP_SHARED; | ||
365 | |||
366 | ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + | ||
367 | ((addr - vma->vm_start) >> PAGE_SHIFT), | ||
368 | map_flags); | ||
369 | if (ret & ~PAGE_MASK) | ||
370 | goto out1; | ||
371 | |||
372 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | ||
373 | if (!(ret & ~PAGE_MASK)) | ||
374 | goto out; | ||
375 | out1: | ||
376 | vm_unacct_memory(charged); | ||
377 | |||
378 | out: | ||
379 | return ret; | ||
380 | } | ||
381 | |||
382 | static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) | ||
383 | { | ||
384 | unsigned long end = vma->vm_end + delta; | ||
385 | if (end < vma->vm_end) /* overflow */ | ||
386 | return 0; | ||
387 | if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */ | ||
388 | return 0; | ||
389 | if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, | ||
390 | 0, MAP_FIXED) & ~PAGE_MASK) | ||
391 | return 0; | ||
392 | return 1; | ||
393 | } | ||
394 | |||
264 | /* | 395 | /* |
265 | * Expand (or shrink) an existing mapping, potentially moving it at the | 396 | * Expand (or shrink) an existing mapping, potentially moving it at the |
266 | * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) | 397 | * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) |
@@ -294,32 +425,10 @@ unsigned long do_mremap(unsigned long addr, | |||
294 | if (!new_len) | 425 | if (!new_len) |
295 | goto out; | 426 | goto out; |
296 | 427 | ||
297 | /* new_addr is only valid if MREMAP_FIXED is specified */ | ||
298 | if (flags & MREMAP_FIXED) { | 428 | if (flags & MREMAP_FIXED) { |
299 | if (new_addr & ~PAGE_MASK) | 429 | if (flags & MREMAP_MAYMOVE) |
300 | goto out; | 430 | ret = mremap_to(addr, old_len, new_addr, new_len); |
301 | if (!(flags & MREMAP_MAYMOVE)) | 431 | goto out; |
302 | goto out; | ||
303 | |||
304 | if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) | ||
305 | goto out; | ||
306 | |||
307 | /* Check if the location we're moving into overlaps the | ||
308 | * old location at all, and fail if it does. | ||
309 | */ | ||
310 | if ((new_addr <= addr) && (new_addr+new_len) > addr) | ||
311 | goto out; | ||
312 | |||
313 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | ||
314 | goto out; | ||
315 | |||
316 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
317 | if (ret) | ||
318 | goto out; | ||
319 | |||
320 | ret = do_munmap(mm, new_addr, new_len); | ||
321 | if (ret) | ||
322 | goto out; | ||
323 | } | 432 | } |
324 | 433 | ||
325 | /* | 434 | /* |
@@ -332,60 +441,23 @@ unsigned long do_mremap(unsigned long addr, | |||
332 | if (ret && old_len != new_len) | 441 | if (ret && old_len != new_len) |
333 | goto out; | 442 | goto out; |
334 | ret = addr; | 443 | ret = addr; |
335 | if (!(flags & MREMAP_FIXED) || (new_addr == addr)) | 444 | goto out; |
336 | goto out; | ||
337 | old_len = new_len; | ||
338 | } | 445 | } |
339 | 446 | ||
340 | /* | 447 | /* |
341 | * Ok, we need to grow.. or relocate. | 448 | * Ok, we need to grow.. |
342 | */ | 449 | */ |
343 | ret = -EFAULT; | 450 | vma = vma_to_resize(addr, old_len, new_len, &charged); |
344 | vma = find_vma(mm, addr); | 451 | if (IS_ERR(vma)) { |
345 | if (!vma || vma->vm_start > addr) | 452 | ret = PTR_ERR(vma); |
346 | goto out; | ||
347 | if (is_vm_hugetlb_page(vma)) { | ||
348 | ret = -EINVAL; | ||
349 | goto out; | ||
350 | } | ||
351 | /* We can't remap across vm area boundaries */ | ||
352 | if (old_len > vma->vm_end - addr) | ||
353 | goto out; | ||
354 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { | ||
355 | if (new_len > old_len) | ||
356 | goto out; | ||
357 | } | ||
358 | if (vma->vm_flags & VM_LOCKED) { | ||
359 | unsigned long locked, lock_limit; | ||
360 | locked = mm->locked_vm << PAGE_SHIFT; | ||
361 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | ||
362 | locked += new_len - old_len; | ||
363 | ret = -EAGAIN; | ||
364 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
365 | goto out; | ||
366 | } | ||
367 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) { | ||
368 | ret = -ENOMEM; | ||
369 | goto out; | 453 | goto out; |
370 | } | 454 | } |
371 | 455 | ||
372 | if (vma->vm_flags & VM_ACCOUNT) { | ||
373 | charged = (new_len - old_len) >> PAGE_SHIFT; | ||
374 | if (security_vm_enough_memory(charged)) | ||
375 | goto out_nc; | ||
376 | } | ||
377 | |||
378 | /* old_len exactly to the end of the area.. | 456 | /* old_len exactly to the end of the area.. |
379 | * And we're not relocating the area. | ||
380 | */ | 457 | */ |
381 | if (old_len == vma->vm_end - addr && | 458 | if (old_len == vma->vm_end - addr) { |
382 | !((flags & MREMAP_FIXED) && (addr != new_addr)) && | ||
383 | (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { | ||
384 | unsigned long max_addr = TASK_SIZE; | ||
385 | if (vma->vm_next) | ||
386 | max_addr = vma->vm_next->vm_start; | ||
387 | /* can we just expand the current mapping? */ | 459 | /* can we just expand the current mapping? */ |
388 | if (max_addr - addr >= new_len) { | 460 | if (vma_expandable(vma, new_len - old_len)) { |
389 | int pages = (new_len - old_len) >> PAGE_SHIFT; | 461 | int pages = (new_len - old_len) >> PAGE_SHIFT; |
390 | 462 | ||
391 | vma_adjust(vma, vma->vm_start, | 463 | vma_adjust(vma, vma->vm_start, |
@@ -409,28 +481,27 @@ unsigned long do_mremap(unsigned long addr, | |||
409 | */ | 481 | */ |
410 | ret = -ENOMEM; | 482 | ret = -ENOMEM; |
411 | if (flags & MREMAP_MAYMOVE) { | 483 | if (flags & MREMAP_MAYMOVE) { |
412 | if (!(flags & MREMAP_FIXED)) { | 484 | unsigned long map_flags = 0; |
413 | unsigned long map_flags = 0; | 485 | if (vma->vm_flags & VM_MAYSHARE) |
414 | if (vma->vm_flags & VM_MAYSHARE) | 486 | map_flags |= MAP_SHARED; |
415 | map_flags |= MAP_SHARED; | 487 | |
416 | 488 | new_addr = get_unmapped_area(vma->vm_file, 0, new_len, | |
417 | new_addr = get_unmapped_area(vma->vm_file, 0, new_len, | 489 | vma->vm_pgoff + |
418 | vma->vm_pgoff, map_flags); | 490 | ((addr - vma->vm_start) >> PAGE_SHIFT), |
419 | if (new_addr & ~PAGE_MASK) { | 491 | map_flags); |
420 | ret = new_addr; | 492 | if (new_addr & ~PAGE_MASK) { |
421 | goto out; | 493 | ret = new_addr; |
422 | } | 494 | goto out; |
423 | |||
424 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
425 | if (ret) | ||
426 | goto out; | ||
427 | } | 495 | } |
496 | |||
497 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
498 | if (ret) | ||
499 | goto out; | ||
428 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | 500 | ret = move_vma(vma, addr, old_len, new_len, new_addr); |
429 | } | 501 | } |
430 | out: | 502 | out: |
431 | if (ret & ~PAGE_MASK) | 503 | if (ret & ~PAGE_MASK) |
432 | vm_unacct_memory(charged); | 504 | vm_unacct_memory(charged); |
433 | out_nc: | ||
434 | return ret; | 505 | return ret; |
435 | } | 506 | } |
436 | 507 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index 9876fa0c3ad3..8687973462bb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1143,9 +1143,6 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1143 | if (ret < rlen) | 1143 | if (ret < rlen) |
1144 | memset(base + ret, 0, rlen - ret); | 1144 | memset(base + ret, 0, rlen - ret); |
1145 | 1145 | ||
1146 | } else { | ||
1147 | /* if it's an anonymous mapping, then just clear it */ | ||
1148 | memset(base, 0, rlen); | ||
1149 | } | 1146 | } |
1150 | 1147 | ||
1151 | return 0; | 1148 | return 0; |
@@ -1343,6 +1340,11 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1343 | goto error_just_free; | 1340 | goto error_just_free; |
1344 | add_nommu_region(region); | 1341 | add_nommu_region(region); |
1345 | 1342 | ||
1343 | /* clear anonymous mappings that don't ask for uninitialized data */ | ||
1344 | if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) | ||
1345 | memset((void *)region->vm_start, 0, | ||
1346 | region->vm_end - region->vm_start); | ||
1347 | |||
1346 | /* okay... we have a mapping; now we have to register it */ | 1348 | /* okay... we have a mapping; now we have to register it */ |
1347 | result = vma->vm_start; | 1349 | result = vma->vm_start; |
1348 | 1350 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ea2147dabba6..f52481b1c1e5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -196,27 +196,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
196 | /* | 196 | /* |
197 | * Determine the type of allocation constraint. | 197 | * Determine the type of allocation constraint. |
198 | */ | 198 | */ |
199 | static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
200 | gfp_t gfp_mask) | ||
201 | { | ||
202 | #ifdef CONFIG_NUMA | 199 | #ifdef CONFIG_NUMA |
200 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
201 | gfp_t gfp_mask, nodemask_t *nodemask) | ||
202 | { | ||
203 | struct zone *zone; | 203 | struct zone *zone; |
204 | struct zoneref *z; | 204 | struct zoneref *z; |
205 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 205 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
206 | nodemask_t nodes = node_states[N_HIGH_MEMORY]; | ||
207 | 206 | ||
208 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 207 | /* |
209 | if (cpuset_zone_allowed_softwall(zone, gfp_mask)) | 208 | * Reach here only when __GFP_NOFAIL is used. So, we should avoid |
210 | node_clear(zone_to_nid(zone), nodes); | 209 | * to kill current.We have to random task kill in this case. |
211 | else | 210 | * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. |
212 | return CONSTRAINT_CPUSET; | 211 | */ |
212 | if (gfp_mask & __GFP_THISNODE) | ||
213 | return CONSTRAINT_NONE; | ||
213 | 214 | ||
214 | if (!nodes_empty(nodes)) | 215 | /* |
216 | * The nodemask here is a nodemask passed to alloc_pages(). Now, | ||
217 | * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy | ||
218 | * feature. mempolicy is an only user of nodemask here. | ||
219 | * check mempolicy's nodemask contains all N_HIGH_MEMORY | ||
220 | */ | ||
221 | if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) | ||
215 | return CONSTRAINT_MEMORY_POLICY; | 222 | return CONSTRAINT_MEMORY_POLICY; |
216 | #endif | ||
217 | 223 | ||
224 | /* Check this allocation failure is caused by cpuset's wall function */ | ||
225 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | ||
226 | high_zoneidx, nodemask) | ||
227 | if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) | ||
228 | return CONSTRAINT_CPUSET; | ||
229 | |||
230 | return CONSTRAINT_NONE; | ||
231 | } | ||
232 | #else | ||
233 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
234 | gfp_t gfp_mask, nodemask_t *nodemask) | ||
235 | { | ||
218 | return CONSTRAINT_NONE; | 236 | return CONSTRAINT_NONE; |
219 | } | 237 | } |
238 | #endif | ||
220 | 239 | ||
221 | /* | 240 | /* |
222 | * Simple selection loop. We chose the process with the highest | 241 | * Simple selection loop. We chose the process with the highest |
@@ -337,6 +356,24 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
337 | } while_each_thread(g, p); | 356 | } while_each_thread(g, p); |
338 | } | 357 | } |
339 | 358 | ||
359 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | ||
360 | struct mem_cgroup *mem) | ||
361 | { | ||
362 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | ||
363 | "oom_adj=%d\n", | ||
364 | current->comm, gfp_mask, order, current->signal->oom_adj); | ||
365 | task_lock(current); | ||
366 | cpuset_print_task_mems_allowed(current); | ||
367 | task_unlock(current); | ||
368 | dump_stack(); | ||
369 | mem_cgroup_print_oom_info(mem, p); | ||
370 | show_mem(); | ||
371 | if (sysctl_oom_dump_tasks) | ||
372 | dump_tasks(mem); | ||
373 | } | ||
374 | |||
375 | #define K(x) ((x) << (PAGE_SHIFT-10)) | ||
376 | |||
340 | /* | 377 | /* |
341 | * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO | 378 | * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO |
342 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO | 379 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO |
@@ -350,15 +387,23 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
350 | return; | 387 | return; |
351 | } | 388 | } |
352 | 389 | ||
390 | task_lock(p); | ||
353 | if (!p->mm) { | 391 | if (!p->mm) { |
354 | WARN_ON(1); | 392 | WARN_ON(1); |
355 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | 393 | printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n", |
394 | task_pid_nr(p), p->comm); | ||
395 | task_unlock(p); | ||
356 | return; | 396 | return; |
357 | } | 397 | } |
358 | 398 | ||
359 | if (verbose) | 399 | if (verbose) |
360 | printk(KERN_ERR "Killed process %d (%s)\n", | 400 | printk(KERN_ERR "Killed process %d (%s) " |
361 | task_pid_nr(p), p->comm); | 401 | "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", |
402 | task_pid_nr(p), p->comm, | ||
403 | K(p->mm->total_vm), | ||
404 | K(get_mm_counter(p->mm, anon_rss)), | ||
405 | K(get_mm_counter(p->mm, file_rss))); | ||
406 | task_unlock(p); | ||
362 | 407 | ||
363 | /* | 408 | /* |
364 | * We give our sacrificial lamb high priority and access to | 409 | * We give our sacrificial lamb high priority and access to |
@@ -395,20 +440,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
395 | { | 440 | { |
396 | struct task_struct *c; | 441 | struct task_struct *c; |
397 | 442 | ||
398 | if (printk_ratelimit()) { | 443 | if (printk_ratelimit()) |
399 | printk(KERN_WARNING "%s invoked oom-killer: " | 444 | dump_header(p, gfp_mask, order, mem); |
400 | "gfp_mask=0x%x, order=%d, oom_adj=%d\n", | ||
401 | current->comm, gfp_mask, order, | ||
402 | current->signal->oom_adj); | ||
403 | task_lock(current); | ||
404 | cpuset_print_task_mems_allowed(current); | ||
405 | task_unlock(current); | ||
406 | dump_stack(); | ||
407 | mem_cgroup_print_oom_info(mem, current); | ||
408 | show_mem(); | ||
409 | if (sysctl_oom_dump_tasks) | ||
410 | dump_tasks(mem); | ||
411 | } | ||
412 | 445 | ||
413 | /* | 446 | /* |
414 | * If the task is already exiting, don't alarm the sysadmin or kill | 447 | * If the task is already exiting, don't alarm the sysadmin or kill |
@@ -544,6 +577,7 @@ retry: | |||
544 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 577 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
545 | if (!p) { | 578 | if (!p) { |
546 | read_unlock(&tasklist_lock); | 579 | read_unlock(&tasklist_lock); |
580 | dump_header(NULL, gfp_mask, order, NULL); | ||
547 | panic("Out of memory and no killable processes...\n"); | 581 | panic("Out of memory and no killable processes...\n"); |
548 | } | 582 | } |
549 | 583 | ||
@@ -599,7 +633,8 @@ rest_and_return: | |||
599 | * OR try to be smart about which process to kill. Note that we | 633 | * OR try to be smart about which process to kill. Note that we |
600 | * don't have to be perfect here, we just have to be good. | 634 | * don't have to be perfect here, we just have to be good. |
601 | */ | 635 | */ |
602 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | 636 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
637 | int order, nodemask_t *nodemask) | ||
603 | { | 638 | { |
604 | unsigned long freed = 0; | 639 | unsigned long freed = 0; |
605 | enum oom_constraint constraint; | 640 | enum oom_constraint constraint; |
@@ -609,14 +644,16 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
609 | /* Got some memory back in the last second. */ | 644 | /* Got some memory back in the last second. */ |
610 | return; | 645 | return; |
611 | 646 | ||
612 | if (sysctl_panic_on_oom == 2) | 647 | if (sysctl_panic_on_oom == 2) { |
648 | dump_header(NULL, gfp_mask, order, NULL); | ||
613 | panic("out of memory. Compulsory panic_on_oom is selected.\n"); | 649 | panic("out of memory. Compulsory panic_on_oom is selected.\n"); |
650 | } | ||
614 | 651 | ||
615 | /* | 652 | /* |
616 | * Check if there were limitations on the allocation (only relevant for | 653 | * Check if there were limitations on the allocation (only relevant for |
617 | * NUMA) that may require different handling. | 654 | * NUMA) that may require different handling. |
618 | */ | 655 | */ |
619 | constraint = constrained_alloc(zonelist, gfp_mask); | 656 | constraint = constrained_alloc(zonelist, gfp_mask, nodemask); |
620 | read_lock(&tasklist_lock); | 657 | read_lock(&tasklist_lock); |
621 | 658 | ||
622 | switch (constraint) { | 659 | switch (constraint) { |
@@ -626,8 +663,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
626 | break; | 663 | break; |
627 | 664 | ||
628 | case CONSTRAINT_NONE: | 665 | case CONSTRAINT_NONE: |
629 | if (sysctl_panic_on_oom) | 666 | if (sysctl_panic_on_oom) { |
667 | dump_header(NULL, gfp_mask, order, NULL); | ||
630 | panic("out of memory. panic_on_oom is selected\n"); | 668 | panic("out of memory. panic_on_oom is selected\n"); |
669 | } | ||
631 | /* Fall-through */ | 670 | /* Fall-through */ |
632 | case CONSTRAINT_CPUSET: | 671 | case CONSTRAINT_CPUSET: |
633 | __out_of_memory(gfp_mask, order); | 672 | __out_of_memory(gfp_mask, order); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 873c86308b4e..4e869657cb51 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -486,7 +486,6 @@ static inline void __free_one_page(struct page *page, | |||
486 | zone->free_area[order].nr_free++; | 486 | zone->free_area[order].nr_free++; |
487 | } | 487 | } |
488 | 488 | ||
489 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
490 | /* | 489 | /* |
491 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | 490 | * free_page_mlock() -- clean up attempts to free and mlocked() page. |
492 | * Page should not be on lru, so no need to fix that up. | 491 | * Page should not be on lru, so no need to fix that up. |
@@ -497,9 +496,6 @@ static inline void free_page_mlock(struct page *page) | |||
497 | __dec_zone_page_state(page, NR_MLOCK); | 496 | __dec_zone_page_state(page, NR_MLOCK); |
498 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | 497 | __count_vm_event(UNEVICTABLE_MLOCKFREED); |
499 | } | 498 | } |
500 | #else | ||
501 | static void free_page_mlock(struct page *page) { } | ||
502 | #endif | ||
503 | 499 | ||
504 | static inline int free_pages_check(struct page *page) | 500 | static inline int free_pages_check(struct page *page) |
505 | { | 501 | { |
@@ -1658,12 +1654,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
1658 | if (page) | 1654 | if (page) |
1659 | goto out; | 1655 | goto out; |
1660 | 1656 | ||
1661 | /* The OOM killer will not help higher order allocs */ | 1657 | if (!(gfp_mask & __GFP_NOFAIL)) { |
1662 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) | 1658 | /* The OOM killer will not help higher order allocs */ |
1663 | goto out; | 1659 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
1664 | 1660 | goto out; | |
1661 | /* | ||
1662 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. | ||
1663 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. | ||
1664 | * The caller should handle page allocation failure by itself if | ||
1665 | * it specifies __GFP_THISNODE. | ||
1666 | * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. | ||
1667 | */ | ||
1668 | if (gfp_mask & __GFP_THISNODE) | ||
1669 | goto out; | ||
1670 | } | ||
1665 | /* Exhausted what can be done so it's blamo time */ | 1671 | /* Exhausted what can be done so it's blamo time */ |
1666 | out_of_memory(zonelist, gfp_mask, order); | 1672 | out_of_memory(zonelist, gfp_mask, order, nodemask); |
1667 | 1673 | ||
1668 | out: | 1674 | out: |
1669 | clear_zonelist_oom(zonelist, gfp_mask); | 1675 | clear_zonelist_oom(zonelist, gfp_mask); |
@@ -3127,7 +3133,7 @@ static int __cpuinit process_zones(int cpu) | |||
3127 | 3133 | ||
3128 | if (percpu_pagelist_fraction) | 3134 | if (percpu_pagelist_fraction) |
3129 | setup_pagelist_highmark(zone_pcp(zone, cpu), | 3135 | setup_pagelist_highmark(zone_pcp(zone, cpu), |
3130 | (zone->present_pages / percpu_pagelist_fraction)); | 3136 | (zone->present_pages / percpu_pagelist_fraction)); |
3131 | } | 3137 | } |
3132 | 3138 | ||
3133 | return 0; | 3139 | return 0; |
@@ -5085,3 +5091,24 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
5085 | spin_unlock_irqrestore(&zone->lock, flags); | 5091 | spin_unlock_irqrestore(&zone->lock, flags); |
5086 | } | 5092 | } |
5087 | #endif | 5093 | #endif |
5094 | |||
5095 | #ifdef CONFIG_MEMORY_FAILURE | ||
5096 | bool is_free_buddy_page(struct page *page) | ||
5097 | { | ||
5098 | struct zone *zone = page_zone(page); | ||
5099 | unsigned long pfn = page_to_pfn(page); | ||
5100 | unsigned long flags; | ||
5101 | int order; | ||
5102 | |||
5103 | spin_lock_irqsave(&zone->lock, flags); | ||
5104 | for (order = 0; order < MAX_ORDER; order++) { | ||
5105 | struct page *page_head = page - (pfn & ((1 << order) - 1)); | ||
5106 | |||
5107 | if (PageBuddy(page_head) && page_order(page_head) >= order) | ||
5108 | break; | ||
5109 | } | ||
5110 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5111 | |||
5112 | return order < MAX_ORDER; | ||
5113 | } | ||
5114 | #endif | ||
diff --git a/mm/page_io.c b/mm/page_io.c index c6f3e5071de3..a19af956ee1b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -19,20 +19,15 @@ | |||
19 | #include <linux/writeback.h> | 19 | #include <linux/writeback.h> |
20 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
21 | 21 | ||
22 | static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, | 22 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
23 | struct page *page, bio_end_io_t end_io) | 23 | struct page *page, bio_end_io_t end_io) |
24 | { | 24 | { |
25 | struct bio *bio; | 25 | struct bio *bio; |
26 | 26 | ||
27 | bio = bio_alloc(gfp_flags, 1); | 27 | bio = bio_alloc(gfp_flags, 1); |
28 | if (bio) { | 28 | if (bio) { |
29 | struct swap_info_struct *sis; | 29 | bio->bi_sector = map_swap_page(page, &bio->bi_bdev); |
30 | swp_entry_t entry = { .val = index, }; | 30 | bio->bi_sector <<= PAGE_SHIFT - 9; |
31 | |||
32 | sis = get_swap_info_struct(swp_type(entry)); | ||
33 | bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * | ||
34 | (PAGE_SIZE >> 9); | ||
35 | bio->bi_bdev = sis->bdev; | ||
36 | bio->bi_io_vec[0].bv_page = page; | 31 | bio->bi_io_vec[0].bv_page = page; |
37 | bio->bi_io_vec[0].bv_len = PAGE_SIZE; | 32 | bio->bi_io_vec[0].bv_len = PAGE_SIZE; |
38 | bio->bi_io_vec[0].bv_offset = 0; | 33 | bio->bi_io_vec[0].bv_offset = 0; |
@@ -102,8 +97,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
102 | unlock_page(page); | 97 | unlock_page(page); |
103 | goto out; | 98 | goto out; |
104 | } | 99 | } |
105 | bio = get_swap_bio(GFP_NOIO, page_private(page), page, | 100 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); |
106 | end_swap_bio_write); | ||
107 | if (bio == NULL) { | 101 | if (bio == NULL) { |
108 | set_page_dirty(page); | 102 | set_page_dirty(page); |
109 | unlock_page(page); | 103 | unlock_page(page); |
@@ -127,8 +121,7 @@ int swap_readpage(struct page *page) | |||
127 | 121 | ||
128 | VM_BUG_ON(!PageLocked(page)); | 122 | VM_BUG_ON(!PageLocked(page)); |
129 | VM_BUG_ON(PageUptodate(page)); | 123 | VM_BUG_ON(PageUptodate(page)); |
130 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, | 124 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); |
131 | end_swap_bio_read); | ||
132 | if (bio == NULL) { | 125 | if (bio == NULL) { |
133 | unlock_page(page); | 126 | unlock_page(page); |
134 | ret = -ENOMEM; | 127 | ret = -ENOMEM; |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d5878bed7841..7b47a57b6646 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -1,6 +1,7 @@ | |||
1 | #include <linux/mm.h> | 1 | #include <linux/mm.h> |
2 | #include <linux/highmem.h> | 2 | #include <linux/highmem.h> |
3 | #include <linux/sched.h> | 3 | #include <linux/sched.h> |
4 | #include <linux/hugetlb.h> | ||
4 | 5 | ||
5 | static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | 6 | static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
6 | struct mm_walk *walk) | 7 | struct mm_walk *walk) |
@@ -107,6 +108,7 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
107 | pgd_t *pgd; | 108 | pgd_t *pgd; |
108 | unsigned long next; | 109 | unsigned long next; |
109 | int err = 0; | 110 | int err = 0; |
111 | struct vm_area_struct *vma; | ||
110 | 112 | ||
111 | if (addr >= end) | 113 | if (addr >= end) |
112 | return err; | 114 | return err; |
@@ -117,11 +119,38 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
117 | pgd = pgd_offset(walk->mm, addr); | 119 | pgd = pgd_offset(walk->mm, addr); |
118 | do { | 120 | do { |
119 | next = pgd_addr_end(addr, end); | 121 | next = pgd_addr_end(addr, end); |
122 | |||
123 | /* | ||
124 | * handle hugetlb vma individually because pagetable walk for | ||
125 | * the hugetlb page is dependent on the architecture and | ||
126 | * we can't handled it in the same manner as non-huge pages. | ||
127 | */ | ||
128 | vma = find_vma(walk->mm, addr); | ||
129 | #ifdef CONFIG_HUGETLB_PAGE | ||
130 | if (vma && is_vm_hugetlb_page(vma)) { | ||
131 | pte_t *pte; | ||
132 | struct hstate *hs; | ||
133 | |||
134 | if (vma->vm_end < next) | ||
135 | next = vma->vm_end; | ||
136 | hs = hstate_vma(vma); | ||
137 | pte = huge_pte_offset(walk->mm, | ||
138 | addr & huge_page_mask(hs)); | ||
139 | if (pte && !huge_pte_none(huge_ptep_get(pte)) | ||
140 | && walk->hugetlb_entry) | ||
141 | err = walk->hugetlb_entry(pte, addr, | ||
142 | next, walk); | ||
143 | if (err) | ||
144 | break; | ||
145 | continue; | ||
146 | } | ||
147 | #endif | ||
120 | if (pgd_none_or_clear_bad(pgd)) { | 148 | if (pgd_none_or_clear_bad(pgd)) { |
121 | if (walk->pte_hole) | 149 | if (walk->pte_hole) |
122 | err = walk->pte_hole(addr, next, walk); | 150 | err = walk->pte_hole(addr, next, walk); |
123 | if (err) | 151 | if (err) |
124 | break; | 152 | break; |
153 | pgd++; | ||
125 | continue; | 154 | continue; |
126 | } | 155 | } |
127 | if (walk->pgd_entry) | 156 | if (walk->pgd_entry) |
@@ -131,7 +160,8 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
131 | err = walk_pud_range(pgd, addr, next, walk); | 160 | err = walk_pud_range(pgd, addr, next, walk); |
132 | if (err) | 161 | if (err) |
133 | break; | 162 | break; |
134 | } while (pgd++, addr = next, addr != end); | 163 | pgd++; |
164 | } while (addr = next, addr != end); | ||
135 | 165 | ||
136 | return err; | 166 | return err; |
137 | } | 167 | } |
diff --git a/mm/percpu.c b/mm/percpu.c index 5adfc268b408..442010cc91c6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -46,8 +46,6 @@ | |||
46 | * | 46 | * |
47 | * To use this allocator, arch code should do the followings. | 47 | * To use this allocator, arch code should do the followings. |
48 | * | 48 | * |
49 | * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA | ||
50 | * | ||
51 | * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate | 49 | * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate |
52 | * regular address to percpu pointer and back if they need to be | 50 | * regular address to percpu pointer and back if they need to be |
53 | * different from the default | 51 | * different from the default |
@@ -74,6 +72,7 @@ | |||
74 | #include <asm/cacheflush.h> | 72 | #include <asm/cacheflush.h> |
75 | #include <asm/sections.h> | 73 | #include <asm/sections.h> |
76 | #include <asm/tlbflush.h> | 74 | #include <asm/tlbflush.h> |
75 | #include <asm/io.h> | ||
77 | 76 | ||
78 | #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ | 77 | #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ |
79 | #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ | 78 | #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ |
@@ -1302,6 +1301,27 @@ void free_percpu(void *ptr) | |||
1302 | } | 1301 | } |
1303 | EXPORT_SYMBOL_GPL(free_percpu); | 1302 | EXPORT_SYMBOL_GPL(free_percpu); |
1304 | 1303 | ||
1304 | /** | ||
1305 | * per_cpu_ptr_to_phys - convert translated percpu address to physical address | ||
1306 | * @addr: the address to be converted to physical address | ||
1307 | * | ||
1308 | * Given @addr which is dereferenceable address obtained via one of | ||
1309 | * percpu access macros, this function translates it into its physical | ||
1310 | * address. The caller is responsible for ensuring @addr stays valid | ||
1311 | * until this function finishes. | ||
1312 | * | ||
1313 | * RETURNS: | ||
1314 | * The physical address for @addr. | ||
1315 | */ | ||
1316 | phys_addr_t per_cpu_ptr_to_phys(void *addr) | ||
1317 | { | ||
1318 | if ((unsigned long)addr < VMALLOC_START || | ||
1319 | (unsigned long)addr >= VMALLOC_END) | ||
1320 | return __pa(addr); | ||
1321 | else | ||
1322 | return page_to_phys(vmalloc_to_page(addr)); | ||
1323 | } | ||
1324 | |||
1305 | static inline size_t pcpu_calc_fc_sizes(size_t static_size, | 1325 | static inline size_t pcpu_calc_fc_sizes(size_t static_size, |
1306 | size_t reserved_size, | 1326 | size_t reserved_size, |
1307 | ssize_t *dyn_sizep) | 1327 | ssize_t *dyn_sizep) |
diff --git a/mm/readahead.c b/mm/readahead.c index aa1aa2345235..033bc135a41f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -547,5 +547,17 @@ page_cache_async_readahead(struct address_space *mapping, | |||
547 | 547 | ||
548 | /* do read-ahead */ | 548 | /* do read-ahead */ |
549 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); | 549 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); |
550 | |||
551 | #ifdef CONFIG_BLOCK | ||
552 | /* | ||
553 | * Normally the current page is !uptodate and lock_page() will be | ||
554 | * immediately called to implicitly unplug the device. However this | ||
555 | * is not always true for RAID conifgurations, where data arrives | ||
556 | * not strictly in their submission order. In this case we need to | ||
557 | * explicitly kick off the IO. | ||
558 | */ | ||
559 | if (PageUptodate(page)) | ||
560 | blk_run_backing_dev(mapping->backing_dev_info, NULL); | ||
561 | #endif | ||
550 | } | 562 | } |
551 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); | 563 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); |
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/swapops.h> | 49 | #include <linux/swapops.h> |
50 | #include <linux/slab.h> | 50 | #include <linux/slab.h> |
51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
52 | #include <linux/ksm.h> | ||
52 | #include <linux/rmap.h> | 53 | #include <linux/rmap.h> |
53 | #include <linux/rcupdate.h> | 54 | #include <linux/rcupdate.h> |
54 | #include <linux/module.h> | 55 | #include <linux/module.h> |
@@ -67,7 +68,7 @@ static inline struct anon_vma *anon_vma_alloc(void) | |||
67 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | 68 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); |
68 | } | 69 | } |
69 | 70 | ||
70 | static inline void anon_vma_free(struct anon_vma *anon_vma) | 71 | void anon_vma_free(struct anon_vma *anon_vma) |
71 | { | 72 | { |
72 | kmem_cache_free(anon_vma_cachep, anon_vma); | 73 | kmem_cache_free(anon_vma_cachep, anon_vma); |
73 | } | 74 | } |
@@ -171,7 +172,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
171 | list_del(&vma->anon_vma_node); | 172 | list_del(&vma->anon_vma_node); |
172 | 173 | ||
173 | /* We must garbage collect the anon_vma if it's empty */ | 174 | /* We must garbage collect the anon_vma if it's empty */ |
174 | empty = list_empty(&anon_vma->head); | 175 | empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); |
175 | spin_unlock(&anon_vma->lock); | 176 | spin_unlock(&anon_vma->lock); |
176 | 177 | ||
177 | if (empty) | 178 | if (empty) |
@@ -183,6 +184,7 @@ static void anon_vma_ctor(void *data) | |||
183 | struct anon_vma *anon_vma = data; | 184 | struct anon_vma *anon_vma = data; |
184 | 185 | ||
185 | spin_lock_init(&anon_vma->lock); | 186 | spin_lock_init(&anon_vma->lock); |
187 | ksm_refcount_init(anon_vma); | ||
186 | INIT_LIST_HEAD(&anon_vma->head); | 188 | INIT_LIST_HEAD(&anon_vma->head); |
187 | } | 189 | } |
188 | 190 | ||
@@ -202,8 +204,8 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
202 | unsigned long anon_mapping; | 204 | unsigned long anon_mapping; |
203 | 205 | ||
204 | rcu_read_lock(); | 206 | rcu_read_lock(); |
205 | anon_mapping = (unsigned long) page->mapping; | 207 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); |
206 | if (!(anon_mapping & PAGE_MAPPING_ANON)) | 208 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
207 | goto out; | 209 | goto out; |
208 | if (!page_mapped(page)) | 210 | if (!page_mapped(page)) |
209 | goto out; | 211 | goto out; |
@@ -248,8 +250,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
248 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 250 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
249 | { | 251 | { |
250 | if (PageAnon(page)) { | 252 | if (PageAnon(page)) { |
251 | if ((void *)vma->anon_vma != | 253 | if (vma->anon_vma != page_anon_vma(page)) |
252 | (void *)page->mapping - PAGE_MAPPING_ANON) | ||
253 | return -EFAULT; | 254 | return -EFAULT; |
254 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { | 255 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { |
255 | if (!vma->vm_file || | 256 | if (!vma->vm_file || |
@@ -337,21 +338,15 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
337 | * Subfunctions of page_referenced: page_referenced_one called | 338 | * Subfunctions of page_referenced: page_referenced_one called |
338 | * repeatedly from either page_referenced_anon or page_referenced_file. | 339 | * repeatedly from either page_referenced_anon or page_referenced_file. |
339 | */ | 340 | */ |
340 | static int page_referenced_one(struct page *page, | 341 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, |
341 | struct vm_area_struct *vma, | 342 | unsigned long address, unsigned int *mapcount, |
342 | unsigned int *mapcount, | 343 | unsigned long *vm_flags) |
343 | unsigned long *vm_flags) | ||
344 | { | 344 | { |
345 | struct mm_struct *mm = vma->vm_mm; | 345 | struct mm_struct *mm = vma->vm_mm; |
346 | unsigned long address; | ||
347 | pte_t *pte; | 346 | pte_t *pte; |
348 | spinlock_t *ptl; | 347 | spinlock_t *ptl; |
349 | int referenced = 0; | 348 | int referenced = 0; |
350 | 349 | ||
351 | address = vma_address(page, vma); | ||
352 | if (address == -EFAULT) | ||
353 | goto out; | ||
354 | |||
355 | pte = page_check_address(page, mm, address, &ptl, 0); | 350 | pte = page_check_address(page, mm, address, &ptl, 0); |
356 | if (!pte) | 351 | if (!pte) |
357 | goto out; | 352 | goto out; |
@@ -388,9 +383,10 @@ static int page_referenced_one(struct page *page, | |||
388 | out_unmap: | 383 | out_unmap: |
389 | (*mapcount)--; | 384 | (*mapcount)--; |
390 | pte_unmap_unlock(pte, ptl); | 385 | pte_unmap_unlock(pte, ptl); |
391 | out: | 386 | |
392 | if (referenced) | 387 | if (referenced) |
393 | *vm_flags |= vma->vm_flags; | 388 | *vm_flags |= vma->vm_flags; |
389 | out: | ||
394 | return referenced; | 390 | return referenced; |
395 | } | 391 | } |
396 | 392 | ||
@@ -409,6 +405,9 @@ static int page_referenced_anon(struct page *page, | |||
409 | 405 | ||
410 | mapcount = page_mapcount(page); | 406 | mapcount = page_mapcount(page); |
411 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 407 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
408 | unsigned long address = vma_address(page, vma); | ||
409 | if (address == -EFAULT) | ||
410 | continue; | ||
412 | /* | 411 | /* |
413 | * If we are reclaiming on behalf of a cgroup, skip | 412 | * If we are reclaiming on behalf of a cgroup, skip |
414 | * counting on behalf of references from different | 413 | * counting on behalf of references from different |
@@ -416,7 +415,7 @@ static int page_referenced_anon(struct page *page, | |||
416 | */ | 415 | */ |
417 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 416 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
418 | continue; | 417 | continue; |
419 | referenced += page_referenced_one(page, vma, | 418 | referenced += page_referenced_one(page, vma, address, |
420 | &mapcount, vm_flags); | 419 | &mapcount, vm_flags); |
421 | if (!mapcount) | 420 | if (!mapcount) |
422 | break; | 421 | break; |
@@ -474,6 +473,9 @@ static int page_referenced_file(struct page *page, | |||
474 | mapcount = page_mapcount(page); | 473 | mapcount = page_mapcount(page); |
475 | 474 | ||
476 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 475 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
476 | unsigned long address = vma_address(page, vma); | ||
477 | if (address == -EFAULT) | ||
478 | continue; | ||
477 | /* | 479 | /* |
478 | * If we are reclaiming on behalf of a cgroup, skip | 480 | * If we are reclaiming on behalf of a cgroup, skip |
479 | * counting on behalf of references from different | 481 | * counting on behalf of references from different |
@@ -481,7 +483,7 @@ static int page_referenced_file(struct page *page, | |||
481 | */ | 483 | */ |
482 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 484 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
483 | continue; | 485 | continue; |
484 | referenced += page_referenced_one(page, vma, | 486 | referenced += page_referenced_one(page, vma, address, |
485 | &mapcount, vm_flags); | 487 | &mapcount, vm_flags); |
486 | if (!mapcount) | 488 | if (!mapcount) |
487 | break; | 489 | break; |
@@ -507,46 +509,47 @@ int page_referenced(struct page *page, | |||
507 | unsigned long *vm_flags) | 509 | unsigned long *vm_flags) |
508 | { | 510 | { |
509 | int referenced = 0; | 511 | int referenced = 0; |
512 | int we_locked = 0; | ||
510 | 513 | ||
511 | if (TestClearPageReferenced(page)) | 514 | if (TestClearPageReferenced(page)) |
512 | referenced++; | 515 | referenced++; |
513 | 516 | ||
514 | *vm_flags = 0; | 517 | *vm_flags = 0; |
515 | if (page_mapped(page) && page->mapping) { | 518 | if (page_mapped(page) && page_rmapping(page)) { |
516 | if (PageAnon(page)) | 519 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { |
520 | we_locked = trylock_page(page); | ||
521 | if (!we_locked) { | ||
522 | referenced++; | ||
523 | goto out; | ||
524 | } | ||
525 | } | ||
526 | if (unlikely(PageKsm(page))) | ||
527 | referenced += page_referenced_ksm(page, mem_cont, | ||
528 | vm_flags); | ||
529 | else if (PageAnon(page)) | ||
517 | referenced += page_referenced_anon(page, mem_cont, | 530 | referenced += page_referenced_anon(page, mem_cont, |
518 | vm_flags); | 531 | vm_flags); |
519 | else if (is_locked) | 532 | else if (page->mapping) |
520 | referenced += page_referenced_file(page, mem_cont, | 533 | referenced += page_referenced_file(page, mem_cont, |
521 | vm_flags); | 534 | vm_flags); |
522 | else if (!trylock_page(page)) | 535 | if (we_locked) |
523 | referenced++; | ||
524 | else { | ||
525 | if (page->mapping) | ||
526 | referenced += page_referenced_file(page, | ||
527 | mem_cont, vm_flags); | ||
528 | unlock_page(page); | 536 | unlock_page(page); |
529 | } | ||
530 | } | 537 | } |
531 | 538 | out: | |
532 | if (page_test_and_clear_young(page)) | 539 | if (page_test_and_clear_young(page)) |
533 | referenced++; | 540 | referenced++; |
534 | 541 | ||
535 | return referenced; | 542 | return referenced; |
536 | } | 543 | } |
537 | 544 | ||
538 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | 545 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, |
546 | unsigned long address) | ||
539 | { | 547 | { |
540 | struct mm_struct *mm = vma->vm_mm; | 548 | struct mm_struct *mm = vma->vm_mm; |
541 | unsigned long address; | ||
542 | pte_t *pte; | 549 | pte_t *pte; |
543 | spinlock_t *ptl; | 550 | spinlock_t *ptl; |
544 | int ret = 0; | 551 | int ret = 0; |
545 | 552 | ||
546 | address = vma_address(page, vma); | ||
547 | if (address == -EFAULT) | ||
548 | goto out; | ||
549 | |||
550 | pte = page_check_address(page, mm, address, &ptl, 1); | 553 | pte = page_check_address(page, mm, address, &ptl, 1); |
551 | if (!pte) | 554 | if (!pte) |
552 | goto out; | 555 | goto out; |
@@ -578,8 +581,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
578 | 581 | ||
579 | spin_lock(&mapping->i_mmap_lock); | 582 | spin_lock(&mapping->i_mmap_lock); |
580 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 583 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
581 | if (vma->vm_flags & VM_SHARED) | 584 | if (vma->vm_flags & VM_SHARED) { |
582 | ret += page_mkclean_one(page, vma); | 585 | unsigned long address = vma_address(page, vma); |
586 | if (address == -EFAULT) | ||
587 | continue; | ||
588 | ret += page_mkclean_one(page, vma, address); | ||
589 | } | ||
583 | } | 590 | } |
584 | spin_unlock(&mapping->i_mmap_lock); | 591 | spin_unlock(&mapping->i_mmap_lock); |
585 | return ret; | 592 | return ret; |
@@ -620,14 +627,7 @@ static void __page_set_anon_rmap(struct page *page, | |||
620 | BUG_ON(!anon_vma); | 627 | BUG_ON(!anon_vma); |
621 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 628 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
622 | page->mapping = (struct address_space *) anon_vma; | 629 | page->mapping = (struct address_space *) anon_vma; |
623 | |||
624 | page->index = linear_page_index(vma, address); | 630 | page->index = linear_page_index(vma, address); |
625 | |||
626 | /* | ||
627 | * nr_mapped state can be updated without turning off | ||
628 | * interrupts because it is not modified via interrupt. | ||
629 | */ | ||
630 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
631 | } | 631 | } |
632 | 632 | ||
633 | /** | 633 | /** |
@@ -665,14 +665,23 @@ static void __page_check_anon_rmap(struct page *page, | |||
665 | * @vma: the vm area in which the mapping is added | 665 | * @vma: the vm area in which the mapping is added |
666 | * @address: the user virtual address mapped | 666 | * @address: the user virtual address mapped |
667 | * | 667 | * |
668 | * The caller needs to hold the pte lock and the page must be locked. | 668 | * The caller needs to hold the pte lock, and the page must be locked in |
669 | * the anon_vma case: to serialize mapping,index checking after setting, | ||
670 | * and to ensure that PageAnon is not being upgraded racily to PageKsm | ||
671 | * (but PageKsm is never downgraded to PageAnon). | ||
669 | */ | 672 | */ |
670 | void page_add_anon_rmap(struct page *page, | 673 | void page_add_anon_rmap(struct page *page, |
671 | struct vm_area_struct *vma, unsigned long address) | 674 | struct vm_area_struct *vma, unsigned long address) |
672 | { | 675 | { |
676 | int first = atomic_inc_and_test(&page->_mapcount); | ||
677 | if (first) | ||
678 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
679 | if (unlikely(PageKsm(page))) | ||
680 | return; | ||
681 | |||
673 | VM_BUG_ON(!PageLocked(page)); | 682 | VM_BUG_ON(!PageLocked(page)); |
674 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 683 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
675 | if (atomic_inc_and_test(&page->_mapcount)) | 684 | if (first) |
676 | __page_set_anon_rmap(page, vma, address); | 685 | __page_set_anon_rmap(page, vma, address); |
677 | else | 686 | else |
678 | __page_check_anon_rmap(page, vma, address); | 687 | __page_check_anon_rmap(page, vma, address); |
@@ -694,6 +703,7 @@ void page_add_new_anon_rmap(struct page *page, | |||
694 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 703 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
695 | SetPageSwapBacked(page); | 704 | SetPageSwapBacked(page); |
696 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 705 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
706 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
697 | __page_set_anon_rmap(page, vma, address); | 707 | __page_set_anon_rmap(page, vma, address); |
698 | if (page_evictable(page, vma)) | 708 | if (page_evictable(page, vma)) |
699 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 709 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
@@ -711,7 +721,7 @@ void page_add_file_rmap(struct page *page) | |||
711 | { | 721 | { |
712 | if (atomic_inc_and_test(&page->_mapcount)) { | 722 | if (atomic_inc_and_test(&page->_mapcount)) { |
713 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 723 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
714 | mem_cgroup_update_mapped_file_stat(page, 1); | 724 | mem_cgroup_update_file_mapped(page, 1); |
715 | } | 725 | } |
716 | } | 726 | } |
717 | 727 | ||
@@ -743,8 +753,8 @@ void page_remove_rmap(struct page *page) | |||
743 | __dec_zone_page_state(page, NR_ANON_PAGES); | 753 | __dec_zone_page_state(page, NR_ANON_PAGES); |
744 | } else { | 754 | } else { |
745 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 755 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
756 | mem_cgroup_update_file_mapped(page, -1); | ||
746 | } | 757 | } |
747 | mem_cgroup_update_mapped_file_stat(page, -1); | ||
748 | /* | 758 | /* |
749 | * It would be tidy to reset the PageAnon mapping here, | 759 | * It would be tidy to reset the PageAnon mapping here, |
750 | * but that might overwrite a racing page_add_anon_rmap | 760 | * but that might overwrite a racing page_add_anon_rmap |
@@ -760,20 +770,15 @@ void page_remove_rmap(struct page *page) | |||
760 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 770 | * Subfunctions of try_to_unmap: try_to_unmap_one called |
761 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 771 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. |
762 | */ | 772 | */ |
763 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 773 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
764 | enum ttu_flags flags) | 774 | unsigned long address, enum ttu_flags flags) |
765 | { | 775 | { |
766 | struct mm_struct *mm = vma->vm_mm; | 776 | struct mm_struct *mm = vma->vm_mm; |
767 | unsigned long address; | ||
768 | pte_t *pte; | 777 | pte_t *pte; |
769 | pte_t pteval; | 778 | pte_t pteval; |
770 | spinlock_t *ptl; | 779 | spinlock_t *ptl; |
771 | int ret = SWAP_AGAIN; | 780 | int ret = SWAP_AGAIN; |
772 | 781 | ||
773 | address = vma_address(page, vma); | ||
774 | if (address == -EFAULT) | ||
775 | goto out; | ||
776 | |||
777 | pte = page_check_address(page, mm, address, &ptl, 0); | 782 | pte = page_check_address(page, mm, address, &ptl, 0); |
778 | if (!pte) | 783 | if (!pte) |
779 | goto out; | 784 | goto out; |
@@ -784,10 +789,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
784 | * skipped over this mm) then we should reactivate it. | 789 | * skipped over this mm) then we should reactivate it. |
785 | */ | 790 | */ |
786 | if (!(flags & TTU_IGNORE_MLOCK)) { | 791 | if (!(flags & TTU_IGNORE_MLOCK)) { |
787 | if (vma->vm_flags & VM_LOCKED) { | 792 | if (vma->vm_flags & VM_LOCKED) |
788 | ret = SWAP_MLOCK; | 793 | goto out_mlock; |
794 | |||
795 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
789 | goto out_unmap; | 796 | goto out_unmap; |
790 | } | ||
791 | } | 797 | } |
792 | if (!(flags & TTU_IGNORE_ACCESS)) { | 798 | if (!(flags & TTU_IGNORE_ACCESS)) { |
793 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 799 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
@@ -822,7 +828,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
822 | * Store the swap location in the pte. | 828 | * Store the swap location in the pte. |
823 | * See handle_pte_fault() ... | 829 | * See handle_pte_fault() ... |
824 | */ | 830 | */ |
825 | swap_duplicate(entry); | 831 | if (swap_duplicate(entry) < 0) { |
832 | set_pte_at(mm, address, pte, pteval); | ||
833 | ret = SWAP_FAIL; | ||
834 | goto out_unmap; | ||
835 | } | ||
826 | if (list_empty(&mm->mmlist)) { | 836 | if (list_empty(&mm->mmlist)) { |
827 | spin_lock(&mmlist_lock); | 837 | spin_lock(&mmlist_lock); |
828 | if (list_empty(&mm->mmlist)) | 838 | if (list_empty(&mm->mmlist)) |
@@ -849,7 +859,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
849 | } else | 859 | } else |
850 | dec_mm_counter(mm, file_rss); | 860 | dec_mm_counter(mm, file_rss); |
851 | 861 | ||
852 | |||
853 | page_remove_rmap(page); | 862 | page_remove_rmap(page); |
854 | page_cache_release(page); | 863 | page_cache_release(page); |
855 | 864 | ||
@@ -857,6 +866,27 @@ out_unmap: | |||
857 | pte_unmap_unlock(pte, ptl); | 866 | pte_unmap_unlock(pte, ptl); |
858 | out: | 867 | out: |
859 | return ret; | 868 | return ret; |
869 | |||
870 | out_mlock: | ||
871 | pte_unmap_unlock(pte, ptl); | ||
872 | |||
873 | |||
874 | /* | ||
875 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | ||
876 | * unstable result and race. Plus, We can't wait here because | ||
877 | * we now hold anon_vma->lock or mapping->i_mmap_lock. | ||
878 | * if trylock failed, the page remain in evictable lru and later | ||
879 | * vmscan could retry to move the page to unevictable lru if the | ||
880 | * page is actually mlocked. | ||
881 | */ | ||
882 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
883 | if (vma->vm_flags & VM_LOCKED) { | ||
884 | mlock_vma_page(page); | ||
885 | ret = SWAP_MLOCK; | ||
886 | } | ||
887 | up_read(&vma->vm_mm->mmap_sem); | ||
888 | } | ||
889 | return ret; | ||
860 | } | 890 | } |
861 | 891 | ||
862 | /* | 892 | /* |
@@ -922,11 +952,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
922 | return ret; | 952 | return ret; |
923 | 953 | ||
924 | /* | 954 | /* |
925 | * MLOCK_PAGES => feature is configured. | 955 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, |
926 | * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, | ||
927 | * keep the sem while scanning the cluster for mlocking pages. | 956 | * keep the sem while scanning the cluster for mlocking pages. |
928 | */ | 957 | */ |
929 | if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { | 958 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { |
930 | locked_vma = (vma->vm_flags & VM_LOCKED); | 959 | locked_vma = (vma->vm_flags & VM_LOCKED); |
931 | if (!locked_vma) | 960 | if (!locked_vma) |
932 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ | 961 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ |
@@ -976,29 +1005,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
976 | return ret; | 1005 | return ret; |
977 | } | 1006 | } |
978 | 1007 | ||
979 | /* | ||
980 | * common handling for pages mapped in VM_LOCKED vmas | ||
981 | */ | ||
982 | static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) | ||
983 | { | ||
984 | int mlocked = 0; | ||
985 | |||
986 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
987 | if (vma->vm_flags & VM_LOCKED) { | ||
988 | mlock_vma_page(page); | ||
989 | mlocked++; /* really mlocked the page */ | ||
990 | } | ||
991 | up_read(&vma->vm_mm->mmap_sem); | ||
992 | } | ||
993 | return mlocked; | ||
994 | } | ||
995 | |||
996 | /** | 1008 | /** |
997 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based | 1009 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based |
998 | * rmap method | 1010 | * rmap method |
999 | * @page: the page to unmap/unlock | 1011 | * @page: the page to unmap/unlock |
1000 | * @unlock: request for unlock rather than unmap [unlikely] | 1012 | * @flags: action and flags |
1001 | * @migration: unmapping for migration - ignored if @unlock | ||
1002 | * | 1013 | * |
1003 | * Find all the mappings of a page using the mapping pointer and the vma chains | 1014 | * Find all the mappings of a page using the mapping pointer and the vma chains |
1004 | * contained in the anon_vma struct it points to. | 1015 | * contained in the anon_vma struct it points to. |
@@ -1014,42 +1025,22 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1014 | { | 1025 | { |
1015 | struct anon_vma *anon_vma; | 1026 | struct anon_vma *anon_vma; |
1016 | struct vm_area_struct *vma; | 1027 | struct vm_area_struct *vma; |
1017 | unsigned int mlocked = 0; | ||
1018 | int ret = SWAP_AGAIN; | 1028 | int ret = SWAP_AGAIN; |
1019 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
1020 | |||
1021 | if (MLOCK_PAGES && unlikely(unlock)) | ||
1022 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
1023 | 1029 | ||
1024 | anon_vma = page_lock_anon_vma(page); | 1030 | anon_vma = page_lock_anon_vma(page); |
1025 | if (!anon_vma) | 1031 | if (!anon_vma) |
1026 | return ret; | 1032 | return ret; |
1027 | 1033 | ||
1028 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1034 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
1029 | if (MLOCK_PAGES && unlikely(unlock)) { | 1035 | unsigned long address = vma_address(page, vma); |
1030 | if (!((vma->vm_flags & VM_LOCKED) && | 1036 | if (address == -EFAULT) |
1031 | page_mapped_in_vma(page, vma))) | 1037 | continue; |
1032 | continue; /* must visit all unlocked vmas */ | 1038 | ret = try_to_unmap_one(page, vma, address, flags); |
1033 | ret = SWAP_MLOCK; /* saw at least one mlocked vma */ | 1039 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1034 | } else { | 1040 | break; |
1035 | ret = try_to_unmap_one(page, vma, flags); | ||
1036 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
1037 | break; | ||
1038 | } | ||
1039 | if (ret == SWAP_MLOCK) { | ||
1040 | mlocked = try_to_mlock_page(page, vma); | ||
1041 | if (mlocked) | ||
1042 | break; /* stop if actually mlocked page */ | ||
1043 | } | ||
1044 | } | 1041 | } |
1045 | 1042 | ||
1046 | page_unlock_anon_vma(anon_vma); | 1043 | page_unlock_anon_vma(anon_vma); |
1047 | |||
1048 | if (mlocked) | ||
1049 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
1050 | else if (ret == SWAP_MLOCK) | ||
1051 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
1052 | |||
1053 | return ret; | 1044 | return ret; |
1054 | } | 1045 | } |
1055 | 1046 | ||
@@ -1079,48 +1070,30 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1079 | unsigned long max_nl_cursor = 0; | 1070 | unsigned long max_nl_cursor = 0; |
1080 | unsigned long max_nl_size = 0; | 1071 | unsigned long max_nl_size = 0; |
1081 | unsigned int mapcount; | 1072 | unsigned int mapcount; |
1082 | unsigned int mlocked = 0; | ||
1083 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
1084 | |||
1085 | if (MLOCK_PAGES && unlikely(unlock)) | ||
1086 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
1087 | 1073 | ||
1088 | spin_lock(&mapping->i_mmap_lock); | 1074 | spin_lock(&mapping->i_mmap_lock); |
1089 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1075 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1090 | if (MLOCK_PAGES && unlikely(unlock)) { | 1076 | unsigned long address = vma_address(page, vma); |
1091 | if (!((vma->vm_flags & VM_LOCKED) && | 1077 | if (address == -EFAULT) |
1092 | page_mapped_in_vma(page, vma))) | 1078 | continue; |
1093 | continue; /* must visit all vmas */ | 1079 | ret = try_to_unmap_one(page, vma, address, flags); |
1094 | ret = SWAP_MLOCK; | 1080 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1095 | } else { | 1081 | goto out; |
1096 | ret = try_to_unmap_one(page, vma, flags); | ||
1097 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
1098 | goto out; | ||
1099 | } | ||
1100 | if (ret == SWAP_MLOCK) { | ||
1101 | mlocked = try_to_mlock_page(page, vma); | ||
1102 | if (mlocked) | ||
1103 | break; /* stop if actually mlocked page */ | ||
1104 | } | ||
1105 | } | 1082 | } |
1106 | 1083 | ||
1107 | if (mlocked) | 1084 | if (list_empty(&mapping->i_mmap_nonlinear)) |
1108 | goto out; | 1085 | goto out; |
1109 | 1086 | ||
1110 | if (list_empty(&mapping->i_mmap_nonlinear)) | 1087 | /* |
1088 | * We don't bother to try to find the munlocked page in nonlinears. | ||
1089 | * It's costly. Instead, later, page reclaim logic may call | ||
1090 | * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. | ||
1091 | */ | ||
1092 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
1111 | goto out; | 1093 | goto out; |
1112 | 1094 | ||
1113 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1095 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1114 | shared.vm_set.list) { | 1096 | shared.vm_set.list) { |
1115 | if (MLOCK_PAGES && unlikely(unlock)) { | ||
1116 | if (!(vma->vm_flags & VM_LOCKED)) | ||
1117 | continue; /* must visit all vmas */ | ||
1118 | ret = SWAP_MLOCK; /* leave mlocked == 0 */ | ||
1119 | goto out; /* no need to look further */ | ||
1120 | } | ||
1121 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && | ||
1122 | (vma->vm_flags & VM_LOCKED)) | ||
1123 | continue; | ||
1124 | cursor = (unsigned long) vma->vm_private_data; | 1097 | cursor = (unsigned long) vma->vm_private_data; |
1125 | if (cursor > max_nl_cursor) | 1098 | if (cursor > max_nl_cursor) |
1126 | max_nl_cursor = cursor; | 1099 | max_nl_cursor = cursor; |
@@ -1153,16 +1126,12 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1153 | do { | 1126 | do { |
1154 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1127 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1155 | shared.vm_set.list) { | 1128 | shared.vm_set.list) { |
1156 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && | ||
1157 | (vma->vm_flags & VM_LOCKED)) | ||
1158 | continue; | ||
1159 | cursor = (unsigned long) vma->vm_private_data; | 1129 | cursor = (unsigned long) vma->vm_private_data; |
1160 | while ( cursor < max_nl_cursor && | 1130 | while ( cursor < max_nl_cursor && |
1161 | cursor < vma->vm_end - vma->vm_start) { | 1131 | cursor < vma->vm_end - vma->vm_start) { |
1162 | ret = try_to_unmap_cluster(cursor, &mapcount, | 1132 | if (try_to_unmap_cluster(cursor, &mapcount, |
1163 | vma, page); | 1133 | vma, page) == SWAP_MLOCK) |
1164 | if (ret == SWAP_MLOCK) | 1134 | ret = SWAP_MLOCK; |
1165 | mlocked = 2; /* to return below */ | ||
1166 | cursor += CLUSTER_SIZE; | 1135 | cursor += CLUSTER_SIZE; |
1167 | vma->vm_private_data = (void *) cursor; | 1136 | vma->vm_private_data = (void *) cursor; |
1168 | if ((int)mapcount <= 0) | 1137 | if ((int)mapcount <= 0) |
@@ -1183,10 +1152,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1183 | vma->vm_private_data = NULL; | 1152 | vma->vm_private_data = NULL; |
1184 | out: | 1153 | out: |
1185 | spin_unlock(&mapping->i_mmap_lock); | 1154 | spin_unlock(&mapping->i_mmap_lock); |
1186 | if (mlocked) | ||
1187 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
1188 | else if (ret == SWAP_MLOCK) | ||
1189 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
1190 | return ret; | 1155 | return ret; |
1191 | } | 1156 | } |
1192 | 1157 | ||
@@ -1210,7 +1175,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1210 | 1175 | ||
1211 | BUG_ON(!PageLocked(page)); | 1176 | BUG_ON(!PageLocked(page)); |
1212 | 1177 | ||
1213 | if (PageAnon(page)) | 1178 | if (unlikely(PageKsm(page))) |
1179 | ret = try_to_unmap_ksm(page, flags); | ||
1180 | else if (PageAnon(page)) | ||
1214 | ret = try_to_unmap_anon(page, flags); | 1181 | ret = try_to_unmap_anon(page, flags); |
1215 | else | 1182 | else |
1216 | ret = try_to_unmap_file(page, flags); | 1183 | ret = try_to_unmap_file(page, flags); |
@@ -1229,17 +1196,98 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1229 | * | 1196 | * |
1230 | * Return values are: | 1197 | * Return values are: |
1231 | * | 1198 | * |
1232 | * SWAP_SUCCESS - no vma's holding page mlocked. | 1199 | * SWAP_AGAIN - no vma is holding page mlocked, or, |
1233 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem | 1200 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem |
1201 | * SWAP_FAIL - page cannot be located at present | ||
1234 | * SWAP_MLOCK - page is now mlocked. | 1202 | * SWAP_MLOCK - page is now mlocked. |
1235 | */ | 1203 | */ |
1236 | int try_to_munlock(struct page *page) | 1204 | int try_to_munlock(struct page *page) |
1237 | { | 1205 | { |
1238 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | 1206 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); |
1239 | 1207 | ||
1240 | if (PageAnon(page)) | 1208 | if (unlikely(PageKsm(page))) |
1209 | return try_to_unmap_ksm(page, TTU_MUNLOCK); | ||
1210 | else if (PageAnon(page)) | ||
1241 | return try_to_unmap_anon(page, TTU_MUNLOCK); | 1211 | return try_to_unmap_anon(page, TTU_MUNLOCK); |
1242 | else | 1212 | else |
1243 | return try_to_unmap_file(page, TTU_MUNLOCK); | 1213 | return try_to_unmap_file(page, TTU_MUNLOCK); |
1244 | } | 1214 | } |
1245 | 1215 | ||
1216 | #ifdef CONFIG_MIGRATION | ||
1217 | /* | ||
1218 | * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): | ||
1219 | * Called by migrate.c to remove migration ptes, but might be used more later. | ||
1220 | */ | ||
1221 | static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | ||
1222 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1223 | { | ||
1224 | struct anon_vma *anon_vma; | ||
1225 | struct vm_area_struct *vma; | ||
1226 | int ret = SWAP_AGAIN; | ||
1227 | |||
1228 | /* | ||
1229 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() | ||
1230 | * because that depends on page_mapped(); but not all its usages | ||
1231 | * are holding mmap_sem, which also gave the necessary guarantee | ||
1232 | * (that this anon_vma's slab has not already been destroyed). | ||
1233 | * This needs to be reviewed later: avoiding page_lock_anon_vma() | ||
1234 | * is risky, and currently limits the usefulness of rmap_walk(). | ||
1235 | */ | ||
1236 | anon_vma = page_anon_vma(page); | ||
1237 | if (!anon_vma) | ||
1238 | return ret; | ||
1239 | spin_lock(&anon_vma->lock); | ||
1240 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
1241 | unsigned long address = vma_address(page, vma); | ||
1242 | if (address == -EFAULT) | ||
1243 | continue; | ||
1244 | ret = rmap_one(page, vma, address, arg); | ||
1245 | if (ret != SWAP_AGAIN) | ||
1246 | break; | ||
1247 | } | ||
1248 | spin_unlock(&anon_vma->lock); | ||
1249 | return ret; | ||
1250 | } | ||
1251 | |||
1252 | static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | ||
1253 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1254 | { | ||
1255 | struct address_space *mapping = page->mapping; | ||
1256 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1257 | struct vm_area_struct *vma; | ||
1258 | struct prio_tree_iter iter; | ||
1259 | int ret = SWAP_AGAIN; | ||
1260 | |||
1261 | if (!mapping) | ||
1262 | return ret; | ||
1263 | spin_lock(&mapping->i_mmap_lock); | ||
1264 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
1265 | unsigned long address = vma_address(page, vma); | ||
1266 | if (address == -EFAULT) | ||
1267 | continue; | ||
1268 | ret = rmap_one(page, vma, address, arg); | ||
1269 | if (ret != SWAP_AGAIN) | ||
1270 | break; | ||
1271 | } | ||
1272 | /* | ||
1273 | * No nonlinear handling: being always shared, nonlinear vmas | ||
1274 | * never contain migration ptes. Decide what to do about this | ||
1275 | * limitation to linear when we need rmap_walk() on nonlinear. | ||
1276 | */ | ||
1277 | spin_unlock(&mapping->i_mmap_lock); | ||
1278 | return ret; | ||
1279 | } | ||
1280 | |||
1281 | int rmap_walk(struct page *page, int (*rmap_one)(struct page *, | ||
1282 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1283 | { | ||
1284 | VM_BUG_ON(!PageLocked(page)); | ||
1285 | |||
1286 | if (unlikely(PageKsm(page))) | ||
1287 | return rmap_walk_ksm(page, rmap_one, arg); | ||
1288 | else if (PageAnon(page)) | ||
1289 | return rmap_walk_anon(page, rmap_one, arg); | ||
1290 | else | ||
1291 | return rmap_walk_file(page, rmap_one, arg); | ||
1292 | } | ||
1293 | #endif /* CONFIG_MIGRATION */ | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 356dd99566ec..eef4ebea5158 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -29,7 +29,6 @@ | |||
29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
31 | #include <linux/swap.h> | 31 | #include <linux/swap.h> |
32 | #include <linux/ima.h> | ||
33 | 32 | ||
34 | static struct vfsmount *shm_mnt; | 33 | static struct vfsmount *shm_mnt; |
35 | 34 | ||
@@ -42,6 +41,7 @@ static struct vfsmount *shm_mnt; | |||
42 | 41 | ||
43 | #include <linux/xattr.h> | 42 | #include <linux/xattr.h> |
44 | #include <linux/exportfs.h> | 43 | #include <linux/exportfs.h> |
44 | #include <linux/posix_acl.h> | ||
45 | #include <linux/generic_acl.h> | 45 | #include <linux/generic_acl.h> |
46 | #include <linux/mman.h> | 46 | #include <linux/mman.h> |
47 | #include <linux/string.h> | 47 | #include <linux/string.h> |
@@ -810,7 +810,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
810 | error = inode_setattr(inode, attr); | 810 | error = inode_setattr(inode, attr); |
811 | #ifdef CONFIG_TMPFS_POSIX_ACL | 811 | #ifdef CONFIG_TMPFS_POSIX_ACL |
812 | if (!error && (attr->ia_valid & ATTR_MODE)) | 812 | if (!error && (attr->ia_valid & ATTR_MODE)) |
813 | error = generic_acl_chmod(inode, &shmem_acl_ops); | 813 | error = generic_acl_chmod(inode); |
814 | #endif | 814 | #endif |
815 | if (page) | 815 | if (page) |
816 | page_cache_release(page); | 816 | page_cache_release(page); |
@@ -1017,7 +1017,14 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
1017 | goto out; | 1017 | goto out; |
1018 | } | 1018 | } |
1019 | mutex_unlock(&shmem_swaplist_mutex); | 1019 | mutex_unlock(&shmem_swaplist_mutex); |
1020 | out: return found; /* 0 or 1 or -ENOMEM */ | 1020 | /* |
1021 | * Can some race bring us here? We've been holding page lock, | ||
1022 | * so I think not; but would rather try again later than BUG() | ||
1023 | */ | ||
1024 | unlock_page(page); | ||
1025 | page_cache_release(page); | ||
1026 | out: | ||
1027 | return (found < 0) ? found : 0; | ||
1021 | } | 1028 | } |
1022 | 1029 | ||
1023 | /* | 1030 | /* |
@@ -1080,7 +1087,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1080 | else | 1087 | else |
1081 | inode = NULL; | 1088 | inode = NULL; |
1082 | spin_unlock(&info->lock); | 1089 | spin_unlock(&info->lock); |
1083 | swap_duplicate(swap); | 1090 | swap_shmem_alloc(swap); |
1084 | BUG_ON(page_mapped(page)); | 1091 | BUG_ON(page_mapped(page)); |
1085 | page_cache_release(page); /* pagecache ref */ | 1092 | page_cache_release(page); /* pagecache ref */ |
1086 | swap_writepage(page, wbc); | 1093 | swap_writepage(page, wbc); |
@@ -1817,11 +1824,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) | |||
1817 | return error; | 1824 | return error; |
1818 | } | 1825 | } |
1819 | } | 1826 | } |
1820 | error = shmem_acl_init(inode, dir); | 1827 | #ifdef CONFIG_TMPFS_POSIX_ACL |
1828 | error = generic_acl_init(inode, dir); | ||
1821 | if (error) { | 1829 | if (error) { |
1822 | iput(inode); | 1830 | iput(inode); |
1823 | return error; | 1831 | return error; |
1824 | } | 1832 | } |
1833 | #else | ||
1834 | error = 0; | ||
1835 | #endif | ||
1825 | if (dir->i_mode & S_ISGID) { | 1836 | if (dir->i_mode & S_ISGID) { |
1826 | inode->i_gid = dir->i_gid; | 1837 | inode->i_gid = dir->i_gid; |
1827 | if (S_ISDIR(mode)) | 1838 | if (S_ISDIR(mode)) |
@@ -2036,27 +2047,28 @@ static const struct inode_operations shmem_symlink_inode_operations = { | |||
2036 | * filesystem level, though. | 2047 | * filesystem level, though. |
2037 | */ | 2048 | */ |
2038 | 2049 | ||
2039 | static size_t shmem_xattr_security_list(struct inode *inode, char *list, | 2050 | static size_t shmem_xattr_security_list(struct dentry *dentry, char *list, |
2040 | size_t list_len, const char *name, | 2051 | size_t list_len, const char *name, |
2041 | size_t name_len) | 2052 | size_t name_len, int handler_flags) |
2042 | { | 2053 | { |
2043 | return security_inode_listsecurity(inode, list, list_len); | 2054 | return security_inode_listsecurity(dentry->d_inode, list, list_len); |
2044 | } | 2055 | } |
2045 | 2056 | ||
2046 | static int shmem_xattr_security_get(struct inode *inode, const char *name, | 2057 | static int shmem_xattr_security_get(struct dentry *dentry, const char *name, |
2047 | void *buffer, size_t size) | 2058 | void *buffer, size_t size, int handler_flags) |
2048 | { | 2059 | { |
2049 | if (strcmp(name, "") == 0) | 2060 | if (strcmp(name, "") == 0) |
2050 | return -EINVAL; | 2061 | return -EINVAL; |
2051 | return xattr_getsecurity(inode, name, buffer, size); | 2062 | return xattr_getsecurity(dentry->d_inode, name, buffer, size); |
2052 | } | 2063 | } |
2053 | 2064 | ||
2054 | static int shmem_xattr_security_set(struct inode *inode, const char *name, | 2065 | static int shmem_xattr_security_set(struct dentry *dentry, const char *name, |
2055 | const void *value, size_t size, int flags) | 2066 | const void *value, size_t size, int flags, int handler_flags) |
2056 | { | 2067 | { |
2057 | if (strcmp(name, "") == 0) | 2068 | if (strcmp(name, "") == 0) |
2058 | return -EINVAL; | 2069 | return -EINVAL; |
2059 | return security_inode_setsecurity(inode, name, value, size, flags); | 2070 | return security_inode_setsecurity(dentry->d_inode, name, value, |
2071 | size, flags); | ||
2060 | } | 2072 | } |
2061 | 2073 | ||
2062 | static struct xattr_handler shmem_xattr_security_handler = { | 2074 | static struct xattr_handler shmem_xattr_security_handler = { |
@@ -2067,8 +2079,8 @@ static struct xattr_handler shmem_xattr_security_handler = { | |||
2067 | }; | 2079 | }; |
2068 | 2080 | ||
2069 | static struct xattr_handler *shmem_xattr_handlers[] = { | 2081 | static struct xattr_handler *shmem_xattr_handlers[] = { |
2070 | &shmem_xattr_acl_access_handler, | 2082 | &generic_acl_access_handler, |
2071 | &shmem_xattr_acl_default_handler, | 2083 | &generic_acl_default_handler, |
2072 | &shmem_xattr_security_handler, | 2084 | &shmem_xattr_security_handler, |
2073 | NULL | 2085 | NULL |
2074 | }; | 2086 | }; |
@@ -2447,7 +2459,7 @@ static const struct inode_operations shmem_inode_operations = { | |||
2447 | .getxattr = generic_getxattr, | 2459 | .getxattr = generic_getxattr, |
2448 | .listxattr = generic_listxattr, | 2460 | .listxattr = generic_listxattr, |
2449 | .removexattr = generic_removexattr, | 2461 | .removexattr = generic_removexattr, |
2450 | .check_acl = shmem_check_acl, | 2462 | .check_acl = generic_check_acl, |
2451 | #endif | 2463 | #endif |
2452 | 2464 | ||
2453 | }; | 2465 | }; |
@@ -2470,7 +2482,7 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
2470 | .getxattr = generic_getxattr, | 2482 | .getxattr = generic_getxattr, |
2471 | .listxattr = generic_listxattr, | 2483 | .listxattr = generic_listxattr, |
2472 | .removexattr = generic_removexattr, | 2484 | .removexattr = generic_removexattr, |
2473 | .check_acl = shmem_check_acl, | 2485 | .check_acl = generic_check_acl, |
2474 | #endif | 2486 | #endif |
2475 | }; | 2487 | }; |
2476 | 2488 | ||
@@ -2481,7 +2493,7 @@ static const struct inode_operations shmem_special_inode_operations = { | |||
2481 | .getxattr = generic_getxattr, | 2493 | .getxattr = generic_getxattr, |
2482 | .listxattr = generic_listxattr, | 2494 | .listxattr = generic_listxattr, |
2483 | .removexattr = generic_removexattr, | 2495 | .removexattr = generic_removexattr, |
2484 | .check_acl = shmem_check_acl, | 2496 | .check_acl = generic_check_acl, |
2485 | #endif | 2497 | #endif |
2486 | }; | 2498 | }; |
2487 | 2499 | ||
@@ -2619,7 +2631,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2619 | int error; | 2631 | int error; |
2620 | struct file *file; | 2632 | struct file *file; |
2621 | struct inode *inode; | 2633 | struct inode *inode; |
2622 | struct dentry *dentry, *root; | 2634 | struct path path; |
2635 | struct dentry *root; | ||
2623 | struct qstr this; | 2636 | struct qstr this; |
2624 | 2637 | ||
2625 | if (IS_ERR(shm_mnt)) | 2638 | if (IS_ERR(shm_mnt)) |
@@ -2636,38 +2649,35 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2636 | this.len = strlen(name); | 2649 | this.len = strlen(name); |
2637 | this.hash = 0; /* will go */ | 2650 | this.hash = 0; /* will go */ |
2638 | root = shm_mnt->mnt_root; | 2651 | root = shm_mnt->mnt_root; |
2639 | dentry = d_alloc(root, &this); | 2652 | path.dentry = d_alloc(root, &this); |
2640 | if (!dentry) | 2653 | if (!path.dentry) |
2641 | goto put_memory; | 2654 | goto put_memory; |
2642 | 2655 | path.mnt = mntget(shm_mnt); | |
2643 | error = -ENFILE; | ||
2644 | file = get_empty_filp(); | ||
2645 | if (!file) | ||
2646 | goto put_dentry; | ||
2647 | 2656 | ||
2648 | error = -ENOSPC; | 2657 | error = -ENOSPC; |
2649 | inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); | 2658 | inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); |
2650 | if (!inode) | 2659 | if (!inode) |
2651 | goto close_file; | 2660 | goto put_dentry; |
2652 | 2661 | ||
2653 | d_instantiate(dentry, inode); | 2662 | d_instantiate(path.dentry, inode); |
2654 | inode->i_size = size; | 2663 | inode->i_size = size; |
2655 | inode->i_nlink = 0; /* It is unlinked */ | 2664 | inode->i_nlink = 0; /* It is unlinked */ |
2656 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
2657 | &shmem_file_operations); | ||
2658 | |||
2659 | #ifndef CONFIG_MMU | 2665 | #ifndef CONFIG_MMU |
2660 | error = ramfs_nommu_expand_for_mapping(inode, size); | 2666 | error = ramfs_nommu_expand_for_mapping(inode, size); |
2661 | if (error) | 2667 | if (error) |
2662 | goto close_file; | 2668 | goto put_dentry; |
2663 | #endif | 2669 | #endif |
2664 | ima_counts_get(file); | 2670 | |
2671 | error = -ENFILE; | ||
2672 | file = alloc_file(&path, FMODE_WRITE | FMODE_READ, | ||
2673 | &shmem_file_operations); | ||
2674 | if (!file) | ||
2675 | goto put_dentry; | ||
2676 | |||
2665 | return file; | 2677 | return file; |
2666 | 2678 | ||
2667 | close_file: | ||
2668 | put_filp(file); | ||
2669 | put_dentry: | 2679 | put_dentry: |
2670 | dput(dentry); | 2680 | path_put(&path); |
2671 | put_memory: | 2681 | put_memory: |
2672 | shmem_unacct_size(flags, size); | 2682 | shmem_unacct_size(flags, size); |
2673 | return ERR_PTR(error); | 2683 | return ERR_PTR(error); |
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c deleted file mode 100644 index df2c87fdae50..000000000000 --- a/mm/shmem_acl.c +++ /dev/null | |||
@@ -1,171 +0,0 @@ | |||
1 | /* | ||
2 | * mm/shmem_acl.c | ||
3 | * | ||
4 | * (C) 2005 Andreas Gruenbacher <agruen@suse.de> | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | */ | ||
8 | |||
9 | #include <linux/fs.h> | ||
10 | #include <linux/shmem_fs.h> | ||
11 | #include <linux/xattr.h> | ||
12 | #include <linux/generic_acl.h> | ||
13 | |||
14 | /** | ||
15 | * shmem_get_acl - generic_acl_operations->getacl() operation | ||
16 | */ | ||
17 | static struct posix_acl * | ||
18 | shmem_get_acl(struct inode *inode, int type) | ||
19 | { | ||
20 | struct posix_acl *acl = NULL; | ||
21 | |||
22 | spin_lock(&inode->i_lock); | ||
23 | switch(type) { | ||
24 | case ACL_TYPE_ACCESS: | ||
25 | acl = posix_acl_dup(inode->i_acl); | ||
26 | break; | ||
27 | |||
28 | case ACL_TYPE_DEFAULT: | ||
29 | acl = posix_acl_dup(inode->i_default_acl); | ||
30 | break; | ||
31 | } | ||
32 | spin_unlock(&inode->i_lock); | ||
33 | |||
34 | return acl; | ||
35 | } | ||
36 | |||
37 | /** | ||
38 | * shmem_set_acl - generic_acl_operations->setacl() operation | ||
39 | */ | ||
40 | static void | ||
41 | shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) | ||
42 | { | ||
43 | struct posix_acl *free = NULL; | ||
44 | |||
45 | spin_lock(&inode->i_lock); | ||
46 | switch(type) { | ||
47 | case ACL_TYPE_ACCESS: | ||
48 | free = inode->i_acl; | ||
49 | inode->i_acl = posix_acl_dup(acl); | ||
50 | break; | ||
51 | |||
52 | case ACL_TYPE_DEFAULT: | ||
53 | free = inode->i_default_acl; | ||
54 | inode->i_default_acl = posix_acl_dup(acl); | ||
55 | break; | ||
56 | } | ||
57 | spin_unlock(&inode->i_lock); | ||
58 | posix_acl_release(free); | ||
59 | } | ||
60 | |||
61 | struct generic_acl_operations shmem_acl_ops = { | ||
62 | .getacl = shmem_get_acl, | ||
63 | .setacl = shmem_set_acl, | ||
64 | }; | ||
65 | |||
66 | /** | ||
67 | * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access, | ||
68 | * shmem_xattr_acl_access_handler - plumbing code to implement the | ||
69 | * system.posix_acl_access xattr using the generic acl functions. | ||
70 | */ | ||
71 | |||
72 | static size_t | ||
73 | shmem_list_acl_access(struct inode *inode, char *list, size_t list_size, | ||
74 | const char *name, size_t name_len) | ||
75 | { | ||
76 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, | ||
77 | list, list_size); | ||
78 | } | ||
79 | |||
80 | static int | ||
81 | shmem_get_acl_access(struct inode *inode, const char *name, void *buffer, | ||
82 | size_t size) | ||
83 | { | ||
84 | if (strcmp(name, "") != 0) | ||
85 | return -EINVAL; | ||
86 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer, | ||
87 | size); | ||
88 | } | ||
89 | |||
90 | static int | ||
91 | shmem_set_acl_access(struct inode *inode, const char *name, const void *value, | ||
92 | size_t size, int flags) | ||
93 | { | ||
94 | if (strcmp(name, "") != 0) | ||
95 | return -EINVAL; | ||
96 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value, | ||
97 | size); | ||
98 | } | ||
99 | |||
100 | struct xattr_handler shmem_xattr_acl_access_handler = { | ||
101 | .prefix = POSIX_ACL_XATTR_ACCESS, | ||
102 | .list = shmem_list_acl_access, | ||
103 | .get = shmem_get_acl_access, | ||
104 | .set = shmem_set_acl_access, | ||
105 | }; | ||
106 | |||
107 | /** | ||
108 | * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default, | ||
109 | * shmem_xattr_acl_default_handler - plumbing code to implement the | ||
110 | * system.posix_acl_default xattr using the generic acl functions. | ||
111 | */ | ||
112 | |||
113 | static size_t | ||
114 | shmem_list_acl_default(struct inode *inode, char *list, size_t list_size, | ||
115 | const char *name, size_t name_len) | ||
116 | { | ||
117 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, | ||
118 | list, list_size); | ||
119 | } | ||
120 | |||
121 | static int | ||
122 | shmem_get_acl_default(struct inode *inode, const char *name, void *buffer, | ||
123 | size_t size) | ||
124 | { | ||
125 | if (strcmp(name, "") != 0) | ||
126 | return -EINVAL; | ||
127 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer, | ||
128 | size); | ||
129 | } | ||
130 | |||
131 | static int | ||
132 | shmem_set_acl_default(struct inode *inode, const char *name, const void *value, | ||
133 | size_t size, int flags) | ||
134 | { | ||
135 | if (strcmp(name, "") != 0) | ||
136 | return -EINVAL; | ||
137 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value, | ||
138 | size); | ||
139 | } | ||
140 | |||
141 | struct xattr_handler shmem_xattr_acl_default_handler = { | ||
142 | .prefix = POSIX_ACL_XATTR_DEFAULT, | ||
143 | .list = shmem_list_acl_default, | ||
144 | .get = shmem_get_acl_default, | ||
145 | .set = shmem_set_acl_default, | ||
146 | }; | ||
147 | |||
148 | /** | ||
149 | * shmem_acl_init - Inizialize the acl(s) of a new inode | ||
150 | */ | ||
151 | int | ||
152 | shmem_acl_init(struct inode *inode, struct inode *dir) | ||
153 | { | ||
154 | return generic_acl_init(inode, dir, &shmem_acl_ops); | ||
155 | } | ||
156 | |||
157 | /** | ||
158 | * shmem_check_acl - check_acl() callback for generic_permission() | ||
159 | */ | ||
160 | int | ||
161 | shmem_check_acl(struct inode *inode, int mask) | ||
162 | { | ||
163 | struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); | ||
164 | |||
165 | if (acl) { | ||
166 | int error = posix_acl_permission(inode, acl, mask); | ||
167 | posix_acl_release(acl); | ||
168 | return error; | ||
169 | } | ||
170 | return -EAGAIN; | ||
171 | } | ||
@@ -490,7 +490,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
490 | 490 | ||
491 | #endif | 491 | #endif |
492 | 492 | ||
493 | #ifdef CONFIG_KMEMTRACE | 493 | #ifdef CONFIG_TRACING |
494 | size_t slab_buffer_size(struct kmem_cache *cachep) | 494 | size_t slab_buffer_size(struct kmem_cache *cachep) |
495 | { | 495 | { |
496 | return cachep->buffer_size; | 496 | return cachep->buffer_size; |
@@ -604,6 +604,26 @@ static struct kmem_cache cache_cache = { | |||
604 | 604 | ||
605 | #define BAD_ALIEN_MAGIC 0x01020304ul | 605 | #define BAD_ALIEN_MAGIC 0x01020304ul |
606 | 606 | ||
607 | /* | ||
608 | * chicken and egg problem: delay the per-cpu array allocation | ||
609 | * until the general caches are up. | ||
610 | */ | ||
611 | static enum { | ||
612 | NONE, | ||
613 | PARTIAL_AC, | ||
614 | PARTIAL_L3, | ||
615 | EARLY, | ||
616 | FULL | ||
617 | } g_cpucache_up; | ||
618 | |||
619 | /* | ||
620 | * used by boot code to determine if it can use slab based allocator | ||
621 | */ | ||
622 | int slab_is_available(void) | ||
623 | { | ||
624 | return g_cpucache_up >= EARLY; | ||
625 | } | ||
626 | |||
607 | #ifdef CONFIG_LOCKDEP | 627 | #ifdef CONFIG_LOCKDEP |
608 | 628 | ||
609 | /* | 629 | /* |
@@ -620,40 +640,52 @@ static struct kmem_cache cache_cache = { | |||
620 | static struct lock_class_key on_slab_l3_key; | 640 | static struct lock_class_key on_slab_l3_key; |
621 | static struct lock_class_key on_slab_alc_key; | 641 | static struct lock_class_key on_slab_alc_key; |
622 | 642 | ||
623 | static inline void init_lock_keys(void) | 643 | static void init_node_lock_keys(int q) |
624 | |||
625 | { | 644 | { |
626 | int q; | ||
627 | struct cache_sizes *s = malloc_sizes; | 645 | struct cache_sizes *s = malloc_sizes; |
628 | 646 | ||
629 | while (s->cs_size != ULONG_MAX) { | 647 | if (g_cpucache_up != FULL) |
630 | for_each_node(q) { | 648 | return; |
631 | struct array_cache **alc; | 649 | |
632 | int r; | 650 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { |
633 | struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; | 651 | struct array_cache **alc; |
634 | if (!l3 || OFF_SLAB(s->cs_cachep)) | 652 | struct kmem_list3 *l3; |
635 | continue; | 653 | int r; |
636 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); | 654 | |
637 | alc = l3->alien; | 655 | l3 = s->cs_cachep->nodelists[q]; |
638 | /* | 656 | if (!l3 || OFF_SLAB(s->cs_cachep)) |
639 | * FIXME: This check for BAD_ALIEN_MAGIC | 657 | return; |
640 | * should go away when common slab code is taught to | 658 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); |
641 | * work even without alien caches. | 659 | alc = l3->alien; |
642 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | 660 | /* |
643 | * for alloc_alien_cache, | 661 | * FIXME: This check for BAD_ALIEN_MAGIC |
644 | */ | 662 | * should go away when common slab code is taught to |
645 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | 663 | * work even without alien caches. |
646 | continue; | 664 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC |
647 | for_each_node(r) { | 665 | * for alloc_alien_cache, |
648 | if (alc[r]) | 666 | */ |
649 | lockdep_set_class(&alc[r]->lock, | 667 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) |
650 | &on_slab_alc_key); | 668 | return; |
651 | } | 669 | for_each_node(r) { |
670 | if (alc[r]) | ||
671 | lockdep_set_class(&alc[r]->lock, | ||
672 | &on_slab_alc_key); | ||
652 | } | 673 | } |
653 | s++; | ||
654 | } | 674 | } |
655 | } | 675 | } |
676 | |||
677 | static inline void init_lock_keys(void) | ||
678 | { | ||
679 | int node; | ||
680 | |||
681 | for_each_node(node) | ||
682 | init_node_lock_keys(node); | ||
683 | } | ||
656 | #else | 684 | #else |
685 | static void init_node_lock_keys(int q) | ||
686 | { | ||
687 | } | ||
688 | |||
657 | static inline void init_lock_keys(void) | 689 | static inline void init_lock_keys(void) |
658 | { | 690 | { |
659 | } | 691 | } |
@@ -665,27 +697,7 @@ static inline void init_lock_keys(void) | |||
665 | static DEFINE_MUTEX(cache_chain_mutex); | 697 | static DEFINE_MUTEX(cache_chain_mutex); |
666 | static struct list_head cache_chain; | 698 | static struct list_head cache_chain; |
667 | 699 | ||
668 | /* | 700 | static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); |
669 | * chicken and egg problem: delay the per-cpu array allocation | ||
670 | * until the general caches are up. | ||
671 | */ | ||
672 | static enum { | ||
673 | NONE, | ||
674 | PARTIAL_AC, | ||
675 | PARTIAL_L3, | ||
676 | EARLY, | ||
677 | FULL | ||
678 | } g_cpucache_up; | ||
679 | |||
680 | /* | ||
681 | * used by boot code to determine if it can use slab based allocator | ||
682 | */ | ||
683 | int slab_is_available(void) | ||
684 | { | ||
685 | return g_cpucache_up >= EARLY; | ||
686 | } | ||
687 | |||
688 | static DEFINE_PER_CPU(struct delayed_work, reap_work); | ||
689 | 701 | ||
690 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 702 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) |
691 | { | 703 | { |
@@ -826,7 +838,7 @@ __setup("noaliencache", noaliencache_setup); | |||
826 | * objects freed on different nodes from which they were allocated) and the | 838 | * objects freed on different nodes from which they were allocated) and the |
827 | * flushing of remote pcps by calling drain_node_pages. | 839 | * flushing of remote pcps by calling drain_node_pages. |
828 | */ | 840 | */ |
829 | static DEFINE_PER_CPU(unsigned long, reap_node); | 841 | static DEFINE_PER_CPU(unsigned long, slab_reap_node); |
830 | 842 | ||
831 | static void init_reap_node(int cpu) | 843 | static void init_reap_node(int cpu) |
832 | { | 844 | { |
@@ -836,17 +848,17 @@ static void init_reap_node(int cpu) | |||
836 | if (node == MAX_NUMNODES) | 848 | if (node == MAX_NUMNODES) |
837 | node = first_node(node_online_map); | 849 | node = first_node(node_online_map); |
838 | 850 | ||
839 | per_cpu(reap_node, cpu) = node; | 851 | per_cpu(slab_reap_node, cpu) = node; |
840 | } | 852 | } |
841 | 853 | ||
842 | static void next_reap_node(void) | 854 | static void next_reap_node(void) |
843 | { | 855 | { |
844 | int node = __get_cpu_var(reap_node); | 856 | int node = __get_cpu_var(slab_reap_node); |
845 | 857 | ||
846 | node = next_node(node, node_online_map); | 858 | node = next_node(node, node_online_map); |
847 | if (unlikely(node >= MAX_NUMNODES)) | 859 | if (unlikely(node >= MAX_NUMNODES)) |
848 | node = first_node(node_online_map); | 860 | node = first_node(node_online_map); |
849 | __get_cpu_var(reap_node) = node; | 861 | __get_cpu_var(slab_reap_node) = node; |
850 | } | 862 | } |
851 | 863 | ||
852 | #else | 864 | #else |
@@ -863,7 +875,7 @@ static void next_reap_node(void) | |||
863 | */ | 875 | */ |
864 | static void __cpuinit start_cpu_timer(int cpu) | 876 | static void __cpuinit start_cpu_timer(int cpu) |
865 | { | 877 | { |
866 | struct delayed_work *reap_work = &per_cpu(reap_work, cpu); | 878 | struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); |
867 | 879 | ||
868 | /* | 880 | /* |
869 | * When this gets called from do_initcalls via cpucache_init(), | 881 | * When this gets called from do_initcalls via cpucache_init(), |
@@ -1027,7 +1039,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, | |||
1027 | */ | 1039 | */ |
1028 | static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) | 1040 | static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) |
1029 | { | 1041 | { |
1030 | int node = __get_cpu_var(reap_node); | 1042 | int node = __get_cpu_var(slab_reap_node); |
1031 | 1043 | ||
1032 | if (l3->alien) { | 1044 | if (l3->alien) { |
1033 | struct array_cache *ac = l3->alien[node]; | 1045 | struct array_cache *ac = l3->alien[node]; |
@@ -1120,7 +1132,7 @@ static void __cpuinit cpuup_canceled(long cpu) | |||
1120 | if (nc) | 1132 | if (nc) |
1121 | free_block(cachep, nc->entry, nc->avail, node); | 1133 | free_block(cachep, nc->entry, nc->avail, node); |
1122 | 1134 | ||
1123 | if (!cpus_empty(*mask)) { | 1135 | if (!cpumask_empty(mask)) { |
1124 | spin_unlock_irq(&l3->list_lock); | 1136 | spin_unlock_irq(&l3->list_lock); |
1125 | goto free_array_cache; | 1137 | goto free_array_cache; |
1126 | } | 1138 | } |
@@ -1254,6 +1266,8 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
1254 | kfree(shared); | 1266 | kfree(shared); |
1255 | free_alien_cache(alien); | 1267 | free_alien_cache(alien); |
1256 | } | 1268 | } |
1269 | init_node_lock_keys(node); | ||
1270 | |||
1257 | return 0; | 1271 | return 0; |
1258 | bad: | 1272 | bad: |
1259 | cpuup_canceled(cpu); | 1273 | cpuup_canceled(cpu); |
@@ -1286,9 +1300,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1286 | * anything expensive but will only modify reap_work | 1300 | * anything expensive but will only modify reap_work |
1287 | * and reschedule the timer. | 1301 | * and reschedule the timer. |
1288 | */ | 1302 | */ |
1289 | cancel_rearming_delayed_work(&per_cpu(reap_work, cpu)); | 1303 | cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); |
1290 | /* Now the cache_reaper is guaranteed to be not running. */ | 1304 | /* Now the cache_reaper is guaranteed to be not running. */ |
1291 | per_cpu(reap_work, cpu).work.func = NULL; | 1305 | per_cpu(slab_reap_work, cpu).work.func = NULL; |
1292 | break; | 1306 | break; |
1293 | case CPU_DOWN_FAILED: | 1307 | case CPU_DOWN_FAILED: |
1294 | case CPU_DOWN_FAILED_FROZEN: | 1308 | case CPU_DOWN_FAILED_FROZEN: |
@@ -2261,9 +2275,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2261 | /* | 2275 | /* |
2262 | * Determine if the slab management is 'on' or 'off' slab. | 2276 | * Determine if the slab management is 'on' or 'off' slab. |
2263 | * (bootstrapping cannot cope with offslab caches so don't do | 2277 | * (bootstrapping cannot cope with offslab caches so don't do |
2264 | * it too early on.) | 2278 | * it too early on. Always use on-slab management when |
2279 | * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) | ||
2265 | */ | 2280 | */ |
2266 | if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) | 2281 | if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && |
2282 | !(flags & SLAB_NOLEAKTRACE)) | ||
2267 | /* | 2283 | /* |
2268 | * Size is large, assume best to place the slab management obj | 2284 | * Size is large, assume best to place the slab management obj |
2269 | * off-slab (should allow better packing of objs). | 2285 | * off-slab (should allow better packing of objs). |
@@ -2582,8 +2598,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | |||
2582 | * kmemleak does not treat the ->s_mem pointer as a reference | 2598 | * kmemleak does not treat the ->s_mem pointer as a reference |
2583 | * to the object. Otherwise we will not report the leak. | 2599 | * to the object. Otherwise we will not report the leak. |
2584 | */ | 2600 | */ |
2585 | kmemleak_scan_area(slabp, offsetof(struct slab, list), | 2601 | kmemleak_scan_area(&slabp->list, sizeof(struct list_head), |
2586 | sizeof(struct list_head), local_flags); | 2602 | local_flags); |
2587 | if (!slabp) | 2603 | if (!slabp) |
2588 | return NULL; | 2604 | return NULL; |
2589 | } else { | 2605 | } else { |
@@ -3103,13 +3119,19 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3103 | } else { | 3119 | } else { |
3104 | STATS_INC_ALLOCMISS(cachep); | 3120 | STATS_INC_ALLOCMISS(cachep); |
3105 | objp = cache_alloc_refill(cachep, flags); | 3121 | objp = cache_alloc_refill(cachep, flags); |
3122 | /* | ||
3123 | * the 'ac' may be updated by cache_alloc_refill(), | ||
3124 | * and kmemleak_erase() requires its correct value. | ||
3125 | */ | ||
3126 | ac = cpu_cache_get(cachep); | ||
3106 | } | 3127 | } |
3107 | /* | 3128 | /* |
3108 | * To avoid a false negative, if an object that is in one of the | 3129 | * To avoid a false negative, if an object that is in one of the |
3109 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't | 3130 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't |
3110 | * treat the array pointers as a reference to the object. | 3131 | * treat the array pointers as a reference to the object. |
3111 | */ | 3132 | */ |
3112 | kmemleak_erase(&ac->entry[ac->avail]); | 3133 | if (objp) |
3134 | kmemleak_erase(&ac->entry[ac->avail]); | ||
3113 | return objp; | 3135 | return objp; |
3114 | } | 3136 | } |
3115 | 3137 | ||
@@ -3306,7 +3328,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3306 | cache_alloc_debugcheck_before(cachep, flags); | 3328 | cache_alloc_debugcheck_before(cachep, flags); |
3307 | local_irq_save(save_flags); | 3329 | local_irq_save(save_flags); |
3308 | 3330 | ||
3309 | if (unlikely(nodeid == -1)) | 3331 | if (nodeid == -1) |
3310 | nodeid = numa_node_id(); | 3332 | nodeid = numa_node_id(); |
3311 | 3333 | ||
3312 | if (unlikely(!cachep->nodelists[nodeid])) { | 3334 | if (unlikely(!cachep->nodelists[nodeid])) { |
@@ -3558,7 +3580,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3558 | } | 3580 | } |
3559 | EXPORT_SYMBOL(kmem_cache_alloc); | 3581 | EXPORT_SYMBOL(kmem_cache_alloc); |
3560 | 3582 | ||
3561 | #ifdef CONFIG_KMEMTRACE | 3583 | #ifdef CONFIG_TRACING |
3562 | void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) | 3584 | void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) |
3563 | { | 3585 | { |
3564 | return __cache_alloc(cachep, flags, __builtin_return_address(0)); | 3586 | return __cache_alloc(cachep, flags, __builtin_return_address(0)); |
@@ -3621,7 +3643,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
3621 | } | 3643 | } |
3622 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3644 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
3623 | 3645 | ||
3624 | #ifdef CONFIG_KMEMTRACE | 3646 | #ifdef CONFIG_TRACING |
3625 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, | 3647 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, |
3626 | gfp_t flags, | 3648 | gfp_t flags, |
3627 | int nodeid) | 3649 | int nodeid) |
@@ -3649,7 +3671,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) | |||
3649 | return ret; | 3671 | return ret; |
3650 | } | 3672 | } |
3651 | 3673 | ||
3652 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) | 3674 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
3653 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3675 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
3654 | { | 3676 | { |
3655 | return __do_kmalloc_node(size, flags, node, | 3677 | return __do_kmalloc_node(size, flags, node, |
@@ -3669,7 +3691,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) | |||
3669 | return __do_kmalloc_node(size, flags, node, NULL); | 3691 | return __do_kmalloc_node(size, flags, node, NULL); |
3670 | } | 3692 | } |
3671 | EXPORT_SYMBOL(__kmalloc_node); | 3693 | EXPORT_SYMBOL(__kmalloc_node); |
3672 | #endif /* CONFIG_DEBUG_SLAB */ | 3694 | #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ |
3673 | #endif /* CONFIG_NUMA */ | 3695 | #endif /* CONFIG_NUMA */ |
3674 | 3696 | ||
3675 | /** | 3697 | /** |
@@ -3701,7 +3723,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
3701 | } | 3723 | } |
3702 | 3724 | ||
3703 | 3725 | ||
3704 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) | 3726 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
3705 | void *__kmalloc(size_t size, gfp_t flags) | 3727 | void *__kmalloc(size_t size, gfp_t flags) |
3706 | { | 3728 | { |
3707 | return __do_kmalloc(size, flags, __builtin_return_address(0)); | 3729 | return __do_kmalloc(size, flags, __builtin_return_address(0)); |
@@ -1735,7 +1735,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
1735 | } | 1735 | } |
1736 | local_irq_restore(flags); | 1736 | local_irq_restore(flags); |
1737 | 1737 | ||
1738 | if (unlikely((gfpflags & __GFP_ZERO) && object)) | 1738 | if (unlikely(gfpflags & __GFP_ZERO) && object) |
1739 | memset(object, 0, objsize); | 1739 | memset(object, 0, objsize); |
1740 | 1740 | ||
1741 | kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); | 1741 | kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); |
@@ -1754,7 +1754,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) | |||
1754 | } | 1754 | } |
1755 | EXPORT_SYMBOL(kmem_cache_alloc); | 1755 | EXPORT_SYMBOL(kmem_cache_alloc); |
1756 | 1756 | ||
1757 | #ifdef CONFIG_KMEMTRACE | 1757 | #ifdef CONFIG_TRACING |
1758 | void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) | 1758 | void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) |
1759 | { | 1759 | { |
1760 | return slab_alloc(s, gfpflags, -1, _RET_IP_); | 1760 | return slab_alloc(s, gfpflags, -1, _RET_IP_); |
@@ -1775,7 +1775,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) | |||
1775 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 1775 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
1776 | #endif | 1776 | #endif |
1777 | 1777 | ||
1778 | #ifdef CONFIG_KMEMTRACE | 1778 | #ifdef CONFIG_TRACING |
1779 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, | 1779 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, |
1780 | gfp_t gfpflags, | 1780 | gfp_t gfpflags, |
1781 | int node) | 1781 | int node) |
@@ -4371,12 +4371,28 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) | |||
4371 | return len + sprintf(buf + len, "\n"); | 4371 | return len + sprintf(buf + len, "\n"); |
4372 | } | 4372 | } |
4373 | 4373 | ||
4374 | static void clear_stat(struct kmem_cache *s, enum stat_item si) | ||
4375 | { | ||
4376 | int cpu; | ||
4377 | |||
4378 | for_each_online_cpu(cpu) | ||
4379 | get_cpu_slab(s, cpu)->stat[si] = 0; | ||
4380 | } | ||
4381 | |||
4374 | #define STAT_ATTR(si, text) \ | 4382 | #define STAT_ATTR(si, text) \ |
4375 | static ssize_t text##_show(struct kmem_cache *s, char *buf) \ | 4383 | static ssize_t text##_show(struct kmem_cache *s, char *buf) \ |
4376 | { \ | 4384 | { \ |
4377 | return show_stat(s, buf, si); \ | 4385 | return show_stat(s, buf, si); \ |
4378 | } \ | 4386 | } \ |
4379 | SLAB_ATTR_RO(text); \ | 4387 | static ssize_t text##_store(struct kmem_cache *s, \ |
4388 | const char *buf, size_t length) \ | ||
4389 | { \ | ||
4390 | if (buf[0] != '0') \ | ||
4391 | return -EINVAL; \ | ||
4392 | clear_stat(s, si); \ | ||
4393 | return length; \ | ||
4394 | } \ | ||
4395 | SLAB_ATTR(text); \ | ||
4380 | 4396 | ||
4381 | STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); | 4397 | STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); |
4382 | STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); | 4398 | STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 9c590eef7912..6c0585b16418 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
25 | #include <linux/ksm.h> | ||
25 | #include <linux/rmap.h> | 26 | #include <linux/rmap.h> |
26 | #include <linux/security.h> | 27 | #include <linux/security.h> |
27 | #include <linux/backing-dev.h> | 28 | #include <linux/backing-dev.h> |
@@ -35,11 +36,15 @@ | |||
35 | #include <linux/swapops.h> | 36 | #include <linux/swapops.h> |
36 | #include <linux/page_cgroup.h> | 37 | #include <linux/page_cgroup.h> |
37 | 38 | ||
39 | static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | ||
40 | unsigned char); | ||
41 | static void free_swap_count_continuations(struct swap_info_struct *); | ||
42 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); | ||
43 | |||
38 | static DEFINE_SPINLOCK(swap_lock); | 44 | static DEFINE_SPINLOCK(swap_lock); |
39 | static unsigned int nr_swapfiles; | 45 | static unsigned int nr_swapfiles; |
40 | long nr_swap_pages; | 46 | long nr_swap_pages; |
41 | long total_swap_pages; | 47 | long total_swap_pages; |
42 | static int swap_overflow; | ||
43 | static int least_priority; | 48 | static int least_priority; |
44 | 49 | ||
45 | static const char Bad_file[] = "Bad swap file entry "; | 50 | static const char Bad_file[] = "Bad swap file entry "; |
@@ -49,42 +54,20 @@ static const char Unused_offset[] = "Unused swap offset entry "; | |||
49 | 54 | ||
50 | static struct swap_list_t swap_list = {-1, -1}; | 55 | static struct swap_list_t swap_list = {-1, -1}; |
51 | 56 | ||
52 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; | 57 | static struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
53 | 58 | ||
54 | static DEFINE_MUTEX(swapon_mutex); | 59 | static DEFINE_MUTEX(swapon_mutex); |
55 | 60 | ||
56 | /* For reference count accounting in swap_map */ | 61 | static inline unsigned char swap_count(unsigned char ent) |
57 | /* enum for swap_map[] handling. internal use only */ | ||
58 | enum { | ||
59 | SWAP_MAP = 0, /* ops for reference from swap users */ | ||
60 | SWAP_CACHE, /* ops for reference from swap cache */ | ||
61 | }; | ||
62 | |||
63 | static inline int swap_count(unsigned short ent) | ||
64 | { | ||
65 | return ent & SWAP_COUNT_MASK; | ||
66 | } | ||
67 | |||
68 | static inline bool swap_has_cache(unsigned short ent) | ||
69 | { | 62 | { |
70 | return !!(ent & SWAP_HAS_CACHE); | 63 | return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ |
71 | } | 64 | } |
72 | 65 | ||
73 | static inline unsigned short encode_swapmap(int count, bool has_cache) | 66 | /* returns 1 if swap entry is freed */ |
74 | { | ||
75 | unsigned short ret = count; | ||
76 | |||
77 | if (has_cache) | ||
78 | return SWAP_HAS_CACHE | ret; | ||
79 | return ret; | ||
80 | } | ||
81 | |||
82 | /* returnes 1 if swap entry is freed */ | ||
83 | static int | 67 | static int |
84 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | 68 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) |
85 | { | 69 | { |
86 | int type = si - swap_info; | 70 | swp_entry_t entry = swp_entry(si->type, offset); |
87 | swp_entry_t entry = swp_entry(type, offset); | ||
88 | struct page *page; | 71 | struct page *page; |
89 | int ret = 0; | 72 | int ret = 0; |
90 | 73 | ||
@@ -120,7 +103,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
120 | down_read(&swap_unplug_sem); | 103 | down_read(&swap_unplug_sem); |
121 | entry.val = page_private(page); | 104 | entry.val = page_private(page); |
122 | if (PageSwapCache(page)) { | 105 | if (PageSwapCache(page)) { |
123 | struct block_device *bdev = swap_info[swp_type(entry)].bdev; | 106 | struct block_device *bdev = swap_info[swp_type(entry)]->bdev; |
124 | struct backing_dev_info *bdi; | 107 | struct backing_dev_info *bdi; |
125 | 108 | ||
126 | /* | 109 | /* |
@@ -146,23 +129,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
146 | static int discard_swap(struct swap_info_struct *si) | 129 | static int discard_swap(struct swap_info_struct *si) |
147 | { | 130 | { |
148 | struct swap_extent *se; | 131 | struct swap_extent *se; |
132 | sector_t start_block; | ||
133 | sector_t nr_blocks; | ||
149 | int err = 0; | 134 | int err = 0; |
150 | 135 | ||
151 | list_for_each_entry(se, &si->extent_list, list) { | 136 | /* Do not discard the swap header page! */ |
152 | sector_t start_block = se->start_block << (PAGE_SHIFT - 9); | 137 | se = &si->first_swap_extent; |
153 | sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); | 138 | start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); |
139 | nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); | ||
140 | if (nr_blocks) { | ||
141 | err = blkdev_issue_discard(si->bdev, start_block, | ||
142 | nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); | ||
143 | if (err) | ||
144 | return err; | ||
145 | cond_resched(); | ||
146 | } | ||
154 | 147 | ||
155 | if (se->start_page == 0) { | 148 | list_for_each_entry(se, &si->first_swap_extent.list, list) { |
156 | /* Do not discard the swap header page! */ | 149 | start_block = se->start_block << (PAGE_SHIFT - 9); |
157 | start_block += 1 << (PAGE_SHIFT - 9); | 150 | nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); |
158 | nr_blocks -= 1 << (PAGE_SHIFT - 9); | ||
159 | if (!nr_blocks) | ||
160 | continue; | ||
161 | } | ||
162 | 151 | ||
163 | err = blkdev_issue_discard(si->bdev, start_block, | 152 | err = blkdev_issue_discard(si->bdev, start_block, |
164 | nr_blocks, GFP_KERNEL, | 153 | nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); |
165 | DISCARD_FL_BARRIER); | ||
166 | if (err) | 154 | if (err) |
167 | break; | 155 | break; |
168 | 156 | ||
@@ -201,14 +189,11 @@ static void discard_swap_cluster(struct swap_info_struct *si, | |||
201 | start_block <<= PAGE_SHIFT - 9; | 189 | start_block <<= PAGE_SHIFT - 9; |
202 | nr_blocks <<= PAGE_SHIFT - 9; | 190 | nr_blocks <<= PAGE_SHIFT - 9; |
203 | if (blkdev_issue_discard(si->bdev, start_block, | 191 | if (blkdev_issue_discard(si->bdev, start_block, |
204 | nr_blocks, GFP_NOIO, | 192 | nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) |
205 | DISCARD_FL_BARRIER)) | ||
206 | break; | 193 | break; |
207 | } | 194 | } |
208 | 195 | ||
209 | lh = se->list.next; | 196 | lh = se->list.next; |
210 | if (lh == &si->extent_list) | ||
211 | lh = lh->next; | ||
212 | se = list_entry(lh, struct swap_extent, list); | 197 | se = list_entry(lh, struct swap_extent, list); |
213 | } | 198 | } |
214 | } | 199 | } |
@@ -223,7 +208,7 @@ static int wait_for_discard(void *word) | |||
223 | #define LATENCY_LIMIT 256 | 208 | #define LATENCY_LIMIT 256 |
224 | 209 | ||
225 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, | 210 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, |
226 | int cache) | 211 | unsigned char usage) |
227 | { | 212 | { |
228 | unsigned long offset; | 213 | unsigned long offset; |
229 | unsigned long scan_base; | 214 | unsigned long scan_base; |
@@ -354,10 +339,7 @@ checks: | |||
354 | si->lowest_bit = si->max; | 339 | si->lowest_bit = si->max; |
355 | si->highest_bit = 0; | 340 | si->highest_bit = 0; |
356 | } | 341 | } |
357 | if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ | 342 | si->swap_map[offset] = usage; |
358 | si->swap_map[offset] = encode_swapmap(0, true); | ||
359 | else /* at suspend */ | ||
360 | si->swap_map[offset] = encode_swapmap(1, false); | ||
361 | si->cluster_next = offset + 1; | 343 | si->cluster_next = offset + 1; |
362 | si->flags -= SWP_SCANNING; | 344 | si->flags -= SWP_SCANNING; |
363 | 345 | ||
@@ -467,10 +449,10 @@ swp_entry_t get_swap_page(void) | |||
467 | nr_swap_pages--; | 449 | nr_swap_pages--; |
468 | 450 | ||
469 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | 451 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { |
470 | si = swap_info + type; | 452 | si = swap_info[type]; |
471 | next = si->next; | 453 | next = si->next; |
472 | if (next < 0 || | 454 | if (next < 0 || |
473 | (!wrapped && si->prio != swap_info[next].prio)) { | 455 | (!wrapped && si->prio != swap_info[next]->prio)) { |
474 | next = swap_list.head; | 456 | next = swap_list.head; |
475 | wrapped++; | 457 | wrapped++; |
476 | } | 458 | } |
@@ -482,7 +464,7 @@ swp_entry_t get_swap_page(void) | |||
482 | 464 | ||
483 | swap_list.next = next; | 465 | swap_list.next = next; |
484 | /* This is called for allocating swap entry for cache */ | 466 | /* This is called for allocating swap entry for cache */ |
485 | offset = scan_swap_map(si, SWAP_CACHE); | 467 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
486 | if (offset) { | 468 | if (offset) { |
487 | spin_unlock(&swap_lock); | 469 | spin_unlock(&swap_lock); |
488 | return swp_entry(type, offset); | 470 | return swp_entry(type, offset); |
@@ -503,11 +485,11 @@ swp_entry_t get_swap_page_of_type(int type) | |||
503 | pgoff_t offset; | 485 | pgoff_t offset; |
504 | 486 | ||
505 | spin_lock(&swap_lock); | 487 | spin_lock(&swap_lock); |
506 | si = swap_info + type; | 488 | si = swap_info[type]; |
507 | if (si->flags & SWP_WRITEOK) { | 489 | if (si && (si->flags & SWP_WRITEOK)) { |
508 | nr_swap_pages--; | 490 | nr_swap_pages--; |
509 | /* This is called for allocating swap entry, not cache */ | 491 | /* This is called for allocating swap entry, not cache */ |
510 | offset = scan_swap_map(si, SWAP_MAP); | 492 | offset = scan_swap_map(si, 1); |
511 | if (offset) { | 493 | if (offset) { |
512 | spin_unlock(&swap_lock); | 494 | spin_unlock(&swap_lock); |
513 | return swp_entry(type, offset); | 495 | return swp_entry(type, offset); |
@@ -518,9 +500,9 @@ swp_entry_t get_swap_page_of_type(int type) | |||
518 | return (swp_entry_t) {0}; | 500 | return (swp_entry_t) {0}; |
519 | } | 501 | } |
520 | 502 | ||
521 | static struct swap_info_struct * swap_info_get(swp_entry_t entry) | 503 | static struct swap_info_struct *swap_info_get(swp_entry_t entry) |
522 | { | 504 | { |
523 | struct swap_info_struct * p; | 505 | struct swap_info_struct *p; |
524 | unsigned long offset, type; | 506 | unsigned long offset, type; |
525 | 507 | ||
526 | if (!entry.val) | 508 | if (!entry.val) |
@@ -528,7 +510,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry) | |||
528 | type = swp_type(entry); | 510 | type = swp_type(entry); |
529 | if (type >= nr_swapfiles) | 511 | if (type >= nr_swapfiles) |
530 | goto bad_nofile; | 512 | goto bad_nofile; |
531 | p = & swap_info[type]; | 513 | p = swap_info[type]; |
532 | if (!(p->flags & SWP_USED)) | 514 | if (!(p->flags & SWP_USED)) |
533 | goto bad_device; | 515 | goto bad_device; |
534 | offset = swp_offset(entry); | 516 | offset = swp_offset(entry); |
@@ -554,41 +536,56 @@ out: | |||
554 | return NULL; | 536 | return NULL; |
555 | } | 537 | } |
556 | 538 | ||
557 | static int swap_entry_free(struct swap_info_struct *p, | 539 | static unsigned char swap_entry_free(struct swap_info_struct *p, |
558 | swp_entry_t ent, int cache) | 540 | swp_entry_t entry, unsigned char usage) |
559 | { | 541 | { |
560 | unsigned long offset = swp_offset(ent); | 542 | unsigned long offset = swp_offset(entry); |
561 | int count = swap_count(p->swap_map[offset]); | 543 | unsigned char count; |
562 | bool has_cache; | 544 | unsigned char has_cache; |
563 | 545 | ||
564 | has_cache = swap_has_cache(p->swap_map[offset]); | 546 | count = p->swap_map[offset]; |
547 | has_cache = count & SWAP_HAS_CACHE; | ||
548 | count &= ~SWAP_HAS_CACHE; | ||
565 | 549 | ||
566 | if (cache == SWAP_MAP) { /* dropping usage count of swap */ | 550 | if (usage == SWAP_HAS_CACHE) { |
567 | if (count < SWAP_MAP_MAX) { | ||
568 | count--; | ||
569 | p->swap_map[offset] = encode_swapmap(count, has_cache); | ||
570 | } | ||
571 | } else { /* dropping swap cache flag */ | ||
572 | VM_BUG_ON(!has_cache); | 551 | VM_BUG_ON(!has_cache); |
573 | p->swap_map[offset] = encode_swapmap(count, false); | 552 | has_cache = 0; |
574 | 553 | } else if (count == SWAP_MAP_SHMEM) { | |
554 | /* | ||
555 | * Or we could insist on shmem.c using a special | ||
556 | * swap_shmem_free() and free_shmem_swap_and_cache()... | ||
557 | */ | ||
558 | count = 0; | ||
559 | } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { | ||
560 | if (count == COUNT_CONTINUED) { | ||
561 | if (swap_count_continued(p, offset, count)) | ||
562 | count = SWAP_MAP_MAX | COUNT_CONTINUED; | ||
563 | else | ||
564 | count = SWAP_MAP_MAX; | ||
565 | } else | ||
566 | count--; | ||
575 | } | 567 | } |
576 | /* return code. */ | 568 | |
577 | count = p->swap_map[offset]; | 569 | if (!count) |
570 | mem_cgroup_uncharge_swap(entry); | ||
571 | |||
572 | usage = count | has_cache; | ||
573 | p->swap_map[offset] = usage; | ||
574 | |||
578 | /* free if no reference */ | 575 | /* free if no reference */ |
579 | if (!count) { | 576 | if (!usage) { |
580 | if (offset < p->lowest_bit) | 577 | if (offset < p->lowest_bit) |
581 | p->lowest_bit = offset; | 578 | p->lowest_bit = offset; |
582 | if (offset > p->highest_bit) | 579 | if (offset > p->highest_bit) |
583 | p->highest_bit = offset; | 580 | p->highest_bit = offset; |
584 | if (p->prio > swap_info[swap_list.next].prio) | 581 | if (swap_list.next >= 0 && |
585 | swap_list.next = p - swap_info; | 582 | p->prio > swap_info[swap_list.next]->prio) |
583 | swap_list.next = p->type; | ||
586 | nr_swap_pages++; | 584 | nr_swap_pages++; |
587 | p->inuse_pages--; | 585 | p->inuse_pages--; |
588 | } | 586 | } |
589 | if (!swap_count(count)) | 587 | |
590 | mem_cgroup_uncharge_swap(ent); | 588 | return usage; |
591 | return count; | ||
592 | } | 589 | } |
593 | 590 | ||
594 | /* | 591 | /* |
@@ -597,11 +594,11 @@ static int swap_entry_free(struct swap_info_struct *p, | |||
597 | */ | 594 | */ |
598 | void swap_free(swp_entry_t entry) | 595 | void swap_free(swp_entry_t entry) |
599 | { | 596 | { |
600 | struct swap_info_struct * p; | 597 | struct swap_info_struct *p; |
601 | 598 | ||
602 | p = swap_info_get(entry); | 599 | p = swap_info_get(entry); |
603 | if (p) { | 600 | if (p) { |
604 | swap_entry_free(p, entry, SWAP_MAP); | 601 | swap_entry_free(p, entry, 1); |
605 | spin_unlock(&swap_lock); | 602 | spin_unlock(&swap_lock); |
606 | } | 603 | } |
607 | } | 604 | } |
@@ -612,26 +609,21 @@ void swap_free(swp_entry_t entry) | |||
612 | void swapcache_free(swp_entry_t entry, struct page *page) | 609 | void swapcache_free(swp_entry_t entry, struct page *page) |
613 | { | 610 | { |
614 | struct swap_info_struct *p; | 611 | struct swap_info_struct *p; |
615 | int ret; | 612 | unsigned char count; |
616 | 613 | ||
617 | p = swap_info_get(entry); | 614 | p = swap_info_get(entry); |
618 | if (p) { | 615 | if (p) { |
619 | ret = swap_entry_free(p, entry, SWAP_CACHE); | 616 | count = swap_entry_free(p, entry, SWAP_HAS_CACHE); |
620 | if (page) { | 617 | if (page) |
621 | bool swapout; | 618 | mem_cgroup_uncharge_swapcache(page, entry, count != 0); |
622 | if (ret) | ||
623 | swapout = true; /* the end of swap out */ | ||
624 | else | ||
625 | swapout = false; /* no more swap users! */ | ||
626 | mem_cgroup_uncharge_swapcache(page, entry, swapout); | ||
627 | } | ||
628 | spin_unlock(&swap_lock); | 619 | spin_unlock(&swap_lock); |
629 | } | 620 | } |
630 | return; | ||
631 | } | 621 | } |
632 | 622 | ||
633 | /* | 623 | /* |
634 | * How many references to page are currently swapped out? | 624 | * How many references to page are currently swapped out? |
625 | * This does not give an exact answer when swap count is continued, | ||
626 | * but does include the high COUNT_CONTINUED flag to allow for that. | ||
635 | */ | 627 | */ |
636 | static inline int page_swapcount(struct page *page) | 628 | static inline int page_swapcount(struct page *page) |
637 | { | 629 | { |
@@ -659,6 +651,8 @@ int reuse_swap_page(struct page *page) | |||
659 | int count; | 651 | int count; |
660 | 652 | ||
661 | VM_BUG_ON(!PageLocked(page)); | 653 | VM_BUG_ON(!PageLocked(page)); |
654 | if (unlikely(PageKsm(page))) | ||
655 | return 0; | ||
662 | count = page_mapcount(page); | 656 | count = page_mapcount(page); |
663 | if (count <= 1 && PageSwapCache(page)) { | 657 | if (count <= 1 && PageSwapCache(page)) { |
664 | count += page_swapcount(page); | 658 | count += page_swapcount(page); |
@@ -667,7 +661,7 @@ int reuse_swap_page(struct page *page) | |||
667 | SetPageDirty(page); | 661 | SetPageDirty(page); |
668 | } | 662 | } |
669 | } | 663 | } |
670 | return count == 1; | 664 | return count <= 1; |
671 | } | 665 | } |
672 | 666 | ||
673 | /* | 667 | /* |
@@ -704,7 +698,7 @@ int free_swap_and_cache(swp_entry_t entry) | |||
704 | 698 | ||
705 | p = swap_info_get(entry); | 699 | p = swap_info_get(entry); |
706 | if (p) { | 700 | if (p) { |
707 | if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { | 701 | if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { |
708 | page = find_get_page(&swapper_space, entry.val); | 702 | page = find_get_page(&swapper_space, entry.val); |
709 | if (page && !trylock_page(page)) { | 703 | if (page && !trylock_page(page)) { |
710 | page_cache_release(page); | 704 | page_cache_release(page); |
@@ -741,14 +735,14 @@ int free_swap_and_cache(swp_entry_t entry) | |||
741 | int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | 735 | int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) |
742 | { | 736 | { |
743 | struct block_device *bdev = NULL; | 737 | struct block_device *bdev = NULL; |
744 | int i; | 738 | int type; |
745 | 739 | ||
746 | if (device) | 740 | if (device) |
747 | bdev = bdget(device); | 741 | bdev = bdget(device); |
748 | 742 | ||
749 | spin_lock(&swap_lock); | 743 | spin_lock(&swap_lock); |
750 | for (i = 0; i < nr_swapfiles; i++) { | 744 | for (type = 0; type < nr_swapfiles; type++) { |
751 | struct swap_info_struct *sis = swap_info + i; | 745 | struct swap_info_struct *sis = swap_info[type]; |
752 | 746 | ||
753 | if (!(sis->flags & SWP_WRITEOK)) | 747 | if (!(sis->flags & SWP_WRITEOK)) |
754 | continue; | 748 | continue; |
@@ -758,20 +752,18 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
758 | *bdev_p = bdgrab(sis->bdev); | 752 | *bdev_p = bdgrab(sis->bdev); |
759 | 753 | ||
760 | spin_unlock(&swap_lock); | 754 | spin_unlock(&swap_lock); |
761 | return i; | 755 | return type; |
762 | } | 756 | } |
763 | if (bdev == sis->bdev) { | 757 | if (bdev == sis->bdev) { |
764 | struct swap_extent *se; | 758 | struct swap_extent *se = &sis->first_swap_extent; |
765 | 759 | ||
766 | se = list_entry(sis->extent_list.next, | ||
767 | struct swap_extent, list); | ||
768 | if (se->start_block == offset) { | 760 | if (se->start_block == offset) { |
769 | if (bdev_p) | 761 | if (bdev_p) |
770 | *bdev_p = bdgrab(sis->bdev); | 762 | *bdev_p = bdgrab(sis->bdev); |
771 | 763 | ||
772 | spin_unlock(&swap_lock); | 764 | spin_unlock(&swap_lock); |
773 | bdput(bdev); | 765 | bdput(bdev); |
774 | return i; | 766 | return type; |
775 | } | 767 | } |
776 | } | 768 | } |
777 | } | 769 | } |
@@ -783,6 +775,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
783 | } | 775 | } |
784 | 776 | ||
785 | /* | 777 | /* |
778 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev | ||
779 | * corresponding to given index in swap_info (swap type). | ||
780 | */ | ||
781 | sector_t swapdev_block(int type, pgoff_t offset) | ||
782 | { | ||
783 | struct block_device *bdev; | ||
784 | |||
785 | if ((unsigned int)type >= nr_swapfiles) | ||
786 | return 0; | ||
787 | if (!(swap_info[type]->flags & SWP_WRITEOK)) | ||
788 | return 0; | ||
789 | return map_swap_entry(swp_entry(type, offset), &bdev); | ||
790 | } | ||
791 | |||
792 | /* | ||
786 | * Return either the total number of swap pages of given type, or the number | 793 | * Return either the total number of swap pages of given type, or the number |
787 | * of free pages of that type (depending on @free) | 794 | * of free pages of that type (depending on @free) |
788 | * | 795 | * |
@@ -792,18 +799,20 @@ unsigned int count_swap_pages(int type, int free) | |||
792 | { | 799 | { |
793 | unsigned int n = 0; | 800 | unsigned int n = 0; |
794 | 801 | ||
795 | if (type < nr_swapfiles) { | 802 | spin_lock(&swap_lock); |
796 | spin_lock(&swap_lock); | 803 | if ((unsigned int)type < nr_swapfiles) { |
797 | if (swap_info[type].flags & SWP_WRITEOK) { | 804 | struct swap_info_struct *sis = swap_info[type]; |
798 | n = swap_info[type].pages; | 805 | |
806 | if (sis->flags & SWP_WRITEOK) { | ||
807 | n = sis->pages; | ||
799 | if (free) | 808 | if (free) |
800 | n -= swap_info[type].inuse_pages; | 809 | n -= sis->inuse_pages; |
801 | } | 810 | } |
802 | spin_unlock(&swap_lock); | ||
803 | } | 811 | } |
812 | spin_unlock(&swap_lock); | ||
804 | return n; | 813 | return n; |
805 | } | 814 | } |
806 | #endif | 815 | #endif /* CONFIG_HIBERNATION */ |
807 | 816 | ||
808 | /* | 817 | /* |
809 | * No need to decide whether this PTE shares the swap entry with others, | 818 | * No need to decide whether this PTE shares the swap entry with others, |
@@ -932,7 +941,7 @@ static int unuse_vma(struct vm_area_struct *vma, | |||
932 | unsigned long addr, end, next; | 941 | unsigned long addr, end, next; |
933 | int ret; | 942 | int ret; |
934 | 943 | ||
935 | if (page->mapping) { | 944 | if (page_anon_vma(page)) { |
936 | addr = page_address_in_vma(page, vma); | 945 | addr = page_address_in_vma(page, vma); |
937 | if (addr == -EFAULT) | 946 | if (addr == -EFAULT) |
938 | return 0; | 947 | return 0; |
@@ -988,7 +997,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
988 | { | 997 | { |
989 | unsigned int max = si->max; | 998 | unsigned int max = si->max; |
990 | unsigned int i = prev; | 999 | unsigned int i = prev; |
991 | int count; | 1000 | unsigned char count; |
992 | 1001 | ||
993 | /* | 1002 | /* |
994 | * No need for swap_lock here: we're just looking | 1003 | * No need for swap_lock here: we're just looking |
@@ -1024,16 +1033,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1024 | */ | 1033 | */ |
1025 | static int try_to_unuse(unsigned int type) | 1034 | static int try_to_unuse(unsigned int type) |
1026 | { | 1035 | { |
1027 | struct swap_info_struct * si = &swap_info[type]; | 1036 | struct swap_info_struct *si = swap_info[type]; |
1028 | struct mm_struct *start_mm; | 1037 | struct mm_struct *start_mm; |
1029 | unsigned short *swap_map; | 1038 | unsigned char *swap_map; |
1030 | unsigned short swcount; | 1039 | unsigned char swcount; |
1031 | struct page *page; | 1040 | struct page *page; |
1032 | swp_entry_t entry; | 1041 | swp_entry_t entry; |
1033 | unsigned int i = 0; | 1042 | unsigned int i = 0; |
1034 | int retval = 0; | 1043 | int retval = 0; |
1035 | int reset_overflow = 0; | ||
1036 | int shmem; | ||
1037 | 1044 | ||
1038 | /* | 1045 | /* |
1039 | * When searching mms for an entry, a good strategy is to | 1046 | * When searching mms for an entry, a good strategy is to |
@@ -1047,8 +1054,7 @@ static int try_to_unuse(unsigned int type) | |||
1047 | * together, child after parent. If we race with dup_mmap(), we | 1054 | * together, child after parent. If we race with dup_mmap(), we |
1048 | * prefer to resolve parent before child, lest we miss entries | 1055 | * prefer to resolve parent before child, lest we miss entries |
1049 | * duplicated after we scanned child: using last mm would invert | 1056 | * duplicated after we scanned child: using last mm would invert |
1050 | * that. Though it's only a serious concern when an overflowed | 1057 | * that. |
1051 | * swap count is reset from SWAP_MAP_MAX, preventing a rescan. | ||
1052 | */ | 1058 | */ |
1053 | start_mm = &init_mm; | 1059 | start_mm = &init_mm; |
1054 | atomic_inc(&init_mm.mm_users); | 1060 | atomic_inc(&init_mm.mm_users); |
@@ -1110,17 +1116,18 @@ static int try_to_unuse(unsigned int type) | |||
1110 | 1116 | ||
1111 | /* | 1117 | /* |
1112 | * Remove all references to entry. | 1118 | * Remove all references to entry. |
1113 | * Whenever we reach init_mm, there's no address space | ||
1114 | * to search, but use it as a reminder to search shmem. | ||
1115 | */ | 1119 | */ |
1116 | shmem = 0; | ||
1117 | swcount = *swap_map; | 1120 | swcount = *swap_map; |
1118 | if (swap_count(swcount)) { | 1121 | if (swap_count(swcount) == SWAP_MAP_SHMEM) { |
1119 | if (start_mm == &init_mm) | 1122 | retval = shmem_unuse(entry, page); |
1120 | shmem = shmem_unuse(entry, page); | 1123 | /* page has already been unlocked and released */ |
1121 | else | 1124 | if (retval < 0) |
1122 | retval = unuse_mm(start_mm, entry, page); | 1125 | break; |
1126 | continue; | ||
1123 | } | 1127 | } |
1128 | if (swap_count(swcount) && start_mm != &init_mm) | ||
1129 | retval = unuse_mm(start_mm, entry, page); | ||
1130 | |||
1124 | if (swap_count(*swap_map)) { | 1131 | if (swap_count(*swap_map)) { |
1125 | int set_start_mm = (*swap_map >= swcount); | 1132 | int set_start_mm = (*swap_map >= swcount); |
1126 | struct list_head *p = &start_mm->mmlist; | 1133 | struct list_head *p = &start_mm->mmlist; |
@@ -1131,7 +1138,7 @@ static int try_to_unuse(unsigned int type) | |||
1131 | atomic_inc(&new_start_mm->mm_users); | 1138 | atomic_inc(&new_start_mm->mm_users); |
1132 | atomic_inc(&prev_mm->mm_users); | 1139 | atomic_inc(&prev_mm->mm_users); |
1133 | spin_lock(&mmlist_lock); | 1140 | spin_lock(&mmlist_lock); |
1134 | while (swap_count(*swap_map) && !retval && !shmem && | 1141 | while (swap_count(*swap_map) && !retval && |
1135 | (p = p->next) != &start_mm->mmlist) { | 1142 | (p = p->next) != &start_mm->mmlist) { |
1136 | mm = list_entry(p, struct mm_struct, mmlist); | 1143 | mm = list_entry(p, struct mm_struct, mmlist); |
1137 | if (!atomic_inc_not_zero(&mm->mm_users)) | 1144 | if (!atomic_inc_not_zero(&mm->mm_users)) |
@@ -1145,10 +1152,9 @@ static int try_to_unuse(unsigned int type) | |||
1145 | swcount = *swap_map; | 1152 | swcount = *swap_map; |
1146 | if (!swap_count(swcount)) /* any usage ? */ | 1153 | if (!swap_count(swcount)) /* any usage ? */ |
1147 | ; | 1154 | ; |
1148 | else if (mm == &init_mm) { | 1155 | else if (mm == &init_mm) |
1149 | set_start_mm = 1; | 1156 | set_start_mm = 1; |
1150 | shmem = shmem_unuse(entry, page); | 1157 | else |
1151 | } else | ||
1152 | retval = unuse_mm(mm, entry, page); | 1158 | retval = unuse_mm(mm, entry, page); |
1153 | 1159 | ||
1154 | if (set_start_mm && *swap_map < swcount) { | 1160 | if (set_start_mm && *swap_map < swcount) { |
@@ -1164,13 +1170,6 @@ static int try_to_unuse(unsigned int type) | |||
1164 | mmput(start_mm); | 1170 | mmput(start_mm); |
1165 | start_mm = new_start_mm; | 1171 | start_mm = new_start_mm; |
1166 | } | 1172 | } |
1167 | if (shmem) { | ||
1168 | /* page has already been unlocked and released */ | ||
1169 | if (shmem > 0) | ||
1170 | continue; | ||
1171 | retval = shmem; | ||
1172 | break; | ||
1173 | } | ||
1174 | if (retval) { | 1173 | if (retval) { |
1175 | unlock_page(page); | 1174 | unlock_page(page); |
1176 | page_cache_release(page); | 1175 | page_cache_release(page); |
@@ -1178,30 +1177,6 @@ static int try_to_unuse(unsigned int type) | |||
1178 | } | 1177 | } |
1179 | 1178 | ||
1180 | /* | 1179 | /* |
1181 | * How could swap count reach 0x7ffe ? | ||
1182 | * There's no way to repeat a swap page within an mm | ||
1183 | * (except in shmem, where it's the shared object which takes | ||
1184 | * the reference count)? | ||
1185 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned | ||
1186 | * short is too small....) | ||
1187 | * If that's wrong, then we should worry more about | ||
1188 | * exit_mmap() and do_munmap() cases described above: | ||
1189 | * we might be resetting SWAP_MAP_MAX too early here. | ||
1190 | * We know "Undead"s can happen, they're okay, so don't | ||
1191 | * report them; but do report if we reset SWAP_MAP_MAX. | ||
1192 | */ | ||
1193 | /* We might release the lock_page() in unuse_mm(). */ | ||
1194 | if (!PageSwapCache(page) || page_private(page) != entry.val) | ||
1195 | goto retry; | ||
1196 | |||
1197 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { | ||
1198 | spin_lock(&swap_lock); | ||
1199 | *swap_map = encode_swapmap(0, true); | ||
1200 | spin_unlock(&swap_lock); | ||
1201 | reset_overflow = 1; | ||
1202 | } | ||
1203 | |||
1204 | /* | ||
1205 | * If a reference remains (rare), we would like to leave | 1180 | * If a reference remains (rare), we would like to leave |
1206 | * the page in the swap cache; but try_to_unmap could | 1181 | * the page in the swap cache; but try_to_unmap could |
1207 | * then re-duplicate the entry once we drop page lock, | 1182 | * then re-duplicate the entry once we drop page lock, |
@@ -1213,6 +1188,12 @@ static int try_to_unuse(unsigned int type) | |||
1213 | * read from disk into another page. Splitting into two | 1188 | * read from disk into another page. Splitting into two |
1214 | * pages would be incorrect if swap supported "shared | 1189 | * pages would be incorrect if swap supported "shared |
1215 | * private" pages, but they are handled by tmpfs files. | 1190 | * private" pages, but they are handled by tmpfs files. |
1191 | * | ||
1192 | * Given how unuse_vma() targets one particular offset | ||
1193 | * in an anon_vma, once the anon_vma has been determined, | ||
1194 | * this splitting happens to be just what is needed to | ||
1195 | * handle where KSM pages have been swapped out: re-reading | ||
1196 | * is unnecessarily slow, but we can fix that later on. | ||
1216 | */ | 1197 | */ |
1217 | if (swap_count(*swap_map) && | 1198 | if (swap_count(*swap_map) && |
1218 | PageDirty(page) && PageSwapCache(page)) { | 1199 | PageDirty(page) && PageSwapCache(page)) { |
@@ -1242,7 +1223,6 @@ static int try_to_unuse(unsigned int type) | |||
1242 | * mark page dirty so shrink_page_list will preserve it. | 1223 | * mark page dirty so shrink_page_list will preserve it. |
1243 | */ | 1224 | */ |
1244 | SetPageDirty(page); | 1225 | SetPageDirty(page); |
1245 | retry: | ||
1246 | unlock_page(page); | 1226 | unlock_page(page); |
1247 | page_cache_release(page); | 1227 | page_cache_release(page); |
1248 | 1228 | ||
@@ -1254,10 +1234,6 @@ retry: | |||
1254 | } | 1234 | } |
1255 | 1235 | ||
1256 | mmput(start_mm); | 1236 | mmput(start_mm); |
1257 | if (reset_overflow) { | ||
1258 | printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); | ||
1259 | swap_overflow = 0; | ||
1260 | } | ||
1261 | return retval; | 1237 | return retval; |
1262 | } | 1238 | } |
1263 | 1239 | ||
@@ -1270,10 +1246,10 @@ retry: | |||
1270 | static void drain_mmlist(void) | 1246 | static void drain_mmlist(void) |
1271 | { | 1247 | { |
1272 | struct list_head *p, *next; | 1248 | struct list_head *p, *next; |
1273 | unsigned int i; | 1249 | unsigned int type; |
1274 | 1250 | ||
1275 | for (i = 0; i < nr_swapfiles; i++) | 1251 | for (type = 0; type < nr_swapfiles; type++) |
1276 | if (swap_info[i].inuse_pages) | 1252 | if (swap_info[type]->inuse_pages) |
1277 | return; | 1253 | return; |
1278 | spin_lock(&mmlist_lock); | 1254 | spin_lock(&mmlist_lock); |
1279 | list_for_each_safe(p, next, &init_mm.mmlist) | 1255 | list_for_each_safe(p, next, &init_mm.mmlist) |
@@ -1283,12 +1259,23 @@ static void drain_mmlist(void) | |||
1283 | 1259 | ||
1284 | /* | 1260 | /* |
1285 | * Use this swapdev's extent info to locate the (PAGE_SIZE) block which | 1261 | * Use this swapdev's extent info to locate the (PAGE_SIZE) block which |
1286 | * corresponds to page offset `offset'. | 1262 | * corresponds to page offset for the specified swap entry. |
1263 | * Note that the type of this function is sector_t, but it returns page offset | ||
1264 | * into the bdev, not sector offset. | ||
1287 | */ | 1265 | */ |
1288 | sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | 1266 | static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) |
1289 | { | 1267 | { |
1290 | struct swap_extent *se = sis->curr_swap_extent; | 1268 | struct swap_info_struct *sis; |
1291 | struct swap_extent *start_se = se; | 1269 | struct swap_extent *start_se; |
1270 | struct swap_extent *se; | ||
1271 | pgoff_t offset; | ||
1272 | |||
1273 | sis = swap_info[swp_type(entry)]; | ||
1274 | *bdev = sis->bdev; | ||
1275 | |||
1276 | offset = swp_offset(entry); | ||
1277 | start_se = sis->curr_swap_extent; | ||
1278 | se = start_se; | ||
1292 | 1279 | ||
1293 | for ( ; ; ) { | 1280 | for ( ; ; ) { |
1294 | struct list_head *lh; | 1281 | struct list_head *lh; |
@@ -1298,40 +1285,31 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | |||
1298 | return se->start_block + (offset - se->start_page); | 1285 | return se->start_block + (offset - se->start_page); |
1299 | } | 1286 | } |
1300 | lh = se->list.next; | 1287 | lh = se->list.next; |
1301 | if (lh == &sis->extent_list) | ||
1302 | lh = lh->next; | ||
1303 | se = list_entry(lh, struct swap_extent, list); | 1288 | se = list_entry(lh, struct swap_extent, list); |
1304 | sis->curr_swap_extent = se; | 1289 | sis->curr_swap_extent = se; |
1305 | BUG_ON(se == start_se); /* It *must* be present */ | 1290 | BUG_ON(se == start_se); /* It *must* be present */ |
1306 | } | 1291 | } |
1307 | } | 1292 | } |
1308 | 1293 | ||
1309 | #ifdef CONFIG_HIBERNATION | ||
1310 | /* | 1294 | /* |
1311 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev | 1295 | * Returns the page offset into bdev for the specified page's swap entry. |
1312 | * corresponding to given index in swap_info (swap type). | ||
1313 | */ | 1296 | */ |
1314 | sector_t swapdev_block(int swap_type, pgoff_t offset) | 1297 | sector_t map_swap_page(struct page *page, struct block_device **bdev) |
1315 | { | 1298 | { |
1316 | struct swap_info_struct *sis; | 1299 | swp_entry_t entry; |
1317 | 1300 | entry.val = page_private(page); | |
1318 | if (swap_type >= nr_swapfiles) | 1301 | return map_swap_entry(entry, bdev); |
1319 | return 0; | ||
1320 | |||
1321 | sis = swap_info + swap_type; | ||
1322 | return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; | ||
1323 | } | 1302 | } |
1324 | #endif /* CONFIG_HIBERNATION */ | ||
1325 | 1303 | ||
1326 | /* | 1304 | /* |
1327 | * Free all of a swapdev's extent information | 1305 | * Free all of a swapdev's extent information |
1328 | */ | 1306 | */ |
1329 | static void destroy_swap_extents(struct swap_info_struct *sis) | 1307 | static void destroy_swap_extents(struct swap_info_struct *sis) |
1330 | { | 1308 | { |
1331 | while (!list_empty(&sis->extent_list)) { | 1309 | while (!list_empty(&sis->first_swap_extent.list)) { |
1332 | struct swap_extent *se; | 1310 | struct swap_extent *se; |
1333 | 1311 | ||
1334 | se = list_entry(sis->extent_list.next, | 1312 | se = list_entry(sis->first_swap_extent.list.next, |
1335 | struct swap_extent, list); | 1313 | struct swap_extent, list); |
1336 | list_del(&se->list); | 1314 | list_del(&se->list); |
1337 | kfree(se); | 1315 | kfree(se); |
@@ -1352,8 +1330,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
1352 | struct swap_extent *new_se; | 1330 | struct swap_extent *new_se; |
1353 | struct list_head *lh; | 1331 | struct list_head *lh; |
1354 | 1332 | ||
1355 | lh = sis->extent_list.prev; /* The highest page extent */ | 1333 | if (start_page == 0) { |
1356 | if (lh != &sis->extent_list) { | 1334 | se = &sis->first_swap_extent; |
1335 | sis->curr_swap_extent = se; | ||
1336 | se->start_page = 0; | ||
1337 | se->nr_pages = nr_pages; | ||
1338 | se->start_block = start_block; | ||
1339 | return 1; | ||
1340 | } else { | ||
1341 | lh = sis->first_swap_extent.list.prev; /* Highest extent */ | ||
1357 | se = list_entry(lh, struct swap_extent, list); | 1342 | se = list_entry(lh, struct swap_extent, list); |
1358 | BUG_ON(se->start_page + se->nr_pages != start_page); | 1343 | BUG_ON(se->start_page + se->nr_pages != start_page); |
1359 | if (se->start_block + se->nr_pages == start_block) { | 1344 | if (se->start_block + se->nr_pages == start_block) { |
@@ -1373,7 +1358,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
1373 | new_se->nr_pages = nr_pages; | 1358 | new_se->nr_pages = nr_pages; |
1374 | new_se->start_block = start_block; | 1359 | new_se->start_block = start_block; |
1375 | 1360 | ||
1376 | list_add_tail(&new_se->list, &sis->extent_list); | 1361 | list_add_tail(&new_se->list, &sis->first_swap_extent.list); |
1377 | return 1; | 1362 | return 1; |
1378 | } | 1363 | } |
1379 | 1364 | ||
@@ -1425,7 +1410,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | |||
1425 | if (S_ISBLK(inode->i_mode)) { | 1410 | if (S_ISBLK(inode->i_mode)) { |
1426 | ret = add_swap_extent(sis, 0, sis->max, 0); | 1411 | ret = add_swap_extent(sis, 0, sis->max, 0); |
1427 | *span = sis->pages; | 1412 | *span = sis->pages; |
1428 | goto done; | 1413 | goto out; |
1429 | } | 1414 | } |
1430 | 1415 | ||
1431 | blkbits = inode->i_blkbits; | 1416 | blkbits = inode->i_blkbits; |
@@ -1496,25 +1481,22 @@ reprobe: | |||
1496 | sis->max = page_no; | 1481 | sis->max = page_no; |
1497 | sis->pages = page_no - 1; | 1482 | sis->pages = page_no - 1; |
1498 | sis->highest_bit = page_no - 1; | 1483 | sis->highest_bit = page_no - 1; |
1499 | done: | 1484 | out: |
1500 | sis->curr_swap_extent = list_entry(sis->extent_list.prev, | 1485 | return ret; |
1501 | struct swap_extent, list); | ||
1502 | goto out; | ||
1503 | bad_bmap: | 1486 | bad_bmap: |
1504 | printk(KERN_ERR "swapon: swapfile has holes\n"); | 1487 | printk(KERN_ERR "swapon: swapfile has holes\n"); |
1505 | ret = -EINVAL; | 1488 | ret = -EINVAL; |
1506 | out: | 1489 | goto out; |
1507 | return ret; | ||
1508 | } | 1490 | } |
1509 | 1491 | ||
1510 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | 1492 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
1511 | { | 1493 | { |
1512 | struct swap_info_struct * p = NULL; | 1494 | struct swap_info_struct *p = NULL; |
1513 | unsigned short *swap_map; | 1495 | unsigned char *swap_map; |
1514 | struct file *swap_file, *victim; | 1496 | struct file *swap_file, *victim; |
1515 | struct address_space *mapping; | 1497 | struct address_space *mapping; |
1516 | struct inode *inode; | 1498 | struct inode *inode; |
1517 | char * pathname; | 1499 | char *pathname; |
1518 | int i, type, prev; | 1500 | int i, type, prev; |
1519 | int err; | 1501 | int err; |
1520 | 1502 | ||
@@ -1535,8 +1517,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1535 | mapping = victim->f_mapping; | 1517 | mapping = victim->f_mapping; |
1536 | prev = -1; | 1518 | prev = -1; |
1537 | spin_lock(&swap_lock); | 1519 | spin_lock(&swap_lock); |
1538 | for (type = swap_list.head; type >= 0; type = swap_info[type].next) { | 1520 | for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { |
1539 | p = swap_info + type; | 1521 | p = swap_info[type]; |
1540 | if (p->flags & SWP_WRITEOK) { | 1522 | if (p->flags & SWP_WRITEOK) { |
1541 | if (p->swap_file->f_mapping == mapping) | 1523 | if (p->swap_file->f_mapping == mapping) |
1542 | break; | 1524 | break; |
@@ -1555,18 +1537,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1555 | spin_unlock(&swap_lock); | 1537 | spin_unlock(&swap_lock); |
1556 | goto out_dput; | 1538 | goto out_dput; |
1557 | } | 1539 | } |
1558 | if (prev < 0) { | 1540 | if (prev < 0) |
1559 | swap_list.head = p->next; | 1541 | swap_list.head = p->next; |
1560 | } else { | 1542 | else |
1561 | swap_info[prev].next = p->next; | 1543 | swap_info[prev]->next = p->next; |
1562 | } | ||
1563 | if (type == swap_list.next) { | 1544 | if (type == swap_list.next) { |
1564 | /* just pick something that's safe... */ | 1545 | /* just pick something that's safe... */ |
1565 | swap_list.next = swap_list.head; | 1546 | swap_list.next = swap_list.head; |
1566 | } | 1547 | } |
1567 | if (p->prio < 0) { | 1548 | if (p->prio < 0) { |
1568 | for (i = p->next; i >= 0; i = swap_info[i].next) | 1549 | for (i = p->next; i >= 0; i = swap_info[i]->next) |
1569 | swap_info[i].prio = p->prio--; | 1550 | swap_info[i]->prio = p->prio--; |
1570 | least_priority++; | 1551 | least_priority++; |
1571 | } | 1552 | } |
1572 | nr_swap_pages -= p->pages; | 1553 | nr_swap_pages -= p->pages; |
@@ -1584,16 +1565,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1584 | if (p->prio < 0) | 1565 | if (p->prio < 0) |
1585 | p->prio = --least_priority; | 1566 | p->prio = --least_priority; |
1586 | prev = -1; | 1567 | prev = -1; |
1587 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | 1568 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { |
1588 | if (p->prio >= swap_info[i].prio) | 1569 | if (p->prio >= swap_info[i]->prio) |
1589 | break; | 1570 | break; |
1590 | prev = i; | 1571 | prev = i; |
1591 | } | 1572 | } |
1592 | p->next = i; | 1573 | p->next = i; |
1593 | if (prev < 0) | 1574 | if (prev < 0) |
1594 | swap_list.head = swap_list.next = p - swap_info; | 1575 | swap_list.head = swap_list.next = type; |
1595 | else | 1576 | else |
1596 | swap_info[prev].next = p - swap_info; | 1577 | swap_info[prev]->next = type; |
1597 | nr_swap_pages += p->pages; | 1578 | nr_swap_pages += p->pages; |
1598 | total_swap_pages += p->pages; | 1579 | total_swap_pages += p->pages; |
1599 | p->flags |= SWP_WRITEOK; | 1580 | p->flags |= SWP_WRITEOK; |
@@ -1606,6 +1587,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1606 | up_write(&swap_unplug_sem); | 1587 | up_write(&swap_unplug_sem); |
1607 | 1588 | ||
1608 | destroy_swap_extents(p); | 1589 | destroy_swap_extents(p); |
1590 | if (p->flags & SWP_CONTINUED) | ||
1591 | free_swap_count_continuations(p); | ||
1592 | |||
1609 | mutex_lock(&swapon_mutex); | 1593 | mutex_lock(&swapon_mutex); |
1610 | spin_lock(&swap_lock); | 1594 | spin_lock(&swap_lock); |
1611 | drain_mmlist(); | 1595 | drain_mmlist(); |
@@ -1653,8 +1637,8 @@ out: | |||
1653 | /* iterator */ | 1637 | /* iterator */ |
1654 | static void *swap_start(struct seq_file *swap, loff_t *pos) | 1638 | static void *swap_start(struct seq_file *swap, loff_t *pos) |
1655 | { | 1639 | { |
1656 | struct swap_info_struct *ptr = swap_info; | 1640 | struct swap_info_struct *si; |
1657 | int i; | 1641 | int type; |
1658 | loff_t l = *pos; | 1642 | loff_t l = *pos; |
1659 | 1643 | ||
1660 | mutex_lock(&swapon_mutex); | 1644 | mutex_lock(&swapon_mutex); |
@@ -1662,11 +1646,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1662 | if (!l) | 1646 | if (!l) |
1663 | return SEQ_START_TOKEN; | 1647 | return SEQ_START_TOKEN; |
1664 | 1648 | ||
1665 | for (i = 0; i < nr_swapfiles; i++, ptr++) { | 1649 | for (type = 0; type < nr_swapfiles; type++) { |
1666 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1650 | smp_rmb(); /* read nr_swapfiles before swap_info[type] */ |
1651 | si = swap_info[type]; | ||
1652 | if (!(si->flags & SWP_USED) || !si->swap_map) | ||
1667 | continue; | 1653 | continue; |
1668 | if (!--l) | 1654 | if (!--l) |
1669 | return ptr; | 1655 | return si; |
1670 | } | 1656 | } |
1671 | 1657 | ||
1672 | return NULL; | 1658 | return NULL; |
@@ -1674,21 +1660,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1674 | 1660 | ||
1675 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) | 1661 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) |
1676 | { | 1662 | { |
1677 | struct swap_info_struct *ptr; | 1663 | struct swap_info_struct *si = v; |
1678 | struct swap_info_struct *endptr = swap_info + nr_swapfiles; | 1664 | int type; |
1679 | 1665 | ||
1680 | if (v == SEQ_START_TOKEN) | 1666 | if (v == SEQ_START_TOKEN) |
1681 | ptr = swap_info; | 1667 | type = 0; |
1682 | else { | 1668 | else |
1683 | ptr = v; | 1669 | type = si->type + 1; |
1684 | ptr++; | ||
1685 | } | ||
1686 | 1670 | ||
1687 | for (; ptr < endptr; ptr++) { | 1671 | for (; type < nr_swapfiles; type++) { |
1688 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1672 | smp_rmb(); /* read nr_swapfiles before swap_info[type] */ |
1673 | si = swap_info[type]; | ||
1674 | if (!(si->flags & SWP_USED) || !si->swap_map) | ||
1689 | continue; | 1675 | continue; |
1690 | ++*pos; | 1676 | ++*pos; |
1691 | return ptr; | 1677 | return si; |
1692 | } | 1678 | } |
1693 | 1679 | ||
1694 | return NULL; | 1680 | return NULL; |
@@ -1701,24 +1687,24 @@ static void swap_stop(struct seq_file *swap, void *v) | |||
1701 | 1687 | ||
1702 | static int swap_show(struct seq_file *swap, void *v) | 1688 | static int swap_show(struct seq_file *swap, void *v) |
1703 | { | 1689 | { |
1704 | struct swap_info_struct *ptr = v; | 1690 | struct swap_info_struct *si = v; |
1705 | struct file *file; | 1691 | struct file *file; |
1706 | int len; | 1692 | int len; |
1707 | 1693 | ||
1708 | if (ptr == SEQ_START_TOKEN) { | 1694 | if (si == SEQ_START_TOKEN) { |
1709 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); | 1695 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); |
1710 | return 0; | 1696 | return 0; |
1711 | } | 1697 | } |
1712 | 1698 | ||
1713 | file = ptr->swap_file; | 1699 | file = si->swap_file; |
1714 | len = seq_path(swap, &file->f_path, " \t\n\\"); | 1700 | len = seq_path(swap, &file->f_path, " \t\n\\"); |
1715 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", | 1701 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
1716 | len < 40 ? 40 - len : 1, " ", | 1702 | len < 40 ? 40 - len : 1, " ", |
1717 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? | 1703 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? |
1718 | "partition" : "file\t", | 1704 | "partition" : "file\t", |
1719 | ptr->pages << (PAGE_SHIFT - 10), | 1705 | si->pages << (PAGE_SHIFT - 10), |
1720 | ptr->inuse_pages << (PAGE_SHIFT - 10), | 1706 | si->inuse_pages << (PAGE_SHIFT - 10), |
1721 | ptr->prio); | 1707 | si->prio); |
1722 | return 0; | 1708 | return 0; |
1723 | } | 1709 | } |
1724 | 1710 | ||
@@ -1765,7 +1751,7 @@ late_initcall(max_swapfiles_check); | |||
1765 | */ | 1751 | */ |
1766 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | 1752 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) |
1767 | { | 1753 | { |
1768 | struct swap_info_struct * p; | 1754 | struct swap_info_struct *p; |
1769 | char *name = NULL; | 1755 | char *name = NULL; |
1770 | struct block_device *bdev = NULL; | 1756 | struct block_device *bdev = NULL; |
1771 | struct file *swap_file = NULL; | 1757 | struct file *swap_file = NULL; |
@@ -1779,30 +1765,52 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1779 | sector_t span; | 1765 | sector_t span; |
1780 | unsigned long maxpages = 1; | 1766 | unsigned long maxpages = 1; |
1781 | unsigned long swapfilepages; | 1767 | unsigned long swapfilepages; |
1782 | unsigned short *swap_map = NULL; | 1768 | unsigned char *swap_map = NULL; |
1783 | struct page *page = NULL; | 1769 | struct page *page = NULL; |
1784 | struct inode *inode = NULL; | 1770 | struct inode *inode = NULL; |
1785 | int did_down = 0; | 1771 | int did_down = 0; |
1786 | 1772 | ||
1787 | if (!capable(CAP_SYS_ADMIN)) | 1773 | if (!capable(CAP_SYS_ADMIN)) |
1788 | return -EPERM; | 1774 | return -EPERM; |
1775 | |||
1776 | p = kzalloc(sizeof(*p), GFP_KERNEL); | ||
1777 | if (!p) | ||
1778 | return -ENOMEM; | ||
1779 | |||
1789 | spin_lock(&swap_lock); | 1780 | spin_lock(&swap_lock); |
1790 | p = swap_info; | 1781 | for (type = 0; type < nr_swapfiles; type++) { |
1791 | for (type = 0 ; type < nr_swapfiles ; type++,p++) | 1782 | if (!(swap_info[type]->flags & SWP_USED)) |
1792 | if (!(p->flags & SWP_USED)) | ||
1793 | break; | 1783 | break; |
1784 | } | ||
1794 | error = -EPERM; | 1785 | error = -EPERM; |
1795 | if (type >= MAX_SWAPFILES) { | 1786 | if (type >= MAX_SWAPFILES) { |
1796 | spin_unlock(&swap_lock); | 1787 | spin_unlock(&swap_lock); |
1788 | kfree(p); | ||
1797 | goto out; | 1789 | goto out; |
1798 | } | 1790 | } |
1799 | if (type >= nr_swapfiles) | 1791 | if (type >= nr_swapfiles) { |
1800 | nr_swapfiles = type+1; | 1792 | p->type = type; |
1801 | memset(p, 0, sizeof(*p)); | 1793 | swap_info[type] = p; |
1802 | INIT_LIST_HEAD(&p->extent_list); | 1794 | /* |
1795 | * Write swap_info[type] before nr_swapfiles, in case a | ||
1796 | * racing procfs swap_start() or swap_next() is reading them. | ||
1797 | * (We never shrink nr_swapfiles, we never free this entry.) | ||
1798 | */ | ||
1799 | smp_wmb(); | ||
1800 | nr_swapfiles++; | ||
1801 | } else { | ||
1802 | kfree(p); | ||
1803 | p = swap_info[type]; | ||
1804 | /* | ||
1805 | * Do not memset this entry: a racing procfs swap_next() | ||
1806 | * would be relying on p->type to remain valid. | ||
1807 | */ | ||
1808 | } | ||
1809 | INIT_LIST_HEAD(&p->first_swap_extent.list); | ||
1803 | p->flags = SWP_USED; | 1810 | p->flags = SWP_USED; |
1804 | p->next = -1; | 1811 | p->next = -1; |
1805 | spin_unlock(&swap_lock); | 1812 | spin_unlock(&swap_lock); |
1813 | |||
1806 | name = getname(specialfile); | 1814 | name = getname(specialfile); |
1807 | error = PTR_ERR(name); | 1815 | error = PTR_ERR(name); |
1808 | if (IS_ERR(name)) { | 1816 | if (IS_ERR(name)) { |
@@ -1822,7 +1830,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1822 | 1830 | ||
1823 | error = -EBUSY; | 1831 | error = -EBUSY; |
1824 | for (i = 0; i < nr_swapfiles; i++) { | 1832 | for (i = 0; i < nr_swapfiles; i++) { |
1825 | struct swap_info_struct *q = &swap_info[i]; | 1833 | struct swap_info_struct *q = swap_info[i]; |
1826 | 1834 | ||
1827 | if (i == type || !q->swap_file) | 1835 | if (i == type || !q->swap_file) |
1828 | continue; | 1836 | continue; |
@@ -1897,6 +1905,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1897 | 1905 | ||
1898 | p->lowest_bit = 1; | 1906 | p->lowest_bit = 1; |
1899 | p->cluster_next = 1; | 1907 | p->cluster_next = 1; |
1908 | p->cluster_nr = 0; | ||
1900 | 1909 | ||
1901 | /* | 1910 | /* |
1902 | * Find out how many pages are allowed for a single swap | 1911 | * Find out how many pages are allowed for a single swap |
@@ -1932,13 +1941,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1932 | goto bad_swap; | 1941 | goto bad_swap; |
1933 | 1942 | ||
1934 | /* OK, set up the swap map and apply the bad block list */ | 1943 | /* OK, set up the swap map and apply the bad block list */ |
1935 | swap_map = vmalloc(maxpages * sizeof(short)); | 1944 | swap_map = vmalloc(maxpages); |
1936 | if (!swap_map) { | 1945 | if (!swap_map) { |
1937 | error = -ENOMEM; | 1946 | error = -ENOMEM; |
1938 | goto bad_swap; | 1947 | goto bad_swap; |
1939 | } | 1948 | } |
1940 | 1949 | ||
1941 | memset(swap_map, 0, maxpages * sizeof(short)); | 1950 | memset(swap_map, 0, maxpages); |
1942 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 1951 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
1943 | int page_nr = swap_header->info.badpages[i]; | 1952 | int page_nr = swap_header->info.badpages[i]; |
1944 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { | 1953 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { |
@@ -2003,18 +2012,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2003 | 2012 | ||
2004 | /* insert swap space into swap_list: */ | 2013 | /* insert swap space into swap_list: */ |
2005 | prev = -1; | 2014 | prev = -1; |
2006 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | 2015 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { |
2007 | if (p->prio >= swap_info[i].prio) { | 2016 | if (p->prio >= swap_info[i]->prio) |
2008 | break; | 2017 | break; |
2009 | } | ||
2010 | prev = i; | 2018 | prev = i; |
2011 | } | 2019 | } |
2012 | p->next = i; | 2020 | p->next = i; |
2013 | if (prev < 0) { | 2021 | if (prev < 0) |
2014 | swap_list.head = swap_list.next = p - swap_info; | 2022 | swap_list.head = swap_list.next = type; |
2015 | } else { | 2023 | else |
2016 | swap_info[prev].next = p - swap_info; | 2024 | swap_info[prev]->next = type; |
2017 | } | ||
2018 | spin_unlock(&swap_lock); | 2025 | spin_unlock(&swap_lock); |
2019 | mutex_unlock(&swapon_mutex); | 2026 | mutex_unlock(&swapon_mutex); |
2020 | error = 0; | 2027 | error = 0; |
@@ -2051,15 +2058,15 @@ out: | |||
2051 | 2058 | ||
2052 | void si_swapinfo(struct sysinfo *val) | 2059 | void si_swapinfo(struct sysinfo *val) |
2053 | { | 2060 | { |
2054 | unsigned int i; | 2061 | unsigned int type; |
2055 | unsigned long nr_to_be_unused = 0; | 2062 | unsigned long nr_to_be_unused = 0; |
2056 | 2063 | ||
2057 | spin_lock(&swap_lock); | 2064 | spin_lock(&swap_lock); |
2058 | for (i = 0; i < nr_swapfiles; i++) { | 2065 | for (type = 0; type < nr_swapfiles; type++) { |
2059 | if (!(swap_info[i].flags & SWP_USED) || | 2066 | struct swap_info_struct *si = swap_info[type]; |
2060 | (swap_info[i].flags & SWP_WRITEOK)) | 2067 | |
2061 | continue; | 2068 | if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) |
2062 | nr_to_be_unused += swap_info[i].inuse_pages; | 2069 | nr_to_be_unused += si->inuse_pages; |
2063 | } | 2070 | } |
2064 | val->freeswap = nr_swap_pages + nr_to_be_unused; | 2071 | val->freeswap = nr_swap_pages + nr_to_be_unused; |
2065 | val->totalswap = total_swap_pages + nr_to_be_unused; | 2072 | val->totalswap = total_swap_pages + nr_to_be_unused; |
@@ -2069,101 +2076,107 @@ void si_swapinfo(struct sysinfo *val) | |||
2069 | /* | 2076 | /* |
2070 | * Verify that a swap entry is valid and increment its swap map count. | 2077 | * Verify that a swap entry is valid and increment its swap map count. |
2071 | * | 2078 | * |
2072 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | ||
2073 | * "permanent", but will be reclaimed by the next swapoff. | ||
2074 | * Returns error code in following case. | 2079 | * Returns error code in following case. |
2075 | * - success -> 0 | 2080 | * - success -> 0 |
2076 | * - swp_entry is invalid -> EINVAL | 2081 | * - swp_entry is invalid -> EINVAL |
2077 | * - swp_entry is migration entry -> EINVAL | 2082 | * - swp_entry is migration entry -> EINVAL |
2078 | * - swap-cache reference is requested but there is already one. -> EEXIST | 2083 | * - swap-cache reference is requested but there is already one. -> EEXIST |
2079 | * - swap-cache reference is requested but the entry is not used. -> ENOENT | 2084 | * - swap-cache reference is requested but the entry is not used. -> ENOENT |
2085 | * - swap-mapped reference requested but needs continued swap count. -> ENOMEM | ||
2080 | */ | 2086 | */ |
2081 | static int __swap_duplicate(swp_entry_t entry, bool cache) | 2087 | static int __swap_duplicate(swp_entry_t entry, unsigned char usage) |
2082 | { | 2088 | { |
2083 | struct swap_info_struct * p; | 2089 | struct swap_info_struct *p; |
2084 | unsigned long offset, type; | 2090 | unsigned long offset, type; |
2085 | int result = -EINVAL; | 2091 | unsigned char count; |
2086 | int count; | 2092 | unsigned char has_cache; |
2087 | bool has_cache; | 2093 | int err = -EINVAL; |
2088 | 2094 | ||
2089 | if (non_swap_entry(entry)) | 2095 | if (non_swap_entry(entry)) |
2090 | return -EINVAL; | 2096 | goto out; |
2091 | 2097 | ||
2092 | type = swp_type(entry); | 2098 | type = swp_type(entry); |
2093 | if (type >= nr_swapfiles) | 2099 | if (type >= nr_swapfiles) |
2094 | goto bad_file; | 2100 | goto bad_file; |
2095 | p = type + swap_info; | 2101 | p = swap_info[type]; |
2096 | offset = swp_offset(entry); | 2102 | offset = swp_offset(entry); |
2097 | 2103 | ||
2098 | spin_lock(&swap_lock); | 2104 | spin_lock(&swap_lock); |
2099 | |||
2100 | if (unlikely(offset >= p->max)) | 2105 | if (unlikely(offset >= p->max)) |
2101 | goto unlock_out; | 2106 | goto unlock_out; |
2102 | 2107 | ||
2103 | count = swap_count(p->swap_map[offset]); | 2108 | count = p->swap_map[offset]; |
2104 | has_cache = swap_has_cache(p->swap_map[offset]); | 2109 | has_cache = count & SWAP_HAS_CACHE; |
2110 | count &= ~SWAP_HAS_CACHE; | ||
2111 | err = 0; | ||
2105 | 2112 | ||
2106 | if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ | 2113 | if (usage == SWAP_HAS_CACHE) { |
2107 | 2114 | ||
2108 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ | 2115 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ |
2109 | if (!has_cache && count) { | 2116 | if (!has_cache && count) |
2110 | p->swap_map[offset] = encode_swapmap(count, true); | 2117 | has_cache = SWAP_HAS_CACHE; |
2111 | result = 0; | 2118 | else if (has_cache) /* someone else added cache */ |
2112 | } else if (has_cache) /* someone added cache */ | 2119 | err = -EEXIST; |
2113 | result = -EEXIST; | 2120 | else /* no users remaining */ |
2114 | else if (!count) /* no users */ | 2121 | err = -ENOENT; |
2115 | result = -ENOENT; | ||
2116 | 2122 | ||
2117 | } else if (count || has_cache) { | 2123 | } else if (count || has_cache) { |
2118 | if (count < SWAP_MAP_MAX - 1) { | 2124 | |
2119 | p->swap_map[offset] = encode_swapmap(count + 1, | 2125 | if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) |
2120 | has_cache); | 2126 | count += usage; |
2121 | result = 0; | 2127 | else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) |
2122 | } else if (count <= SWAP_MAP_MAX) { | 2128 | err = -EINVAL; |
2123 | if (swap_overflow++ < 5) | 2129 | else if (swap_count_continued(p, offset, count)) |
2124 | printk(KERN_WARNING | 2130 | count = COUNT_CONTINUED; |
2125 | "swap_dup: swap entry overflow\n"); | 2131 | else |
2126 | p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, | 2132 | err = -ENOMEM; |
2127 | has_cache); | ||
2128 | result = 0; | ||
2129 | } | ||
2130 | } else | 2133 | } else |
2131 | result = -ENOENT; /* unused swap entry */ | 2134 | err = -ENOENT; /* unused swap entry */ |
2135 | |||
2136 | p->swap_map[offset] = count | has_cache; | ||
2137 | |||
2132 | unlock_out: | 2138 | unlock_out: |
2133 | spin_unlock(&swap_lock); | 2139 | spin_unlock(&swap_lock); |
2134 | out: | 2140 | out: |
2135 | return result; | 2141 | return err; |
2136 | 2142 | ||
2137 | bad_file: | 2143 | bad_file: |
2138 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | 2144 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); |
2139 | goto out; | 2145 | goto out; |
2140 | } | 2146 | } |
2147 | |||
2148 | /* | ||
2149 | * Help swapoff by noting that swap entry belongs to shmem/tmpfs | ||
2150 | * (in which case its reference count is never incremented). | ||
2151 | */ | ||
2152 | void swap_shmem_alloc(swp_entry_t entry) | ||
2153 | { | ||
2154 | __swap_duplicate(entry, SWAP_MAP_SHMEM); | ||
2155 | } | ||
2156 | |||
2141 | /* | 2157 | /* |
2142 | * increase reference count of swap entry by 1. | 2158 | * increase reference count of swap entry by 1. |
2143 | */ | 2159 | */ |
2144 | void swap_duplicate(swp_entry_t entry) | 2160 | int swap_duplicate(swp_entry_t entry) |
2145 | { | 2161 | { |
2146 | __swap_duplicate(entry, SWAP_MAP); | 2162 | int err = 0; |
2163 | |||
2164 | while (!err && __swap_duplicate(entry, 1) == -ENOMEM) | ||
2165 | err = add_swap_count_continuation(entry, GFP_ATOMIC); | ||
2166 | return err; | ||
2147 | } | 2167 | } |
2148 | 2168 | ||
2149 | /* | 2169 | /* |
2150 | * @entry: swap entry for which we allocate swap cache. | 2170 | * @entry: swap entry for which we allocate swap cache. |
2151 | * | 2171 | * |
2152 | * Called when allocating swap cache for exising swap entry, | 2172 | * Called when allocating swap cache for existing swap entry, |
2153 | * This can return error codes. Returns 0 at success. | 2173 | * This can return error codes. Returns 0 at success. |
2154 | * -EBUSY means there is a swap cache. | 2174 | * -EBUSY means there is a swap cache. |
2155 | * Note: return code is different from swap_duplicate(). | 2175 | * Note: return code is different from swap_duplicate(). |
2156 | */ | 2176 | */ |
2157 | int swapcache_prepare(swp_entry_t entry) | 2177 | int swapcache_prepare(swp_entry_t entry) |
2158 | { | 2178 | { |
2159 | return __swap_duplicate(entry, SWAP_CACHE); | 2179 | return __swap_duplicate(entry, SWAP_HAS_CACHE); |
2160 | } | ||
2161 | |||
2162 | |||
2163 | struct swap_info_struct * | ||
2164 | get_swap_info_struct(unsigned type) | ||
2165 | { | ||
2166 | return &swap_info[type]; | ||
2167 | } | 2180 | } |
2168 | 2181 | ||
2169 | /* | 2182 | /* |
@@ -2181,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2181 | if (!our_page_cluster) /* no readahead */ | 2194 | if (!our_page_cluster) /* no readahead */ |
2182 | return 0; | 2195 | return 0; |
2183 | 2196 | ||
2184 | si = &swap_info[swp_type(entry)]; | 2197 | si = swap_info[swp_type(entry)]; |
2185 | target = swp_offset(entry); | 2198 | target = swp_offset(entry); |
2186 | base = (target >> our_page_cluster) << our_page_cluster; | 2199 | base = (target >> our_page_cluster) << our_page_cluster; |
2187 | end = base + (1 << our_page_cluster); | 2200 | end = base + (1 << our_page_cluster); |
@@ -2217,3 +2230,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2217 | *offset = ++toff; | 2230 | *offset = ++toff; |
2218 | return nr_pages? ++nr_pages: 0; | 2231 | return nr_pages? ++nr_pages: 0; |
2219 | } | 2232 | } |
2233 | |||
2234 | /* | ||
2235 | * add_swap_count_continuation - called when a swap count is duplicated | ||
2236 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's | ||
2237 | * page of the original vmalloc'ed swap_map, to hold the continuation count | ||
2238 | * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called | ||
2239 | * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. | ||
2240 | * | ||
2241 | * These continuation pages are seldom referenced: the common paths all work | ||
2242 | * on the original swap_map, only referring to a continuation page when the | ||
2243 | * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. | ||
2244 | * | ||
2245 | * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding | ||
2246 | * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) | ||
2247 | * can be called after dropping locks. | ||
2248 | */ | ||
2249 | int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | ||
2250 | { | ||
2251 | struct swap_info_struct *si; | ||
2252 | struct page *head; | ||
2253 | struct page *page; | ||
2254 | struct page *list_page; | ||
2255 | pgoff_t offset; | ||
2256 | unsigned char count; | ||
2257 | |||
2258 | /* | ||
2259 | * When debugging, it's easier to use __GFP_ZERO here; but it's better | ||
2260 | * for latency not to zero a page while GFP_ATOMIC and holding locks. | ||
2261 | */ | ||
2262 | page = alloc_page(gfp_mask | __GFP_HIGHMEM); | ||
2263 | |||
2264 | si = swap_info_get(entry); | ||
2265 | if (!si) { | ||
2266 | /* | ||
2267 | * An acceptable race has occurred since the failing | ||
2268 | * __swap_duplicate(): the swap entry has been freed, | ||
2269 | * perhaps even the whole swap_map cleared for swapoff. | ||
2270 | */ | ||
2271 | goto outer; | ||
2272 | } | ||
2273 | |||
2274 | offset = swp_offset(entry); | ||
2275 | count = si->swap_map[offset] & ~SWAP_HAS_CACHE; | ||
2276 | |||
2277 | if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { | ||
2278 | /* | ||
2279 | * The higher the swap count, the more likely it is that tasks | ||
2280 | * will race to add swap count continuation: we need to avoid | ||
2281 | * over-provisioning. | ||
2282 | */ | ||
2283 | goto out; | ||
2284 | } | ||
2285 | |||
2286 | if (!page) { | ||
2287 | spin_unlock(&swap_lock); | ||
2288 | return -ENOMEM; | ||
2289 | } | ||
2290 | |||
2291 | /* | ||
2292 | * We are fortunate that although vmalloc_to_page uses pte_offset_map, | ||
2293 | * no architecture is using highmem pages for kernel pagetables: so it | ||
2294 | * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. | ||
2295 | */ | ||
2296 | head = vmalloc_to_page(si->swap_map + offset); | ||
2297 | offset &= ~PAGE_MASK; | ||
2298 | |||
2299 | /* | ||
2300 | * Page allocation does not initialize the page's lru field, | ||
2301 | * but it does always reset its private field. | ||
2302 | */ | ||
2303 | if (!page_private(head)) { | ||
2304 | BUG_ON(count & COUNT_CONTINUED); | ||
2305 | INIT_LIST_HEAD(&head->lru); | ||
2306 | set_page_private(head, SWP_CONTINUED); | ||
2307 | si->flags |= SWP_CONTINUED; | ||
2308 | } | ||
2309 | |||
2310 | list_for_each_entry(list_page, &head->lru, lru) { | ||
2311 | unsigned char *map; | ||
2312 | |||
2313 | /* | ||
2314 | * If the previous map said no continuation, but we've found | ||
2315 | * a continuation page, free our allocation and use this one. | ||
2316 | */ | ||
2317 | if (!(count & COUNT_CONTINUED)) | ||
2318 | goto out; | ||
2319 | |||
2320 | map = kmap_atomic(list_page, KM_USER0) + offset; | ||
2321 | count = *map; | ||
2322 | kunmap_atomic(map, KM_USER0); | ||
2323 | |||
2324 | /* | ||
2325 | * If this continuation count now has some space in it, | ||
2326 | * free our allocation and use this one. | ||
2327 | */ | ||
2328 | if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) | ||
2329 | goto out; | ||
2330 | } | ||
2331 | |||
2332 | list_add_tail(&page->lru, &head->lru); | ||
2333 | page = NULL; /* now it's attached, don't free it */ | ||
2334 | out: | ||
2335 | spin_unlock(&swap_lock); | ||
2336 | outer: | ||
2337 | if (page) | ||
2338 | __free_page(page); | ||
2339 | return 0; | ||
2340 | } | ||
2341 | |||
2342 | /* | ||
2343 | * swap_count_continued - when the original swap_map count is incremented | ||
2344 | * from SWAP_MAP_MAX, check if there is already a continuation page to carry | ||
2345 | * into, carry if so, or else fail until a new continuation page is allocated; | ||
2346 | * when the original swap_map count is decremented from 0 with continuation, | ||
2347 | * borrow from the continuation and report whether it still holds more. | ||
2348 | * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. | ||
2349 | */ | ||
2350 | static bool swap_count_continued(struct swap_info_struct *si, | ||
2351 | pgoff_t offset, unsigned char count) | ||
2352 | { | ||
2353 | struct page *head; | ||
2354 | struct page *page; | ||
2355 | unsigned char *map; | ||
2356 | |||
2357 | head = vmalloc_to_page(si->swap_map + offset); | ||
2358 | if (page_private(head) != SWP_CONTINUED) { | ||
2359 | BUG_ON(count & COUNT_CONTINUED); | ||
2360 | return false; /* need to add count continuation */ | ||
2361 | } | ||
2362 | |||
2363 | offset &= ~PAGE_MASK; | ||
2364 | page = list_entry(head->lru.next, struct page, lru); | ||
2365 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2366 | |||
2367 | if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ | ||
2368 | goto init_map; /* jump over SWAP_CONT_MAX checks */ | ||
2369 | |||
2370 | if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ | ||
2371 | /* | ||
2372 | * Think of how you add 1 to 999 | ||
2373 | */ | ||
2374 | while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { | ||
2375 | kunmap_atomic(map, KM_USER0); | ||
2376 | page = list_entry(page->lru.next, struct page, lru); | ||
2377 | BUG_ON(page == head); | ||
2378 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2379 | } | ||
2380 | if (*map == SWAP_CONT_MAX) { | ||
2381 | kunmap_atomic(map, KM_USER0); | ||
2382 | page = list_entry(page->lru.next, struct page, lru); | ||
2383 | if (page == head) | ||
2384 | return false; /* add count continuation */ | ||
2385 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2386 | init_map: *map = 0; /* we didn't zero the page */ | ||
2387 | } | ||
2388 | *map += 1; | ||
2389 | kunmap_atomic(map, KM_USER0); | ||
2390 | page = list_entry(page->lru.prev, struct page, lru); | ||
2391 | while (page != head) { | ||
2392 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2393 | *map = COUNT_CONTINUED; | ||
2394 | kunmap_atomic(map, KM_USER0); | ||
2395 | page = list_entry(page->lru.prev, struct page, lru); | ||
2396 | } | ||
2397 | return true; /* incremented */ | ||
2398 | |||
2399 | } else { /* decrementing */ | ||
2400 | /* | ||
2401 | * Think of how you subtract 1 from 1000 | ||
2402 | */ | ||
2403 | BUG_ON(count != COUNT_CONTINUED); | ||
2404 | while (*map == COUNT_CONTINUED) { | ||
2405 | kunmap_atomic(map, KM_USER0); | ||
2406 | page = list_entry(page->lru.next, struct page, lru); | ||
2407 | BUG_ON(page == head); | ||
2408 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2409 | } | ||
2410 | BUG_ON(*map == 0); | ||
2411 | *map -= 1; | ||
2412 | if (*map == 0) | ||
2413 | count = 0; | ||
2414 | kunmap_atomic(map, KM_USER0); | ||
2415 | page = list_entry(page->lru.prev, struct page, lru); | ||
2416 | while (page != head) { | ||
2417 | map = kmap_atomic(page, KM_USER0) + offset; | ||
2418 | *map = SWAP_CONT_MAX | count; | ||
2419 | count = COUNT_CONTINUED; | ||
2420 | kunmap_atomic(map, KM_USER0); | ||
2421 | page = list_entry(page->lru.prev, struct page, lru); | ||
2422 | } | ||
2423 | return count == COUNT_CONTINUED; | ||
2424 | } | ||
2425 | } | ||
2426 | |||
2427 | /* | ||
2428 | * free_swap_count_continuations - swapoff free all the continuation pages | ||
2429 | * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. | ||
2430 | */ | ||
2431 | static void free_swap_count_continuations(struct swap_info_struct *si) | ||
2432 | { | ||
2433 | pgoff_t offset; | ||
2434 | |||
2435 | for (offset = 0; offset < si->max; offset += PAGE_SIZE) { | ||
2436 | struct page *head; | ||
2437 | head = vmalloc_to_page(si->swap_map + offset); | ||
2438 | if (page_private(head)) { | ||
2439 | struct list_head *this, *next; | ||
2440 | list_for_each_safe(this, next, &head->lru) { | ||
2441 | struct page *page; | ||
2442 | page = list_entry(this, struct page, lru); | ||
2443 | list_del(this); | ||
2444 | __free_page(page); | ||
2445 | } | ||
2446 | } | ||
2447 | } | ||
2448 | } | ||
diff --git a/mm/truncate.c b/mm/truncate.c index 2c147a7e5f2c..342deee22684 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
272 | pagevec_release(&pvec); | 272 | pagevec_release(&pvec); |
273 | break; | 273 | break; |
274 | } | 274 | } |
275 | mem_cgroup_uncharge_start(); | ||
275 | for (i = 0; i < pagevec_count(&pvec); i++) { | 276 | for (i = 0; i < pagevec_count(&pvec); i++) { |
276 | struct page *page = pvec.pages[i]; | 277 | struct page *page = pvec.pages[i]; |
277 | 278 | ||
@@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
286 | unlock_page(page); | 287 | unlock_page(page); |
287 | } | 288 | } |
288 | pagevec_release(&pvec); | 289 | pagevec_release(&pvec); |
290 | mem_cgroup_uncharge_end(); | ||
289 | } | 291 | } |
290 | } | 292 | } |
291 | EXPORT_SYMBOL(truncate_inode_pages_range); | 293 | EXPORT_SYMBOL(truncate_inode_pages_range); |
@@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
327 | pagevec_init(&pvec, 0); | 329 | pagevec_init(&pvec, 0); |
328 | while (next <= end && | 330 | while (next <= end && |
329 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 331 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { |
332 | mem_cgroup_uncharge_start(); | ||
330 | for (i = 0; i < pagevec_count(&pvec); i++) { | 333 | for (i = 0; i < pagevec_count(&pvec); i++) { |
331 | struct page *page = pvec.pages[i]; | 334 | struct page *page = pvec.pages[i]; |
332 | pgoff_t index; | 335 | pgoff_t index; |
@@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
354 | break; | 357 | break; |
355 | } | 358 | } |
356 | pagevec_release(&pvec); | 359 | pagevec_release(&pvec); |
360 | mem_cgroup_uncharge_end(); | ||
357 | cond_resched(); | 361 | cond_resched(); |
358 | } | 362 | } |
359 | return ret; | 363 | return ret; |
@@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
428 | while (next <= end && !wrapped && | 432 | while (next <= end && !wrapped && |
429 | pagevec_lookup(&pvec, mapping, next, | 433 | pagevec_lookup(&pvec, mapping, next, |
430 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 434 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
435 | mem_cgroup_uncharge_start(); | ||
431 | for (i = 0; i < pagevec_count(&pvec); i++) { | 436 | for (i = 0; i < pagevec_count(&pvec); i++) { |
432 | struct page *page = pvec.pages[i]; | 437 | struct page *page = pvec.pages[i]; |
433 | pgoff_t page_index; | 438 | pgoff_t page_index; |
@@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
477 | unlock_page(page); | 482 | unlock_page(page); |
478 | } | 483 | } |
479 | pagevec_release(&pvec); | 484 | pagevec_release(&pvec); |
485 | mem_cgroup_uncharge_end(); | ||
480 | cond_resched(); | 486 | cond_resched(); |
481 | } | 487 | } |
482 | return ret; | 488 | return ret; |
@@ -4,6 +4,10 @@ | |||
4 | #include <linux/module.h> | 4 | #include <linux/module.h> |
5 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <linux/hugetlb.h> | ||
8 | #include <linux/syscalls.h> | ||
9 | #include <linux/mman.h> | ||
10 | #include <linux/file.h> | ||
7 | #include <asm/uaccess.h> | 11 | #include <asm/uaccess.h> |
8 | 12 | ||
9 | #define CREATE_TRACE_POINTS | 13 | #define CREATE_TRACE_POINTS |
@@ -268,6 +272,46 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, | |||
268 | } | 272 | } |
269 | EXPORT_SYMBOL_GPL(get_user_pages_fast); | 273 | EXPORT_SYMBOL_GPL(get_user_pages_fast); |
270 | 274 | ||
275 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | ||
276 | unsigned long, prot, unsigned long, flags, | ||
277 | unsigned long, fd, unsigned long, pgoff) | ||
278 | { | ||
279 | struct file * file = NULL; | ||
280 | unsigned long retval = -EBADF; | ||
281 | |||
282 | if (!(flags & MAP_ANONYMOUS)) { | ||
283 | if (unlikely(flags & MAP_HUGETLB)) | ||
284 | return -EINVAL; | ||
285 | file = fget(fd); | ||
286 | if (!file) | ||
287 | goto out; | ||
288 | } else if (flags & MAP_HUGETLB) { | ||
289 | struct user_struct *user = NULL; | ||
290 | /* | ||
291 | * VM_NORESERVE is used because the reservations will be | ||
292 | * taken when vm_ops->mmap() is called | ||
293 | * A dummy user value is used because we are not locking | ||
294 | * memory so no accounting is necessary | ||
295 | */ | ||
296 | len = ALIGN(len, huge_page_size(&default_hstate)); | ||
297 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, | ||
298 | &user, HUGETLB_ANONHUGE_INODE); | ||
299 | if (IS_ERR(file)) | ||
300 | return PTR_ERR(file); | ||
301 | } | ||
302 | |||
303 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
304 | |||
305 | down_write(¤t->mm->mmap_sem); | ||
306 | retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
307 | up_write(¤t->mm->mmap_sem); | ||
308 | |||
309 | if (file) | ||
310 | fput(file); | ||
311 | out: | ||
312 | return retval; | ||
313 | } | ||
314 | |||
271 | /* Tracepoints definitions. */ | 315 | /* Tracepoints definitions. */ |
272 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); | 316 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); |
273 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); | 317 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0f551a4a44cd..37e69295f250 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -761,7 +761,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
761 | spin_lock(&vbq->lock); | 761 | spin_lock(&vbq->lock); |
762 | list_add(&vb->free_list, &vbq->free); | 762 | list_add(&vb->free_list, &vbq->free); |
763 | spin_unlock(&vbq->lock); | 763 | spin_unlock(&vbq->lock); |
764 | put_cpu_var(vmap_cpu_blocks); | 764 | put_cpu_var(vmap_block_queue); |
765 | 765 | ||
766 | return vb; | 766 | return vb; |
767 | } | 767 | } |
@@ -826,7 +826,7 @@ again: | |||
826 | } | 826 | } |
827 | spin_unlock(&vb->lock); | 827 | spin_unlock(&vb->lock); |
828 | } | 828 | } |
829 | put_cpu_var(vmap_cpu_blocks); | 829 | put_cpu_var(vmap_block_queue); |
830 | rcu_read_unlock(); | 830 | rcu_read_unlock(); |
831 | 831 | ||
832 | if (!addr) { | 832 | if (!addr) { |
@@ -1411,6 +1411,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1411 | { | 1411 | { |
1412 | struct page **pages; | 1412 | struct page **pages; |
1413 | unsigned int nr_pages, array_size, i; | 1413 | unsigned int nr_pages, array_size, i; |
1414 | gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; | ||
1414 | 1415 | ||
1415 | nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; | 1416 | nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; |
1416 | array_size = (nr_pages * sizeof(struct page *)); | 1417 | array_size = (nr_pages * sizeof(struct page *)); |
@@ -1418,13 +1419,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1418 | area->nr_pages = nr_pages; | 1419 | area->nr_pages = nr_pages; |
1419 | /* Please note that the recursion is strictly bounded. */ | 1420 | /* Please note that the recursion is strictly bounded. */ |
1420 | if (array_size > PAGE_SIZE) { | 1421 | if (array_size > PAGE_SIZE) { |
1421 | pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO, | 1422 | pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, |
1422 | PAGE_KERNEL, node, caller); | 1423 | PAGE_KERNEL, node, caller); |
1423 | area->flags |= VM_VPAGES; | 1424 | area->flags |= VM_VPAGES; |
1424 | } else { | 1425 | } else { |
1425 | pages = kmalloc_node(array_size, | 1426 | pages = kmalloc_node(array_size, nested_gfp, node); |
1426 | (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO, | ||
1427 | node); | ||
1428 | } | 1427 | } |
1429 | area->pages = pages; | 1428 | area->pages = pages; |
1430 | area->caller = caller; | 1429 | area->caller = caller; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 777af57fd8c8..885207a6b6b7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -55,6 +55,11 @@ struct scan_control { | |||
55 | /* Number of pages freed so far during a call to shrink_zones() */ | 55 | /* Number of pages freed so far during a call to shrink_zones() */ |
56 | unsigned long nr_reclaimed; | 56 | unsigned long nr_reclaimed; |
57 | 57 | ||
58 | /* How many pages shrink_list() should reclaim */ | ||
59 | unsigned long nr_to_reclaim; | ||
60 | |||
61 | unsigned long hibernation_mode; | ||
62 | |||
58 | /* This context's GFP mask */ | 63 | /* This context's GFP mask */ |
59 | gfp_t gfp_mask; | 64 | gfp_t gfp_mask; |
60 | 65 | ||
@@ -66,12 +71,6 @@ struct scan_control { | |||
66 | /* Can pages be swapped as part of reclaim? */ | 71 | /* Can pages be swapped as part of reclaim? */ |
67 | int may_swap; | 72 | int may_swap; |
68 | 73 | ||
69 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for | ||
70 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. | ||
71 | * In this context, it doesn't matter that we scan the | ||
72 | * whole list at once. */ | ||
73 | int swap_cluster_max; | ||
74 | |||
75 | int swappiness; | 74 | int swappiness; |
76 | 75 | ||
77 | int all_unreclaimable; | 76 | int all_unreclaimable; |
@@ -358,7 +357,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
358 | * stalls if we need to run get_block(). We could test | 357 | * stalls if we need to run get_block(). We could test |
359 | * PagePrivate for that. | 358 | * PagePrivate for that. |
360 | * | 359 | * |
361 | * If this process is currently in generic_file_write() against | 360 | * If this process is currently in __generic_file_aio_write() against |
362 | * this page's queue, we can perform writeback even if that | 361 | * this page's queue, we can perform writeback even if that |
363 | * will block. | 362 | * will block. |
364 | * | 363 | * |
@@ -1132,7 +1131,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1132 | unsigned long nr_anon; | 1131 | unsigned long nr_anon; |
1133 | unsigned long nr_file; | 1132 | unsigned long nr_file; |
1134 | 1133 | ||
1135 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, | 1134 | nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, |
1136 | &page_list, &nr_scan, sc->order, mode, | 1135 | &page_list, &nr_scan, sc->order, mode, |
1137 | zone, sc->mem_cgroup, 0, file); | 1136 | zone, sc->mem_cgroup, 0, file); |
1138 | 1137 | ||
@@ -1166,10 +1165,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1166 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); | 1165 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); |
1167 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); | 1166 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); |
1168 | 1167 | ||
1169 | reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; | 1168 | reclaim_stat->recent_scanned[0] += nr_anon; |
1170 | reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; | 1169 | reclaim_stat->recent_scanned[1] += nr_file; |
1171 | reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE]; | ||
1172 | reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE]; | ||
1173 | 1170 | ||
1174 | spin_unlock_irq(&zone->lru_lock); | 1171 | spin_unlock_irq(&zone->lru_lock); |
1175 | 1172 | ||
@@ -1464,20 +1461,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | |||
1464 | return low; | 1461 | return low; |
1465 | } | 1462 | } |
1466 | 1463 | ||
1464 | static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, | ||
1465 | int file) | ||
1466 | { | ||
1467 | if (file) | ||
1468 | return inactive_file_is_low(zone, sc); | ||
1469 | else | ||
1470 | return inactive_anon_is_low(zone, sc); | ||
1471 | } | ||
1472 | |||
1467 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1473 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
1468 | struct zone *zone, struct scan_control *sc, int priority) | 1474 | struct zone *zone, struct scan_control *sc, int priority) |
1469 | { | 1475 | { |
1470 | int file = is_file_lru(lru); | 1476 | int file = is_file_lru(lru); |
1471 | 1477 | ||
1472 | if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { | 1478 | if (is_active_lru(lru)) { |
1473 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | 1479 | if (inactive_list_is_low(zone, sc, file)) |
1480 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | ||
1474 | return 0; | 1481 | return 0; |
1475 | } | 1482 | } |
1476 | 1483 | ||
1477 | if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { | ||
1478 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | ||
1479 | return 0; | ||
1480 | } | ||
1481 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); | 1484 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); |
1482 | } | 1485 | } |
1483 | 1486 | ||
@@ -1567,15 +1570,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1567 | * until we collected @swap_cluster_max pages to scan. | 1570 | * until we collected @swap_cluster_max pages to scan. |
1568 | */ | 1571 | */ |
1569 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, | 1572 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, |
1570 | unsigned long *nr_saved_scan, | 1573 | unsigned long *nr_saved_scan) |
1571 | unsigned long swap_cluster_max) | ||
1572 | { | 1574 | { |
1573 | unsigned long nr; | 1575 | unsigned long nr; |
1574 | 1576 | ||
1575 | *nr_saved_scan += nr_to_scan; | 1577 | *nr_saved_scan += nr_to_scan; |
1576 | nr = *nr_saved_scan; | 1578 | nr = *nr_saved_scan; |
1577 | 1579 | ||
1578 | if (nr >= swap_cluster_max) | 1580 | if (nr >= SWAP_CLUSTER_MAX) |
1579 | *nr_saved_scan = 0; | 1581 | *nr_saved_scan = 0; |
1580 | else | 1582 | else |
1581 | nr = 0; | 1583 | nr = 0; |
@@ -1594,7 +1596,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1594 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ | 1596 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ |
1595 | enum lru_list l; | 1597 | enum lru_list l; |
1596 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1598 | unsigned long nr_reclaimed = sc->nr_reclaimed; |
1597 | unsigned long swap_cluster_max = sc->swap_cluster_max; | 1599 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
1598 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1600 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1599 | int noswap = 0; | 1601 | int noswap = 0; |
1600 | 1602 | ||
@@ -1616,15 +1618,15 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1616 | scan = (scan * percent[file]) / 100; | 1618 | scan = (scan * percent[file]) / 100; |
1617 | } | 1619 | } |
1618 | nr[l] = nr_scan_try_batch(scan, | 1620 | nr[l] = nr_scan_try_batch(scan, |
1619 | &reclaim_stat->nr_saved_scan[l], | 1621 | &reclaim_stat->nr_saved_scan[l]); |
1620 | swap_cluster_max); | ||
1621 | } | 1622 | } |
1622 | 1623 | ||
1623 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1624 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
1624 | nr[LRU_INACTIVE_FILE]) { | 1625 | nr[LRU_INACTIVE_FILE]) { |
1625 | for_each_evictable_lru(l) { | 1626 | for_each_evictable_lru(l) { |
1626 | if (nr[l]) { | 1627 | if (nr[l]) { |
1627 | nr_to_scan = min(nr[l], swap_cluster_max); | 1628 | nr_to_scan = min_t(unsigned long, |
1629 | nr[l], SWAP_CLUSTER_MAX); | ||
1628 | nr[l] -= nr_to_scan; | 1630 | nr[l] -= nr_to_scan; |
1629 | 1631 | ||
1630 | nr_reclaimed += shrink_list(l, nr_to_scan, | 1632 | nr_reclaimed += shrink_list(l, nr_to_scan, |
@@ -1639,8 +1641,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1639 | * with multiple processes reclaiming pages, the total | 1641 | * with multiple processes reclaiming pages, the total |
1640 | * freeing target can get unreasonably large. | 1642 | * freeing target can get unreasonably large. |
1641 | */ | 1643 | */ |
1642 | if (nr_reclaimed > swap_cluster_max && | 1644 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
1643 | priority < DEF_PRIORITY && !current_is_kswapd()) | ||
1644 | break; | 1645 | break; |
1645 | } | 1646 | } |
1646 | 1647 | ||
@@ -1738,6 +1739,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1738 | struct zoneref *z; | 1739 | struct zoneref *z; |
1739 | struct zone *zone; | 1740 | struct zone *zone; |
1740 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); | 1741 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); |
1742 | unsigned long writeback_threshold; | ||
1741 | 1743 | ||
1742 | delayacct_freepages_start(); | 1744 | delayacct_freepages_start(); |
1743 | 1745 | ||
@@ -1773,7 +1775,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1773 | } | 1775 | } |
1774 | } | 1776 | } |
1775 | total_scanned += sc->nr_scanned; | 1777 | total_scanned += sc->nr_scanned; |
1776 | if (sc->nr_reclaimed >= sc->swap_cluster_max) { | 1778 | if (sc->nr_reclaimed >= sc->nr_to_reclaim) { |
1777 | ret = sc->nr_reclaimed; | 1779 | ret = sc->nr_reclaimed; |
1778 | goto out; | 1780 | goto out; |
1779 | } | 1781 | } |
@@ -1785,14 +1787,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1785 | * that's undesirable in laptop mode, where we *want* lumpy | 1787 | * that's undesirable in laptop mode, where we *want* lumpy |
1786 | * writeout. So in laptop mode, write out the whole world. | 1788 | * writeout. So in laptop mode, write out the whole world. |
1787 | */ | 1789 | */ |
1788 | if (total_scanned > sc->swap_cluster_max + | 1790 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; |
1789 | sc->swap_cluster_max / 2) { | 1791 | if (total_scanned > writeback_threshold) { |
1790 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); | 1792 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); |
1791 | sc->may_writepage = 1; | 1793 | sc->may_writepage = 1; |
1792 | } | 1794 | } |
1793 | 1795 | ||
1794 | /* Take a nap, wait for some writeback to complete */ | 1796 | /* Take a nap, wait for some writeback to complete */ |
1795 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) | 1797 | if (!sc->hibernation_mode && sc->nr_scanned && |
1798 | priority < DEF_PRIORITY - 2) | ||
1796 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1799 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1797 | } | 1800 | } |
1798 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 1801 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
@@ -1831,7 +1834,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
1831 | struct scan_control sc = { | 1834 | struct scan_control sc = { |
1832 | .gfp_mask = gfp_mask, | 1835 | .gfp_mask = gfp_mask, |
1833 | .may_writepage = !laptop_mode, | 1836 | .may_writepage = !laptop_mode, |
1834 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1837 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
1835 | .may_unmap = 1, | 1838 | .may_unmap = 1, |
1836 | .may_swap = 1, | 1839 | .may_swap = 1, |
1837 | .swappiness = vm_swappiness, | 1840 | .swappiness = vm_swappiness, |
@@ -1855,7 +1858,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
1855 | .may_writepage = !laptop_mode, | 1858 | .may_writepage = !laptop_mode, |
1856 | .may_unmap = 1, | 1859 | .may_unmap = 1, |
1857 | .may_swap = !noswap, | 1860 | .may_swap = !noswap, |
1858 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
1859 | .swappiness = swappiness, | 1861 | .swappiness = swappiness, |
1860 | .order = 0, | 1862 | .order = 0, |
1861 | .mem_cgroup = mem, | 1863 | .mem_cgroup = mem, |
@@ -1889,7 +1891,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1889 | .may_writepage = !laptop_mode, | 1891 | .may_writepage = !laptop_mode, |
1890 | .may_unmap = 1, | 1892 | .may_unmap = 1, |
1891 | .may_swap = !noswap, | 1893 | .may_swap = !noswap, |
1892 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1894 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
1893 | .swappiness = swappiness, | 1895 | .swappiness = swappiness, |
1894 | .order = 0, | 1896 | .order = 0, |
1895 | .mem_cgroup = mem_cont, | 1897 | .mem_cgroup = mem_cont, |
@@ -1904,6 +1906,30 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1904 | } | 1906 | } |
1905 | #endif | 1907 | #endif |
1906 | 1908 | ||
1909 | /* is kswapd sleeping prematurely? */ | ||
1910 | static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | ||
1911 | { | ||
1912 | int i; | ||
1913 | |||
1914 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | ||
1915 | if (remaining) | ||
1916 | return 1; | ||
1917 | |||
1918 | /* If after HZ/10, a zone is below the high mark, it's premature */ | ||
1919 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
1920 | struct zone *zone = pgdat->node_zones + i; | ||
1921 | |||
1922 | if (!populated_zone(zone)) | ||
1923 | continue; | ||
1924 | |||
1925 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | ||
1926 | 0, 0)) | ||
1927 | return 1; | ||
1928 | } | ||
1929 | |||
1930 | return 0; | ||
1931 | } | ||
1932 | |||
1907 | /* | 1933 | /* |
1908 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1934 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1909 | * they are all at high_wmark_pages(zone). | 1935 | * they are all at high_wmark_pages(zone). |
@@ -1936,7 +1962,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1936 | .gfp_mask = GFP_KERNEL, | 1962 | .gfp_mask = GFP_KERNEL, |
1937 | .may_unmap = 1, | 1963 | .may_unmap = 1, |
1938 | .may_swap = 1, | 1964 | .may_swap = 1, |
1939 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1965 | /* |
1966 | * kswapd doesn't want to be bailed out while reclaim. because | ||
1967 | * we want to put equal scanning pressure on each zone. | ||
1968 | */ | ||
1969 | .nr_to_reclaim = ULONG_MAX, | ||
1940 | .swappiness = vm_swappiness, | 1970 | .swappiness = vm_swappiness, |
1941 | .order = order, | 1971 | .order = order, |
1942 | .mem_cgroup = NULL, | 1972 | .mem_cgroup = NULL, |
@@ -1961,6 +1991,7 @@ loop_again: | |||
1961 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 1991 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
1962 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 1992 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
1963 | unsigned long lru_pages = 0; | 1993 | unsigned long lru_pages = 0; |
1994 | int has_under_min_watermark_zone = 0; | ||
1964 | 1995 | ||
1965 | /* The swap token gets in the way of swapout... */ | 1996 | /* The swap token gets in the way of swapout... */ |
1966 | if (!priority) | 1997 | if (!priority) |
@@ -2067,6 +2098,15 @@ loop_again: | |||
2067 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 2098 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && |
2068 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2099 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2069 | sc.may_writepage = 1; | 2100 | sc.may_writepage = 1; |
2101 | |||
2102 | /* | ||
2103 | * We are still under min water mark. it mean we have | ||
2104 | * GFP_ATOMIC allocation failure risk. Hurry up! | ||
2105 | */ | ||
2106 | if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), | ||
2107 | end_zone, 0)) | ||
2108 | has_under_min_watermark_zone = 1; | ||
2109 | |||
2070 | } | 2110 | } |
2071 | if (all_zones_ok) | 2111 | if (all_zones_ok) |
2072 | break; /* kswapd: all done */ | 2112 | break; /* kswapd: all done */ |
@@ -2074,8 +2114,12 @@ loop_again: | |||
2074 | * OK, kswapd is getting into trouble. Take a nap, then take | 2114 | * OK, kswapd is getting into trouble. Take a nap, then take |
2075 | * another pass across the zones. | 2115 | * another pass across the zones. |
2076 | */ | 2116 | */ |
2077 | if (total_scanned && priority < DEF_PRIORITY - 2) | 2117 | if (total_scanned && (priority < DEF_PRIORITY - 2)) { |
2078 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 2118 | if (has_under_min_watermark_zone) |
2119 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); | ||
2120 | else | ||
2121 | congestion_wait(BLK_RW_ASYNC, HZ/10); | ||
2122 | } | ||
2079 | 2123 | ||
2080 | /* | 2124 | /* |
2081 | * We do this so kswapd doesn't build up large priorities for | 2125 | * We do this so kswapd doesn't build up large priorities for |
@@ -2173,6 +2217,7 @@ static int kswapd(void *p) | |||
2173 | order = 0; | 2217 | order = 0; |
2174 | for ( ; ; ) { | 2218 | for ( ; ; ) { |
2175 | unsigned long new_order; | 2219 | unsigned long new_order; |
2220 | int ret; | ||
2176 | 2221 | ||
2177 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2222 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
2178 | new_order = pgdat->kswapd_max_order; | 2223 | new_order = pgdat->kswapd_max_order; |
@@ -2184,19 +2229,45 @@ static int kswapd(void *p) | |||
2184 | */ | 2229 | */ |
2185 | order = new_order; | 2230 | order = new_order; |
2186 | } else { | 2231 | } else { |
2187 | if (!freezing(current)) | 2232 | if (!freezing(current) && !kthread_should_stop()) { |
2188 | schedule(); | 2233 | long remaining = 0; |
2234 | |||
2235 | /* Try to sleep for a short interval */ | ||
2236 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
2237 | remaining = schedule_timeout(HZ/10); | ||
2238 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2239 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2240 | } | ||
2241 | |||
2242 | /* | ||
2243 | * After a short sleep, check if it was a | ||
2244 | * premature sleep. If not, then go fully | ||
2245 | * to sleep until explicitly woken up | ||
2246 | */ | ||
2247 | if (!sleeping_prematurely(pgdat, order, remaining)) | ||
2248 | schedule(); | ||
2249 | else { | ||
2250 | if (remaining) | ||
2251 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
2252 | else | ||
2253 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
2254 | } | ||
2255 | } | ||
2189 | 2256 | ||
2190 | order = pgdat->kswapd_max_order; | 2257 | order = pgdat->kswapd_max_order; |
2191 | } | 2258 | } |
2192 | finish_wait(&pgdat->kswapd_wait, &wait); | 2259 | finish_wait(&pgdat->kswapd_wait, &wait); |
2193 | 2260 | ||
2194 | if (!try_to_freeze()) { | 2261 | ret = try_to_freeze(); |
2195 | /* We can speed up thawing tasks if we don't call | 2262 | if (kthread_should_stop()) |
2196 | * balance_pgdat after returning from the refrigerator | 2263 | break; |
2197 | */ | 2264 | |
2265 | /* | ||
2266 | * We can speed up thawing tasks if we don't call balance_pgdat | ||
2267 | * after returning from the refrigerator | ||
2268 | */ | ||
2269 | if (!ret) | ||
2198 | balance_pgdat(pgdat, order); | 2270 | balance_pgdat(pgdat, order); |
2199 | } | ||
2200 | } | 2271 | } |
2201 | return 0; | 2272 | return 0; |
2202 | } | 2273 | } |
@@ -2260,148 +2331,43 @@ unsigned long zone_reclaimable_pages(struct zone *zone) | |||
2260 | 2331 | ||
2261 | #ifdef CONFIG_HIBERNATION | 2332 | #ifdef CONFIG_HIBERNATION |
2262 | /* | 2333 | /* |
2263 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages | 2334 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of |
2264 | * from LRU lists system-wide, for given pass and priority. | ||
2265 | * | ||
2266 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages | ||
2267 | */ | ||
2268 | static void shrink_all_zones(unsigned long nr_pages, int prio, | ||
2269 | int pass, struct scan_control *sc) | ||
2270 | { | ||
2271 | struct zone *zone; | ||
2272 | unsigned long nr_reclaimed = 0; | ||
2273 | struct zone_reclaim_stat *reclaim_stat; | ||
2274 | |||
2275 | for_each_populated_zone(zone) { | ||
2276 | enum lru_list l; | ||
2277 | |||
2278 | if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) | ||
2279 | continue; | ||
2280 | |||
2281 | for_each_evictable_lru(l) { | ||
2282 | enum zone_stat_item ls = NR_LRU_BASE + l; | ||
2283 | unsigned long lru_pages = zone_page_state(zone, ls); | ||
2284 | |||
2285 | /* For pass = 0, we don't shrink the active list */ | ||
2286 | if (pass == 0 && (l == LRU_ACTIVE_ANON || | ||
2287 | l == LRU_ACTIVE_FILE)) | ||
2288 | continue; | ||
2289 | |||
2290 | reclaim_stat = get_reclaim_stat(zone, sc); | ||
2291 | reclaim_stat->nr_saved_scan[l] += | ||
2292 | (lru_pages >> prio) + 1; | ||
2293 | if (reclaim_stat->nr_saved_scan[l] | ||
2294 | >= nr_pages || pass > 3) { | ||
2295 | unsigned long nr_to_scan; | ||
2296 | |||
2297 | reclaim_stat->nr_saved_scan[l] = 0; | ||
2298 | nr_to_scan = min(nr_pages, lru_pages); | ||
2299 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, | ||
2300 | sc, prio); | ||
2301 | if (nr_reclaimed >= nr_pages) { | ||
2302 | sc->nr_reclaimed += nr_reclaimed; | ||
2303 | return; | ||
2304 | } | ||
2305 | } | ||
2306 | } | ||
2307 | } | ||
2308 | sc->nr_reclaimed += nr_reclaimed; | ||
2309 | } | ||
2310 | |||
2311 | /* | ||
2312 | * Try to free `nr_pages' of memory, system-wide, and return the number of | ||
2313 | * freed pages. | 2335 | * freed pages. |
2314 | * | 2336 | * |
2315 | * Rather than trying to age LRUs the aim is to preserve the overall | 2337 | * Rather than trying to age LRUs the aim is to preserve the overall |
2316 | * LRU order by reclaiming preferentially | 2338 | * LRU order by reclaiming preferentially |
2317 | * inactive > active > active referenced > active mapped | 2339 | * inactive > active > active referenced > active mapped |
2318 | */ | 2340 | */ |
2319 | unsigned long shrink_all_memory(unsigned long nr_pages) | 2341 | unsigned long shrink_all_memory(unsigned long nr_to_reclaim) |
2320 | { | 2342 | { |
2321 | unsigned long lru_pages, nr_slab; | ||
2322 | int pass; | ||
2323 | struct reclaim_state reclaim_state; | 2343 | struct reclaim_state reclaim_state; |
2324 | struct scan_control sc = { | 2344 | struct scan_control sc = { |
2325 | .gfp_mask = GFP_KERNEL, | 2345 | .gfp_mask = GFP_HIGHUSER_MOVABLE, |
2326 | .may_unmap = 0, | 2346 | .may_swap = 1, |
2347 | .may_unmap = 1, | ||
2327 | .may_writepage = 1, | 2348 | .may_writepage = 1, |
2349 | .nr_to_reclaim = nr_to_reclaim, | ||
2350 | .hibernation_mode = 1, | ||
2351 | .swappiness = vm_swappiness, | ||
2352 | .order = 0, | ||
2328 | .isolate_pages = isolate_pages_global, | 2353 | .isolate_pages = isolate_pages_global, |
2329 | .nr_reclaimed = 0, | ||
2330 | }; | 2354 | }; |
2355 | struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | ||
2356 | struct task_struct *p = current; | ||
2357 | unsigned long nr_reclaimed; | ||
2331 | 2358 | ||
2332 | current->reclaim_state = &reclaim_state; | 2359 | p->flags |= PF_MEMALLOC; |
2333 | 2360 | lockdep_set_current_reclaim_state(sc.gfp_mask); | |
2334 | lru_pages = global_reclaimable_pages(); | 2361 | reclaim_state.reclaimed_slab = 0; |
2335 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); | 2362 | p->reclaim_state = &reclaim_state; |
2336 | /* If slab caches are huge, it's better to hit them first */ | ||
2337 | while (nr_slab >= lru_pages) { | ||
2338 | reclaim_state.reclaimed_slab = 0; | ||
2339 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
2340 | if (!reclaim_state.reclaimed_slab) | ||
2341 | break; | ||
2342 | |||
2343 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2344 | if (sc.nr_reclaimed >= nr_pages) | ||
2345 | goto out; | ||
2346 | |||
2347 | nr_slab -= reclaim_state.reclaimed_slab; | ||
2348 | } | ||
2349 | |||
2350 | /* | ||
2351 | * We try to shrink LRUs in 5 passes: | ||
2352 | * 0 = Reclaim from inactive_list only | ||
2353 | * 1 = Reclaim from active list but don't reclaim mapped | ||
2354 | * 2 = 2nd pass of type 1 | ||
2355 | * 3 = Reclaim mapped (normal reclaim) | ||
2356 | * 4 = 2nd pass of type 3 | ||
2357 | */ | ||
2358 | for (pass = 0; pass < 5; pass++) { | ||
2359 | int prio; | ||
2360 | |||
2361 | /* Force reclaiming mapped pages in the passes #3 and #4 */ | ||
2362 | if (pass > 2) | ||
2363 | sc.may_unmap = 1; | ||
2364 | |||
2365 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | ||
2366 | unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; | ||
2367 | |||
2368 | sc.nr_scanned = 0; | ||
2369 | sc.swap_cluster_max = nr_to_scan; | ||
2370 | shrink_all_zones(nr_to_scan, prio, pass, &sc); | ||
2371 | if (sc.nr_reclaimed >= nr_pages) | ||
2372 | goto out; | ||
2373 | |||
2374 | reclaim_state.reclaimed_slab = 0; | ||
2375 | shrink_slab(sc.nr_scanned, sc.gfp_mask, | ||
2376 | global_reclaimable_pages()); | ||
2377 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2378 | if (sc.nr_reclaimed >= nr_pages) | ||
2379 | goto out; | ||
2380 | |||
2381 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | ||
2382 | congestion_wait(BLK_RW_ASYNC, HZ / 10); | ||
2383 | } | ||
2384 | } | ||
2385 | |||
2386 | /* | ||
2387 | * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be | ||
2388 | * something in slab caches | ||
2389 | */ | ||
2390 | if (!sc.nr_reclaimed) { | ||
2391 | do { | ||
2392 | reclaim_state.reclaimed_slab = 0; | ||
2393 | shrink_slab(nr_pages, sc.gfp_mask, | ||
2394 | global_reclaimable_pages()); | ||
2395 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2396 | } while (sc.nr_reclaimed < nr_pages && | ||
2397 | reclaim_state.reclaimed_slab > 0); | ||
2398 | } | ||
2399 | 2363 | ||
2364 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | ||
2400 | 2365 | ||
2401 | out: | 2366 | p->reclaim_state = NULL; |
2402 | current->reclaim_state = NULL; | 2367 | lockdep_clear_current_reclaim_state(); |
2368 | p->flags &= ~PF_MEMALLOC; | ||
2403 | 2369 | ||
2404 | return sc.nr_reclaimed; | 2370 | return nr_reclaimed; |
2405 | } | 2371 | } |
2406 | #endif /* CONFIG_HIBERNATION */ | 2372 | #endif /* CONFIG_HIBERNATION */ |
2407 | 2373 | ||
@@ -2451,6 +2417,17 @@ int kswapd_run(int nid) | |||
2451 | return ret; | 2417 | return ret; |
2452 | } | 2418 | } |
2453 | 2419 | ||
2420 | /* | ||
2421 | * Called by memory hotplug when all memory in a node is offlined. | ||
2422 | */ | ||
2423 | void kswapd_stop(int nid) | ||
2424 | { | ||
2425 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; | ||
2426 | |||
2427 | if (kswapd) | ||
2428 | kthread_stop(kswapd); | ||
2429 | } | ||
2430 | |||
2454 | static int __init kswapd_init(void) | 2431 | static int __init kswapd_init(void) |
2455 | { | 2432 | { |
2456 | int nid; | 2433 | int nid; |
@@ -2553,8 +2530,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2553 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 2530 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
2554 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 2531 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
2555 | .may_swap = 1, | 2532 | .may_swap = 1, |
2556 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 2533 | .nr_to_reclaim = max_t(unsigned long, nr_pages, |
2557 | SWAP_CLUSTER_MAX), | 2534 | SWAP_CLUSTER_MAX), |
2558 | .gfp_mask = gfp_mask, | 2535 | .gfp_mask = gfp_mask, |
2559 | .swappiness = vm_swappiness, | 2536 | .swappiness = vm_swappiness, |
2560 | .order = order, | 2537 | .order = order, |
diff --git a/mm/vmstat.c b/mm/vmstat.c index c81321f9feec..6051fbab67ba 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -683,6 +683,9 @@ static const char * const vmstat_text[] = { | |||
683 | "slabs_scanned", | 683 | "slabs_scanned", |
684 | "kswapd_steal", | 684 | "kswapd_steal", |
685 | "kswapd_inodesteal", | 685 | "kswapd_inodesteal", |
686 | "kswapd_low_wmark_hit_quickly", | ||
687 | "kswapd_high_wmark_hit_quickly", | ||
688 | "kswapd_skip_congestion_wait", | ||
686 | "pageoutrun", | 689 | "pageoutrun", |
687 | "allocstall", | 690 | "allocstall", |
688 | 691 | ||
@@ -883,11 +886,10 @@ static void vmstat_update(struct work_struct *w) | |||
883 | 886 | ||
884 | static void __cpuinit start_cpu_timer(int cpu) | 887 | static void __cpuinit start_cpu_timer(int cpu) |
885 | { | 888 | { |
886 | struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); | 889 | struct delayed_work *work = &per_cpu(vmstat_work, cpu); |
887 | 890 | ||
888 | INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); | 891 | INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update); |
889 | schedule_delayed_work_on(cpu, vmstat_work, | 892 | schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); |
890 | __round_jiffies_relative(HZ, cpu)); | ||
891 | } | 893 | } |
892 | 894 | ||
893 | /* | 895 | /* |