diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 52 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/ashmem.c | 748 | ||||
-rw-r--r-- | mm/backing-dev.c | 128 | ||||
-rw-r--r-- | mm/bootmem.c | 5 | ||||
-rw-r--r-- | mm/cma-best-fit.c | 408 | ||||
-rw-r--r-- | mm/cma.c | 1413 | ||||
-rw-r--r-- | mm/compaction.c | 55 | ||||
-rw-r--r-- | mm/filemap.c | 44 | ||||
-rw-r--r-- | mm/filemap_xip.c | 7 | ||||
-rw-r--r-- | mm/huge_memory.c | 47 | ||||
-rw-r--r-- | mm/hugetlb.c | 75 | ||||
-rw-r--r-- | mm/internal.h | 46 | ||||
-rw-r--r-- | mm/ksm.c | 6 | ||||
-rw-r--r-- | mm/madvise.c | 16 | ||||
-rw-r--r-- | mm/memcontrol.c | 67 | ||||
-rw-r--r-- | mm/memory-failure.c | 6 | ||||
-rw-r--r-- | mm/memory.c | 76 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 73 | ||||
-rw-r--r-- | mm/migrate.c | 248 | ||||
-rw-r--r-- | mm/mincore.c | 2 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 45 | ||||
-rw-r--r-- | mm/nobootmem.c | 3 | ||||
-rw-r--r-- | mm/nommu.c | 9 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 894 | ||||
-rw-r--r-- | mm/page_alloc.c | 222 | ||||
-rw-r--r-- | mm/pagewalk.c | 2 | ||||
-rw-r--r-- | mm/percpu-vm.c | 12 | ||||
-rw-r--r-- | mm/percpu.c | 50 | ||||
-rw-r--r-- | mm/shmem.c | 516 | ||||
-rw-r--r-- | mm/slab.c | 13 | ||||
-rw-r--r-- | mm/slqb.c | 3816 | ||||
-rw-r--r-- | mm/slub.c | 51 | ||||
-rw-r--r-- | mm/sparse.c | 30 | ||||
-rw-r--r-- | mm/swap.c | 85 | ||||
-rw-r--r-- | mm/swap_state.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 4 | ||||
-rw-r--r-- | mm/vmalloc.c | 93 | ||||
-rw-r--r-- | mm/vmscan.c | 422 | ||||
-rw-r--r-- | mm/vmstat.c | 3 |
42 files changed, 8873 insertions, 933 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 8ca47a5ee9c..3c2b6739c87 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -370,3 +370,55 @@ config CLEANCACHE | |||
370 | in a negligible performance hit. | 370 | in a negligible performance hit. |
371 | 371 | ||
372 | If unsure, say Y to enable cleancache | 372 | If unsure, say Y to enable cleancache |
373 | |||
374 | config CMA | ||
375 | bool "Contiguous Memory Allocator framework" | ||
376 | # Currently there is only one allocator so force it on | ||
377 | select CMA_BEST_FIT | ||
378 | help | ||
379 | This enables the Contiguous Memory Allocator framework which | ||
380 | allows drivers to allocate big physically-contiguous blocks of | ||
381 | memory for use with hardware components that do not support I/O | ||
382 | map nor scatter-gather. | ||
383 | |||
384 | If you select this option you will also have to select at least | ||
385 | one allocator algorithm below. | ||
386 | |||
387 | To make use of CMA you need to specify the regions and | ||
388 | driver->region mapping on command line when booting the kernel. | ||
389 | |||
390 | config CMA_DEVELOPEMENT | ||
391 | bool "Include CMA developement features" | ||
392 | depends on CMA | ||
393 | help | ||
394 | This lets you enable some developement features of the CMA | ||
395 | freamework. | ||
396 | |||
397 | config CMA_DEBUG | ||
398 | bool "CMA debug messages" | ||
399 | depends on CMA_DEVELOPEMENT | ||
400 | help | ||
401 | Enable debug messages in CMA code. | ||
402 | |||
403 | config CMA_SYSFS | ||
404 | bool "CMA SysFS interface support" | ||
405 | depends on CMA_DEVELOPEMENT | ||
406 | help | ||
407 | Enable support for SysFS interface. | ||
408 | |||
409 | config CMA_CMDLINE | ||
410 | bool "CMA command line parameters support" | ||
411 | depends on CMA_DEVELOPEMENT | ||
412 | help | ||
413 | Enable support for cma, cma.map and cma.asterisk command line | ||
414 | parameters. | ||
415 | |||
416 | config CMA_BEST_FIT | ||
417 | bool "CMA best-fit allocator" | ||
418 | depends on CMA | ||
419 | help | ||
420 | This is a best-fit algorithm running in O(n log n) time where | ||
421 | n is the number of existing holes (which is never greater then | ||
422 | the number of allocated regions and usually much smaller). It | ||
423 | allocates area from the smallest hole that is big enough for | ||
424 | allocation in question. | ||
diff --git a/mm/Makefile b/mm/Makefile index 836e4163c1b..f846ad087a1 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -30,6 +30,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o | |||
30 | obj-$(CONFIG_NUMA) += mempolicy.o | 30 | obj-$(CONFIG_NUMA) += mempolicy.o |
31 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 31 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
32 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | 32 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o |
33 | obj-$(CONFIG_ASHMEM) += ashmem.o | ||
33 | obj-$(CONFIG_SLOB) += slob.o | 34 | obj-$(CONFIG_SLOB) += slob.o |
34 | obj-$(CONFIG_COMPACTION) += compaction.o | 35 | obj-$(CONFIG_COMPACTION) += compaction.o |
35 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 36 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
@@ -37,6 +38,7 @@ obj-$(CONFIG_KSM) += ksm.o | |||
37 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o | 38 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o |
38 | obj-$(CONFIG_SLAB) += slab.o | 39 | obj-$(CONFIG_SLAB) += slab.o |
39 | obj-$(CONFIG_SLUB) += slub.o | 40 | obj-$(CONFIG_SLUB) += slub.o |
41 | obj-$(CONFIG_SLQB) += slqb.o | ||
40 | obj-$(CONFIG_KMEMCHECK) += kmemcheck.o | 42 | obj-$(CONFIG_KMEMCHECK) += kmemcheck.o |
41 | obj-$(CONFIG_FAILSLAB) += failslab.o | 43 | obj-$(CONFIG_FAILSLAB) += failslab.o |
42 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 44 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
@@ -50,3 +52,5 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | |||
50 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | 52 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o |
51 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | 53 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o |
52 | obj-$(CONFIG_CLEANCACHE) += cleancache.o | 54 | obj-$(CONFIG_CLEANCACHE) += cleancache.o |
55 | obj-$(CONFIG_CMA) += cma.o | ||
56 | obj-$(CONFIG_CMA_BEST_FIT) += cma-best-fit.o | ||
diff --git a/mm/ashmem.c b/mm/ashmem.c new file mode 100644 index 00000000000..66e3f23ee33 --- /dev/null +++ b/mm/ashmem.c | |||
@@ -0,0 +1,748 @@ | |||
1 | /* mm/ashmem.c | ||
2 | ** | ||
3 | ** Anonymous Shared Memory Subsystem, ashmem | ||
4 | ** | ||
5 | ** Copyright (C) 2008 Google, Inc. | ||
6 | ** | ||
7 | ** Robert Love <rlove@google.com> | ||
8 | ** | ||
9 | ** This software is licensed under the terms of the GNU General Public | ||
10 | ** License version 2, as published by the Free Software Foundation, and | ||
11 | ** may be copied, distributed, and modified under those terms. | ||
12 | ** | ||
13 | ** This program is distributed in the hope that it will be useful, | ||
14 | ** but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | ** GNU General Public License for more details. | ||
17 | */ | ||
18 | |||
19 | #include <linux/module.h> | ||
20 | #include <linux/file.h> | ||
21 | #include <linux/fs.h> | ||
22 | #include <linux/miscdevice.h> | ||
23 | #include <linux/security.h> | ||
24 | #include <linux/mm.h> | ||
25 | #include <linux/mman.h> | ||
26 | #include <linux/uaccess.h> | ||
27 | #include <linux/personality.h> | ||
28 | #include <linux/bitops.h> | ||
29 | #include <linux/mutex.h> | ||
30 | #include <linux/shmem_fs.h> | ||
31 | #include <linux/ashmem.h> | ||
32 | |||
33 | #define ASHMEM_NAME_PREFIX "dev/ashmem/" | ||
34 | #define ASHMEM_NAME_PREFIX_LEN (sizeof(ASHMEM_NAME_PREFIX) - 1) | ||
35 | #define ASHMEM_FULL_NAME_LEN (ASHMEM_NAME_LEN + ASHMEM_NAME_PREFIX_LEN) | ||
36 | |||
37 | /* | ||
38 | * ashmem_area - anonymous shared memory area | ||
39 | * Lifecycle: From our parent file's open() until its release() | ||
40 | * Locking: Protected by `ashmem_mutex' | ||
41 | * Big Note: Mappings do NOT pin this structure; it dies on close() | ||
42 | */ | ||
43 | struct ashmem_area { | ||
44 | char name[ASHMEM_FULL_NAME_LEN];/* optional name for /proc/pid/maps */ | ||
45 | struct list_head unpinned_list; /* list of all ashmem areas */ | ||
46 | struct file *file; /* the shmem-based backing file */ | ||
47 | size_t size; /* size of the mapping, in bytes */ | ||
48 | unsigned long prot_mask; /* allowed prot bits, as vm_flags */ | ||
49 | }; | ||
50 | |||
51 | /* | ||
52 | * ashmem_range - represents an interval of unpinned (evictable) pages | ||
53 | * Lifecycle: From unpin to pin | ||
54 | * Locking: Protected by `ashmem_mutex' | ||
55 | */ | ||
56 | struct ashmem_range { | ||
57 | struct list_head lru; /* entry in LRU list */ | ||
58 | struct list_head unpinned; /* entry in its area's unpinned list */ | ||
59 | struct ashmem_area *asma; /* associated area */ | ||
60 | size_t pgstart; /* starting page, inclusive */ | ||
61 | size_t pgend; /* ending page, inclusive */ | ||
62 | unsigned int purged; /* ASHMEM_NOT or ASHMEM_WAS_PURGED */ | ||
63 | }; | ||
64 | |||
65 | /* LRU list of unpinned pages, protected by ashmem_mutex */ | ||
66 | static LIST_HEAD(ashmem_lru_list); | ||
67 | |||
68 | /* Count of pages on our LRU list, protected by ashmem_mutex */ | ||
69 | static unsigned long lru_count; | ||
70 | |||
71 | /* | ||
72 | * ashmem_mutex - protects the list of and each individual ashmem_area | ||
73 | * | ||
74 | * Lock Ordering: ashmex_mutex -> i_mutex -> i_alloc_sem | ||
75 | */ | ||
76 | static DEFINE_MUTEX(ashmem_mutex); | ||
77 | |||
78 | static struct kmem_cache *ashmem_area_cachep __read_mostly; | ||
79 | static struct kmem_cache *ashmem_range_cachep __read_mostly; | ||
80 | |||
81 | #define range_size(range) \ | ||
82 | ((range)->pgend - (range)->pgstart + 1) | ||
83 | |||
84 | #define range_on_lru(range) \ | ||
85 | ((range)->purged == ASHMEM_NOT_PURGED) | ||
86 | |||
87 | #define page_range_subsumes_range(range, start, end) \ | ||
88 | (((range)->pgstart >= (start)) && ((range)->pgend <= (end))) | ||
89 | |||
90 | #define page_range_subsumed_by_range(range, start, end) \ | ||
91 | (((range)->pgstart <= (start)) && ((range)->pgend >= (end))) | ||
92 | |||
93 | #define page_in_range(range, page) \ | ||
94 | (((range)->pgstart <= (page)) && ((range)->pgend >= (page))) | ||
95 | |||
96 | #define page_range_in_range(range, start, end) \ | ||
97 | (page_in_range(range, start) || page_in_range(range, end) || \ | ||
98 | page_range_subsumes_range(range, start, end)) | ||
99 | |||
100 | #define range_before_page(range, page) \ | ||
101 | ((range)->pgend < (page)) | ||
102 | |||
103 | #define PROT_MASK (PROT_EXEC | PROT_READ | PROT_WRITE) | ||
104 | |||
105 | static inline void lru_add(struct ashmem_range *range) | ||
106 | { | ||
107 | list_add_tail(&range->lru, &ashmem_lru_list); | ||
108 | lru_count += range_size(range); | ||
109 | } | ||
110 | |||
111 | static inline void lru_del(struct ashmem_range *range) | ||
112 | { | ||
113 | list_del(&range->lru); | ||
114 | lru_count -= range_size(range); | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | * range_alloc - allocate and initialize a new ashmem_range structure | ||
119 | * | ||
120 | * 'asma' - associated ashmem_area | ||
121 | * 'prev_range' - the previous ashmem_range in the sorted asma->unpinned list | ||
122 | * 'purged' - initial purge value (ASMEM_NOT_PURGED or ASHMEM_WAS_PURGED) | ||
123 | * 'start' - starting page, inclusive | ||
124 | * 'end' - ending page, inclusive | ||
125 | * | ||
126 | * Caller must hold ashmem_mutex. | ||
127 | */ | ||
128 | static int range_alloc(struct ashmem_area *asma, | ||
129 | struct ashmem_range *prev_range, unsigned int purged, | ||
130 | size_t start, size_t end) | ||
131 | { | ||
132 | struct ashmem_range *range; | ||
133 | |||
134 | range = kmem_cache_zalloc(ashmem_range_cachep, GFP_KERNEL); | ||
135 | if (unlikely(!range)) | ||
136 | return -ENOMEM; | ||
137 | |||
138 | range->asma = asma; | ||
139 | range->pgstart = start; | ||
140 | range->pgend = end; | ||
141 | range->purged = purged; | ||
142 | |||
143 | list_add_tail(&range->unpinned, &prev_range->unpinned); | ||
144 | |||
145 | if (range_on_lru(range)) | ||
146 | lru_add(range); | ||
147 | |||
148 | return 0; | ||
149 | } | ||
150 | |||
151 | static void range_del(struct ashmem_range *range) | ||
152 | { | ||
153 | list_del(&range->unpinned); | ||
154 | if (range_on_lru(range)) | ||
155 | lru_del(range); | ||
156 | kmem_cache_free(ashmem_range_cachep, range); | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * range_shrink - shrinks a range | ||
161 | * | ||
162 | * Caller must hold ashmem_mutex. | ||
163 | */ | ||
164 | static inline void range_shrink(struct ashmem_range *range, | ||
165 | size_t start, size_t end) | ||
166 | { | ||
167 | size_t pre = range_size(range); | ||
168 | |||
169 | range->pgstart = start; | ||
170 | range->pgend = end; | ||
171 | |||
172 | if (range_on_lru(range)) | ||
173 | lru_count -= pre - range_size(range); | ||
174 | } | ||
175 | |||
176 | static int ashmem_open(struct inode *inode, struct file *file) | ||
177 | { | ||
178 | struct ashmem_area *asma; | ||
179 | int ret; | ||
180 | |||
181 | ret = generic_file_open(inode, file); | ||
182 | if (unlikely(ret)) | ||
183 | return ret; | ||
184 | |||
185 | asma = kmem_cache_zalloc(ashmem_area_cachep, GFP_KERNEL); | ||
186 | if (unlikely(!asma)) | ||
187 | return -ENOMEM; | ||
188 | |||
189 | INIT_LIST_HEAD(&asma->unpinned_list); | ||
190 | memcpy(asma->name, ASHMEM_NAME_PREFIX, ASHMEM_NAME_PREFIX_LEN); | ||
191 | asma->prot_mask = PROT_MASK; | ||
192 | file->private_data = asma; | ||
193 | |||
194 | return 0; | ||
195 | } | ||
196 | |||
197 | static int ashmem_release(struct inode *ignored, struct file *file) | ||
198 | { | ||
199 | struct ashmem_area *asma = file->private_data; | ||
200 | struct ashmem_range *range, *next; | ||
201 | |||
202 | mutex_lock(&ashmem_mutex); | ||
203 | list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned) | ||
204 | range_del(range); | ||
205 | mutex_unlock(&ashmem_mutex); | ||
206 | |||
207 | if (asma->file) | ||
208 | fput(asma->file); | ||
209 | kmem_cache_free(ashmem_area_cachep, asma); | ||
210 | |||
211 | return 0; | ||
212 | } | ||
213 | |||
214 | static ssize_t ashmem_read(struct file *file, char __user *buf, | ||
215 | size_t len, loff_t *pos) | ||
216 | { | ||
217 | struct ashmem_area *asma = file->private_data; | ||
218 | int ret = 0; | ||
219 | |||
220 | mutex_lock(&ashmem_mutex); | ||
221 | |||
222 | /* If size is not set, or set to 0, always return EOF. */ | ||
223 | if (asma->size == 0) { | ||
224 | goto out; | ||
225 | } | ||
226 | |||
227 | if (!asma->file) { | ||
228 | ret = -EBADF; | ||
229 | goto out; | ||
230 | } | ||
231 | |||
232 | ret = asma->file->f_op->read(asma->file, buf, len, pos); | ||
233 | if (ret < 0) { | ||
234 | goto out; | ||
235 | } | ||
236 | |||
237 | /** Update backing file pos, since f_ops->read() doesn't */ | ||
238 | asma->file->f_pos = *pos; | ||
239 | |||
240 | out: | ||
241 | mutex_unlock(&ashmem_mutex); | ||
242 | return ret; | ||
243 | } | ||
244 | |||
245 | static loff_t ashmem_llseek(struct file *file, loff_t offset, int origin) | ||
246 | { | ||
247 | struct ashmem_area *asma = file->private_data; | ||
248 | int ret; | ||
249 | |||
250 | mutex_lock(&ashmem_mutex); | ||
251 | |||
252 | if (asma->size == 0) { | ||
253 | ret = -EINVAL; | ||
254 | goto out; | ||
255 | } | ||
256 | |||
257 | if (!asma->file) { | ||
258 | ret = -EBADF; | ||
259 | goto out; | ||
260 | } | ||
261 | |||
262 | ret = asma->file->f_op->llseek(asma->file, offset, origin); | ||
263 | if (ret < 0) { | ||
264 | goto out; | ||
265 | } | ||
266 | |||
267 | /** Copy f_pos from backing file, since f_ops->llseek() sets it */ | ||
268 | file->f_pos = asma->file->f_pos; | ||
269 | |||
270 | out: | ||
271 | mutex_unlock(&ashmem_mutex); | ||
272 | return ret; | ||
273 | } | ||
274 | |||
275 | static inline unsigned long | ||
276 | calc_vm_may_flags(unsigned long prot) | ||
277 | { | ||
278 | return _calc_vm_trans(prot, PROT_READ, VM_MAYREAD ) | | ||
279 | _calc_vm_trans(prot, PROT_WRITE, VM_MAYWRITE) | | ||
280 | _calc_vm_trans(prot, PROT_EXEC, VM_MAYEXEC); | ||
281 | } | ||
282 | |||
283 | static int ashmem_mmap(struct file *file, struct vm_area_struct *vma) | ||
284 | { | ||
285 | struct ashmem_area *asma = file->private_data; | ||
286 | int ret = 0; | ||
287 | |||
288 | mutex_lock(&ashmem_mutex); | ||
289 | |||
290 | /* user needs to SET_SIZE before mapping */ | ||
291 | if (unlikely(!asma->size)) { | ||
292 | ret = -EINVAL; | ||
293 | goto out; | ||
294 | } | ||
295 | |||
296 | /* requested protection bits must match our allowed protection mask */ | ||
297 | if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) & | ||
298 | calc_vm_prot_bits(PROT_MASK))) { | ||
299 | ret = -EPERM; | ||
300 | goto out; | ||
301 | } | ||
302 | vma->vm_flags &= ~calc_vm_may_flags(~asma->prot_mask); | ||
303 | |||
304 | if (!asma->file) { | ||
305 | char *name = ASHMEM_NAME_DEF; | ||
306 | struct file *vmfile; | ||
307 | |||
308 | if (asma->name[ASHMEM_NAME_PREFIX_LEN] != '\0') | ||
309 | name = asma->name; | ||
310 | |||
311 | /* ... and allocate the backing shmem file */ | ||
312 | vmfile = shmem_file_setup(name, asma->size, vma->vm_flags); | ||
313 | if (unlikely(IS_ERR(vmfile))) { | ||
314 | ret = PTR_ERR(vmfile); | ||
315 | goto out; | ||
316 | } | ||
317 | asma->file = vmfile; | ||
318 | } | ||
319 | get_file(asma->file); | ||
320 | |||
321 | if (vma->vm_flags & VM_SHARED) | ||
322 | shmem_set_file(vma, asma->file); | ||
323 | else { | ||
324 | if (vma->vm_file) | ||
325 | fput(vma->vm_file); | ||
326 | vma->vm_file = asma->file; | ||
327 | } | ||
328 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
329 | |||
330 | out: | ||
331 | mutex_unlock(&ashmem_mutex); | ||
332 | return ret; | ||
333 | } | ||
334 | |||
335 | /* | ||
336 | * ashmem_shrink - our cache shrinker, called from mm/vmscan.c :: shrink_slab | ||
337 | * | ||
338 | * 'nr_to_scan' is the number of objects (pages) to prune, or 0 to query how | ||
339 | * many objects (pages) we have in total. | ||
340 | * | ||
341 | * 'gfp_mask' is the mask of the allocation that got us into this mess. | ||
342 | * | ||
343 | * Return value is the number of objects (pages) remaining, or -1 if we cannot | ||
344 | * proceed without risk of deadlock (due to gfp_mask). | ||
345 | * | ||
346 | * We approximate LRU via least-recently-unpinned, jettisoning unpinned partial | ||
347 | * chunks of ashmem regions LRU-wise one-at-a-time until we hit 'nr_to_scan' | ||
348 | * pages freed. | ||
349 | */ | ||
350 | static int ashmem_shrink(struct shrinker *s, struct shrink_control *sc) | ||
351 | { | ||
352 | struct ashmem_range *range, *next; | ||
353 | |||
354 | /* We might recurse into filesystem code, so bail out if necessary */ | ||
355 | if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS)) | ||
356 | return -1; | ||
357 | if (!sc->nr_to_scan) | ||
358 | return lru_count; | ||
359 | |||
360 | mutex_lock(&ashmem_mutex); | ||
361 | list_for_each_entry_safe(range, next, &ashmem_lru_list, lru) { | ||
362 | struct inode *inode = range->asma->file->f_dentry->d_inode; | ||
363 | loff_t start = range->pgstart * PAGE_SIZE; | ||
364 | loff_t end = (range->pgend + 1) * PAGE_SIZE - 1; | ||
365 | |||
366 | vmtruncate_range(inode, start, end); | ||
367 | range->purged = ASHMEM_WAS_PURGED; | ||
368 | lru_del(range); | ||
369 | |||
370 | sc->nr_to_scan -= range_size(range); | ||
371 | if (sc->nr_to_scan <= 0) | ||
372 | break; | ||
373 | } | ||
374 | mutex_unlock(&ashmem_mutex); | ||
375 | |||
376 | return lru_count; | ||
377 | } | ||
378 | |||
379 | static struct shrinker ashmem_shrinker = { | ||
380 | .shrink = ashmem_shrink, | ||
381 | .seeks = DEFAULT_SEEKS * 4, | ||
382 | }; | ||
383 | |||
384 | static int set_prot_mask(struct ashmem_area *asma, unsigned long prot) | ||
385 | { | ||
386 | int ret = 0; | ||
387 | |||
388 | mutex_lock(&ashmem_mutex); | ||
389 | |||
390 | /* the user can only remove, not add, protection bits */ | ||
391 | if (unlikely((asma->prot_mask & prot) != prot)) { | ||
392 | ret = -EINVAL; | ||
393 | goto out; | ||
394 | } | ||
395 | |||
396 | /* does the application expect PROT_READ to imply PROT_EXEC? */ | ||
397 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) | ||
398 | prot |= PROT_EXEC; | ||
399 | |||
400 | asma->prot_mask = prot; | ||
401 | |||
402 | out: | ||
403 | mutex_unlock(&ashmem_mutex); | ||
404 | return ret; | ||
405 | } | ||
406 | |||
407 | static int set_name(struct ashmem_area *asma, void __user *name) | ||
408 | { | ||
409 | int ret = 0; | ||
410 | |||
411 | mutex_lock(&ashmem_mutex); | ||
412 | |||
413 | /* cannot change an existing mapping's name */ | ||
414 | if (unlikely(asma->file)) { | ||
415 | ret = -EINVAL; | ||
416 | goto out; | ||
417 | } | ||
418 | |||
419 | if (unlikely(copy_from_user(asma->name + ASHMEM_NAME_PREFIX_LEN, | ||
420 | name, ASHMEM_NAME_LEN))) | ||
421 | ret = -EFAULT; | ||
422 | asma->name[ASHMEM_FULL_NAME_LEN-1] = '\0'; | ||
423 | |||
424 | out: | ||
425 | mutex_unlock(&ashmem_mutex); | ||
426 | |||
427 | return ret; | ||
428 | } | ||
429 | |||
430 | static int get_name(struct ashmem_area *asma, void __user *name) | ||
431 | { | ||
432 | int ret = 0; | ||
433 | |||
434 | mutex_lock(&ashmem_mutex); | ||
435 | if (asma->name[ASHMEM_NAME_PREFIX_LEN] != '\0') { | ||
436 | size_t len; | ||
437 | |||
438 | /* | ||
439 | * Copying only `len', instead of ASHMEM_NAME_LEN, bytes | ||
440 | * prevents us from revealing one user's stack to another. | ||
441 | */ | ||
442 | len = strlen(asma->name + ASHMEM_NAME_PREFIX_LEN) + 1; | ||
443 | if (unlikely(copy_to_user(name, | ||
444 | asma->name + ASHMEM_NAME_PREFIX_LEN, len))) | ||
445 | ret = -EFAULT; | ||
446 | } else { | ||
447 | if (unlikely(copy_to_user(name, ASHMEM_NAME_DEF, | ||
448 | sizeof(ASHMEM_NAME_DEF)))) | ||
449 | ret = -EFAULT; | ||
450 | } | ||
451 | mutex_unlock(&ashmem_mutex); | ||
452 | |||
453 | return ret; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * ashmem_pin - pin the given ashmem region, returning whether it was | ||
458 | * previously purged (ASHMEM_WAS_PURGED) or not (ASHMEM_NOT_PURGED). | ||
459 | * | ||
460 | * Caller must hold ashmem_mutex. | ||
461 | */ | ||
462 | static int ashmem_pin(struct ashmem_area *asma, size_t pgstart, size_t pgend) | ||
463 | { | ||
464 | struct ashmem_range *range, *next; | ||
465 | int ret = ASHMEM_NOT_PURGED; | ||
466 | |||
467 | list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned) { | ||
468 | /* moved past last applicable page; we can short circuit */ | ||
469 | if (range_before_page(range, pgstart)) | ||
470 | break; | ||
471 | |||
472 | /* | ||
473 | * The user can ask us to pin pages that span multiple ranges, | ||
474 | * or to pin pages that aren't even unpinned, so this is messy. | ||
475 | * | ||
476 | * Four cases: | ||
477 | * 1. The requested range subsumes an existing range, so we | ||
478 | * just remove the entire matching range. | ||
479 | * 2. The requested range overlaps the start of an existing | ||
480 | * range, so we just update that range. | ||
481 | * 3. The requested range overlaps the end of an existing | ||
482 | * range, so we just update that range. | ||
483 | * 4. The requested range punches a hole in an existing range, | ||
484 | * so we have to update one side of the range and then | ||
485 | * create a new range for the other side. | ||
486 | */ | ||
487 | if (page_range_in_range(range, pgstart, pgend)) { | ||
488 | ret |= range->purged; | ||
489 | |||
490 | /* Case #1: Easy. Just nuke the whole thing. */ | ||
491 | if (page_range_subsumes_range(range, pgstart, pgend)) { | ||
492 | range_del(range); | ||
493 | continue; | ||
494 | } | ||
495 | |||
496 | /* Case #2: We overlap from the start, so adjust it */ | ||
497 | if (range->pgstart >= pgstart) { | ||
498 | range_shrink(range, pgend + 1, range->pgend); | ||
499 | continue; | ||
500 | } | ||
501 | |||
502 | /* Case #3: We overlap from the rear, so adjust it */ | ||
503 | if (range->pgend <= pgend) { | ||
504 | range_shrink(range, range->pgstart, pgstart-1); | ||
505 | continue; | ||
506 | } | ||
507 | |||
508 | /* | ||
509 | * Case #4: We eat a chunk out of the middle. A bit | ||
510 | * more complicated, we allocate a new range for the | ||
511 | * second half and adjust the first chunk's endpoint. | ||
512 | */ | ||
513 | range_alloc(asma, range, range->purged, | ||
514 | pgend + 1, range->pgend); | ||
515 | range_shrink(range, range->pgstart, pgstart - 1); | ||
516 | break; | ||
517 | } | ||
518 | } | ||
519 | |||
520 | return ret; | ||
521 | } | ||
522 | |||
523 | /* | ||
524 | * ashmem_unpin - unpin the given range of pages. Returns zero on success. | ||
525 | * | ||
526 | * Caller must hold ashmem_mutex. | ||
527 | */ | ||
528 | static int ashmem_unpin(struct ashmem_area *asma, size_t pgstart, size_t pgend) | ||
529 | { | ||
530 | struct ashmem_range *range, *next; | ||
531 | unsigned int purged = ASHMEM_NOT_PURGED; | ||
532 | |||
533 | restart: | ||
534 | list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned) { | ||
535 | /* short circuit: this is our insertion point */ | ||
536 | if (range_before_page(range, pgstart)) | ||
537 | break; | ||
538 | |||
539 | /* | ||
540 | * The user can ask us to unpin pages that are already entirely | ||
541 | * or partially pinned. We handle those two cases here. | ||
542 | */ | ||
543 | if (page_range_subsumed_by_range(range, pgstart, pgend)) | ||
544 | return 0; | ||
545 | if (page_range_in_range(range, pgstart, pgend)) { | ||
546 | pgstart = min_t(size_t, range->pgstart, pgstart), | ||
547 | pgend = max_t(size_t, range->pgend, pgend); | ||
548 | purged |= range->purged; | ||
549 | range_del(range); | ||
550 | goto restart; | ||
551 | } | ||
552 | } | ||
553 | |||
554 | return range_alloc(asma, range, purged, pgstart, pgend); | ||
555 | } | ||
556 | |||
557 | /* | ||
558 | * ashmem_get_pin_status - Returns ASHMEM_IS_UNPINNED if _any_ pages in the | ||
559 | * given interval are unpinned and ASHMEM_IS_PINNED otherwise. | ||
560 | * | ||
561 | * Caller must hold ashmem_mutex. | ||
562 | */ | ||
563 | static int ashmem_get_pin_status(struct ashmem_area *asma, size_t pgstart, | ||
564 | size_t pgend) | ||
565 | { | ||
566 | struct ashmem_range *range; | ||
567 | int ret = ASHMEM_IS_PINNED; | ||
568 | |||
569 | list_for_each_entry(range, &asma->unpinned_list, unpinned) { | ||
570 | if (range_before_page(range, pgstart)) | ||
571 | break; | ||
572 | if (page_range_in_range(range, pgstart, pgend)) { | ||
573 | ret = ASHMEM_IS_UNPINNED; | ||
574 | break; | ||
575 | } | ||
576 | } | ||
577 | |||
578 | return ret; | ||
579 | } | ||
580 | |||
581 | static int ashmem_pin_unpin(struct ashmem_area *asma, unsigned long cmd, | ||
582 | void __user *p) | ||
583 | { | ||
584 | struct ashmem_pin pin; | ||
585 | size_t pgstart, pgend; | ||
586 | int ret = -EINVAL; | ||
587 | |||
588 | if (unlikely(!asma->file)) | ||
589 | return -EINVAL; | ||
590 | |||
591 | if (unlikely(copy_from_user(&pin, p, sizeof(pin)))) | ||
592 | return -EFAULT; | ||
593 | |||
594 | /* per custom, you can pass zero for len to mean "everything onward" */ | ||
595 | if (!pin.len) | ||
596 | pin.len = PAGE_ALIGN(asma->size) - pin.offset; | ||
597 | |||
598 | if (unlikely((pin.offset | pin.len) & ~PAGE_MASK)) | ||
599 | return -EINVAL; | ||
600 | |||
601 | if (unlikely(((__u32) -1) - pin.offset < pin.len)) | ||
602 | return -EINVAL; | ||
603 | |||
604 | if (unlikely(PAGE_ALIGN(asma->size) < pin.offset + pin.len)) | ||
605 | return -EINVAL; | ||
606 | |||
607 | pgstart = pin.offset / PAGE_SIZE; | ||
608 | pgend = pgstart + (pin.len / PAGE_SIZE) - 1; | ||
609 | |||
610 | mutex_lock(&ashmem_mutex); | ||
611 | |||
612 | switch (cmd) { | ||
613 | case ASHMEM_PIN: | ||
614 | ret = ashmem_pin(asma, pgstart, pgend); | ||
615 | break; | ||
616 | case ASHMEM_UNPIN: | ||
617 | ret = ashmem_unpin(asma, pgstart, pgend); | ||
618 | break; | ||
619 | case ASHMEM_GET_PIN_STATUS: | ||
620 | ret = ashmem_get_pin_status(asma, pgstart, pgend); | ||
621 | break; | ||
622 | } | ||
623 | |||
624 | mutex_unlock(&ashmem_mutex); | ||
625 | |||
626 | return ret; | ||
627 | } | ||
628 | |||
629 | static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | ||
630 | { | ||
631 | struct ashmem_area *asma = file->private_data; | ||
632 | long ret = -ENOTTY; | ||
633 | |||
634 | switch (cmd) { | ||
635 | case ASHMEM_SET_NAME: | ||
636 | ret = set_name(asma, (void __user *) arg); | ||
637 | break; | ||
638 | case ASHMEM_GET_NAME: | ||
639 | ret = get_name(asma, (void __user *) arg); | ||
640 | break; | ||
641 | case ASHMEM_SET_SIZE: | ||
642 | ret = -EINVAL; | ||
643 | if (!asma->file) { | ||
644 | ret = 0; | ||
645 | asma->size = (size_t) arg; | ||
646 | } | ||
647 | break; | ||
648 | case ASHMEM_GET_SIZE: | ||
649 | ret = asma->size; | ||
650 | break; | ||
651 | case ASHMEM_SET_PROT_MASK: | ||
652 | ret = set_prot_mask(asma, arg); | ||
653 | break; | ||
654 | case ASHMEM_GET_PROT_MASK: | ||
655 | ret = asma->prot_mask; | ||
656 | break; | ||
657 | case ASHMEM_PIN: | ||
658 | case ASHMEM_UNPIN: | ||
659 | case ASHMEM_GET_PIN_STATUS: | ||
660 | ret = ashmem_pin_unpin(asma, cmd, (void __user *) arg); | ||
661 | break; | ||
662 | case ASHMEM_PURGE_ALL_CACHES: | ||
663 | ret = -EPERM; | ||
664 | if (capable(CAP_SYS_ADMIN)) { | ||
665 | struct shrink_control sc = { | ||
666 | .gfp_mask = GFP_KERNEL, | ||
667 | .nr_to_scan = 0, | ||
668 | }; | ||
669 | ret = ashmem_shrink(&ashmem_shrinker, &sc); | ||
670 | sc.nr_to_scan = ret; | ||
671 | ashmem_shrink(&ashmem_shrinker, &sc); | ||
672 | } | ||
673 | break; | ||
674 | } | ||
675 | |||
676 | return ret; | ||
677 | } | ||
678 | |||
679 | static struct file_operations ashmem_fops = { | ||
680 | .owner = THIS_MODULE, | ||
681 | .open = ashmem_open, | ||
682 | .release = ashmem_release, | ||
683 | .read = ashmem_read, | ||
684 | .llseek = ashmem_llseek, | ||
685 | .mmap = ashmem_mmap, | ||
686 | .unlocked_ioctl = ashmem_ioctl, | ||
687 | .compat_ioctl = ashmem_ioctl, | ||
688 | }; | ||
689 | |||
690 | static struct miscdevice ashmem_misc = { | ||
691 | .minor = MISC_DYNAMIC_MINOR, | ||
692 | .name = "ashmem", | ||
693 | .fops = &ashmem_fops, | ||
694 | }; | ||
695 | |||
696 | static int __init ashmem_init(void) | ||
697 | { | ||
698 | int ret; | ||
699 | |||
700 | ashmem_area_cachep = kmem_cache_create("ashmem_area_cache", | ||
701 | sizeof(struct ashmem_area), | ||
702 | 0, 0, NULL); | ||
703 | if (unlikely(!ashmem_area_cachep)) { | ||
704 | printk(KERN_ERR "ashmem: failed to create slab cache\n"); | ||
705 | return -ENOMEM; | ||
706 | } | ||
707 | |||
708 | ashmem_range_cachep = kmem_cache_create("ashmem_range_cache", | ||
709 | sizeof(struct ashmem_range), | ||
710 | 0, 0, NULL); | ||
711 | if (unlikely(!ashmem_range_cachep)) { | ||
712 | printk(KERN_ERR "ashmem: failed to create slab cache\n"); | ||
713 | return -ENOMEM; | ||
714 | } | ||
715 | |||
716 | ret = misc_register(&ashmem_misc); | ||
717 | if (unlikely(ret)) { | ||
718 | printk(KERN_ERR "ashmem: failed to register misc device!\n"); | ||
719 | return ret; | ||
720 | } | ||
721 | |||
722 | register_shrinker(&ashmem_shrinker); | ||
723 | |||
724 | printk(KERN_INFO "ashmem: initialized\n"); | ||
725 | |||
726 | return 0; | ||
727 | } | ||
728 | |||
729 | static void __exit ashmem_exit(void) | ||
730 | { | ||
731 | int ret; | ||
732 | |||
733 | unregister_shrinker(&ashmem_shrinker); | ||
734 | |||
735 | ret = misc_deregister(&ashmem_misc); | ||
736 | if (unlikely(ret)) | ||
737 | printk(KERN_ERR "ashmem: failed to unregister misc device!\n"); | ||
738 | |||
739 | kmem_cache_destroy(ashmem_range_cachep); | ||
740 | kmem_cache_destroy(ashmem_area_cachep); | ||
741 | |||
742 | printk(KERN_INFO "ashmem: unloaded\n"); | ||
743 | } | ||
744 | |||
745 | module_init(ashmem_init); | ||
746 | module_exit(ashmem_exit); | ||
747 | |||
748 | MODULE_LICENSE("GPL"); | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f032e6e1e09..cb9f1c2d01a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer; | |||
45 | static int bdi_sync_supers(void *); | 45 | static int bdi_sync_supers(void *); |
46 | static void sync_supers_timer_fn(unsigned long); | 46 | static void sync_supers_timer_fn(unsigned long); |
47 | 47 | ||
48 | void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) | ||
49 | { | ||
50 | if (wb1 < wb2) { | ||
51 | spin_lock(&wb1->list_lock); | ||
52 | spin_lock_nested(&wb2->list_lock, 1); | ||
53 | } else { | ||
54 | spin_lock(&wb2->list_lock); | ||
55 | spin_lock_nested(&wb1->list_lock, 1); | ||
56 | } | ||
57 | } | ||
58 | |||
48 | #ifdef CONFIG_DEBUG_FS | 59 | #ifdef CONFIG_DEBUG_FS |
49 | #include <linux/debugfs.h> | 60 | #include <linux/debugfs.h> |
50 | #include <linux/seq_file.h> | 61 | #include <linux/seq_file.h> |
@@ -67,34 +78,44 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
67 | struct inode *inode; | 78 | struct inode *inode; |
68 | 79 | ||
69 | nr_dirty = nr_io = nr_more_io = 0; | 80 | nr_dirty = nr_io = nr_more_io = 0; |
70 | spin_lock(&inode_wb_list_lock); | 81 | spin_lock(&wb->list_lock); |
71 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) | 82 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
72 | nr_dirty++; | 83 | nr_dirty++; |
73 | list_for_each_entry(inode, &wb->b_io, i_wb_list) | 84 | list_for_each_entry(inode, &wb->b_io, i_wb_list) |
74 | nr_io++; | 85 | nr_io++; |
75 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) | 86 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) |
76 | nr_more_io++; | 87 | nr_more_io++; |
77 | spin_unlock(&inode_wb_list_lock); | 88 | spin_unlock(&wb->list_lock); |
78 | 89 | ||
79 | global_dirty_limits(&background_thresh, &dirty_thresh); | 90 | global_dirty_limits(&background_thresh, &dirty_thresh); |
80 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 91 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
81 | 92 | ||
82 | #define K(x) ((x) << (PAGE_SHIFT - 10)) | 93 | #define K(x) ((x) << (PAGE_SHIFT - 10)) |
83 | seq_printf(m, | 94 | seq_printf(m, |
84 | "BdiWriteback: %8lu kB\n" | 95 | "BdiWriteback: %10lu kB\n" |
85 | "BdiReclaimable: %8lu kB\n" | 96 | "BdiReclaimable: %10lu kB\n" |
86 | "BdiDirtyThresh: %8lu kB\n" | 97 | "BdiDirtyThresh: %10lu kB\n" |
87 | "DirtyThresh: %8lu kB\n" | 98 | "DirtyThresh: %10lu kB\n" |
88 | "BackgroundThresh: %8lu kB\n" | 99 | "BackgroundThresh: %10lu kB\n" |
89 | "b_dirty: %8lu\n" | 100 | "BdiDirtied: %10lu kB\n" |
90 | "b_io: %8lu\n" | 101 | "BdiWritten: %10lu kB\n" |
91 | "b_more_io: %8lu\n" | 102 | "BdiWriteBandwidth: %10lu kBps\n" |
92 | "bdi_list: %8u\n" | 103 | "b_dirty: %10lu\n" |
93 | "state: %8lx\n", | 104 | "b_io: %10lu\n" |
105 | "b_more_io: %10lu\n" | ||
106 | "bdi_list: %10u\n" | ||
107 | "state: %10lx\n", | ||
94 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), | 108 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), |
95 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), | 109 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), |
96 | K(bdi_thresh), K(dirty_thresh), | 110 | K(bdi_thresh), |
97 | K(background_thresh), nr_dirty, nr_io, nr_more_io, | 111 | K(dirty_thresh), |
112 | K(background_thresh), | ||
113 | (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)), | ||
114 | (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), | ||
115 | (unsigned long) K(bdi->write_bandwidth), | ||
116 | nr_dirty, | ||
117 | nr_io, | ||
118 | nr_more_io, | ||
98 | !list_empty(&bdi->bdi_list), bdi->state); | 119 | !list_empty(&bdi->bdi_list), bdi->state); |
99 | #undef K | 120 | #undef K |
100 | 121 | ||
@@ -249,18 +270,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) | |||
249 | return wb_has_dirty_io(&bdi->wb); | 270 | return wb_has_dirty_io(&bdi->wb); |
250 | } | 271 | } |
251 | 272 | ||
252 | static void bdi_flush_io(struct backing_dev_info *bdi) | ||
253 | { | ||
254 | struct writeback_control wbc = { | ||
255 | .sync_mode = WB_SYNC_NONE, | ||
256 | .older_than_this = NULL, | ||
257 | .range_cyclic = 1, | ||
258 | .nr_to_write = 1024, | ||
259 | }; | ||
260 | |||
261 | writeback_inodes_wb(&bdi->wb, &wbc); | ||
262 | } | ||
263 | |||
264 | /* | 273 | /* |
265 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() | 274 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() |
266 | * or we risk deadlocking on ->s_umount. The longer term solution would be | 275 | * or we risk deadlocking on ->s_umount. The longer term solution would be |
@@ -352,6 +361,17 @@ static unsigned long bdi_longest_inactive(void) | |||
352 | return max(5UL * 60 * HZ, interval); | 361 | return max(5UL * 60 * HZ, interval); |
353 | } | 362 | } |
354 | 363 | ||
364 | /* | ||
365 | * Clear pending bit and wakeup anybody waiting for flusher thread creation or | ||
366 | * shutdown | ||
367 | */ | ||
368 | static void bdi_clear_pending(struct backing_dev_info *bdi) | ||
369 | { | ||
370 | clear_bit(BDI_pending, &bdi->state); | ||
371 | smp_mb__after_clear_bit(); | ||
372 | wake_up_bit(&bdi->state, BDI_pending); | ||
373 | } | ||
374 | |||
355 | static int bdi_forker_thread(void *ptr) | 375 | static int bdi_forker_thread(void *ptr) |
356 | { | 376 | { |
357 | struct bdi_writeback *me = ptr; | 377 | struct bdi_writeback *me = ptr; |
@@ -383,6 +403,13 @@ static int bdi_forker_thread(void *ptr) | |||
383 | } | 403 | } |
384 | 404 | ||
385 | spin_lock_bh(&bdi_lock); | 405 | spin_lock_bh(&bdi_lock); |
406 | /* | ||
407 | * In the following loop we are going to check whether we have | ||
408 | * some work to do without any synchronization with tasks | ||
409 | * waking us up to do work for them. So we have to set task | ||
410 | * state already here so that we don't miss wakeups coming | ||
411 | * after we verify some condition. | ||
412 | */ | ||
386 | set_current_state(TASK_INTERRUPTIBLE); | 413 | set_current_state(TASK_INTERRUPTIBLE); |
387 | 414 | ||
388 | list_for_each_entry(bdi, &bdi_list, bdi_list) { | 415 | list_for_each_entry(bdi, &bdi_list, bdi_list) { |
@@ -446,9 +473,11 @@ static int bdi_forker_thread(void *ptr) | |||
446 | if (IS_ERR(task)) { | 473 | if (IS_ERR(task)) { |
447 | /* | 474 | /* |
448 | * If thread creation fails, force writeout of | 475 | * If thread creation fails, force writeout of |
449 | * the bdi from the thread. | 476 | * the bdi from the thread. Hopefully 1024 is |
477 | * large enough for efficient IO. | ||
450 | */ | 478 | */ |
451 | bdi_flush_io(bdi); | 479 | writeback_inodes_wb(&bdi->wb, 1024, |
480 | WB_REASON_FORKER_THREAD); | ||
452 | } else { | 481 | } else { |
453 | /* | 482 | /* |
454 | * The spinlock makes sure we do not lose | 483 | * The spinlock makes sure we do not lose |
@@ -461,11 +490,13 @@ static int bdi_forker_thread(void *ptr) | |||
461 | spin_unlock_bh(&bdi->wb_lock); | 490 | spin_unlock_bh(&bdi->wb_lock); |
462 | wake_up_process(task); | 491 | wake_up_process(task); |
463 | } | 492 | } |
493 | bdi_clear_pending(bdi); | ||
464 | break; | 494 | break; |
465 | 495 | ||
466 | case KILL_THREAD: | 496 | case KILL_THREAD: |
467 | __set_current_state(TASK_RUNNING); | 497 | __set_current_state(TASK_RUNNING); |
468 | kthread_stop(task); | 498 | kthread_stop(task); |
499 | bdi_clear_pending(bdi); | ||
469 | break; | 500 | break; |
470 | 501 | ||
471 | case NO_ACTION: | 502 | case NO_ACTION: |
@@ -481,16 +512,8 @@ static int bdi_forker_thread(void *ptr) | |||
481 | else | 512 | else |
482 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); | 513 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); |
483 | try_to_freeze(); | 514 | try_to_freeze(); |
484 | /* Back to the main loop */ | 515 | break; |
485 | continue; | ||
486 | } | 516 | } |
487 | |||
488 | /* | ||
489 | * Clear pending bit and wakeup anybody waiting to tear us down. | ||
490 | */ | ||
491 | clear_bit(BDI_pending, &bdi->state); | ||
492 | smp_mb__after_clear_bit(); | ||
493 | wake_up_bit(&bdi->state, BDI_pending); | ||
494 | } | 517 | } |
495 | 518 | ||
496 | return 0; | 519 | return 0; |
@@ -505,7 +528,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) | |||
505 | list_del_rcu(&bdi->bdi_list); | 528 | list_del_rcu(&bdi->bdi_list); |
506 | spin_unlock_bh(&bdi_lock); | 529 | spin_unlock_bh(&bdi_lock); |
507 | 530 | ||
508 | synchronize_rcu(); | 531 | synchronize_rcu_expedited(); |
509 | } | 532 | } |
510 | 533 | ||
511 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, | 534 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, |
@@ -606,6 +629,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) | |||
606 | void bdi_unregister(struct backing_dev_info *bdi) | 629 | void bdi_unregister(struct backing_dev_info *bdi) |
607 | { | 630 | { |
608 | if (bdi->dev) { | 631 | if (bdi->dev) { |
632 | bdi_set_min_ratio(bdi, 0); | ||
609 | trace_writeback_bdi_unregister(bdi); | 633 | trace_writeback_bdi_unregister(bdi); |
610 | bdi_prune_sb(bdi); | 634 | bdi_prune_sb(bdi); |
611 | del_timer_sync(&bdi->wb.wakeup_timer); | 635 | del_timer_sync(&bdi->wb.wakeup_timer); |
@@ -628,9 +652,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) | |||
628 | INIT_LIST_HEAD(&wb->b_dirty); | 652 | INIT_LIST_HEAD(&wb->b_dirty); |
629 | INIT_LIST_HEAD(&wb->b_io); | 653 | INIT_LIST_HEAD(&wb->b_io); |
630 | INIT_LIST_HEAD(&wb->b_more_io); | 654 | INIT_LIST_HEAD(&wb->b_more_io); |
655 | spin_lock_init(&wb->list_lock); | ||
631 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); | 656 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); |
632 | } | 657 | } |
633 | 658 | ||
659 | /* | ||
660 | * Initial write bandwidth: 100 MB/s | ||
661 | */ | ||
662 | #define INIT_BW (100 << (20 - PAGE_SHIFT)) | ||
663 | |||
634 | int bdi_init(struct backing_dev_info *bdi) | 664 | int bdi_init(struct backing_dev_info *bdi) |
635 | { | 665 | { |
636 | int i, err; | 666 | int i, err; |
@@ -653,6 +683,15 @@ int bdi_init(struct backing_dev_info *bdi) | |||
653 | } | 683 | } |
654 | 684 | ||
655 | bdi->dirty_exceeded = 0; | 685 | bdi->dirty_exceeded = 0; |
686 | |||
687 | bdi->bw_time_stamp = jiffies; | ||
688 | bdi->written_stamp = 0; | ||
689 | |||
690 | bdi->balanced_dirty_ratelimit = INIT_BW; | ||
691 | bdi->dirty_ratelimit = INIT_BW; | ||
692 | bdi->write_bandwidth = INIT_BW; | ||
693 | bdi->avg_write_bandwidth = INIT_BW; | ||
694 | |||
656 | err = prop_local_init_percpu(&bdi->completions); | 695 | err = prop_local_init_percpu(&bdi->completions); |
657 | 696 | ||
658 | if (err) { | 697 | if (err) { |
@@ -676,15 +715,24 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
676 | if (bdi_has_dirty_io(bdi)) { | 715 | if (bdi_has_dirty_io(bdi)) { |
677 | struct bdi_writeback *dst = &default_backing_dev_info.wb; | 716 | struct bdi_writeback *dst = &default_backing_dev_info.wb; |
678 | 717 | ||
679 | spin_lock(&inode_wb_list_lock); | 718 | bdi_lock_two(&bdi->wb, dst); |
680 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); | 719 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); |
681 | list_splice(&bdi->wb.b_io, &dst->b_io); | 720 | list_splice(&bdi->wb.b_io, &dst->b_io); |
682 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); | 721 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); |
683 | spin_unlock(&inode_wb_list_lock); | 722 | spin_unlock(&bdi->wb.list_lock); |
723 | spin_unlock(&dst->list_lock); | ||
684 | } | 724 | } |
685 | 725 | ||
686 | bdi_unregister(bdi); | 726 | bdi_unregister(bdi); |
687 | 727 | ||
728 | /* | ||
729 | * If bdi_unregister() had already been called earlier, the | ||
730 | * wakeup_timer could still be armed because bdi_prune_sb() | ||
731 | * can race with the bdi_wakeup_thread_delayed() calls from | ||
732 | * __mark_inode_dirty(). | ||
733 | */ | ||
734 | del_timer_sync(&bdi->wb.wakeup_timer); | ||
735 | |||
688 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 736 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |
689 | percpu_counter_destroy(&bdi->bdi_stat[i]); | 737 | percpu_counter_destroy(&bdi->bdi_stat[i]); |
690 | 738 | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index 01d5a4b3dd0..9686c4e3f80 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -768,14 +768,13 @@ void * __init alloc_bootmem_section(unsigned long size, | |||
768 | unsigned long section_nr) | 768 | unsigned long section_nr) |
769 | { | 769 | { |
770 | bootmem_data_t *bdata; | 770 | bootmem_data_t *bdata; |
771 | unsigned long pfn, goal, limit; | 771 | unsigned long pfn, goal; |
772 | 772 | ||
773 | pfn = section_nr_to_pfn(section_nr); | 773 | pfn = section_nr_to_pfn(section_nr); |
774 | goal = pfn << PAGE_SHIFT; | 774 | goal = pfn << PAGE_SHIFT; |
775 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; | ||
776 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; | 775 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; |
777 | 776 | ||
778 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); | 777 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0); |
779 | } | 778 | } |
780 | #endif | 779 | #endif |
781 | 780 | ||
diff --git a/mm/cma-best-fit.c b/mm/cma-best-fit.c new file mode 100644 index 00000000000..24c27c89cae --- /dev/null +++ b/mm/cma-best-fit.c | |||
@@ -0,0 +1,408 @@ | |||
1 | /* | ||
2 | * Contiguous Memory Allocator framework: Best Fit allocator | ||
3 | * Copyright (c) 2010 by Samsung Electronics. | ||
4 | * Written by Michal Nazarewicz (m.nazarewicz@samsung.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License as | ||
8 | * published by the Free Software Foundation; either version 2 of the | ||
9 | * License or (at your optional) any later version of the license. | ||
10 | */ | ||
11 | |||
12 | #define pr_fmt(fmt) "cma: bf: " fmt | ||
13 | |||
14 | #ifdef CONFIG_CMA_DEBUG | ||
15 | # define DEBUG | ||
16 | #endif | ||
17 | |||
18 | #include <linux/errno.h> /* Error numbers */ | ||
19 | #include <linux/slab.h> /* kmalloc() */ | ||
20 | |||
21 | #include <linux/cma.h> /* CMA structures */ | ||
22 | |||
23 | |||
24 | /************************* Data Types *************************/ | ||
25 | |||
26 | struct cma_bf_item { | ||
27 | struct cma_chunk ch; | ||
28 | struct rb_node by_size; | ||
29 | }; | ||
30 | |||
31 | struct cma_bf_private { | ||
32 | struct rb_root by_start_root; | ||
33 | struct rb_root by_size_root; | ||
34 | }; | ||
35 | |||
36 | |||
37 | /************************* Prototypes *************************/ | ||
38 | |||
39 | /* | ||
40 | * Those are only for holes. They must be called whenever hole's | ||
41 | * properties change but also whenever chunk becomes a hole or hole | ||
42 | * becames a chunk. | ||
43 | */ | ||
44 | static void __cma_bf_hole_insert_by_size(struct cma_bf_item *item); | ||
45 | static void __cma_bf_hole_erase_by_size(struct cma_bf_item *item); | ||
46 | static int __must_check | ||
47 | __cma_bf_hole_insert_by_start(struct cma_bf_item *item); | ||
48 | static void __cma_bf_hole_erase_by_start(struct cma_bf_item *item); | ||
49 | |||
50 | /** | ||
51 | * __cma_bf_hole_take - takes a chunk of memory out of a hole. | ||
52 | * @hole: hole to take chunk from | ||
53 | * @size: chunk's size | ||
54 | * @alignment: chunk's starting address alignment (must be power of two) | ||
55 | * | ||
56 | * Takes a @size bytes large chunk from hole @hole which must be able | ||
57 | * to hold the chunk. The "must be able" includes also alignment | ||
58 | * constraint. | ||
59 | * | ||
60 | * Returns allocated item or NULL on error (if kmalloc() failed). | ||
61 | */ | ||
62 | static struct cma_bf_item *__must_check | ||
63 | __cma_bf_hole_take(struct cma_bf_item *hole, size_t size, dma_addr_t alignment); | ||
64 | |||
65 | /** | ||
66 | * __cma_bf_hole_merge_maybe - tries to merge hole with neighbours. | ||
67 | * @item: hole to try and merge | ||
68 | * | ||
69 | * Which items are preserved is undefined so you may not rely on it. | ||
70 | */ | ||
71 | static void __cma_bf_hole_merge_maybe(struct cma_bf_item *item); | ||
72 | |||
73 | |||
74 | /************************* Device API *************************/ | ||
75 | |||
76 | int cma_bf_init(struct cma_region *reg) | ||
77 | { | ||
78 | struct cma_bf_private *prv; | ||
79 | struct cma_bf_item *item; | ||
80 | |||
81 | prv = kzalloc(sizeof *prv, GFP_KERNEL); | ||
82 | if (unlikely(!prv)) | ||
83 | return -ENOMEM; | ||
84 | |||
85 | item = kzalloc(sizeof *item, GFP_KERNEL); | ||
86 | if (unlikely(!item)) { | ||
87 | kfree(prv); | ||
88 | return -ENOMEM; | ||
89 | } | ||
90 | |||
91 | item->ch.start = reg->start; | ||
92 | item->ch.size = reg->size; | ||
93 | item->ch.reg = reg; | ||
94 | |||
95 | rb_root_init(&prv->by_start_root, &item->ch.by_start); | ||
96 | rb_root_init(&prv->by_size_root, &item->by_size); | ||
97 | |||
98 | reg->private_data = prv; | ||
99 | return 0; | ||
100 | } | ||
101 | |||
102 | void cma_bf_cleanup(struct cma_region *reg) | ||
103 | { | ||
104 | struct cma_bf_private *prv = reg->private_data; | ||
105 | struct cma_bf_item *item = | ||
106 | rb_entry(prv->by_size_root.rb_node, | ||
107 | struct cma_bf_item, by_size); | ||
108 | |||
109 | /* We can assume there is only a single hole in the tree. */ | ||
110 | WARN_ON(item->by_size.rb_left || item->by_size.rb_right || | ||
111 | item->ch.by_start.rb_left || item->ch.by_start.rb_right); | ||
112 | |||
113 | kfree(item); | ||
114 | kfree(prv); | ||
115 | } | ||
116 | |||
117 | struct cma_chunk *cma_bf_alloc(struct cma_region *reg, | ||
118 | size_t size, dma_addr_t alignment) | ||
119 | { | ||
120 | struct cma_bf_private *prv = reg->private_data; | ||
121 | struct rb_node *node = prv->by_size_root.rb_node; | ||
122 | struct cma_bf_item *item = NULL; | ||
123 | |||
124 | /* First find hole that is large enough */ | ||
125 | while (node) { | ||
126 | struct cma_bf_item *i = | ||
127 | rb_entry(node, struct cma_bf_item, by_size); | ||
128 | |||
129 | if (i->ch.size < size) { | ||
130 | node = node->rb_right; | ||
131 | } else if (i->ch.size >= size) { | ||
132 | node = node->rb_left; | ||
133 | item = i; | ||
134 | } | ||
135 | } | ||
136 | if (!item) | ||
137 | return NULL; | ||
138 | |||
139 | /* Now look for items which can satisfy alignment requirements */ | ||
140 | node = &item->by_size; | ||
141 | for (;;) { | ||
142 | dma_addr_t start = ALIGN(item->ch.start, alignment); | ||
143 | dma_addr_t end = item->ch.start + item->ch.size; | ||
144 | if (start < end && end - start >= size) { | ||
145 | item = __cma_bf_hole_take(item, size, alignment); | ||
146 | return likely(item) ? &item->ch : NULL; | ||
147 | } | ||
148 | |||
149 | node = rb_next(node); | ||
150 | if (!node) | ||
151 | return NULL; | ||
152 | |||
153 | item = rb_entry(node, struct cma_bf_item, by_size); | ||
154 | } | ||
155 | } | ||
156 | |||
157 | void cma_bf_free(struct cma_chunk *chunk) | ||
158 | { | ||
159 | struct cma_bf_item *item = container_of(chunk, struct cma_bf_item, ch); | ||
160 | |||
161 | /* Add new hole */ | ||
162 | if (unlikely(__cma_bf_hole_insert_by_start(item))) { | ||
163 | /* | ||
164 | * We're screwed... Just free the item and forget | ||
165 | * about it. Things are broken beyond repair so no | ||
166 | * sense in trying to recover. | ||
167 | */ | ||
168 | kfree(item); | ||
169 | } else { | ||
170 | __cma_bf_hole_insert_by_size(item); | ||
171 | |||
172 | /* Merge with prev and next sibling */ | ||
173 | __cma_bf_hole_merge_maybe(item); | ||
174 | } | ||
175 | } | ||
176 | |||
177 | |||
178 | /************************* Basic Tree Manipulation *************************/ | ||
179 | |||
180 | static void __cma_bf_hole_insert_by_size(struct cma_bf_item *item) | ||
181 | { | ||
182 | struct cma_bf_private *prv = item->ch.reg->private_data; | ||
183 | struct rb_node **link = &prv->by_size_root.rb_node, *parent = NULL; | ||
184 | const typeof(item->ch.size) value = item->ch.size; | ||
185 | |||
186 | while (*link) { | ||
187 | struct cma_bf_item *i; | ||
188 | parent = *link; | ||
189 | i = rb_entry(parent, struct cma_bf_item, by_size); | ||
190 | link = value <= i->ch.size | ||
191 | ? &parent->rb_left | ||
192 | : &parent->rb_right; | ||
193 | } | ||
194 | |||
195 | rb_link_node(&item->by_size, parent, link); | ||
196 | rb_insert_color(&item->by_size, &prv->by_size_root); | ||
197 | } | ||
198 | |||
199 | static void __cma_bf_hole_erase_by_size(struct cma_bf_item *item) | ||
200 | { | ||
201 | struct cma_bf_private *prv = item->ch.reg->private_data; | ||
202 | rb_erase(&item->by_size, &prv->by_size_root); | ||
203 | } | ||
204 | |||
205 | static int __must_check | ||
206 | __cma_bf_hole_insert_by_start(struct cma_bf_item *item) | ||
207 | { | ||
208 | struct cma_bf_private *prv = item->ch.reg->private_data; | ||
209 | struct rb_node **link = &prv->by_start_root.rb_node, *parent = NULL; | ||
210 | const typeof(item->ch.start) value = item->ch.start; | ||
211 | |||
212 | while (*link) { | ||
213 | struct cma_bf_item *i; | ||
214 | parent = *link; | ||
215 | i = rb_entry(parent, struct cma_bf_item, ch.by_start); | ||
216 | |||
217 | if (WARN_ON(value == i->ch.start)) | ||
218 | /* | ||
219 | * This should *never* happen. And I mean | ||
220 | * *never*. We could even BUG on it but | ||
221 | * hopefully things are only a bit broken, | ||
222 | * ie. system can still run. We produce | ||
223 | * a warning and return an error. | ||
224 | */ | ||
225 | return -EBUSY; | ||
226 | |||
227 | link = value <= i->ch.start | ||
228 | ? &parent->rb_left | ||
229 | : &parent->rb_right; | ||
230 | } | ||
231 | |||
232 | rb_link_node(&item->ch.by_start, parent, link); | ||
233 | rb_insert_color(&item->ch.by_start, &prv->by_start_root); | ||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | static void __cma_bf_hole_erase_by_start(struct cma_bf_item *item) | ||
238 | { | ||
239 | struct cma_bf_private *prv = item->ch.reg->private_data; | ||
240 | rb_erase(&item->ch.by_start, &prv->by_start_root); | ||
241 | } | ||
242 | |||
243 | |||
244 | /************************* More Tree Manipulation *************************/ | ||
245 | |||
246 | static struct cma_bf_item *__must_check | ||
247 | __cma_bf_hole_take(struct cma_bf_item *hole, size_t size, size_t alignment) | ||
248 | { | ||
249 | struct cma_bf_item *item; | ||
250 | |||
251 | /* | ||
252 | * There are three cases: | ||
253 | * 1. the chunk takes the whole hole, | ||
254 | * 2. the chunk is at the beginning or at the end of the hole, or | ||
255 | * 3. the chunk is in the middle of the hole. | ||
256 | */ | ||
257 | |||
258 | |||
259 | /* Case 1, the whole hole */ | ||
260 | if (size == hole->ch.size) { | ||
261 | __cma_bf_hole_erase_by_size(hole); | ||
262 | __cma_bf_hole_erase_by_start(hole); | ||
263 | return hole; | ||
264 | } | ||
265 | |||
266 | |||
267 | /* Allocate */ | ||
268 | item = kmalloc(sizeof *item, GFP_KERNEL); | ||
269 | if (unlikely(!item)) | ||
270 | return NULL; | ||
271 | |||
272 | item->ch.start = ALIGN(hole->ch.start, alignment); | ||
273 | item->ch.size = size; | ||
274 | |||
275 | /* Case 3, in the middle */ | ||
276 | if (item->ch.start != hole->ch.start | ||
277 | && item->ch.start + item->ch.size != | ||
278 | hole->ch.start + hole->ch.size) { | ||
279 | struct cma_bf_item *tail; | ||
280 | |||
281 | /* | ||
282 | * Space between the end of the chunk and the end of | ||
283 | * the region, ie. space left after the end of the | ||
284 | * chunk. If this is dividable by alignment we can | ||
285 | * move the chunk to the end of the hole. | ||
286 | */ | ||
287 | size_t left = | ||
288 | hole->ch.start + hole->ch.size - | ||
289 | (item->ch.start + item->ch.size); | ||
290 | if (left % alignment == 0) { | ||
291 | item->ch.start += left; | ||
292 | goto case_2; | ||
293 | } | ||
294 | |||
295 | /* | ||
296 | * We are going to add a hole at the end. This way, | ||
297 | * we will reduce the problem to case 2 -- the chunk | ||
298 | * will be at the end of the hole. | ||
299 | */ | ||
300 | tail = kmalloc(sizeof *tail, GFP_KERNEL); | ||
301 | if (unlikely(!tail)) { | ||
302 | kfree(item); | ||
303 | return NULL; | ||
304 | } | ||
305 | |||
306 | tail->ch.start = item->ch.start + item->ch.size; | ||
307 | tail->ch.size = | ||
308 | hole->ch.start + hole->ch.size - tail->ch.start; | ||
309 | tail->ch.reg = hole->ch.reg; | ||
310 | |||
311 | if (unlikely(__cma_bf_hole_insert_by_start(tail))) { | ||
312 | /* | ||
313 | * Things are broken beyond repair... Abort | ||
314 | * inserting the hole but still continue with | ||
315 | * allocation (seems like the best we can do). | ||
316 | */ | ||
317 | |||
318 | hole->ch.size = tail->ch.start - hole->ch.start; | ||
319 | kfree(tail); | ||
320 | } else { | ||
321 | __cma_bf_hole_insert_by_size(tail); | ||
322 | /* | ||
323 | * It's important that we first insert the new | ||
324 | * hole in the tree sorted by size and later | ||
325 | * reduce the size of the old hole. We will | ||
326 | * update the position of the old hole in the | ||
327 | * rb tree in code that handles case 2. | ||
328 | */ | ||
329 | hole->ch.size = tail->ch.start - hole->ch.start; | ||
330 | } | ||
331 | |||
332 | /* Go to case 2 */ | ||
333 | } | ||
334 | |||
335 | |||
336 | /* Case 2, at the beginning or at the end */ | ||
337 | case_2: | ||
338 | /* No need to update the tree; order preserved. */ | ||
339 | if (item->ch.start == hole->ch.start) | ||
340 | hole->ch.start += item->ch.size; | ||
341 | |||
342 | /* Alter hole's size */ | ||
343 | hole->ch.size -= size; | ||
344 | __cma_bf_hole_erase_by_size(hole); | ||
345 | __cma_bf_hole_insert_by_size(hole); | ||
346 | |||
347 | return item; | ||
348 | } | ||
349 | |||
350 | |||
351 | static void __cma_bf_hole_merge_maybe(struct cma_bf_item *item) | ||
352 | { | ||
353 | struct cma_bf_item *prev; | ||
354 | struct rb_node *node; | ||
355 | int twice = 2; | ||
356 | |||
357 | node = rb_prev(&item->ch.by_start); | ||
358 | if (unlikely(!node)) | ||
359 | goto next; | ||
360 | prev = rb_entry(node, struct cma_bf_item, ch.by_start); | ||
361 | |||
362 | for (;;) { | ||
363 | if (prev->ch.start + prev->ch.size == item->ch.start) { | ||
364 | /* Remove previous hole from trees */ | ||
365 | __cma_bf_hole_erase_by_size(prev); | ||
366 | __cma_bf_hole_erase_by_start(prev); | ||
367 | |||
368 | /* Alter this hole */ | ||
369 | item->ch.size += prev->ch.size; | ||
370 | item->ch.start = prev->ch.start; | ||
371 | __cma_bf_hole_erase_by_size(item); | ||
372 | __cma_bf_hole_insert_by_size(item); | ||
373 | /* | ||
374 | * No need to update by start trees as we do | ||
375 | * not break sequence order | ||
376 | */ | ||
377 | |||
378 | /* Free prev hole */ | ||
379 | kfree(prev); | ||
380 | } | ||
381 | |||
382 | next: | ||
383 | if (!--twice) | ||
384 | break; | ||
385 | |||
386 | node = rb_next(&item->ch.by_start); | ||
387 | if (unlikely(!node)) | ||
388 | break; | ||
389 | prev = item; | ||
390 | item = rb_entry(node, struct cma_bf_item, ch.by_start); | ||
391 | } | ||
392 | } | ||
393 | |||
394 | |||
395 | |||
396 | /************************* Register *************************/ | ||
397 | static int cma_bf_module_init(void) | ||
398 | { | ||
399 | static struct cma_allocator alloc = { | ||
400 | .name = "bf", | ||
401 | .init = cma_bf_init, | ||
402 | .cleanup = cma_bf_cleanup, | ||
403 | .alloc = cma_bf_alloc, | ||
404 | .free = cma_bf_free, | ||
405 | }; | ||
406 | return cma_allocator_register(&alloc); | ||
407 | } | ||
408 | module_init(cma_bf_module_init); | ||
diff --git a/mm/cma.c b/mm/cma.c new file mode 100644 index 00000000000..546dd861bdb --- /dev/null +++ b/mm/cma.c | |||
@@ -0,0 +1,1413 @@ | |||
1 | /* | ||
2 | * Contiguous Memory Allocator framework | ||
3 | * Copyright (c) 2010 by Samsung Electronics. | ||
4 | * Written by Michal Nazarewicz (m.nazarewicz@samsung.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License as | ||
8 | * published by the Free Software Foundation; either version 2 of the | ||
9 | * License or (at your optional) any later version of the license. | ||
10 | */ | ||
11 | |||
12 | /* | ||
13 | * See Documentation/contiguous-memory.txt for details. | ||
14 | */ | ||
15 | |||
16 | #define pr_fmt(fmt) "cma: " fmt | ||
17 | |||
18 | #ifdef CONFIG_CMA_DEBUG | ||
19 | # define DEBUG | ||
20 | #endif | ||
21 | |||
22 | #ifndef CONFIG_NO_BOOTMEM | ||
23 | # include <linux/bootmem.h> /* alloc_bootmem_pages_nopanic() */ | ||
24 | #endif | ||
25 | #ifdef CONFIG_HAVE_MEMBLOCK | ||
26 | # include <linux/memblock.h> /* memblock*() */ | ||
27 | #endif | ||
28 | #include <linux/device.h> /* struct device, dev_name() */ | ||
29 | #include <linux/errno.h> /* Error numbers */ | ||
30 | #include <linux/err.h> /* IS_ERR, PTR_ERR, etc. */ | ||
31 | #include <linux/mm.h> /* PAGE_ALIGN() */ | ||
32 | #include <linux/module.h> /* EXPORT_SYMBOL_GPL() */ | ||
33 | #include <linux/mutex.h> /* mutex */ | ||
34 | #include <linux/slab.h> /* kmalloc() */ | ||
35 | #include <linux/string.h> /* str*() */ | ||
36 | |||
37 | #include <linux/cma.h> | ||
38 | #include <linux/vmalloc.h> | ||
39 | |||
40 | /* | ||
41 | * Protects cma_regions, cma_allocators, cma_map, cma_map_length, | ||
42 | * cma_kobj, cma_sysfs_regions and cma_chunks_by_start. | ||
43 | */ | ||
44 | static DEFINE_MUTEX(cma_mutex); | ||
45 | |||
46 | |||
47 | |||
48 | /************************* Map attribute *************************/ | ||
49 | |||
50 | static const char *cma_map; | ||
51 | static size_t cma_map_length; | ||
52 | |||
53 | /* | ||
54 | * map-attr ::= [ rules [ ';' ] ] | ||
55 | * rules ::= rule [ ';' rules ] | ||
56 | * rule ::= patterns '=' regions | ||
57 | * patterns ::= pattern [ ',' patterns ] | ||
58 | * regions ::= REG-NAME [ ',' regions ] | ||
59 | * pattern ::= dev-pattern [ '/' TYPE-NAME ] | '/' TYPE-NAME | ||
60 | * | ||
61 | * See Documentation/contiguous-memory.txt for details. | ||
62 | */ | ||
63 | static ssize_t cma_map_validate(const char *param) | ||
64 | { | ||
65 | const char *ch = param; | ||
66 | |||
67 | if (*ch == '\0' || *ch == '\n') | ||
68 | return 0; | ||
69 | |||
70 | for (;;) { | ||
71 | const char *start = ch; | ||
72 | |||
73 | while (*ch && *ch != '\n' && *ch != ';' && *ch != '=') | ||
74 | ++ch; | ||
75 | |||
76 | if (*ch != '=' || start == ch) { | ||
77 | pr_err("map: expecting \"<patterns>=<regions>\" near %s\n", | ||
78 | start); | ||
79 | return -EINVAL; | ||
80 | } | ||
81 | |||
82 | while (*++ch != ';') | ||
83 | if (*ch == '\0' || *ch == '\n') | ||
84 | return ch - param; | ||
85 | if (ch[1] == '\0' || ch[1] == '\n') | ||
86 | return ch - param; | ||
87 | ++ch; | ||
88 | } | ||
89 | } | ||
90 | |||
91 | static int __init cma_map_param(char *param) | ||
92 | { | ||
93 | ssize_t len; | ||
94 | |||
95 | pr_debug("param: map: %s\n", param); | ||
96 | |||
97 | len = cma_map_validate(param); | ||
98 | if (len < 0) | ||
99 | return len; | ||
100 | |||
101 | cma_map = param; | ||
102 | cma_map_length = len; | ||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | #if defined CONFIG_CMA_CMDLINE | ||
107 | |||
108 | early_param("cma.map", cma_map_param); | ||
109 | |||
110 | #endif | ||
111 | |||
112 | |||
113 | |||
114 | /************************* Early regions *************************/ | ||
115 | |||
116 | struct list_head cma_early_regions __initdata = | ||
117 | LIST_HEAD_INIT(cma_early_regions); | ||
118 | |||
119 | #ifdef CONFIG_CMA_CMDLINE | ||
120 | |||
121 | /* | ||
122 | * regions-attr ::= [ regions [ ';' ] ] | ||
123 | * regions ::= region [ ';' regions ] | ||
124 | * | ||
125 | * region ::= [ '-' ] reg-name | ||
126 | * '=' size | ||
127 | * [ '@' start ] | ||
128 | * [ '/' alignment ] | ||
129 | * [ ':' alloc-name ] | ||
130 | * | ||
131 | * See Documentation/contiguous-memory.txt for details. | ||
132 | * | ||
133 | * Example: | ||
134 | * cma=reg1=64M:bf;reg2=32M@0x100000:bf;reg3=64M/1M:bf | ||
135 | * | ||
136 | * If allocator is ommited the first available allocater will be used. | ||
137 | */ | ||
138 | |||
139 | #define NUMPARSE(cond_ch, type, cond) ({ \ | ||
140 | unsigned long long v = 0; \ | ||
141 | if (*param == (cond_ch)) { \ | ||
142 | const char *const msg = param + 1; \ | ||
143 | v = memparse(msg, ¶m); \ | ||
144 | if (!v || v > ~(type)0 || !(cond)) { \ | ||
145 | pr_err("param: invalid value near %s\n", msg); \ | ||
146 | ret = -EINVAL; \ | ||
147 | break; \ | ||
148 | } \ | ||
149 | } \ | ||
150 | v; \ | ||
151 | }) | ||
152 | |||
153 | static int __init cma_param_parse(char *param) | ||
154 | { | ||
155 | static struct cma_region regions[16]; | ||
156 | |||
157 | size_t left = ARRAY_SIZE(regions); | ||
158 | struct cma_region *reg = regions; | ||
159 | int ret = 0; | ||
160 | |||
161 | pr_debug("param: %s\n", param); | ||
162 | |||
163 | for (; *param; ++reg) { | ||
164 | dma_addr_t start, alignment; | ||
165 | size_t size; | ||
166 | |||
167 | if (unlikely(!--left)) { | ||
168 | pr_err("param: too many early regions\n"); | ||
169 | return -ENOSPC; | ||
170 | } | ||
171 | |||
172 | /* Parse name */ | ||
173 | reg->name = param; | ||
174 | param = strchr(param, '='); | ||
175 | if (!param || param == reg->name) { | ||
176 | pr_err("param: expected \"<name>=\" near %s\n", | ||
177 | reg->name); | ||
178 | ret = -EINVAL; | ||
179 | break; | ||
180 | } | ||
181 | *param = '\0'; | ||
182 | |||
183 | /* Parse numbers */ | ||
184 | size = NUMPARSE('\0', size_t, true); | ||
185 | start = NUMPARSE('@', dma_addr_t, true); | ||
186 | alignment = NUMPARSE('/', dma_addr_t, (v & (v - 1)) == 0); | ||
187 | |||
188 | alignment = max(alignment, (dma_addr_t)PAGE_SIZE); | ||
189 | start = ALIGN(start, alignment); | ||
190 | size = PAGE_ALIGN(size); | ||
191 | if (start + size < start) { | ||
192 | pr_err("param: invalid start, size combination\n"); | ||
193 | ret = -EINVAL; | ||
194 | break; | ||
195 | } | ||
196 | |||
197 | /* Parse allocator */ | ||
198 | if (*param == ':') { | ||
199 | reg->alloc_name = ++param; | ||
200 | while (*param && *param != ';') | ||
201 | ++param; | ||
202 | if (param == reg->alloc_name) | ||
203 | reg->alloc_name = NULL; | ||
204 | } | ||
205 | |||
206 | /* Go to next */ | ||
207 | if (*param == ';') { | ||
208 | *param = '\0'; | ||
209 | ++param; | ||
210 | } else if (*param) { | ||
211 | pr_err("param: expecting ';' or end of parameter near %s\n", | ||
212 | param); | ||
213 | ret = -EINVAL; | ||
214 | break; | ||
215 | } | ||
216 | |||
217 | /* Add */ | ||
218 | reg->size = size; | ||
219 | reg->start = start; | ||
220 | reg->alignment = alignment; | ||
221 | reg->copy_name = 1; | ||
222 | |||
223 | list_add_tail(®->list, &cma_early_regions); | ||
224 | |||
225 | pr_debug("param: registering early region %s (%p@%p/%p)\n", | ||
226 | reg->name, (void *)reg->size, (void *)reg->start, | ||
227 | (void *)reg->alignment); | ||
228 | } | ||
229 | |||
230 | return ret; | ||
231 | } | ||
232 | early_param("cma", cma_param_parse); | ||
233 | |||
234 | #undef NUMPARSE | ||
235 | |||
236 | #endif | ||
237 | |||
238 | |||
239 | int __init __must_check cma_early_region_register(struct cma_region *reg) | ||
240 | { | ||
241 | dma_addr_t start, alignment; | ||
242 | size_t size; | ||
243 | |||
244 | if (reg->alignment & (reg->alignment - 1)) | ||
245 | return -EINVAL; | ||
246 | |||
247 | alignment = max(reg->alignment, (dma_addr_t)PAGE_SIZE); | ||
248 | start = ALIGN(reg->start, alignment); | ||
249 | size = PAGE_ALIGN(reg->size); | ||
250 | |||
251 | if (start + size < start) | ||
252 | return -EINVAL; | ||
253 | |||
254 | reg->size = size; | ||
255 | reg->start = start; | ||
256 | reg->alignment = alignment; | ||
257 | |||
258 | list_add_tail(®->list, &cma_early_regions); | ||
259 | |||
260 | pr_debug("param: registering early region %s (%p@%p/%p)\n", | ||
261 | reg->name, (void *)reg->size, (void *)reg->start, | ||
262 | (void *)reg->alignment); | ||
263 | |||
264 | return 0; | ||
265 | } | ||
266 | |||
267 | |||
268 | |||
269 | /************************* Regions & Allocators *************************/ | ||
270 | |||
271 | static void __cma_sysfs_region_add(struct cma_region *reg); | ||
272 | |||
273 | static int __cma_region_attach_alloc(struct cma_region *reg); | ||
274 | static void __maybe_unused __cma_region_detach_alloc(struct cma_region *reg); | ||
275 | |||
276 | |||
277 | /* List of all regions. Named regions are kept before unnamed. */ | ||
278 | static LIST_HEAD(cma_regions); | ||
279 | |||
280 | #define cma_foreach_region(reg) \ | ||
281 | list_for_each_entry(reg, &cma_regions, list) | ||
282 | |||
283 | int __must_check cma_region_register(struct cma_region *reg) | ||
284 | { | ||
285 | const char *name, *alloc_name; | ||
286 | struct cma_region *r; | ||
287 | char *ch = NULL; | ||
288 | int ret = 0; | ||
289 | |||
290 | if (!reg->size || reg->start + reg->size < reg->start) | ||
291 | return -EINVAL; | ||
292 | |||
293 | reg->users = 0; | ||
294 | reg->used = 0; | ||
295 | reg->private_data = NULL; | ||
296 | reg->registered = 0; | ||
297 | reg->free_space = reg->size; | ||
298 | |||
299 | /* Copy name and alloc_name */ | ||
300 | name = reg->name; | ||
301 | alloc_name = reg->alloc_name; | ||
302 | if (reg->copy_name && (reg->name || reg->alloc_name)) { | ||
303 | size_t name_size, alloc_size; | ||
304 | |||
305 | name_size = reg->name ? strlen(reg->name) + 1 : 0; | ||
306 | alloc_size = reg->alloc_name ? strlen(reg->alloc_name) + 1 : 0; | ||
307 | |||
308 | ch = kmalloc(name_size + alloc_size, GFP_KERNEL); | ||
309 | if (!ch) { | ||
310 | pr_err("%s: not enough memory to allocate name\n", | ||
311 | reg->name ?: "(private)"); | ||
312 | return -ENOMEM; | ||
313 | } | ||
314 | |||
315 | if (name_size) { | ||
316 | memcpy(ch, reg->name, name_size); | ||
317 | name = ch; | ||
318 | ch += name_size; | ||
319 | } | ||
320 | |||
321 | if (alloc_size) { | ||
322 | memcpy(ch, reg->alloc_name, alloc_size); | ||
323 | alloc_name = ch; | ||
324 | } | ||
325 | } | ||
326 | |||
327 | mutex_lock(&cma_mutex); | ||
328 | |||
329 | /* Don't let regions overlap */ | ||
330 | cma_foreach_region(r) | ||
331 | if (r->start + r->size > reg->start && | ||
332 | r->start < reg->start + reg->size) { | ||
333 | ret = -EADDRINUSE; | ||
334 | goto done; | ||
335 | } | ||
336 | |||
337 | if (reg->alloc) { | ||
338 | ret = __cma_region_attach_alloc(reg); | ||
339 | if (unlikely(ret < 0)) | ||
340 | goto done; | ||
341 | } | ||
342 | |||
343 | reg->name = name; | ||
344 | reg->alloc_name = alloc_name; | ||
345 | reg->registered = 1; | ||
346 | ch = NULL; | ||
347 | |||
348 | /* | ||
349 | * Keep named at the beginning and unnamed (private) at the | ||
350 | * end. This helps in traversal when named region is looked | ||
351 | * for. | ||
352 | */ | ||
353 | if (name) | ||
354 | list_add(®->list, &cma_regions); | ||
355 | else | ||
356 | list_add_tail(®->list, &cma_regions); | ||
357 | |||
358 | __cma_sysfs_region_add(reg); | ||
359 | |||
360 | done: | ||
361 | mutex_unlock(&cma_mutex); | ||
362 | |||
363 | pr_debug("%s: region %sregistered\n", | ||
364 | reg->name ?: "(private)", ret ? "not " : ""); | ||
365 | kfree(ch); | ||
366 | |||
367 | return ret; | ||
368 | } | ||
369 | EXPORT_SYMBOL_GPL(cma_region_register); | ||
370 | |||
371 | static struct cma_region *__must_check | ||
372 | __cma_region_find(const char **namep) | ||
373 | { | ||
374 | struct cma_region *reg; | ||
375 | const char *ch, *name; | ||
376 | size_t n; | ||
377 | |||
378 | ch = *namep; | ||
379 | while (*ch && *ch != ',' && *ch != ';') | ||
380 | ++ch; | ||
381 | name = *namep; | ||
382 | *namep = *ch == ',' ? ch + 1 : ch; | ||
383 | n = ch - name; | ||
384 | |||
385 | /* | ||
386 | * Named regions are kept in front of unnamed so if we | ||
387 | * encounter unnamed region we can stop. | ||
388 | */ | ||
389 | cma_foreach_region(reg) | ||
390 | if (!reg->name) | ||
391 | break; | ||
392 | else if (!strncmp(name, reg->name, n) && !reg->name[n]) | ||
393 | return reg; | ||
394 | |||
395 | return NULL; | ||
396 | } | ||
397 | |||
398 | |||
399 | /* List of all allocators. */ | ||
400 | static LIST_HEAD(cma_allocators); | ||
401 | |||
402 | #define cma_foreach_allocator(alloc) \ | ||
403 | list_for_each_entry(alloc, &cma_allocators, list) | ||
404 | |||
405 | int cma_allocator_register(struct cma_allocator *alloc) | ||
406 | { | ||
407 | struct cma_region *reg; | ||
408 | int first; | ||
409 | |||
410 | if (!alloc->alloc || !alloc->free) | ||
411 | return -EINVAL; | ||
412 | |||
413 | mutex_lock(&cma_mutex); | ||
414 | |||
415 | first = list_empty(&cma_allocators); | ||
416 | |||
417 | list_add_tail(&alloc->list, &cma_allocators); | ||
418 | |||
419 | /* | ||
420 | * Attach this allocator to all allocator-less regions that | ||
421 | * request this particular allocator (reg->alloc_name equals | ||
422 | * alloc->name) or if region wants the first available | ||
423 | * allocator and we are the first. | ||
424 | */ | ||
425 | cma_foreach_region(reg) { | ||
426 | if (reg->alloc) | ||
427 | continue; | ||
428 | if (reg->alloc_name | ||
429 | ? alloc->name && !strcmp(alloc->name, reg->alloc_name) | ||
430 | : (!reg->used && first)) | ||
431 | continue; | ||
432 | |||
433 | reg->alloc = alloc; | ||
434 | __cma_region_attach_alloc(reg); | ||
435 | } | ||
436 | |||
437 | mutex_unlock(&cma_mutex); | ||
438 | |||
439 | pr_debug("%s: allocator registered\n", alloc->name ?: "(unnamed)"); | ||
440 | |||
441 | return 0; | ||
442 | } | ||
443 | EXPORT_SYMBOL_GPL(cma_allocator_register); | ||
444 | |||
445 | static struct cma_allocator *__must_check | ||
446 | __cma_allocator_find(const char *name) | ||
447 | { | ||
448 | struct cma_allocator *alloc; | ||
449 | |||
450 | if (!name) | ||
451 | return list_empty(&cma_allocators) | ||
452 | ? NULL | ||
453 | : list_entry(cma_allocators.next, | ||
454 | struct cma_allocator, list); | ||
455 | |||
456 | cma_foreach_allocator(alloc) | ||
457 | if (alloc->name && !strcmp(name, alloc->name)) | ||
458 | return alloc; | ||
459 | |||
460 | return NULL; | ||
461 | } | ||
462 | |||
463 | |||
464 | |||
465 | /************************* Initialise CMA *************************/ | ||
466 | |||
467 | int __init cma_set_defaults(struct cma_region *regions, const char *map) | ||
468 | { | ||
469 | if (map) { | ||
470 | int ret = cma_map_param((char *)map); | ||
471 | if (unlikely(ret < 0)) | ||
472 | return ret; | ||
473 | } | ||
474 | |||
475 | if (!regions) | ||
476 | return 0; | ||
477 | |||
478 | for (; regions->size; ++regions) { | ||
479 | int ret = cma_early_region_register(regions); | ||
480 | if (unlikely(ret < 0)) | ||
481 | return ret; | ||
482 | } | ||
483 | |||
484 | return 0; | ||
485 | } | ||
486 | |||
487 | |||
488 | int __init cma_early_region_reserve(struct cma_region *reg) | ||
489 | { | ||
490 | int tried = 0; | ||
491 | |||
492 | if (!reg->size || (reg->alignment & (reg->alignment - 1)) || | ||
493 | reg->reserved) | ||
494 | return -EINVAL; | ||
495 | |||
496 | #ifndef CONFIG_NO_BOOTMEM | ||
497 | |||
498 | tried = 1; | ||
499 | |||
500 | { | ||
501 | void *ptr = __alloc_bootmem_nopanic(reg->size, reg->alignment, | ||
502 | reg->start); | ||
503 | if (ptr) { | ||
504 | reg->start = virt_to_phys(ptr); | ||
505 | reg->reserved = 1; | ||
506 | return 0; | ||
507 | } | ||
508 | } | ||
509 | |||
510 | #endif | ||
511 | |||
512 | #ifdef CONFIG_HAVE_MEMBLOCK | ||
513 | |||
514 | tried = 1; | ||
515 | |||
516 | if (reg->start) { | ||
517 | if (!memblock_is_region_reserved(reg->start, reg->size) && | ||
518 | memblock_reserve(reg->start, reg->size) >= 0) { | ||
519 | reg->reserved = 1; | ||
520 | return 0; | ||
521 | } | ||
522 | } else { | ||
523 | /* | ||
524 | * Use __memblock_alloc_base() since | ||
525 | * memblock_alloc_base() panic()s. | ||
526 | */ | ||
527 | u64 ret = __memblock_alloc_base(reg->size, reg->alignment, 0); | ||
528 | if (ret && | ||
529 | ret < ~(dma_addr_t)0 && | ||
530 | ret + reg->size < ~(dma_addr_t)0 && | ||
531 | ret + reg->size > ret) { | ||
532 | reg->start = ret; | ||
533 | reg->reserved = 1; | ||
534 | return 0; | ||
535 | } | ||
536 | |||
537 | if (ret) | ||
538 | memblock_free(ret, reg->size); | ||
539 | } | ||
540 | |||
541 | #endif | ||
542 | |||
543 | return tried ? -ENOMEM : -EOPNOTSUPP; | ||
544 | } | ||
545 | |||
546 | void __init cma_early_regions_reserve(int (*reserve)(struct cma_region *reg)) | ||
547 | { | ||
548 | struct cma_region *reg; | ||
549 | |||
550 | pr_debug("init: reserving early regions\n"); | ||
551 | |||
552 | if (!reserve) | ||
553 | reserve = cma_early_region_reserve; | ||
554 | |||
555 | list_for_each_entry(reg, &cma_early_regions, list) { | ||
556 | if (reg->reserved) { | ||
557 | /* nothing */ | ||
558 | } else if (reserve(reg) >= 0) { | ||
559 | pr_debug("init: %s: reserved %p@%p\n", | ||
560 | reg->name ?: "(private)", | ||
561 | (void *)reg->size, (void *)reg->start); | ||
562 | reg->reserved = 1; | ||
563 | } else { | ||
564 | pr_warn("init: %s: unable to reserve %p@%p/%p\n", | ||
565 | reg->name ?: "(private)", | ||
566 | (void *)reg->size, (void *)reg->start, | ||
567 | (void *)reg->alignment); | ||
568 | } | ||
569 | } | ||
570 | } | ||
571 | |||
572 | |||
573 | static int __init cma_init(void) | ||
574 | { | ||
575 | struct cma_region *reg, *n; | ||
576 | |||
577 | pr_debug("init: initialising\n"); | ||
578 | |||
579 | if (cma_map) { | ||
580 | char *val = kmemdup(cma_map, cma_map_length + 1, GFP_KERNEL); | ||
581 | cma_map = val; | ||
582 | if (!val) | ||
583 | return -ENOMEM; | ||
584 | val[cma_map_length] = '\0'; | ||
585 | } | ||
586 | |||
587 | list_for_each_entry_safe(reg, n, &cma_early_regions, list) { | ||
588 | INIT_LIST_HEAD(®->list); | ||
589 | /* | ||
590 | * We don't care if there was an error. It's a pity | ||
591 | * but there's not much we can do about it any way. | ||
592 | * If the error is on a region that was parsed from | ||
593 | * command line then it will stay and waste a bit of | ||
594 | * space; if it was registered using | ||
595 | * cma_early_region_register() it's caller's | ||
596 | * responsibility to do something about it. | ||
597 | */ | ||
598 | if (reg->reserved && cma_region_register(reg) < 0) | ||
599 | /* ignore error */; | ||
600 | } | ||
601 | |||
602 | INIT_LIST_HEAD(&cma_early_regions); | ||
603 | |||
604 | return 0; | ||
605 | } | ||
606 | /* | ||
607 | * We want to be initialised earlier than module_init/__initcall so | ||
608 | * that drivers that want to grab memory at boot time will get CMA | ||
609 | * ready. subsys_initcall() seems early enough and not too early at | ||
610 | * the same time. | ||
611 | */ | ||
612 | subsys_initcall(cma_init); | ||
613 | |||
614 | |||
615 | |||
616 | /************************* SysFS *************************/ | ||
617 | |||
618 | #if defined CONFIG_CMA_SYSFS | ||
619 | |||
620 | static struct kobject cma_sysfs_regions; | ||
621 | static int cma_sysfs_regions_ready; | ||
622 | |||
623 | |||
624 | #define CMA_ATTR_INLINE(_type, _name) \ | ||
625 | (&((struct cma_ ## _type ## _attribute){ \ | ||
626 | .attr = { \ | ||
627 | .name = __stringify(_name), \ | ||
628 | .mode = 0644, \ | ||
629 | }, \ | ||
630 | .show = cma_sysfs_ ## _type ## _ ## _name ## _show, \ | ||
631 | .store = cma_sysfs_ ## _type ## _ ## _name ## _store, \ | ||
632 | }).attr) | ||
633 | |||
634 | #define CMA_ATTR_RO_INLINE(_type, _name) \ | ||
635 | (&((struct cma_ ## _type ## _attribute){ \ | ||
636 | .attr = { \ | ||
637 | .name = __stringify(_name), \ | ||
638 | .mode = 0444, \ | ||
639 | }, \ | ||
640 | .show = cma_sysfs_ ## _type ## _ ## _name ## _show, \ | ||
641 | }).attr) | ||
642 | |||
643 | |||
644 | struct cma_root_attribute { | ||
645 | struct attribute attr; | ||
646 | ssize_t (*show)(char *buf); | ||
647 | int (*store)(const char *buf); | ||
648 | }; | ||
649 | |||
650 | static ssize_t cma_sysfs_root_map_show(char *page) | ||
651 | { | ||
652 | ssize_t len; | ||
653 | |||
654 | len = cma_map_length; | ||
655 | if (!len) { | ||
656 | *page = 0; | ||
657 | len = 0; | ||
658 | } else { | ||
659 | if (len > (size_t)PAGE_SIZE - 1) | ||
660 | len = (size_t)PAGE_SIZE - 1; | ||
661 | memcpy(page, cma_map, len); | ||
662 | page[len++] = '\n'; | ||
663 | } | ||
664 | |||
665 | return len; | ||
666 | } | ||
667 | |||
668 | static int cma_sysfs_root_map_store(const char *page) | ||
669 | { | ||
670 | ssize_t len = cma_map_validate(page); | ||
671 | char *val = NULL; | ||
672 | |||
673 | if (len < 0) | ||
674 | return len; | ||
675 | |||
676 | if (len) { | ||
677 | val = kmemdup(page, len + 1, GFP_KERNEL); | ||
678 | if (!val) | ||
679 | return -ENOMEM; | ||
680 | val[len] = '\0'; | ||
681 | } | ||
682 | |||
683 | kfree(cma_map); | ||
684 | cma_map = val; | ||
685 | cma_map_length = len; | ||
686 | |||
687 | return 0; | ||
688 | } | ||
689 | |||
690 | static ssize_t cma_sysfs_root_allocators_show(char *page) | ||
691 | { | ||
692 | struct cma_allocator *alloc; | ||
693 | size_t left = PAGE_SIZE; | ||
694 | char *ch = page; | ||
695 | |||
696 | cma_foreach_allocator(alloc) { | ||
697 | ssize_t l = snprintf(ch, left, "%s ", alloc->name ?: "-"); | ||
698 | ch += l; | ||
699 | left -= l; | ||
700 | } | ||
701 | |||
702 | if (ch != page) | ||
703 | ch[-1] = '\n'; | ||
704 | return ch - page; | ||
705 | } | ||
706 | |||
707 | static ssize_t | ||
708 | cma_sysfs_root_show(struct kobject *kobj, struct attribute *attr, char *buf) | ||
709 | { | ||
710 | struct cma_root_attribute *rattr = | ||
711 | container_of(attr, struct cma_root_attribute, attr); | ||
712 | ssize_t ret; | ||
713 | |||
714 | mutex_lock(&cma_mutex); | ||
715 | ret = rattr->show(buf); | ||
716 | mutex_unlock(&cma_mutex); | ||
717 | |||
718 | return ret; | ||
719 | } | ||
720 | |||
721 | static ssize_t | ||
722 | cma_sysfs_root_store(struct kobject *kobj, struct attribute *attr, | ||
723 | const char *buf, size_t count) | ||
724 | { | ||
725 | struct cma_root_attribute *rattr = | ||
726 | container_of(attr, struct cma_root_attribute, attr); | ||
727 | int ret; | ||
728 | |||
729 | mutex_lock(&cma_mutex); | ||
730 | ret = rattr->store(buf); | ||
731 | mutex_unlock(&cma_mutex); | ||
732 | |||
733 | return ret < 0 ? ret : count; | ||
734 | } | ||
735 | |||
736 | static struct kobj_type cma_sysfs_root_type = { | ||
737 | .sysfs_ops = &(const struct sysfs_ops){ | ||
738 | .show = cma_sysfs_root_show, | ||
739 | .store = cma_sysfs_root_store, | ||
740 | }, | ||
741 | .default_attrs = (struct attribute * []) { | ||
742 | CMA_ATTR_INLINE(root, map), | ||
743 | CMA_ATTR_RO_INLINE(root, allocators), | ||
744 | NULL | ||
745 | }, | ||
746 | }; | ||
747 | |||
748 | static int __init cma_sysfs_init(void) | ||
749 | { | ||
750 | static struct kobject root; | ||
751 | static struct kobj_type fake_type; | ||
752 | |||
753 | struct cma_region *reg; | ||
754 | int ret; | ||
755 | |||
756 | /* Root */ | ||
757 | ret = kobject_init_and_add(&root, &cma_sysfs_root_type, | ||
758 | mm_kobj, "contiguous"); | ||
759 | if (unlikely(ret < 0)) { | ||
760 | pr_err("init: unable to add root kobject: %d\n", ret); | ||
761 | return ret; | ||
762 | } | ||
763 | |||
764 | /* Regions */ | ||
765 | ret = kobject_init_and_add(&cma_sysfs_regions, &fake_type, | ||
766 | &root, "regions"); | ||
767 | if (unlikely(ret < 0)) { | ||
768 | pr_err("init: unable to add regions kobject: %d\n", ret); | ||
769 | return ret; | ||
770 | } | ||
771 | |||
772 | mutex_lock(&cma_mutex); | ||
773 | cma_sysfs_regions_ready = 1; | ||
774 | cma_foreach_region(reg) | ||
775 | __cma_sysfs_region_add(reg); | ||
776 | mutex_unlock(&cma_mutex); | ||
777 | |||
778 | return 0; | ||
779 | } | ||
780 | device_initcall(cma_sysfs_init); | ||
781 | |||
782 | |||
783 | |||
784 | struct cma_region_attribute { | ||
785 | struct attribute attr; | ||
786 | ssize_t (*show)(struct cma_region *reg, char *buf); | ||
787 | int (*store)(struct cma_region *reg, const char *buf); | ||
788 | }; | ||
789 | |||
790 | |||
791 | static ssize_t cma_sysfs_region_name_show(struct cma_region *reg, char *page) | ||
792 | { | ||
793 | return reg->name ? snprintf(page, PAGE_SIZE, "%s\n", reg->name) : 0; | ||
794 | } | ||
795 | |||
796 | static ssize_t cma_sysfs_region_start_show(struct cma_region *reg, char *page) | ||
797 | { | ||
798 | return snprintf(page, PAGE_SIZE, "%p\n", (void *)reg->start); | ||
799 | } | ||
800 | |||
801 | static ssize_t cma_sysfs_region_size_show(struct cma_region *reg, char *page) | ||
802 | { | ||
803 | return snprintf(page, PAGE_SIZE, "%zu\n", reg->size); | ||
804 | } | ||
805 | |||
806 | static ssize_t cma_sysfs_region_free_show(struct cma_region *reg, char *page) | ||
807 | { | ||
808 | return snprintf(page, PAGE_SIZE, "%zu\n", reg->free_space); | ||
809 | } | ||
810 | |||
811 | static ssize_t cma_sysfs_region_users_show(struct cma_region *reg, char *page) | ||
812 | { | ||
813 | return snprintf(page, PAGE_SIZE, "%u\n", reg->users); | ||
814 | } | ||
815 | |||
816 | static ssize_t cma_sysfs_region_alloc_show(struct cma_region *reg, char *page) | ||
817 | { | ||
818 | if (reg->alloc) | ||
819 | return snprintf(page, PAGE_SIZE, "%s\n", | ||
820 | reg->alloc->name ?: "-"); | ||
821 | else if (reg->alloc_name) | ||
822 | return snprintf(page, PAGE_SIZE, "[%s]\n", reg->alloc_name); | ||
823 | else | ||
824 | return 0; | ||
825 | } | ||
826 | |||
827 | static int | ||
828 | cma_sysfs_region_alloc_store(struct cma_region *reg, const char *page) | ||
829 | { | ||
830 | char *s; | ||
831 | |||
832 | if (reg->alloc && reg->users) | ||
833 | return -EBUSY; | ||
834 | |||
835 | if (!*page || *page == '\n') { | ||
836 | s = NULL; | ||
837 | } else { | ||
838 | size_t len; | ||
839 | |||
840 | for (s = (char *)page; *++s && *s != '\n'; ) | ||
841 | /* nop */; | ||
842 | |||
843 | len = s - page; | ||
844 | s = kmemdup(page, len + 1, GFP_KERNEL); | ||
845 | if (!s) | ||
846 | return -ENOMEM; | ||
847 | s[len] = '\0'; | ||
848 | } | ||
849 | |||
850 | if (reg->alloc) | ||
851 | __cma_region_detach_alloc(reg); | ||
852 | |||
853 | if (reg->free_alloc_name) | ||
854 | kfree(reg->alloc_name); | ||
855 | |||
856 | reg->alloc_name = s; | ||
857 | reg->free_alloc_name = !!s; | ||
858 | |||
859 | return 0; | ||
860 | } | ||
861 | |||
862 | |||
863 | static ssize_t | ||
864 | cma_sysfs_region_show(struct kobject *kobj, struct attribute *attr, | ||
865 | char *buf) | ||
866 | { | ||
867 | struct cma_region *reg = container_of(kobj, struct cma_region, kobj); | ||
868 | struct cma_region_attribute *rattr = | ||
869 | container_of(attr, struct cma_region_attribute, attr); | ||
870 | ssize_t ret; | ||
871 | |||
872 | mutex_lock(&cma_mutex); | ||
873 | ret = rattr->show(reg, buf); | ||
874 | mutex_unlock(&cma_mutex); | ||
875 | |||
876 | return ret; | ||
877 | } | ||
878 | |||
879 | static int | ||
880 | cma_sysfs_region_store(struct kobject *kobj, struct attribute *attr, | ||
881 | const char *buf, size_t count) | ||
882 | { | ||
883 | struct cma_region *reg = container_of(kobj, struct cma_region, kobj); | ||
884 | struct cma_region_attribute *rattr = | ||
885 | container_of(attr, struct cma_region_attribute, attr); | ||
886 | int ret; | ||
887 | |||
888 | mutex_lock(&cma_mutex); | ||
889 | ret = rattr->store(reg, buf); | ||
890 | mutex_unlock(&cma_mutex); | ||
891 | |||
892 | return ret < 0 ? ret : count; | ||
893 | } | ||
894 | |||
895 | static struct kobj_type cma_sysfs_region_type = { | ||
896 | .sysfs_ops = &(const struct sysfs_ops){ | ||
897 | .show = cma_sysfs_region_show, | ||
898 | .store = cma_sysfs_region_store, | ||
899 | }, | ||
900 | .default_attrs = (struct attribute * []) { | ||
901 | CMA_ATTR_RO_INLINE(region, name), | ||
902 | CMA_ATTR_RO_INLINE(region, start), | ||
903 | CMA_ATTR_RO_INLINE(region, size), | ||
904 | CMA_ATTR_RO_INLINE(region, free), | ||
905 | CMA_ATTR_RO_INLINE(region, users), | ||
906 | CMA_ATTR_INLINE(region, alloc), | ||
907 | NULL | ||
908 | }, | ||
909 | }; | ||
910 | |||
911 | static void __cma_sysfs_region_add(struct cma_region *reg) | ||
912 | { | ||
913 | int ret; | ||
914 | |||
915 | if (!cma_sysfs_regions_ready) | ||
916 | return; | ||
917 | |||
918 | memset(®->kobj, 0, sizeof reg->kobj); | ||
919 | |||
920 | ret = kobject_init_and_add(®->kobj, &cma_sysfs_region_type, | ||
921 | &cma_sysfs_regions, | ||
922 | "%p", (void *)reg->start); | ||
923 | |||
924 | if (reg->name && | ||
925 | sysfs_create_link(&cma_sysfs_regions, ®->kobj, reg->name) < 0) | ||
926 | /* Ignore any errors. */; | ||
927 | } | ||
928 | |||
929 | #else | ||
930 | |||
931 | static void __cma_sysfs_region_add(struct cma_region *reg) | ||
932 | { | ||
933 | /* nop */ | ||
934 | } | ||
935 | |||
936 | #endif | ||
937 | |||
938 | |||
939 | /************************* Chunks *************************/ | ||
940 | |||
941 | /* All chunks sorted by start address. */ | ||
942 | static struct rb_root cma_chunks_by_start; | ||
943 | |||
944 | static struct cma_chunk *__must_check __cma_chunk_find(dma_addr_t addr) | ||
945 | { | ||
946 | struct cma_chunk *chunk; | ||
947 | struct rb_node *n; | ||
948 | |||
949 | for (n = cma_chunks_by_start.rb_node; n; ) { | ||
950 | chunk = rb_entry(n, struct cma_chunk, by_start); | ||
951 | if (addr < chunk->start) | ||
952 | n = n->rb_left; | ||
953 | else if (addr > chunk->start) | ||
954 | n = n->rb_right; | ||
955 | else | ||
956 | return chunk; | ||
957 | } | ||
958 | WARN(1, KERN_WARNING "no chunk starting at %p\n", (void *)addr); | ||
959 | return NULL; | ||
960 | } | ||
961 | |||
962 | static int __must_check __cma_chunk_insert(struct cma_chunk *chunk) | ||
963 | { | ||
964 | struct rb_node **new, *parent = NULL; | ||
965 | typeof(chunk->start) addr = chunk->start; | ||
966 | |||
967 | for (new = &cma_chunks_by_start.rb_node; *new; ) { | ||
968 | struct cma_chunk *c = | ||
969 | container_of(*new, struct cma_chunk, by_start); | ||
970 | |||
971 | parent = *new; | ||
972 | if (addr < c->start) { | ||
973 | new = &(*new)->rb_left; | ||
974 | } else if (addr > c->start) { | ||
975 | new = &(*new)->rb_right; | ||
976 | } else { | ||
977 | /* | ||
978 | * We should never be here. If we are it | ||
979 | * means allocator gave us an invalid chunk | ||
980 | * (one that has already been allocated) so we | ||
981 | * refuse to accept it. Our caller will | ||
982 | * recover by freeing the chunk. | ||
983 | */ | ||
984 | WARN_ON(1); | ||
985 | return -EADDRINUSE; | ||
986 | } | ||
987 | } | ||
988 | |||
989 | rb_link_node(&chunk->by_start, parent, new); | ||
990 | rb_insert_color(&chunk->by_start, &cma_chunks_by_start); | ||
991 | |||
992 | return 0; | ||
993 | } | ||
994 | |||
995 | static void __cma_chunk_free(struct cma_chunk *chunk) | ||
996 | { | ||
997 | rb_erase(&chunk->by_start, &cma_chunks_by_start); | ||
998 | |||
999 | chunk->reg->free_space += chunk->size; | ||
1000 | --chunk->reg->users; | ||
1001 | |||
1002 | chunk->reg->alloc->free(chunk); | ||
1003 | } | ||
1004 | |||
1005 | |||
1006 | /************************* The Device API *************************/ | ||
1007 | |||
1008 | static const char *__must_check | ||
1009 | __cma_where_from(const struct device *dev, const char *type); | ||
1010 | |||
1011 | |||
1012 | /* Allocate. */ | ||
1013 | |||
1014 | static dma_addr_t __must_check | ||
1015 | __cma_alloc_from_region(struct cma_region *reg, | ||
1016 | size_t size, dma_addr_t alignment) | ||
1017 | { | ||
1018 | struct cma_chunk *chunk; | ||
1019 | |||
1020 | pr_debug("allocate %p/%p from %s\n", | ||
1021 | (void *)size, (void *)alignment, | ||
1022 | reg ? reg->name ?: "(private)" : "(null)"); | ||
1023 | |||
1024 | if (!reg || reg->free_space < size) | ||
1025 | return -ENOMEM; | ||
1026 | |||
1027 | if (!reg->alloc) { | ||
1028 | if (!reg->used) | ||
1029 | __cma_region_attach_alloc(reg); | ||
1030 | if (!reg->alloc) | ||
1031 | return -ENOMEM; | ||
1032 | } | ||
1033 | |||
1034 | chunk = reg->alloc->alloc(reg, size, alignment); | ||
1035 | if (!chunk) | ||
1036 | return -ENOMEM; | ||
1037 | |||
1038 | if (unlikely(__cma_chunk_insert(chunk) < 0)) { | ||
1039 | /* We should *never* be here. */ | ||
1040 | chunk->reg->alloc->free(chunk); | ||
1041 | kfree(chunk); | ||
1042 | return -EADDRINUSE; | ||
1043 | } | ||
1044 | |||
1045 | chunk->reg = reg; | ||
1046 | ++reg->users; | ||
1047 | reg->free_space -= chunk->size; | ||
1048 | pr_debug("allocated at %p\n", (void *)chunk->start); | ||
1049 | return chunk->start; | ||
1050 | } | ||
1051 | |||
1052 | dma_addr_t __must_check | ||
1053 | cma_alloc_from_region(struct cma_region *reg, | ||
1054 | size_t size, dma_addr_t alignment) | ||
1055 | { | ||
1056 | dma_addr_t addr; | ||
1057 | |||
1058 | pr_debug("allocate %p/%p from %s\n", | ||
1059 | (void *)size, (void *)alignment, | ||
1060 | reg ? reg->name ?: "(private)" : "(null)"); | ||
1061 | |||
1062 | if (!size || alignment & (alignment - 1) || !reg) | ||
1063 | return -EINVAL; | ||
1064 | |||
1065 | mutex_lock(&cma_mutex); | ||
1066 | |||
1067 | addr = reg->registered ? | ||
1068 | __cma_alloc_from_region(reg, PAGE_ALIGN(size), | ||
1069 | max(alignment, (dma_addr_t)PAGE_SIZE)) : | ||
1070 | -EINVAL; | ||
1071 | |||
1072 | mutex_unlock(&cma_mutex); | ||
1073 | |||
1074 | return addr; | ||
1075 | } | ||
1076 | EXPORT_SYMBOL_GPL(cma_alloc_from_region); | ||
1077 | |||
1078 | dma_addr_t __must_check | ||
1079 | __cma_alloc(const struct device *dev, const char *type, | ||
1080 | dma_addr_t size, dma_addr_t alignment) | ||
1081 | { | ||
1082 | struct cma_region *reg; | ||
1083 | const char *from; | ||
1084 | dma_addr_t addr; | ||
1085 | |||
1086 | if (dev) | ||
1087 | pr_debug("allocate %p/%p for %s/%s\n", | ||
1088 | (void *)size, (void *)alignment, | ||
1089 | dev_name(dev), type ?: ""); | ||
1090 | |||
1091 | if (!size || (alignment & ~alignment)) | ||
1092 | return -EINVAL; | ||
1093 | |||
1094 | if (alignment < PAGE_SIZE) | ||
1095 | alignment = PAGE_SIZE; | ||
1096 | |||
1097 | if (!IS_ALIGNED(size, alignment)) | ||
1098 | size = ALIGN(size, alignment); | ||
1099 | |||
1100 | mutex_lock(&cma_mutex); | ||
1101 | |||
1102 | from = __cma_where_from(dev, type); | ||
1103 | if (unlikely(IS_ERR(from))) { | ||
1104 | addr = PTR_ERR(from); | ||
1105 | goto done; | ||
1106 | } | ||
1107 | |||
1108 | pr_debug("allocate %p/%p from one of %s\n", | ||
1109 | (void *)size, (void *)alignment, from); | ||
1110 | |||
1111 | while (*from && *from != ';') { | ||
1112 | reg = __cma_region_find(&from); | ||
1113 | addr = __cma_alloc_from_region(reg, size, alignment); | ||
1114 | if (!IS_ERR_VALUE(addr)) | ||
1115 | goto done; | ||
1116 | } | ||
1117 | |||
1118 | pr_debug("not enough memory\n"); | ||
1119 | addr = -ENOMEM; | ||
1120 | |||
1121 | done: | ||
1122 | mutex_unlock(&cma_mutex); | ||
1123 | |||
1124 | return addr; | ||
1125 | } | ||
1126 | EXPORT_SYMBOL_GPL(__cma_alloc); | ||
1127 | |||
1128 | |||
1129 | void *cma_get_virt(dma_addr_t phys, dma_addr_t size, int noncached) | ||
1130 | { | ||
1131 | unsigned long num_pages, i; | ||
1132 | struct page **pages; | ||
1133 | void *virt; | ||
1134 | |||
1135 | if (noncached) { | ||
1136 | num_pages = size >> PAGE_SHIFT; | ||
1137 | pages = kmalloc(num_pages * sizeof(struct page *), GFP_KERNEL); | ||
1138 | |||
1139 | if (!pages) | ||
1140 | return ERR_PTR(-ENOMEM); | ||
1141 | |||
1142 | for (i = 0; i < num_pages; i++) | ||
1143 | pages[i] = pfn_to_page((phys >> PAGE_SHIFT) + i); | ||
1144 | |||
1145 | virt = vmap(pages, num_pages, VM_MAP, | ||
1146 | pgprot_writecombine(PAGE_KERNEL)); | ||
1147 | |||
1148 | if (!virt) { | ||
1149 | kfree(pages); | ||
1150 | return ERR_PTR(-ENOMEM); | ||
1151 | } | ||
1152 | |||
1153 | kfree(pages); | ||
1154 | } else { | ||
1155 | virt = phys_to_virt((unsigned long)phys); | ||
1156 | } | ||
1157 | |||
1158 | return virt; | ||
1159 | } | ||
1160 | EXPORT_SYMBOL_GPL(cma_get_virt); | ||
1161 | |||
1162 | /* Query information about regions. */ | ||
1163 | static void __cma_info_add(struct cma_info *infop, struct cma_region *reg) | ||
1164 | { | ||
1165 | infop->total_size += reg->size; | ||
1166 | infop->free_size += reg->free_space; | ||
1167 | if (infop->lower_bound > reg->start) | ||
1168 | infop->lower_bound = reg->start; | ||
1169 | if (infop->upper_bound < reg->start + reg->size) | ||
1170 | infop->upper_bound = reg->start + reg->size; | ||
1171 | ++infop->count; | ||
1172 | } | ||
1173 | |||
1174 | int | ||
1175 | __cma_info(struct cma_info *infop, const struct device *dev, const char *type) | ||
1176 | { | ||
1177 | struct cma_info info = { ~(dma_addr_t)0, 0, 0, 0, 0 }; | ||
1178 | struct cma_region *reg; | ||
1179 | const char *from; | ||
1180 | int ret; | ||
1181 | |||
1182 | if (unlikely(!infop)) | ||
1183 | return -EINVAL; | ||
1184 | |||
1185 | mutex_lock(&cma_mutex); | ||
1186 | |||
1187 | from = __cma_where_from(dev, type); | ||
1188 | if (IS_ERR(from)) { | ||
1189 | ret = PTR_ERR(from); | ||
1190 | info.lower_bound = 0; | ||
1191 | goto done; | ||
1192 | } | ||
1193 | |||
1194 | while (*from && *from != ';') { | ||
1195 | reg = __cma_region_find(&from); | ||
1196 | if (reg) | ||
1197 | __cma_info_add(&info, reg); | ||
1198 | } | ||
1199 | |||
1200 | ret = 0; | ||
1201 | done: | ||
1202 | mutex_unlock(&cma_mutex); | ||
1203 | |||
1204 | memcpy(infop, &info, sizeof info); | ||
1205 | return ret; | ||
1206 | } | ||
1207 | EXPORT_SYMBOL_GPL(__cma_info); | ||
1208 | |||
1209 | |||
1210 | /* Freeing. */ | ||
1211 | int cma_free(dma_addr_t addr) | ||
1212 | { | ||
1213 | struct cma_chunk *c; | ||
1214 | int ret; | ||
1215 | |||
1216 | mutex_lock(&cma_mutex); | ||
1217 | |||
1218 | c = __cma_chunk_find(addr); | ||
1219 | |||
1220 | if (c) { | ||
1221 | __cma_chunk_free(c); | ||
1222 | ret = 0; | ||
1223 | } else { | ||
1224 | ret = -ENOENT; | ||
1225 | } | ||
1226 | |||
1227 | mutex_unlock(&cma_mutex); | ||
1228 | |||
1229 | if (c) | ||
1230 | pr_debug("free(%p): freed\n", (void *)addr); | ||
1231 | else | ||
1232 | pr_err("free(%p): not found\n", (void *)addr); | ||
1233 | return ret; | ||
1234 | } | ||
1235 | EXPORT_SYMBOL_GPL(cma_free); | ||
1236 | |||
1237 | |||
1238 | /************************* Miscellaneous *************************/ | ||
1239 | |||
1240 | static int __cma_region_attach_alloc(struct cma_region *reg) | ||
1241 | { | ||
1242 | struct cma_allocator *alloc; | ||
1243 | int ret; | ||
1244 | |||
1245 | /* | ||
1246 | * If reg->alloc is set then caller wants us to use this | ||
1247 | * allocator. Otherwise we need to find one by name. | ||
1248 | */ | ||
1249 | if (reg->alloc) { | ||
1250 | alloc = reg->alloc; | ||
1251 | } else { | ||
1252 | alloc = __cma_allocator_find(reg->alloc_name); | ||
1253 | if (!alloc) { | ||
1254 | pr_warn("init: %s: %s: no such allocator\n", | ||
1255 | reg->name ?: "(private)", | ||
1256 | reg->alloc_name ?: "(default)"); | ||
1257 | reg->used = 1; | ||
1258 | return -ENOENT; | ||
1259 | } | ||
1260 | } | ||
1261 | |||
1262 | /* Try to initialise the allocator. */ | ||
1263 | reg->private_data = NULL; | ||
1264 | ret = alloc->init ? alloc->init(reg) : 0; | ||
1265 | if (unlikely(ret < 0)) { | ||
1266 | pr_err("init: %s: %s: unable to initialise allocator\n", | ||
1267 | reg->name ?: "(private)", alloc->name ?: "(unnamed)"); | ||
1268 | reg->alloc = NULL; | ||
1269 | reg->used = 1; | ||
1270 | } else { | ||
1271 | reg->alloc = alloc; | ||
1272 | pr_debug("init: %s: %s: initialised allocator\n", | ||
1273 | reg->name ?: "(private)", alloc->name ?: "(unnamed)"); | ||
1274 | } | ||
1275 | return ret; | ||
1276 | } | ||
1277 | |||
1278 | static void __cma_region_detach_alloc(struct cma_region *reg) | ||
1279 | { | ||
1280 | if (!reg->alloc) | ||
1281 | return; | ||
1282 | |||
1283 | if (reg->alloc->cleanup) | ||
1284 | reg->alloc->cleanup(reg); | ||
1285 | |||
1286 | reg->alloc = NULL; | ||
1287 | reg->used = 1; | ||
1288 | } | ||
1289 | |||
1290 | |||
1291 | /* | ||
1292 | * s ::= rules | ||
1293 | * rules ::= rule [ ';' rules ] | ||
1294 | * rule ::= patterns '=' regions | ||
1295 | * patterns ::= pattern [ ',' patterns ] | ||
1296 | * regions ::= REG-NAME [ ',' regions ] | ||
1297 | * pattern ::= dev-pattern [ '/' TYPE-NAME ] | '/' TYPE-NAME | ||
1298 | */ | ||
1299 | static const char *__must_check | ||
1300 | __cma_where_from(const struct device *dev, const char *type) | ||
1301 | { | ||
1302 | /* | ||
1303 | * This function matches the pattern from the map attribute | ||
1304 | * agains given device name and type. Type may be of course | ||
1305 | * NULL or an emtpy string. | ||
1306 | */ | ||
1307 | |||
1308 | const char *s, *name; | ||
1309 | int name_matched = 0; | ||
1310 | |||
1311 | /* | ||
1312 | * If dev is NULL we were called in alternative form where | ||
1313 | * type is the from string. All we have to do is return it. | ||
1314 | */ | ||
1315 | if (!dev) | ||
1316 | return type ?: ERR_PTR(-EINVAL); | ||
1317 | |||
1318 | if (!cma_map) | ||
1319 | return ERR_PTR(-ENOENT); | ||
1320 | |||
1321 | name = dev_name(dev); | ||
1322 | if (WARN_ON(!name || !*name)) | ||
1323 | return ERR_PTR(-EINVAL); | ||
1324 | |||
1325 | if (!type) | ||
1326 | type = "common"; | ||
1327 | |||
1328 | /* | ||
1329 | * Now we go throught the cma_map attribute. | ||
1330 | */ | ||
1331 | for (s = cma_map; *s; ++s) { | ||
1332 | const char *c; | ||
1333 | |||
1334 | /* | ||
1335 | * If the pattern starts with a slash, the device part of the | ||
1336 | * pattern matches if it matched previously. | ||
1337 | */ | ||
1338 | if (*s == '/') { | ||
1339 | if (!name_matched) | ||
1340 | goto look_for_next; | ||
1341 | goto match_type; | ||
1342 | } | ||
1343 | |||
1344 | /* | ||
1345 | * We are now trying to match the device name. This also | ||
1346 | * updates the name_matched variable. If, while reading the | ||
1347 | * spec, we ecnounter comma it means that the pattern does not | ||
1348 | * match and we need to start over with another pattern (the | ||
1349 | * one afther the comma). If we encounter equal sign we need | ||
1350 | * to start over with another rule. If there is a character | ||
1351 | * that does not match, we neet to look for a comma (to get | ||
1352 | * another pattern) or semicolon (to get another rule) and try | ||
1353 | * again if there is one somewhere. | ||
1354 | */ | ||
1355 | |||
1356 | name_matched = 0; | ||
1357 | |||
1358 | for (c = name; *s != '*' && *c; ++c, ++s) | ||
1359 | if (*s == '=') | ||
1360 | goto next_rule; | ||
1361 | else if (*s == ',') | ||
1362 | goto next_pattern; | ||
1363 | else if (*s != '?' && *c != *s) | ||
1364 | goto look_for_next; | ||
1365 | if (*s == '*') | ||
1366 | ++s; | ||
1367 | |||
1368 | name_matched = 1; | ||
1369 | |||
1370 | /* | ||
1371 | * Now we need to match the type part of the pattern. If the | ||
1372 | * pattern is missing it we match only if type points to an | ||
1373 | * empty string. Otherwise wy try to match it just like name. | ||
1374 | */ | ||
1375 | if (*s == '/') { | ||
1376 | match_type: /* s points to '/' */ | ||
1377 | ++s; | ||
1378 | |||
1379 | for (c = type; *s && *c; ++c, ++s) | ||
1380 | if (*s == '=') | ||
1381 | goto next_rule; | ||
1382 | else if (*s == ',') | ||
1383 | goto next_pattern; | ||
1384 | else if (*c != *s) | ||
1385 | goto look_for_next; | ||
1386 | } | ||
1387 | |||
1388 | /* Return the string behind the '=' sign of the rule. */ | ||
1389 | if (*s == '=') | ||
1390 | return s + 1; | ||
1391 | else if (*s == ',') | ||
1392 | return strchr(s, '=') + 1; | ||
1393 | |||
1394 | /* Pattern did not match */ | ||
1395 | |||
1396 | look_for_next: | ||
1397 | do { | ||
1398 | ++s; | ||
1399 | } while (*s != ',' && *s != '='); | ||
1400 | if (*s == ',') | ||
1401 | continue; | ||
1402 | |||
1403 | next_rule: /* s points to '=' */ | ||
1404 | s = strchr(s, ';'); | ||
1405 | if (!s) | ||
1406 | break; | ||
1407 | |||
1408 | next_pattern: | ||
1409 | continue; | ||
1410 | } | ||
1411 | |||
1412 | return ERR_PTR(-ENOENT); | ||
1413 | } | ||
diff --git a/mm/compaction.c b/mm/compaction.c index 6cc604bd564..8ea7308601b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -35,10 +35,6 @@ struct compact_control { | |||
35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
36 | bool sync; /* Synchronous migration */ | 36 | bool sync; /* Synchronous migration */ |
37 | 37 | ||
38 | /* Account for isolated anon and file pages */ | ||
39 | unsigned long nr_anon; | ||
40 | unsigned long nr_file; | ||
41 | |||
42 | unsigned int order; /* order a direct compactor needs */ | 38 | unsigned int order; /* order a direct compactor needs */ |
43 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
44 | struct zone *zone; | 40 | struct zone *zone; |
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone, | |||
223 | static void acct_isolated(struct zone *zone, struct compact_control *cc) | 219 | static void acct_isolated(struct zone *zone, struct compact_control *cc) |
224 | { | 220 | { |
225 | struct page *page; | 221 | struct page *page; |
226 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 222 | unsigned int count[2] = { 0, }; |
227 | 223 | ||
228 | list_for_each_entry(page, &cc->migratepages, lru) { | 224 | list_for_each_entry(page, &cc->migratepages, lru) |
229 | int lru = page_lru_base_type(page); | 225 | count[!!page_is_file_cache(page)]++; |
230 | count[lru]++; | ||
231 | } | ||
232 | 226 | ||
233 | cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; | 227 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); |
234 | cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; | 228 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); |
235 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon); | ||
236 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file); | ||
237 | } | 229 | } |
238 | 230 | ||
239 | /* Similar to reclaim, but different enough that they don't share logic */ | 231 | /* Similar to reclaim, but different enough that they don't share logic */ |
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
269 | unsigned long last_pageblock_nr = 0, pageblock_nr; | 261 | unsigned long last_pageblock_nr = 0, pageblock_nr; |
270 | unsigned long nr_scanned = 0, nr_isolated = 0; | 262 | unsigned long nr_scanned = 0, nr_isolated = 0; |
271 | struct list_head *migratelist = &cc->migratepages; | 263 | struct list_head *migratelist = &cc->migratepages; |
264 | isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE; | ||
272 | 265 | ||
273 | /* Do not scan outside zone boundaries */ | 266 | /* Do not scan outside zone boundaries */ |
274 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | 267 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); |
@@ -320,12 +313,34 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
320 | } else if (!locked) | 313 | } else if (!locked) |
321 | spin_lock_irq(&zone->lru_lock); | 314 | spin_lock_irq(&zone->lru_lock); |
322 | 315 | ||
316 | /* | ||
317 | * migrate_pfn does not necessarily start aligned to a | ||
318 | * pageblock. Ensure that pfn_valid is called when moving | ||
319 | * into a new MAX_ORDER_NR_PAGES range in case of large | ||
320 | * memory holes within the zone | ||
321 | */ | ||
322 | if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { | ||
323 | if (!pfn_valid(low_pfn)) { | ||
324 | low_pfn += MAX_ORDER_NR_PAGES - 1; | ||
325 | continue; | ||
326 | } | ||
327 | } | ||
328 | |||
323 | if (!pfn_valid_within(low_pfn)) | 329 | if (!pfn_valid_within(low_pfn)) |
324 | continue; | 330 | continue; |
325 | nr_scanned++; | 331 | nr_scanned++; |
326 | 332 | ||
327 | /* Get the page and skip if free */ | 333 | /* |
334 | * Get the page and ensure the page is within the same zone. | ||
335 | * See the comment in isolate_freepages about overlapping | ||
336 | * nodes. It is deliberate that the new zone lock is not taken | ||
337 | * as memory compaction should not move pages between nodes. | ||
338 | */ | ||
328 | page = pfn_to_page(low_pfn); | 339 | page = pfn_to_page(low_pfn); |
340 | if (page_zone(page) != zone) | ||
341 | continue; | ||
342 | |||
343 | /* Skip if free */ | ||
329 | if (PageBuddy(page)) | 344 | if (PageBuddy(page)) |
330 | continue; | 345 | continue; |
331 | 346 | ||
@@ -356,8 +371,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
356 | continue; | 371 | continue; |
357 | } | 372 | } |
358 | 373 | ||
374 | if (!cc->sync) | ||
375 | mode |= ISOLATE_ASYNC_MIGRATE; | ||
376 | |||
359 | /* Try isolate the page */ | 377 | /* Try isolate the page */ |
360 | if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) | 378 | if (__isolate_lru_page(page, mode, 0) != 0) |
361 | continue; | 379 | continue; |
362 | 380 | ||
363 | VM_BUG_ON(PageTransCompound(page)); | 381 | VM_BUG_ON(PageTransCompound(page)); |
@@ -559,7 +577,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
559 | nr_migrate = cc->nr_migratepages; | 577 | nr_migrate = cc->nr_migratepages; |
560 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 578 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
561 | (unsigned long)cc, false, | 579 | (unsigned long)cc, false, |
562 | cc->sync); | 580 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); |
563 | update_nr_listpages(cc); | 581 | update_nr_listpages(cc); |
564 | nr_remaining = cc->nr_migratepages; | 582 | nr_remaining = cc->nr_migratepages; |
565 | 583 | ||
@@ -574,8 +592,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
574 | if (err) { | 592 | if (err) { |
575 | putback_lru_pages(&cc->migratepages); | 593 | putback_lru_pages(&cc->migratepages); |
576 | cc->nr_migratepages = 0; | 594 | cc->nr_migratepages = 0; |
595 | if (err == -ENOMEM) { | ||
596 | ret = COMPACT_PARTIAL; | ||
597 | goto out; | ||
598 | } | ||
577 | } | 599 | } |
578 | |||
579 | } | 600 | } |
580 | 601 | ||
581 | out: | 602 | out: |
diff --git a/mm/filemap.c b/mm/filemap.c index a8251a8d345..10481ebd96c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -396,24 +396,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range); | |||
396 | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | 396 | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) |
397 | { | 397 | { |
398 | int error; | 398 | int error; |
399 | struct mem_cgroup *memcg = NULL; | ||
400 | 399 | ||
401 | VM_BUG_ON(!PageLocked(old)); | 400 | VM_BUG_ON(!PageLocked(old)); |
402 | VM_BUG_ON(!PageLocked(new)); | 401 | VM_BUG_ON(!PageLocked(new)); |
403 | VM_BUG_ON(new->mapping); | 402 | VM_BUG_ON(new->mapping); |
404 | 403 | ||
405 | /* | ||
406 | * This is not page migration, but prepare_migration and | ||
407 | * end_migration does enough work for charge replacement. | ||
408 | * | ||
409 | * In the longer term we probably want a specialized function | ||
410 | * for moving the charge from old to new in a more efficient | ||
411 | * manner. | ||
412 | */ | ||
413 | error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); | ||
414 | if (error) | ||
415 | return error; | ||
416 | |||
417 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 404 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); |
418 | if (!error) { | 405 | if (!error) { |
419 | struct address_space *mapping = old->mapping; | 406 | struct address_space *mapping = old->mapping; |
@@ -435,13 +422,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | |||
435 | if (PageSwapBacked(new)) | 422 | if (PageSwapBacked(new)) |
436 | __inc_zone_page_state(new, NR_SHMEM); | 423 | __inc_zone_page_state(new, NR_SHMEM); |
437 | spin_unlock_irq(&mapping->tree_lock); | 424 | spin_unlock_irq(&mapping->tree_lock); |
425 | /* mem_cgroup codes must not be called under tree_lock */ | ||
426 | mem_cgroup_replace_page_cache(old, new); | ||
438 | radix_tree_preload_end(); | 427 | radix_tree_preload_end(); |
439 | if (freepage) | 428 | if (freepage) |
440 | freepage(old); | 429 | freepage(old); |
441 | page_cache_release(old); | 430 | page_cache_release(old); |
442 | mem_cgroup_end_migration(memcg, old, new, true); | ||
443 | } else { | ||
444 | mem_cgroup_end_migration(memcg, old, new, false); | ||
445 | } | 431 | } |
446 | 432 | ||
447 | return error; | 433 | return error; |
@@ -530,10 +516,13 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
530 | struct page *page; | 516 | struct page *page; |
531 | 517 | ||
532 | if (cpuset_do_page_mem_spread()) { | 518 | if (cpuset_do_page_mem_spread()) { |
533 | get_mems_allowed(); | 519 | unsigned int cpuset_mems_cookie; |
534 | n = cpuset_mem_spread_node(); | 520 | do { |
535 | page = alloc_pages_exact_node(n, gfp, 0); | 521 | cpuset_mems_cookie = get_mems_allowed(); |
536 | put_mems_allowed(); | 522 | n = cpuset_mem_spread_node(); |
523 | page = alloc_pages_exact_node(n, gfp, 0); | ||
524 | } while (!put_mems_allowed(cpuset_mems_cookie) && !page); | ||
525 | |||
537 | return page; | 526 | return page; |
538 | } | 527 | } |
539 | return alloc_pages(gfp, 0); | 528 | return alloc_pages(gfp, 0); |
@@ -1393,15 +1382,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1393 | unsigned long seg = 0; | 1382 | unsigned long seg = 0; |
1394 | size_t count; | 1383 | size_t count; |
1395 | loff_t *ppos = &iocb->ki_pos; | 1384 | loff_t *ppos = &iocb->ki_pos; |
1396 | struct blk_plug plug; | ||
1397 | 1385 | ||
1398 | count = 0; | 1386 | count = 0; |
1399 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); | 1387 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); |
1400 | if (retval) | 1388 | if (retval) |
1401 | return retval; | 1389 | return retval; |
1402 | 1390 | ||
1403 | blk_start_plug(&plug); | ||
1404 | |||
1405 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 1391 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ |
1406 | if (filp->f_flags & O_DIRECT) { | 1392 | if (filp->f_flags & O_DIRECT) { |
1407 | loff_t size; | 1393 | loff_t size; |
@@ -1417,8 +1403,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1417 | retval = filemap_write_and_wait_range(mapping, pos, | 1403 | retval = filemap_write_and_wait_range(mapping, pos, |
1418 | pos + iov_length(iov, nr_segs) - 1); | 1404 | pos + iov_length(iov, nr_segs) - 1); |
1419 | if (!retval) { | 1405 | if (!retval) { |
1406 | struct blk_plug plug; | ||
1407 | |||
1408 | blk_start_plug(&plug); | ||
1420 | retval = mapping->a_ops->direct_IO(READ, iocb, | 1409 | retval = mapping->a_ops->direct_IO(READ, iocb, |
1421 | iov, pos, nr_segs); | 1410 | iov, pos, nr_segs); |
1411 | blk_finish_plug(&plug); | ||
1422 | } | 1412 | } |
1423 | if (retval > 0) { | 1413 | if (retval > 0) { |
1424 | *ppos = pos + retval; | 1414 | *ppos = pos + retval; |
@@ -1474,7 +1464,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1474 | break; | 1464 | break; |
1475 | } | 1465 | } |
1476 | out: | 1466 | out: |
1477 | blk_finish_plug(&plug); | ||
1478 | return retval; | 1467 | return retval; |
1479 | } | 1468 | } |
1480 | EXPORT_SYMBOL(generic_file_aio_read); | 1469 | EXPORT_SYMBOL(generic_file_aio_read); |
@@ -1807,7 +1796,7 @@ repeat: | |||
1807 | page = __page_cache_alloc(gfp | __GFP_COLD); | 1796 | page = __page_cache_alloc(gfp | __GFP_COLD); |
1808 | if (!page) | 1797 | if (!page) |
1809 | return ERR_PTR(-ENOMEM); | 1798 | return ERR_PTR(-ENOMEM); |
1810 | err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); | 1799 | err = add_to_page_cache_lru(page, mapping, index, gfp); |
1811 | if (unlikely(err)) { | 1800 | if (unlikely(err)) { |
1812 | page_cache_release(page); | 1801 | page_cache_release(page); |
1813 | if (err == -EEXIST) | 1802 | if (err == -EEXIST) |
@@ -1904,10 +1893,7 @@ static struct page *wait_on_page_read(struct page *page) | |||
1904 | * @gfp: the page allocator flags to use if allocating | 1893 | * @gfp: the page allocator flags to use if allocating |
1905 | * | 1894 | * |
1906 | * This is the same as "read_mapping_page(mapping, index, NULL)", but with | 1895 | * This is the same as "read_mapping_page(mapping, index, NULL)", but with |
1907 | * any new page allocations done using the specified allocation flags. Note | 1896 | * any new page allocations done using the specified allocation flags. |
1908 | * that the Radix tree operations will still use GFP_KERNEL, so you can't | ||
1909 | * expect to do this atomically or anything like that - but you can pass in | ||
1910 | * other page requirements. | ||
1911 | * | 1897 | * |
1912 | * If the page does not get brought uptodate, return -EIO. | 1898 | * If the page does not get brought uptodate, return -EIO. |
1913 | */ | 1899 | */ |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 93356cd1282..dee94297f39 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -263,7 +263,12 @@ found: | |||
263 | xip_pfn); | 263 | xip_pfn); |
264 | if (err == -ENOMEM) | 264 | if (err == -ENOMEM) |
265 | return VM_FAULT_OOM; | 265 | return VM_FAULT_OOM; |
266 | BUG_ON(err); | 266 | /* |
267 | * err == -EBUSY is fine, we've raced against another thread | ||
268 | * that faulted-in the same page | ||
269 | */ | ||
270 | if (err != -EBUSY) | ||
271 | BUG_ON(err); | ||
267 | return VM_FAULT_NOPAGE; | 272 | return VM_FAULT_NOPAGE; |
268 | } else { | 273 | } else { |
269 | int err, ret = VM_FAULT_OOM; | 274 | int err, ret = VM_FAULT_OOM; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 81532f297fd..8cc11dda6a7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -641,6 +641,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
641 | set_pmd_at(mm, haddr, pmd, entry); | 641 | set_pmd_at(mm, haddr, pmd, entry); |
642 | prepare_pmd_huge_pte(pgtable, mm); | 642 | prepare_pmd_huge_pte(pgtable, mm); |
643 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 643 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
644 | mm->nr_ptes++; | ||
644 | spin_unlock(&mm->page_table_lock); | 645 | spin_unlock(&mm->page_table_lock); |
645 | } | 646 | } |
646 | 647 | ||
@@ -759,6 +760,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
759 | pmd = pmd_mkold(pmd_wrprotect(pmd)); | 760 | pmd = pmd_mkold(pmd_wrprotect(pmd)); |
760 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); | 761 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); |
761 | prepare_pmd_huge_pte(pgtable, dst_mm); | 762 | prepare_pmd_huge_pte(pgtable, dst_mm); |
763 | dst_mm->nr_ptes++; | ||
762 | 764 | ||
763 | ret = 0; | 765 | ret = 0; |
764 | out_unlock: | 766 | out_unlock: |
@@ -857,7 +859,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
857 | } | 859 | } |
858 | kfree(pages); | 860 | kfree(pages); |
859 | 861 | ||
860 | mm->nr_ptes++; | ||
861 | smp_wmb(); /* make pte visible before pmd */ | 862 | smp_wmb(); /* make pte visible before pmd */ |
862 | pmd_populate(mm, pmd, pgtable); | 863 | pmd_populate(mm, pmd, pgtable); |
863 | page_remove_rmap(page); | 864 | page_remove_rmap(page); |
@@ -989,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, | |||
989 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | 990 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
990 | VM_BUG_ON(!PageCompound(page)); | 991 | VM_BUG_ON(!PageCompound(page)); |
991 | if (flags & FOLL_GET) | 992 | if (flags & FOLL_GET) |
992 | get_page(page); | 993 | get_page_foll(page); |
993 | 994 | ||
994 | out: | 995 | out: |
995 | return page; | 996 | return page; |
@@ -1016,6 +1017,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1016 | VM_BUG_ON(page_mapcount(page) < 0); | 1017 | VM_BUG_ON(page_mapcount(page) < 0); |
1017 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1018 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); |
1018 | VM_BUG_ON(!PageHead(page)); | 1019 | VM_BUG_ON(!PageHead(page)); |
1020 | tlb->mm->nr_ptes--; | ||
1019 | spin_unlock(&tlb->mm->page_table_lock); | 1021 | spin_unlock(&tlb->mm->page_table_lock); |
1020 | tlb_remove_page(tlb, page); | 1022 | tlb_remove_page(tlb, page); |
1021 | pte_free(tlb->mm, pgtable); | 1023 | pte_free(tlb->mm, pgtable); |
@@ -1156,6 +1158,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1156 | unsigned long head_index = page->index; | 1158 | unsigned long head_index = page->index; |
1157 | struct zone *zone = page_zone(page); | 1159 | struct zone *zone = page_zone(page); |
1158 | int zonestat; | 1160 | int zonestat; |
1161 | int tail_count = 0; | ||
1159 | 1162 | ||
1160 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | 1163 | /* prevent PageLRU to go away from under us, and freeze lru stats */ |
1161 | spin_lock_irq(&zone->lru_lock); | 1164 | spin_lock_irq(&zone->lru_lock); |
@@ -1164,11 +1167,27 @@ static void __split_huge_page_refcount(struct page *page) | |||
1164 | for (i = 1; i < HPAGE_PMD_NR; i++) { | 1167 | for (i = 1; i < HPAGE_PMD_NR; i++) { |
1165 | struct page *page_tail = page + i; | 1168 | struct page *page_tail = page + i; |
1166 | 1169 | ||
1167 | /* tail_page->_count cannot change */ | 1170 | /* tail_page->_mapcount cannot change */ |
1168 | atomic_sub(atomic_read(&page_tail->_count), &page->_count); | 1171 | BUG_ON(page_mapcount(page_tail) < 0); |
1169 | BUG_ON(page_count(page) <= 0); | 1172 | tail_count += page_mapcount(page_tail); |
1170 | atomic_add(page_mapcount(page) + 1, &page_tail->_count); | 1173 | /* check for overflow */ |
1171 | BUG_ON(atomic_read(&page_tail->_count) <= 0); | 1174 | BUG_ON(tail_count < 0); |
1175 | BUG_ON(atomic_read(&page_tail->_count) != 0); | ||
1176 | /* | ||
1177 | * tail_page->_count is zero and not changing from | ||
1178 | * under us. But get_page_unless_zero() may be running | ||
1179 | * from under us on the tail_page. If we used | ||
1180 | * atomic_set() below instead of atomic_add(), we | ||
1181 | * would then run atomic_set() concurrently with | ||
1182 | * get_page_unless_zero(), and atomic_set() is | ||
1183 | * implemented in C not using locked ops. spin_unlock | ||
1184 | * on x86 sometime uses locked ops because of PPro | ||
1185 | * errata 66, 92, so unless somebody can guarantee | ||
1186 | * atomic_set() here would be safe on all archs (and | ||
1187 | * not only on x86), it's safer to use atomic_add(). | ||
1188 | */ | ||
1189 | atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, | ||
1190 | &page_tail->_count); | ||
1172 | 1191 | ||
1173 | /* after clearing PageTail the gup refcount can be released */ | 1192 | /* after clearing PageTail the gup refcount can be released */ |
1174 | smp_mb(); | 1193 | smp_mb(); |
@@ -1186,10 +1205,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1186 | (1L << PG_uptodate))); | 1205 | (1L << PG_uptodate))); |
1187 | page_tail->flags |= (1L << PG_dirty); | 1206 | page_tail->flags |= (1L << PG_dirty); |
1188 | 1207 | ||
1189 | /* | 1208 | /* clear PageTail before overwriting first_page */ |
1190 | * 1) clear PageTail before overwriting first_page | ||
1191 | * 2) clear PageTail before clearing PageHead for VM_BUG_ON | ||
1192 | */ | ||
1193 | smp_wmb(); | 1209 | smp_wmb(); |
1194 | 1210 | ||
1195 | /* | 1211 | /* |
@@ -1206,7 +1222,6 @@ static void __split_huge_page_refcount(struct page *page) | |||
1206 | * status is achieved setting a reserved bit in the | 1222 | * status is achieved setting a reserved bit in the |
1207 | * pmd, not by clearing the present bit. | 1223 | * pmd, not by clearing the present bit. |
1208 | */ | 1224 | */ |
1209 | BUG_ON(page_mapcount(page_tail)); | ||
1210 | page_tail->_mapcount = page->_mapcount; | 1225 | page_tail->_mapcount = page->_mapcount; |
1211 | 1226 | ||
1212 | BUG_ON(page_tail->mapping); | 1227 | BUG_ON(page_tail->mapping); |
@@ -1223,6 +1238,8 @@ static void __split_huge_page_refcount(struct page *page) | |||
1223 | 1238 | ||
1224 | lru_add_page_tail(zone, page, page_tail); | 1239 | lru_add_page_tail(zone, page, page_tail); |
1225 | } | 1240 | } |
1241 | atomic_sub(tail_count, &page->_count); | ||
1242 | BUG_ON(atomic_read(&page->_count) <= 0); | ||
1226 | 1243 | ||
1227 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1244 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
1228 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); | 1245 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); |
@@ -1295,7 +1312,6 @@ static int __split_huge_page_map(struct page *page, | |||
1295 | pte_unmap(pte); | 1312 | pte_unmap(pte); |
1296 | } | 1313 | } |
1297 | 1314 | ||
1298 | mm->nr_ptes++; | ||
1299 | smp_wmb(); /* make pte visible before pmd */ | 1315 | smp_wmb(); /* make pte visible before pmd */ |
1300 | /* | 1316 | /* |
1301 | * Up to this point the pmd is present and huge and | 1317 | * Up to this point the pmd is present and huge and |
@@ -1910,7 +1926,6 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1910 | set_pmd_at(mm, address, pmd, _pmd); | 1926 | set_pmd_at(mm, address, pmd, _pmd); |
1911 | update_mmu_cache(vma, address, entry); | 1927 | update_mmu_cache(vma, address, entry); |
1912 | prepare_pmd_huge_pte(pgtable, mm); | 1928 | prepare_pmd_huge_pte(pgtable, mm); |
1913 | mm->nr_ptes--; | ||
1914 | spin_unlock(&mm->page_table_lock); | 1929 | spin_unlock(&mm->page_table_lock); |
1915 | 1930 | ||
1916 | #ifndef CONFIG_NUMA | 1931 | #ifndef CONFIG_NUMA |
@@ -2005,7 +2020,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) | |||
2005 | { | 2020 | { |
2006 | struct mm_struct *mm = mm_slot->mm; | 2021 | struct mm_struct *mm = mm_slot->mm; |
2007 | 2022 | ||
2008 | VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); | 2023 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); |
2009 | 2024 | ||
2010 | if (khugepaged_test_exit(mm)) { | 2025 | if (khugepaged_test_exit(mm)) { |
2011 | /* free mm_slot */ | 2026 | /* free mm_slot */ |
@@ -2033,7 +2048,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | |||
2033 | int progress = 0; | 2048 | int progress = 0; |
2034 | 2049 | ||
2035 | VM_BUG_ON(!pages); | 2050 | VM_BUG_ON(!pages); |
2036 | VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); | 2051 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); |
2037 | 2052 | ||
2038 | if (khugepaged_scan.mm_slot) | 2053 | if (khugepaged_scan.mm_slot) |
2039 | mm_slot = khugepaged_scan.mm_slot; | 2054 | mm_slot = khugepaged_scan.mm_slot; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bfcf153bc82..037f077b986 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -460,8 +460,10 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
460 | struct zonelist *zonelist; | 460 | struct zonelist *zonelist; |
461 | struct zone *zone; | 461 | struct zone *zone; |
462 | struct zoneref *z; | 462 | struct zoneref *z; |
463 | unsigned int cpuset_mems_cookie; | ||
463 | 464 | ||
464 | get_mems_allowed(); | 465 | retry_cpuset: |
466 | cpuset_mems_cookie = get_mems_allowed(); | ||
465 | zonelist = huge_zonelist(vma, address, | 467 | zonelist = huge_zonelist(vma, address, |
466 | htlb_alloc_mask, &mpol, &nodemask); | 468 | htlb_alloc_mask, &mpol, &nodemask); |
467 | /* | 469 | /* |
@@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
488 | } | 490 | } |
489 | } | 491 | } |
490 | } | 492 | } |
491 | err: | 493 | |
492 | mpol_cond_put(mpol); | 494 | mpol_cond_put(mpol); |
493 | put_mems_allowed(); | 495 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
496 | goto retry_cpuset; | ||
494 | return page; | 497 | return page; |
498 | |||
499 | err: | ||
500 | mpol_cond_put(mpol); | ||
501 | return NULL; | ||
495 | } | 502 | } |
496 | 503 | ||
497 | static void update_and_free_page(struct hstate *h, struct page *page) | 504 | static void update_and_free_page(struct hstate *h, struct page *page) |
@@ -575,6 +582,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
575 | __SetPageHead(page); | 582 | __SetPageHead(page); |
576 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | 583 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { |
577 | __SetPageTail(p); | 584 | __SetPageTail(p); |
585 | set_page_count(p, 0); | ||
578 | p->first_page = page; | 586 | p->first_page = page; |
579 | } | 587 | } |
580 | } | 588 | } |
@@ -900,7 +908,6 @@ retry: | |||
900 | h->resv_huge_pages += delta; | 908 | h->resv_huge_pages += delta; |
901 | ret = 0; | 909 | ret = 0; |
902 | 910 | ||
903 | spin_unlock(&hugetlb_lock); | ||
904 | /* Free the needed pages to the hugetlb pool */ | 911 | /* Free the needed pages to the hugetlb pool */ |
905 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 912 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
906 | if ((--needed) < 0) | 913 | if ((--needed) < 0) |
@@ -914,6 +921,7 @@ retry: | |||
914 | VM_BUG_ON(page_count(page)); | 921 | VM_BUG_ON(page_count(page)); |
915 | enqueue_huge_page(h, page); | 922 | enqueue_huge_page(h, page); |
916 | } | 923 | } |
924 | spin_unlock(&hugetlb_lock); | ||
917 | 925 | ||
918 | /* Free unnecessary surplus pages to the buddy allocator */ | 926 | /* Free unnecessary surplus pages to the buddy allocator */ |
919 | free: | 927 | free: |
@@ -2059,6 +2067,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) | |||
2059 | kref_get(&reservations->refs); | 2067 | kref_get(&reservations->refs); |
2060 | } | 2068 | } |
2061 | 2069 | ||
2070 | static void resv_map_put(struct vm_area_struct *vma) | ||
2071 | { | ||
2072 | struct resv_map *reservations = vma_resv_map(vma); | ||
2073 | |||
2074 | if (!reservations) | ||
2075 | return; | ||
2076 | kref_put(&reservations->refs, resv_map_release); | ||
2077 | } | ||
2078 | |||
2062 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) | 2079 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) |
2063 | { | 2080 | { |
2064 | struct hstate *h = hstate_vma(vma); | 2081 | struct hstate *h = hstate_vma(vma); |
@@ -2074,7 +2091,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2074 | reserve = (end - start) - | 2091 | reserve = (end - start) - |
2075 | region_count(&reservations->regions, start, end); | 2092 | region_count(&reservations->regions, start, end); |
2076 | 2093 | ||
2077 | kref_put(&reservations->refs, resv_map_release); | 2094 | resv_map_put(vma); |
2078 | 2095 | ||
2079 | if (reserve) { | 2096 | if (reserve) { |
2080 | hugetlb_acct_memory(h, -reserve); | 2097 | hugetlb_acct_memory(h, -reserve); |
@@ -2284,6 +2301,22 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2284 | { | 2301 | { |
2285 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2302 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); |
2286 | __unmap_hugepage_range(vma, start, end, ref_page); | 2303 | __unmap_hugepage_range(vma, start, end, ref_page); |
2304 | /* | ||
2305 | * Clear this flag so that x86's huge_pmd_share page_table_shareable | ||
2306 | * test will fail on a vma being torn down, and not grab a page table | ||
2307 | * on its way out. We're lucky that the flag has such an appropriate | ||
2308 | * name, and can in fact be safely cleared here. We could clear it | ||
2309 | * before the __unmap_hugepage_range above, but all that's necessary | ||
2310 | * is to clear it before releasing the i_mmap_mutex below. | ||
2311 | * | ||
2312 | * This works because in the contexts this is called, the VMA is | ||
2313 | * going to be destroyed. It is not vunerable to madvise(DONTNEED) | ||
2314 | * because madvise is not supported on hugetlbfs. The same applies | ||
2315 | * for direct IO. unmap_hugepage_range() is only being called just | ||
2316 | * before free_pgtables() so clearing VM_MAYSHARE will not cause | ||
2317 | * surprises later. | ||
2318 | */ | ||
2319 | vma->vm_flags &= ~VM_MAYSHARE; | ||
2287 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2320 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); |
2288 | } | 2321 | } |
2289 | 2322 | ||
@@ -2397,7 +2430,6 @@ retry_avoidcopy: | |||
2397 | if (outside_reserve) { | 2430 | if (outside_reserve) { |
2398 | BUG_ON(huge_pte_none(pte)); | 2431 | BUG_ON(huge_pte_none(pte)); |
2399 | if (unmap_ref_private(mm, vma, old_page, address)) { | 2432 | if (unmap_ref_private(mm, vma, old_page, address)) { |
2400 | BUG_ON(page_count(old_page) != 1); | ||
2401 | BUG_ON(huge_pte_none(pte)); | 2433 | BUG_ON(huge_pte_none(pte)); |
2402 | spin_lock(&mm->page_table_lock); | 2434 | spin_lock(&mm->page_table_lock); |
2403 | goto retry_avoidcopy; | 2435 | goto retry_avoidcopy; |
@@ -2415,6 +2447,8 @@ retry_avoidcopy: | |||
2415 | * anon_vma prepared. | 2447 | * anon_vma prepared. |
2416 | */ | 2448 | */ |
2417 | if (unlikely(anon_vma_prepare(vma))) { | 2449 | if (unlikely(anon_vma_prepare(vma))) { |
2450 | page_cache_release(new_page); | ||
2451 | page_cache_release(old_page); | ||
2418 | /* Caller expects lock to be held */ | 2452 | /* Caller expects lock to be held */ |
2419 | spin_lock(&mm->page_table_lock); | 2453 | spin_lock(&mm->page_table_lock); |
2420 | return VM_FAULT_OOM; | 2454 | return VM_FAULT_OOM; |
@@ -2676,6 +2710,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2676 | * so no worry about deadlock. | 2710 | * so no worry about deadlock. |
2677 | */ | 2711 | */ |
2678 | page = pte_page(entry); | 2712 | page = pte_page(entry); |
2713 | get_page(page); | ||
2679 | if (page != pagecache_page) | 2714 | if (page != pagecache_page) |
2680 | lock_page(page); | 2715 | lock_page(page); |
2681 | 2716 | ||
@@ -2707,6 +2742,7 @@ out_page_table_lock: | |||
2707 | } | 2742 | } |
2708 | if (page != pagecache_page) | 2743 | if (page != pagecache_page) |
2709 | unlock_page(page); | 2744 | unlock_page(page); |
2745 | put_page(page); | ||
2710 | 2746 | ||
2711 | out_mutex: | 2747 | out_mutex: |
2712 | mutex_unlock(&hugetlb_instantiation_mutex); | 2748 | mutex_unlock(&hugetlb_instantiation_mutex); |
@@ -2833,9 +2869,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
2833 | } | 2869 | } |
2834 | } | 2870 | } |
2835 | spin_unlock(&mm->page_table_lock); | 2871 | spin_unlock(&mm->page_table_lock); |
2836 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2872 | /* |
2837 | 2873 | * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare | |
2874 | * may have cleared our pud entry and done put_page on the page table: | ||
2875 | * once we release i_mmap_mutex, another task can do the final put_page | ||
2876 | * and that page table be reused and filled with junk. | ||
2877 | */ | ||
2838 | flush_tlb_range(vma, start, end); | 2878 | flush_tlb_range(vma, start, end); |
2879 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | ||
2839 | } | 2880 | } |
2840 | 2881 | ||
2841 | int hugetlb_reserve_pages(struct inode *inode, | 2882 | int hugetlb_reserve_pages(struct inode *inode, |
@@ -2873,12 +2914,16 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2873 | set_vma_resv_flags(vma, HPAGE_RESV_OWNER); | 2914 | set_vma_resv_flags(vma, HPAGE_RESV_OWNER); |
2874 | } | 2915 | } |
2875 | 2916 | ||
2876 | if (chg < 0) | 2917 | if (chg < 0) { |
2877 | return chg; | 2918 | ret = chg; |
2919 | goto out_err; | ||
2920 | } | ||
2878 | 2921 | ||
2879 | /* There must be enough filesystem quota for the mapping */ | 2922 | /* There must be enough filesystem quota for the mapping */ |
2880 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 2923 | if (hugetlb_get_quota(inode->i_mapping, chg)) { |
2881 | return -ENOSPC; | 2924 | ret = -ENOSPC; |
2925 | goto out_err; | ||
2926 | } | ||
2882 | 2927 | ||
2883 | /* | 2928 | /* |
2884 | * Check enough hugepages are available for the reservation. | 2929 | * Check enough hugepages are available for the reservation. |
@@ -2887,7 +2932,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2887 | ret = hugetlb_acct_memory(h, chg); | 2932 | ret = hugetlb_acct_memory(h, chg); |
2888 | if (ret < 0) { | 2933 | if (ret < 0) { |
2889 | hugetlb_put_quota(inode->i_mapping, chg); | 2934 | hugetlb_put_quota(inode->i_mapping, chg); |
2890 | return ret; | 2935 | goto out_err; |
2891 | } | 2936 | } |
2892 | 2937 | ||
2893 | /* | 2938 | /* |
@@ -2904,6 +2949,10 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2904 | if (!vma || vma->vm_flags & VM_MAYSHARE) | 2949 | if (!vma || vma->vm_flags & VM_MAYSHARE) |
2905 | region_add(&inode->i_mapping->private_list, from, to); | 2950 | region_add(&inode->i_mapping->private_list, from, to); |
2906 | return 0; | 2951 | return 0; |
2952 | out_err: | ||
2953 | if (vma) | ||
2954 | resv_map_put(vma); | ||
2955 | return ret; | ||
2907 | } | 2956 | } |
2908 | 2957 | ||
2909 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | 2958 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) |
diff --git a/mm/internal.h b/mm/internal.h index d071d380fb4..2189af49178 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page) | |||
37 | atomic_dec(&page->_count); | 37 | atomic_dec(&page->_count); |
38 | } | 38 | } |
39 | 39 | ||
40 | static inline void __get_page_tail_foll(struct page *page, | ||
41 | bool get_page_head) | ||
42 | { | ||
43 | /* | ||
44 | * If we're getting a tail page, the elevated page->_count is | ||
45 | * required only in the head page and we will elevate the head | ||
46 | * page->_count and tail page->_mapcount. | ||
47 | * | ||
48 | * We elevate page_tail->_mapcount for tail pages to force | ||
49 | * page_tail->_count to be zero at all times to avoid getting | ||
50 | * false positives from get_page_unless_zero() with | ||
51 | * speculative page access (like in | ||
52 | * page_cache_get_speculative()) on tail pages. | ||
53 | */ | ||
54 | VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); | ||
55 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
56 | VM_BUG_ON(page_mapcount(page) < 0); | ||
57 | if (get_page_head) | ||
58 | atomic_inc(&page->first_page->_count); | ||
59 | atomic_inc(&page->_mapcount); | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * This is meant to be called as the FOLL_GET operation of | ||
64 | * follow_page() and it must be called while holding the proper PT | ||
65 | * lock while the pte (or pmd_trans_huge) is still mapping the page. | ||
66 | */ | ||
67 | static inline void get_page_foll(struct page *page) | ||
68 | { | ||
69 | if (unlikely(PageTail(page))) | ||
70 | /* | ||
71 | * This is safe only because | ||
72 | * __split_huge_page_refcount() can't run under | ||
73 | * get_page_foll() because we hold the proper PT lock. | ||
74 | */ | ||
75 | __get_page_tail_foll(page, true); | ||
76 | else { | ||
77 | /* | ||
78 | * Getting a normal page or the head of a compound page | ||
79 | * requires to already have an elevated page->_count. | ||
80 | */ | ||
81 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | ||
82 | atomic_inc(&page->_count); | ||
83 | } | ||
84 | } | ||
85 | |||
40 | extern unsigned long highest_memmap_pfn; | 86 | extern unsigned long highest_memmap_pfn; |
41 | 87 | ||
42 | /* | 88 | /* |
@@ -184,15 +184,15 @@ static unsigned long ksm_pages_unshared; | |||
184 | static unsigned long ksm_rmap_items; | 184 | static unsigned long ksm_rmap_items; |
185 | 185 | ||
186 | /* Number of pages ksmd should scan in one batch */ | 186 | /* Number of pages ksmd should scan in one batch */ |
187 | static unsigned int ksm_thread_pages_to_scan = 100; | 187 | static unsigned int ksm_thread_pages_to_scan = 128; |
188 | 188 | ||
189 | /* Milliseconds ksmd should sleep between batches */ | 189 | /* Milliseconds ksmd should sleep between batches */ |
190 | static unsigned int ksm_thread_sleep_millisecs = 20; | 190 | static unsigned int ksm_thread_sleep_millisecs = 4000; |
191 | 191 | ||
192 | #define KSM_RUN_STOP 0 | 192 | #define KSM_RUN_STOP 0 |
193 | #define KSM_RUN_MERGE 1 | 193 | #define KSM_RUN_MERGE 1 |
194 | #define KSM_RUN_UNMERGE 2 | 194 | #define KSM_RUN_UNMERGE 2 |
195 | static unsigned int ksm_run = KSM_RUN_STOP; | 195 | static unsigned int ksm_run = KSM_RUN_MERGE; |
196 | 196 | ||
197 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); | 197 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); |
198 | static DEFINE_MUTEX(ksm_thread_mutex); | 198 | static DEFINE_MUTEX(ksm_thread_mutex); |
diff --git a/mm/madvise.c b/mm/madvise.c index 2221491ed50..deabe5f603a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/hugetlb.h> | 13 | #include <linux/hugetlb.h> |
14 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
15 | #include <linux/ksm.h> | 15 | #include <linux/ksm.h> |
16 | #include <linux/file.h> | ||
16 | 17 | ||
17 | /* | 18 | /* |
18 | * Any behaviour which results in changes to the vma->vm_flags needs to | 19 | * Any behaviour which results in changes to the vma->vm_flags needs to |
@@ -197,14 +198,16 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
197 | struct address_space *mapping; | 198 | struct address_space *mapping; |
198 | loff_t offset, endoff; | 199 | loff_t offset, endoff; |
199 | int error; | 200 | int error; |
201 | struct file *f; | ||
200 | 202 | ||
201 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ | 203 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ |
202 | 204 | ||
203 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) | 205 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) |
204 | return -EINVAL; | 206 | return -EINVAL; |
205 | 207 | ||
206 | if (!vma->vm_file || !vma->vm_file->f_mapping | 208 | f = vma->vm_file; |
207 | || !vma->vm_file->f_mapping->host) { | 209 | |
210 | if (!f || !f->f_mapping || !f->f_mapping->host) { | ||
208 | return -EINVAL; | 211 | return -EINVAL; |
209 | } | 212 | } |
210 | 213 | ||
@@ -218,9 +221,16 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
218 | endoff = (loff_t)(end - vma->vm_start - 1) | 221 | endoff = (loff_t)(end - vma->vm_start - 1) |
219 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | 222 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
220 | 223 | ||
221 | /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ | 224 | /* |
225 | * vmtruncate_range may need to take i_mutex and i_alloc_sem. | ||
226 | * We need to explicitly grab a reference because the vma (and | ||
227 | * hence the vma's reference to the file) can go away as soon as | ||
228 | * we drop mmap_sem. | ||
229 | */ | ||
230 | get_file(f); | ||
222 | up_read(¤t->mm->mmap_sem); | 231 | up_read(¤t->mm->mmap_sem); |
223 | error = vmtruncate_range(mapping->host, offset, endoff); | 232 | error = vmtruncate_range(mapping->host, offset, endoff); |
233 | fput(f); | ||
224 | down_read(¤t->mm->mmap_sem); | 234 | down_read(¤t->mm->mmap_sem); |
225 | return error; | 235 | return error; |
226 | } | 236 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e013b8e57d2..57cdf5ad692 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1251,7 +1251,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) | |||
1251 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | 1251 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, |
1252 | struct list_head *dst, | 1252 | struct list_head *dst, |
1253 | unsigned long *scanned, int order, | 1253 | unsigned long *scanned, int order, |
1254 | int mode, struct zone *z, | 1254 | isolate_mode_t mode, |
1255 | struct zone *z, | ||
1255 | struct mem_cgroup *mem_cont, | 1256 | struct mem_cgroup *mem_cont, |
1256 | int active, int file) | 1257 | int active, int file) |
1257 | { | 1258 | { |
@@ -1730,7 +1731,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1730 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | 1731 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; |
1731 | 1732 | ||
1732 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1733 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
1733 | if (!check_soft && root_mem->memsw_is_minimum) | 1734 | if (!check_soft && !shrink && root_mem->memsw_is_minimum) |
1734 | noswap = true; | 1735 | noswap = true; |
1735 | 1736 | ||
1736 | while (1) { | 1737 | while (1) { |
@@ -3422,6 +3423,50 @@ int mem_cgroup_shmem_charge_fallback(struct page *page, | |||
3422 | return ret; | 3423 | return ret; |
3423 | } | 3424 | } |
3424 | 3425 | ||
3426 | /* | ||
3427 | * At replace page cache, newpage is not under any memcg but it's on | ||
3428 | * LRU. So, this function doesn't touch res_counter but handles LRU | ||
3429 | * in correct way. Both pages are locked so we cannot race with uncharge. | ||
3430 | */ | ||
3431 | void mem_cgroup_replace_page_cache(struct page *oldpage, | ||
3432 | struct page *newpage) | ||
3433 | { | ||
3434 | struct mem_cgroup *memcg; | ||
3435 | struct page_cgroup *pc; | ||
3436 | struct zone *zone; | ||
3437 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
3438 | unsigned long flags; | ||
3439 | |||
3440 | if (mem_cgroup_disabled()) | ||
3441 | return; | ||
3442 | |||
3443 | pc = lookup_page_cgroup(oldpage); | ||
3444 | /* fix accounting on old pages */ | ||
3445 | lock_page_cgroup(pc); | ||
3446 | memcg = pc->mem_cgroup; | ||
3447 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); | ||
3448 | ClearPageCgroupUsed(pc); | ||
3449 | unlock_page_cgroup(pc); | ||
3450 | |||
3451 | if (PageSwapBacked(oldpage)) | ||
3452 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
3453 | |||
3454 | zone = page_zone(newpage); | ||
3455 | pc = lookup_page_cgroup(newpage); | ||
3456 | /* | ||
3457 | * Even if newpage->mapping was NULL before starting replacement, | ||
3458 | * the newpage may be on LRU(or pagevec for LRU) already. We lock | ||
3459 | * LRU while we overwrite pc->mem_cgroup. | ||
3460 | */ | ||
3461 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
3462 | if (PageLRU(newpage)) | ||
3463 | del_page_from_lru_list(zone, newpage, page_lru(newpage)); | ||
3464 | __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type); | ||
3465 | if (PageLRU(newpage)) | ||
3466 | add_page_to_lru_list(zone, newpage, page_lru(newpage)); | ||
3467 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
3468 | } | ||
3469 | |||
3425 | #ifdef CONFIG_DEBUG_VM | 3470 | #ifdef CONFIG_DEBUG_VM |
3426 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) | 3471 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) |
3427 | { | 3472 | { |
@@ -4514,6 +4559,9 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, | |||
4514 | */ | 4559 | */ |
4515 | BUG_ON(!thresholds); | 4560 | BUG_ON(!thresholds); |
4516 | 4561 | ||
4562 | if (!thresholds->primary) | ||
4563 | goto unlock; | ||
4564 | |||
4517 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | 4565 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); |
4518 | 4566 | ||
4519 | /* Check if a threshold crossed before removing */ | 4567 | /* Check if a threshold crossed before removing */ |
@@ -4558,11 +4606,17 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, | |||
4558 | swap_buffers: | 4606 | swap_buffers: |
4559 | /* Swap primary and spare array */ | 4607 | /* Swap primary and spare array */ |
4560 | thresholds->spare = thresholds->primary; | 4608 | thresholds->spare = thresholds->primary; |
4609 | /* If all events are unregistered, free the spare array */ | ||
4610 | if (!new) { | ||
4611 | kfree(thresholds->spare); | ||
4612 | thresholds->spare = NULL; | ||
4613 | } | ||
4614 | |||
4561 | rcu_assign_pointer(thresholds->primary, new); | 4615 | rcu_assign_pointer(thresholds->primary, new); |
4562 | 4616 | ||
4563 | /* To be sure that nobody uses thresholds */ | 4617 | /* To be sure that nobody uses thresholds */ |
4564 | synchronize_rcu(); | 4618 | synchronize_rcu(); |
4565 | 4619 | unlock: | |
4566 | mutex_unlock(&memcg->thresholds_lock); | 4620 | mutex_unlock(&memcg->thresholds_lock); |
4567 | } | 4621 | } |
4568 | 4622 | ||
@@ -4963,9 +5017,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4963 | int cpu; | 5017 | int cpu; |
4964 | enable_swap_cgroup(); | 5018 | enable_swap_cgroup(); |
4965 | parent = NULL; | 5019 | parent = NULL; |
4966 | root_mem_cgroup = mem; | ||
4967 | if (mem_cgroup_soft_limit_tree_init()) | 5020 | if (mem_cgroup_soft_limit_tree_init()) |
4968 | goto free_out; | 5021 | goto free_out; |
5022 | root_mem_cgroup = mem; | ||
4969 | for_each_possible_cpu(cpu) { | 5023 | for_each_possible_cpu(cpu) { |
4970 | struct memcg_stock_pcp *stock = | 5024 | struct memcg_stock_pcp *stock = |
4971 | &per_cpu(memcg_stock, cpu); | 5025 | &per_cpu(memcg_stock, cpu); |
@@ -5004,7 +5058,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
5004 | return &mem->css; | 5058 | return &mem->css; |
5005 | free_out: | 5059 | free_out: |
5006 | __mem_cgroup_free(mem); | 5060 | __mem_cgroup_free(mem); |
5007 | root_mem_cgroup = NULL; | ||
5008 | return ERR_PTR(error); | 5061 | return ERR_PTR(error); |
5009 | } | 5062 | } |
5010 | 5063 | ||
@@ -5244,6 +5297,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
5244 | spinlock_t *ptl; | 5297 | spinlock_t *ptl; |
5245 | 5298 | ||
5246 | split_huge_page_pmd(walk->mm, pmd); | 5299 | split_huge_page_pmd(walk->mm, pmd); |
5300 | if (pmd_trans_unstable(pmd)) | ||
5301 | return 0; | ||
5247 | 5302 | ||
5248 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 5303 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
5249 | for (; addr != end; pte++, addr += PAGE_SIZE) | 5304 | for (; addr != end; pte++, addr += PAGE_SIZE) |
@@ -5405,6 +5460,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
5405 | spinlock_t *ptl; | 5460 | spinlock_t *ptl; |
5406 | 5461 | ||
5407 | split_huge_page_pmd(walk->mm, pmd); | 5462 | split_huge_page_pmd(walk->mm, pmd); |
5463 | if (pmd_trans_unstable(pmd)) | ||
5464 | return 0; | ||
5408 | retry: | 5465 | retry: |
5409 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 5466 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
5410 | for (; addr != end; addr += PAGE_SIZE) { | 5467 | for (; addr != end; addr += PAGE_SIZE) { |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 740c4f52059..2f49dcf4f47 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1334,8 +1334,8 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1334 | /* Keep page count to indicate a given hugepage is isolated. */ | 1334 | /* Keep page count to indicate a given hugepage is isolated. */ |
1335 | 1335 | ||
1336 | list_add(&hpage->lru, &pagelist); | 1336 | list_add(&hpage->lru, &pagelist); |
1337 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, | 1337 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false, |
1338 | true); | 1338 | MIGRATE_SYNC); |
1339 | if (ret) { | 1339 | if (ret) { |
1340 | struct page *page1, *page2; | 1340 | struct page *page1, *page2; |
1341 | list_for_each_entry_safe(page1, page2, &pagelist, lru) | 1341 | list_for_each_entry_safe(page1, page2, &pagelist, lru) |
@@ -1464,7 +1464,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1464 | page_is_file_cache(page)); | 1464 | page_is_file_cache(page)); |
1465 | list_add(&page->lru, &pagelist); | 1465 | list_add(&page->lru, &pagelist); |
1466 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1466 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1467 | 0, true); | 1467 | false, MIGRATE_SYNC); |
1468 | if (ret) { | 1468 | if (ret) { |
1469 | putback_lru_pages(&pagelist); | 1469 | putback_lru_pages(&pagelist); |
1470 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1470 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
diff --git a/mm/memory.c b/mm/memory.c index 36e889cca24..79ff0613449 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1228,16 +1228,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1228 | do { | 1228 | do { |
1229 | next = pmd_addr_end(addr, end); | 1229 | next = pmd_addr_end(addr, end); |
1230 | if (pmd_trans_huge(*pmd)) { | 1230 | if (pmd_trans_huge(*pmd)) { |
1231 | if (next-addr != HPAGE_PMD_SIZE) { | 1231 | if (next - addr != HPAGE_PMD_SIZE) { |
1232 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | 1232 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); |
1233 | split_huge_page_pmd(vma->vm_mm, pmd); | 1233 | split_huge_page_pmd(vma->vm_mm, pmd); |
1234 | } else if (zap_huge_pmd(tlb, vma, pmd)) | 1234 | } else if (zap_huge_pmd(tlb, vma, pmd)) |
1235 | continue; | 1235 | goto next; |
1236 | /* fall through */ | 1236 | /* fall through */ |
1237 | } | 1237 | } |
1238 | if (pmd_none_or_clear_bad(pmd)) | 1238 | /* |
1239 | continue; | 1239 | * Here there can be other concurrent MADV_DONTNEED or |
1240 | * trans huge page faults running, and if the pmd is | ||
1241 | * none or trans huge it can change under us. This is | ||
1242 | * because MADV_DONTNEED holds the mmap_sem in read | ||
1243 | * mode. | ||
1244 | */ | ||
1245 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
1246 | goto next; | ||
1240 | next = zap_pte_range(tlb, vma, pmd, addr, next, details); | 1247 | next = zap_pte_range(tlb, vma, pmd, addr, next, details); |
1248 | next: | ||
1241 | cond_resched(); | 1249 | cond_resched(); |
1242 | } while (pmd++, addr = next, addr != end); | 1250 | } while (pmd++, addr = next, addr != end); |
1243 | 1251 | ||
@@ -1514,7 +1522,7 @@ split_fallthrough: | |||
1514 | } | 1522 | } |
1515 | 1523 | ||
1516 | if (flags & FOLL_GET) | 1524 | if (flags & FOLL_GET) |
1517 | get_page(page); | 1525 | get_page_foll(page); |
1518 | if (flags & FOLL_TOUCH) { | 1526 | if (flags & FOLL_TOUCH) { |
1519 | if ((flags & FOLL_WRITE) && | 1527 | if ((flags & FOLL_WRITE) && |
1520 | !pte_dirty(pte) && !PageDirty(page)) | 1528 | !pte_dirty(pte) && !PageDirty(page)) |
@@ -1816,7 +1824,63 @@ next_page: | |||
1816 | } | 1824 | } |
1817 | EXPORT_SYMBOL(__get_user_pages); | 1825 | EXPORT_SYMBOL(__get_user_pages); |
1818 | 1826 | ||
1819 | /** | 1827 | /* |
1828 | * fixup_user_fault() - manually resolve a user page fault | ||
1829 | * @tsk: the task_struct to use for page fault accounting, or | ||
1830 | * NULL if faults are not to be recorded. | ||
1831 | * @mm: mm_struct of target mm | ||
1832 | * @address: user address | ||
1833 | * @fault_flags:flags to pass down to handle_mm_fault() | ||
1834 | * | ||
1835 | * This is meant to be called in the specific scenario where for locking reasons | ||
1836 | * we try to access user memory in atomic context (within a pagefault_disable() | ||
1837 | * section), this returns -EFAULT, and we want to resolve the user fault before | ||
1838 | * trying again. | ||
1839 | * | ||
1840 | * Typically this is meant to be used by the futex code. | ||
1841 | * | ||
1842 | * The main difference with get_user_pages() is that this function will | ||
1843 | * unconditionally call handle_mm_fault() which will in turn perform all the | ||
1844 | * necessary SW fixup of the dirty and young bits in the PTE, while | ||
1845 | * handle_mm_fault() only guarantees to update these in the struct page. | ||
1846 | * | ||
1847 | * This is important for some architectures where those bits also gate the | ||
1848 | * access permission to the page because they are maintained in software. On | ||
1849 | * such architectures, gup() will not be enough to make a subsequent access | ||
1850 | * succeed. | ||
1851 | * | ||
1852 | * This should be called with the mm_sem held for read. | ||
1853 | */ | ||
1854 | int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | ||
1855 | unsigned long address, unsigned int fault_flags) | ||
1856 | { | ||
1857 | struct vm_area_struct *vma; | ||
1858 | int ret; | ||
1859 | |||
1860 | vma = find_extend_vma(mm, address); | ||
1861 | if (!vma || address < vma->vm_start) | ||
1862 | return -EFAULT; | ||
1863 | |||
1864 | ret = handle_mm_fault(mm, vma, address, fault_flags); | ||
1865 | if (ret & VM_FAULT_ERROR) { | ||
1866 | if (ret & VM_FAULT_OOM) | ||
1867 | return -ENOMEM; | ||
1868 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | ||
1869 | return -EHWPOISON; | ||
1870 | if (ret & VM_FAULT_SIGBUS) | ||
1871 | return -EFAULT; | ||
1872 | BUG(); | ||
1873 | } | ||
1874 | if (tsk) { | ||
1875 | if (ret & VM_FAULT_MAJOR) | ||
1876 | tsk->maj_flt++; | ||
1877 | else | ||
1878 | tsk->min_flt++; | ||
1879 | } | ||
1880 | return 0; | ||
1881 | } | ||
1882 | |||
1883 | /* | ||
1820 | * get_user_pages() - pin user pages in memory | 1884 | * get_user_pages() - pin user pages in memory |
1821 | * @tsk: the task_struct to use for page fault accounting, or | 1885 | * @tsk: the task_struct to use for page fault accounting, or |
1822 | * NULL if faults are not to be recorded. | 1886 | * NULL if faults are not to be recorded. |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c46887b5a11..ae5a3f21010 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -747,7 +747,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
747 | } | 747 | } |
748 | /* this function returns # of failed pages */ | 748 | /* this function returns # of failed pages */ |
749 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, | 749 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, |
750 | true, true); | 750 | true, MIGRATE_SYNC); |
751 | if (ret) | 751 | if (ret) |
752 | putback_lru_pages(&source); | 752 | putback_lru_pages(&source); |
753 | } | 753 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e7fb9d25c54..cff919fe702 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -511,7 +511,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
511 | do { | 511 | do { |
512 | next = pmd_addr_end(addr, end); | 512 | next = pmd_addr_end(addr, end); |
513 | split_huge_page_pmd(vma->vm_mm, pmd); | 513 | split_huge_page_pmd(vma->vm_mm, pmd); |
514 | if (pmd_none_or_clear_bad(pmd)) | 514 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
515 | continue; | 515 | continue; |
516 | if (check_pte_range(vma, pmd, addr, next, nodes, | 516 | if (check_pte_range(vma, pmd, addr, next, nodes, |
517 | flags, private)) | 517 | flags, private)) |
@@ -606,27 +606,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
606 | return first; | 606 | return first; |
607 | } | 607 | } |
608 | 608 | ||
609 | /* Apply policy to a single VMA */ | ||
610 | static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) | ||
611 | { | ||
612 | int err = 0; | ||
613 | struct mempolicy *old = vma->vm_policy; | ||
614 | |||
615 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | ||
616 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | ||
617 | vma->vm_ops, vma->vm_file, | ||
618 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | ||
619 | |||
620 | if (vma->vm_ops && vma->vm_ops->set_policy) | ||
621 | err = vma->vm_ops->set_policy(vma, new); | ||
622 | if (!err) { | ||
623 | mpol_get(new); | ||
624 | vma->vm_policy = new; | ||
625 | mpol_put(old); | ||
626 | } | ||
627 | return err; | ||
628 | } | ||
629 | |||
630 | /* Step 2: apply policy to a range and do splits. */ | 609 | /* Step 2: apply policy to a range and do splits. */ |
631 | static int mbind_range(struct mm_struct *mm, unsigned long start, | 610 | static int mbind_range(struct mm_struct *mm, unsigned long start, |
632 | unsigned long end, struct mempolicy *new_pol) | 611 | unsigned long end, struct mempolicy *new_pol) |
@@ -666,9 +645,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
666 | if (err) | 645 | if (err) |
667 | goto out; | 646 | goto out; |
668 | } | 647 | } |
669 | err = policy_vma(vma, new_pol); | 648 | |
670 | if (err) | 649 | /* |
671 | goto out; | 650 | * Apply policy to a single VMA. The reference counting of |
651 | * policy for vma_policy linkages has already been handled by | ||
652 | * vma_merge and split_vma as necessary. If this is a shared | ||
653 | * policy then ->set_policy will increment the reference count | ||
654 | * for an sp node. | ||
655 | */ | ||
656 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | ||
657 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | ||
658 | vma->vm_ops, vma->vm_file, | ||
659 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | ||
660 | if (vma->vm_ops && vma->vm_ops->set_policy) { | ||
661 | err = vma->vm_ops->set_policy(vma, new_pol); | ||
662 | if (err) | ||
663 | goto out; | ||
664 | } | ||
672 | } | 665 | } |
673 | 666 | ||
674 | out: | 667 | out: |
@@ -933,7 +926,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
933 | 926 | ||
934 | if (!list_empty(&pagelist)) { | 927 | if (!list_empty(&pagelist)) { |
935 | err = migrate_pages(&pagelist, new_node_page, dest, | 928 | err = migrate_pages(&pagelist, new_node_page, dest, |
936 | false, true); | 929 | false, MIGRATE_SYNC); |
937 | if (err) | 930 | if (err) |
938 | putback_lru_pages(&pagelist); | 931 | putback_lru_pages(&pagelist); |
939 | } | 932 | } |
@@ -1817,18 +1810,24 @@ struct page * | |||
1817 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | 1810 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
1818 | unsigned long addr, int node) | 1811 | unsigned long addr, int node) |
1819 | { | 1812 | { |
1820 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1813 | struct mempolicy *pol; |
1821 | struct zonelist *zl; | 1814 | struct zonelist *zl; |
1822 | struct page *page; | 1815 | struct page *page; |
1816 | unsigned int cpuset_mems_cookie; | ||
1817 | |||
1818 | retry_cpuset: | ||
1819 | pol = get_vma_policy(current, vma, addr); | ||
1820 | cpuset_mems_cookie = get_mems_allowed(); | ||
1823 | 1821 | ||
1824 | get_mems_allowed(); | ||
1825 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1822 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
1826 | unsigned nid; | 1823 | unsigned nid; |
1827 | 1824 | ||
1828 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); | 1825 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); |
1829 | mpol_cond_put(pol); | 1826 | mpol_cond_put(pol); |
1830 | page = alloc_page_interleave(gfp, order, nid); | 1827 | page = alloc_page_interleave(gfp, order, nid); |
1831 | put_mems_allowed(); | 1828 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1829 | goto retry_cpuset; | ||
1830 | |||
1832 | return page; | 1831 | return page; |
1833 | } | 1832 | } |
1834 | zl = policy_zonelist(gfp, pol, node); | 1833 | zl = policy_zonelist(gfp, pol, node); |
@@ -1839,7 +1838,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
1839 | struct page *page = __alloc_pages_nodemask(gfp, order, | 1838 | struct page *page = __alloc_pages_nodemask(gfp, order, |
1840 | zl, policy_nodemask(gfp, pol)); | 1839 | zl, policy_nodemask(gfp, pol)); |
1841 | __mpol_put(pol); | 1840 | __mpol_put(pol); |
1842 | put_mems_allowed(); | 1841 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1842 | goto retry_cpuset; | ||
1843 | return page; | 1843 | return page; |
1844 | } | 1844 | } |
1845 | /* | 1845 | /* |
@@ -1847,7 +1847,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
1847 | */ | 1847 | */ |
1848 | page = __alloc_pages_nodemask(gfp, order, zl, | 1848 | page = __alloc_pages_nodemask(gfp, order, zl, |
1849 | policy_nodemask(gfp, pol)); | 1849 | policy_nodemask(gfp, pol)); |
1850 | put_mems_allowed(); | 1850 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1851 | goto retry_cpuset; | ||
1851 | return page; | 1852 | return page; |
1852 | } | 1853 | } |
1853 | 1854 | ||
@@ -1874,11 +1875,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1874 | { | 1875 | { |
1875 | struct mempolicy *pol = current->mempolicy; | 1876 | struct mempolicy *pol = current->mempolicy; |
1876 | struct page *page; | 1877 | struct page *page; |
1878 | unsigned int cpuset_mems_cookie; | ||
1877 | 1879 | ||
1878 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1880 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
1879 | pol = &default_policy; | 1881 | pol = &default_policy; |
1880 | 1882 | ||
1881 | get_mems_allowed(); | 1883 | retry_cpuset: |
1884 | cpuset_mems_cookie = get_mems_allowed(); | ||
1885 | |||
1882 | /* | 1886 | /* |
1883 | * No reference counting needed for current->mempolicy | 1887 | * No reference counting needed for current->mempolicy |
1884 | * nor system default_policy | 1888 | * nor system default_policy |
@@ -1889,7 +1893,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1889 | page = __alloc_pages_nodemask(gfp, order, | 1893 | page = __alloc_pages_nodemask(gfp, order, |
1890 | policy_zonelist(gfp, pol, numa_node_id()), | 1894 | policy_zonelist(gfp, pol, numa_node_id()), |
1891 | policy_nodemask(gfp, pol)); | 1895 | policy_nodemask(gfp, pol)); |
1892 | put_mems_allowed(); | 1896 | |
1897 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
1898 | goto retry_cpuset; | ||
1899 | |||
1893 | return page; | 1900 | return page; |
1894 | } | 1901 | } |
1895 | EXPORT_SYMBOL(alloc_pages_current); | 1902 | EXPORT_SYMBOL(alloc_pages_current); |
diff --git a/mm/migrate.c b/mm/migrate.c index 666e4e67741..480714b6f3f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
120 | 120 | ||
121 | ptep = pte_offset_map(pmd, addr); | 121 | ptep = pte_offset_map(pmd, addr); |
122 | 122 | ||
123 | if (!is_swap_pte(*ptep)) { | 123 | /* |
124 | pte_unmap(ptep); | 124 | * Peek to check is_swap_pte() before taking ptlock? No, we |
125 | goto out; | 125 | * can race mremap's move_ptes(), which skips anon_vma lock. |
126 | } | 126 | */ |
127 | 127 | ||
128 | ptl = pte_lockptr(mm, pmd); | 128 | ptl = pte_lockptr(mm, pmd); |
129 | } | 129 | } |
@@ -220,6 +220,56 @@ out: | |||
220 | pte_unmap_unlock(ptep, ptl); | 220 | pte_unmap_unlock(ptep, ptl); |
221 | } | 221 | } |
222 | 222 | ||
223 | #ifdef CONFIG_BLOCK | ||
224 | /* Returns true if all buffers are successfully locked */ | ||
225 | static bool buffer_migrate_lock_buffers(struct buffer_head *head, | ||
226 | enum migrate_mode mode) | ||
227 | { | ||
228 | struct buffer_head *bh = head; | ||
229 | |||
230 | /* Simple case, sync compaction */ | ||
231 | if (mode != MIGRATE_ASYNC) { | ||
232 | do { | ||
233 | get_bh(bh); | ||
234 | lock_buffer(bh); | ||
235 | bh = bh->b_this_page; | ||
236 | |||
237 | } while (bh != head); | ||
238 | |||
239 | return true; | ||
240 | } | ||
241 | |||
242 | /* async case, we cannot block on lock_buffer so use trylock_buffer */ | ||
243 | do { | ||
244 | get_bh(bh); | ||
245 | if (!trylock_buffer(bh)) { | ||
246 | /* | ||
247 | * We failed to lock the buffer and cannot stall in | ||
248 | * async migration. Release the taken locks | ||
249 | */ | ||
250 | struct buffer_head *failed_bh = bh; | ||
251 | put_bh(failed_bh); | ||
252 | bh = head; | ||
253 | while (bh != failed_bh) { | ||
254 | unlock_buffer(bh); | ||
255 | put_bh(bh); | ||
256 | bh = bh->b_this_page; | ||
257 | } | ||
258 | return false; | ||
259 | } | ||
260 | |||
261 | bh = bh->b_this_page; | ||
262 | } while (bh != head); | ||
263 | return true; | ||
264 | } | ||
265 | #else | ||
266 | static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, | ||
267 | enum migrate_mode mode) | ||
268 | { | ||
269 | return true; | ||
270 | } | ||
271 | #endif /* CONFIG_BLOCK */ | ||
272 | |||
223 | /* | 273 | /* |
224 | * Replace the page in the mapping. | 274 | * Replace the page in the mapping. |
225 | * | 275 | * |
@@ -229,7 +279,8 @@ out: | |||
229 | * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. | 279 | * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. |
230 | */ | 280 | */ |
231 | static int migrate_page_move_mapping(struct address_space *mapping, | 281 | static int migrate_page_move_mapping(struct address_space *mapping, |
232 | struct page *newpage, struct page *page) | 282 | struct page *newpage, struct page *page, |
283 | struct buffer_head *head, enum migrate_mode mode) | ||
233 | { | 284 | { |
234 | int expected_count; | 285 | int expected_count; |
235 | void **pslot; | 286 | void **pslot; |
@@ -259,6 +310,20 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
259 | } | 310 | } |
260 | 311 | ||
261 | /* | 312 | /* |
313 | * In the async migration case of moving a page with buffers, lock the | ||
314 | * buffers using trylock before the mapping is moved. If the mapping | ||
315 | * was moved, we later failed to lock the buffers and could not move | ||
316 | * the mapping back due to an elevated page count, we would have to | ||
317 | * block waiting on other references to be dropped. | ||
318 | */ | ||
319 | if (mode == MIGRATE_ASYNC && head && | ||
320 | !buffer_migrate_lock_buffers(head, mode)) { | ||
321 | page_unfreeze_refs(page, expected_count); | ||
322 | spin_unlock_irq(&mapping->tree_lock); | ||
323 | return -EAGAIN; | ||
324 | } | ||
325 | |||
326 | /* | ||
262 | * Now we know that no one else is looking at the page. | 327 | * Now we know that no one else is looking at the page. |
263 | */ | 328 | */ |
264 | get_page(newpage); /* add cache reference */ | 329 | get_page(newpage); /* add cache reference */ |
@@ -415,13 +480,14 @@ EXPORT_SYMBOL(fail_migrate_page); | |||
415 | * Pages are locked upon entry and exit. | 480 | * Pages are locked upon entry and exit. |
416 | */ | 481 | */ |
417 | int migrate_page(struct address_space *mapping, | 482 | int migrate_page(struct address_space *mapping, |
418 | struct page *newpage, struct page *page) | 483 | struct page *newpage, struct page *page, |
484 | enum migrate_mode mode) | ||
419 | { | 485 | { |
420 | int rc; | 486 | int rc; |
421 | 487 | ||
422 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ | 488 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ |
423 | 489 | ||
424 | rc = migrate_page_move_mapping(mapping, newpage, page); | 490 | rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); |
425 | 491 | ||
426 | if (rc) | 492 | if (rc) |
427 | return rc; | 493 | return rc; |
@@ -438,28 +504,28 @@ EXPORT_SYMBOL(migrate_page); | |||
438 | * exist. | 504 | * exist. |
439 | */ | 505 | */ |
440 | int buffer_migrate_page(struct address_space *mapping, | 506 | int buffer_migrate_page(struct address_space *mapping, |
441 | struct page *newpage, struct page *page) | 507 | struct page *newpage, struct page *page, enum migrate_mode mode) |
442 | { | 508 | { |
443 | struct buffer_head *bh, *head; | 509 | struct buffer_head *bh, *head; |
444 | int rc; | 510 | int rc; |
445 | 511 | ||
446 | if (!page_has_buffers(page)) | 512 | if (!page_has_buffers(page)) |
447 | return migrate_page(mapping, newpage, page); | 513 | return migrate_page(mapping, newpage, page, mode); |
448 | 514 | ||
449 | head = page_buffers(page); | 515 | head = page_buffers(page); |
450 | 516 | ||
451 | rc = migrate_page_move_mapping(mapping, newpage, page); | 517 | rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); |
452 | 518 | ||
453 | if (rc) | 519 | if (rc) |
454 | return rc; | 520 | return rc; |
455 | 521 | ||
456 | bh = head; | 522 | /* |
457 | do { | 523 | * In the async case, migrate_page_move_mapping locked the buffers |
458 | get_bh(bh); | 524 | * with an IRQ-safe spinlock held. In the sync case, the buffers |
459 | lock_buffer(bh); | 525 | * need to be locked now |
460 | bh = bh->b_this_page; | 526 | */ |
461 | 527 | if (mode != MIGRATE_ASYNC) | |
462 | } while (bh != head); | 528 | BUG_ON(!buffer_migrate_lock_buffers(head, mode)); |
463 | 529 | ||
464 | ClearPagePrivate(page); | 530 | ClearPagePrivate(page); |
465 | set_page_private(newpage, page_private(page)); | 531 | set_page_private(newpage, page_private(page)); |
@@ -536,10 +602,14 @@ static int writeout(struct address_space *mapping, struct page *page) | |||
536 | * Default handling if a filesystem does not provide a migration function. | 602 | * Default handling if a filesystem does not provide a migration function. |
537 | */ | 603 | */ |
538 | static int fallback_migrate_page(struct address_space *mapping, | 604 | static int fallback_migrate_page(struct address_space *mapping, |
539 | struct page *newpage, struct page *page) | 605 | struct page *newpage, struct page *page, enum migrate_mode mode) |
540 | { | 606 | { |
541 | if (PageDirty(page)) | 607 | if (PageDirty(page)) { |
608 | /* Only writeback pages in full synchronous migration */ | ||
609 | if (mode != MIGRATE_SYNC) | ||
610 | return -EBUSY; | ||
542 | return writeout(mapping, page); | 611 | return writeout(mapping, page); |
612 | } | ||
543 | 613 | ||
544 | /* | 614 | /* |
545 | * Buffers may be managed in a filesystem specific way. | 615 | * Buffers may be managed in a filesystem specific way. |
@@ -549,7 +619,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
549 | !try_to_release_page(page, GFP_KERNEL)) | 619 | !try_to_release_page(page, GFP_KERNEL)) |
550 | return -EAGAIN; | 620 | return -EAGAIN; |
551 | 621 | ||
552 | return migrate_page(mapping, newpage, page); | 622 | return migrate_page(mapping, newpage, page, mode); |
553 | } | 623 | } |
554 | 624 | ||
555 | /* | 625 | /* |
@@ -564,7 +634,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
564 | * == 0 - success | 634 | * == 0 - success |
565 | */ | 635 | */ |
566 | static int move_to_new_page(struct page *newpage, struct page *page, | 636 | static int move_to_new_page(struct page *newpage, struct page *page, |
567 | int remap_swapcache, bool sync) | 637 | int remap_swapcache, enum migrate_mode mode) |
568 | { | 638 | { |
569 | struct address_space *mapping; | 639 | struct address_space *mapping; |
570 | int rc; | 640 | int rc; |
@@ -585,29 +655,18 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
585 | 655 | ||
586 | mapping = page_mapping(page); | 656 | mapping = page_mapping(page); |
587 | if (!mapping) | 657 | if (!mapping) |
588 | rc = migrate_page(mapping, newpage, page); | 658 | rc = migrate_page(mapping, newpage, page, mode); |
589 | else { | 659 | else if (mapping->a_ops->migratepage) |
590 | /* | 660 | /* |
591 | * Do not writeback pages if !sync and migratepage is | 661 | * Most pages have a mapping and most filesystems provide a |
592 | * not pointing to migrate_page() which is nonblocking | 662 | * migratepage callback. Anonymous pages are part of swap |
593 | * (swapcache/tmpfs uses migratepage = migrate_page). | 663 | * space which also has its own migratepage callback. This |
664 | * is the most common path for page migration. | ||
594 | */ | 665 | */ |
595 | if (PageDirty(page) && !sync && | 666 | rc = mapping->a_ops->migratepage(mapping, |
596 | mapping->a_ops->migratepage != migrate_page) | 667 | newpage, page, mode); |
597 | rc = -EBUSY; | 668 | else |
598 | else if (mapping->a_ops->migratepage) | 669 | rc = fallback_migrate_page(mapping, newpage, page, mode); |
599 | /* | ||
600 | * Most pages have a mapping and most filesystems | ||
601 | * should provide a migration function. Anonymous | ||
602 | * pages are part of swap space which also has its | ||
603 | * own migration function. This is the most common | ||
604 | * path for page migration. | ||
605 | */ | ||
606 | rc = mapping->a_ops->migratepage(mapping, | ||
607 | newpage, page); | ||
608 | else | ||
609 | rc = fallback_migrate_page(mapping, newpage, page); | ||
610 | } | ||
611 | 670 | ||
612 | if (rc) { | 671 | if (rc) { |
613 | newpage->mapping = NULL; | 672 | newpage->mapping = NULL; |
@@ -621,38 +680,18 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
621 | return rc; | 680 | return rc; |
622 | } | 681 | } |
623 | 682 | ||
624 | /* | 683 | static int __unmap_and_move(struct page *page, struct page *newpage, |
625 | * Obtain the lock on page, remove all ptes and migrate the page | 684 | int force, bool offlining, enum migrate_mode mode) |
626 | * to the newly allocated page in newpage. | ||
627 | */ | ||
628 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | ||
629 | struct page *page, int force, bool offlining, bool sync) | ||
630 | { | 685 | { |
631 | int rc = 0; | 686 | int rc = -EAGAIN; |
632 | int *result = NULL; | ||
633 | struct page *newpage = get_new_page(page, private, &result); | ||
634 | int remap_swapcache = 1; | 687 | int remap_swapcache = 1; |
635 | int charge = 0; | 688 | int charge = 0; |
636 | struct mem_cgroup *mem; | 689 | struct mem_cgroup *mem; |
637 | struct anon_vma *anon_vma = NULL; | 690 | struct anon_vma *anon_vma = NULL; |
638 | 691 | ||
639 | if (!newpage) | ||
640 | return -ENOMEM; | ||
641 | |||
642 | if (page_count(page) == 1) { | ||
643 | /* page was freed from under us. So we are done. */ | ||
644 | goto move_newpage; | ||
645 | } | ||
646 | if (unlikely(PageTransHuge(page))) | ||
647 | if (unlikely(split_huge_page(page))) | ||
648 | goto move_newpage; | ||
649 | |||
650 | /* prepare cgroup just returns 0 or -ENOMEM */ | ||
651 | rc = -EAGAIN; | ||
652 | |||
653 | if (!trylock_page(page)) { | 692 | if (!trylock_page(page)) { |
654 | if (!force || !sync) | 693 | if (!force || mode == MIGRATE_ASYNC) |
655 | goto move_newpage; | 694 | goto out; |
656 | 695 | ||
657 | /* | 696 | /* |
658 | * It's not safe for direct compaction to call lock_page. | 697 | * It's not safe for direct compaction to call lock_page. |
@@ -668,7 +707,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
668 | * altogether. | 707 | * altogether. |
669 | */ | 708 | */ |
670 | if (current->flags & PF_MEMALLOC) | 709 | if (current->flags & PF_MEMALLOC) |
671 | goto move_newpage; | 710 | goto out; |
672 | 711 | ||
673 | lock_page(page); | 712 | lock_page(page); |
674 | } | 713 | } |
@@ -697,10 +736,12 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
697 | 736 | ||
698 | if (PageWriteback(page)) { | 737 | if (PageWriteback(page)) { |
699 | /* | 738 | /* |
700 | * For !sync, there is no point retrying as the retry loop | 739 | * Only in the case of a full syncronous migration is it |
701 | * is expected to be too short for PageWriteback to be cleared | 740 | * necessary to wait for PageWriteback. In the async case, |
741 | * the retry loop is too short and in the sync-light case, | ||
742 | * the overhead of stalling is too much | ||
702 | */ | 743 | */ |
703 | if (!sync) { | 744 | if (mode != MIGRATE_SYNC) { |
704 | rc = -EBUSY; | 745 | rc = -EBUSY; |
705 | goto uncharge; | 746 | goto uncharge; |
706 | } | 747 | } |
@@ -771,7 +812,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
771 | 812 | ||
772 | skip_unmap: | 813 | skip_unmap: |
773 | if (!page_mapped(page)) | 814 | if (!page_mapped(page)) |
774 | rc = move_to_new_page(newpage, page, remap_swapcache, sync); | 815 | rc = move_to_new_page(newpage, page, remap_swapcache, mode); |
775 | 816 | ||
776 | if (rc && remap_swapcache) | 817 | if (rc && remap_swapcache) |
777 | remove_migration_ptes(page, page); | 818 | remove_migration_ptes(page, page); |
@@ -785,27 +826,53 @@ uncharge: | |||
785 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); | 826 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); |
786 | unlock: | 827 | unlock: |
787 | unlock_page(page); | 828 | unlock_page(page); |
829 | out: | ||
830 | return rc; | ||
831 | } | ||
788 | 832 | ||
789 | move_newpage: | 833 | /* |
834 | * Obtain the lock on page, remove all ptes and migrate the page | ||
835 | * to the newly allocated page in newpage. | ||
836 | */ | ||
837 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | ||
838 | struct page *page, int force, bool offlining, | ||
839 | enum migrate_mode mode) | ||
840 | { | ||
841 | int rc = 0; | ||
842 | int *result = NULL; | ||
843 | struct page *newpage = get_new_page(page, private, &result); | ||
844 | |||
845 | if (!newpage) | ||
846 | return -ENOMEM; | ||
847 | |||
848 | if (page_count(page) == 1) { | ||
849 | /* page was freed from under us. So we are done. */ | ||
850 | goto out; | ||
851 | } | ||
852 | |||
853 | if (unlikely(PageTransHuge(page))) | ||
854 | if (unlikely(split_huge_page(page))) | ||
855 | goto out; | ||
856 | |||
857 | rc = __unmap_and_move(page, newpage, force, offlining, mode); | ||
858 | out: | ||
790 | if (rc != -EAGAIN) { | 859 | if (rc != -EAGAIN) { |
791 | /* | 860 | /* |
792 | * A page that has been migrated has all references | 861 | * A page that has been migrated has all references |
793 | * removed and will be freed. A page that has not been | 862 | * removed and will be freed. A page that has not been |
794 | * migrated will have kepts its references and be | 863 | * migrated will have kepts its references and be |
795 | * restored. | 864 | * restored. |
796 | */ | 865 | */ |
797 | list_del(&page->lru); | 866 | list_del(&page->lru); |
798 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 867 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
799 | page_is_file_cache(page)); | 868 | page_is_file_cache(page)); |
800 | putback_lru_page(page); | 869 | putback_lru_page(page); |
801 | } | 870 | } |
802 | |||
803 | /* | 871 | /* |
804 | * Move the new page to the LRU. If migration was not successful | 872 | * Move the new page to the LRU. If migration was not successful |
805 | * then this will free the page. | 873 | * then this will free the page. |
806 | */ | 874 | */ |
807 | putback_lru_page(newpage); | 875 | putback_lru_page(newpage); |
808 | |||
809 | if (result) { | 876 | if (result) { |
810 | if (rc) | 877 | if (rc) |
811 | *result = rc; | 878 | *result = rc; |
@@ -835,7 +902,8 @@ move_newpage: | |||
835 | */ | 902 | */ |
836 | static int unmap_and_move_huge_page(new_page_t get_new_page, | 903 | static int unmap_and_move_huge_page(new_page_t get_new_page, |
837 | unsigned long private, struct page *hpage, | 904 | unsigned long private, struct page *hpage, |
838 | int force, bool offlining, bool sync) | 905 | int force, bool offlining, |
906 | enum migrate_mode mode) | ||
839 | { | 907 | { |
840 | int rc = 0; | 908 | int rc = 0; |
841 | int *result = NULL; | 909 | int *result = NULL; |
@@ -848,7 +916,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
848 | rc = -EAGAIN; | 916 | rc = -EAGAIN; |
849 | 917 | ||
850 | if (!trylock_page(hpage)) { | 918 | if (!trylock_page(hpage)) { |
851 | if (!force || !sync) | 919 | if (!force || mode != MIGRATE_SYNC) |
852 | goto out; | 920 | goto out; |
853 | lock_page(hpage); | 921 | lock_page(hpage); |
854 | } | 922 | } |
@@ -859,7 +927,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
859 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | 927 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); |
860 | 928 | ||
861 | if (!page_mapped(hpage)) | 929 | if (!page_mapped(hpage)) |
862 | rc = move_to_new_page(new_hpage, hpage, 1, sync); | 930 | rc = move_to_new_page(new_hpage, hpage, 1, mode); |
863 | 931 | ||
864 | if (rc) | 932 | if (rc) |
865 | remove_migration_ptes(hpage, hpage); | 933 | remove_migration_ptes(hpage, hpage); |
@@ -902,7 +970,7 @@ out: | |||
902 | */ | 970 | */ |
903 | int migrate_pages(struct list_head *from, | 971 | int migrate_pages(struct list_head *from, |
904 | new_page_t get_new_page, unsigned long private, bool offlining, | 972 | new_page_t get_new_page, unsigned long private, bool offlining, |
905 | bool sync) | 973 | enum migrate_mode mode) |
906 | { | 974 | { |
907 | int retry = 1; | 975 | int retry = 1; |
908 | int nr_failed = 0; | 976 | int nr_failed = 0; |
@@ -923,7 +991,7 @@ int migrate_pages(struct list_head *from, | |||
923 | 991 | ||
924 | rc = unmap_and_move(get_new_page, private, | 992 | rc = unmap_and_move(get_new_page, private, |
925 | page, pass > 2, offlining, | 993 | page, pass > 2, offlining, |
926 | sync); | 994 | mode); |
927 | 995 | ||
928 | switch(rc) { | 996 | switch(rc) { |
929 | case -ENOMEM: | 997 | case -ENOMEM: |
@@ -953,7 +1021,7 @@ out: | |||
953 | 1021 | ||
954 | int migrate_huge_pages(struct list_head *from, | 1022 | int migrate_huge_pages(struct list_head *from, |
955 | new_page_t get_new_page, unsigned long private, bool offlining, | 1023 | new_page_t get_new_page, unsigned long private, bool offlining, |
956 | bool sync) | 1024 | enum migrate_mode mode) |
957 | { | 1025 | { |
958 | int retry = 1; | 1026 | int retry = 1; |
959 | int nr_failed = 0; | 1027 | int nr_failed = 0; |
@@ -970,7 +1038,7 @@ int migrate_huge_pages(struct list_head *from, | |||
970 | 1038 | ||
971 | rc = unmap_and_move_huge_page(get_new_page, | 1039 | rc = unmap_and_move_huge_page(get_new_page, |
972 | private, page, pass > 2, offlining, | 1040 | private, page, pass > 2, offlining, |
973 | sync); | 1041 | mode); |
974 | 1042 | ||
975 | switch(rc) { | 1043 | switch(rc) { |
976 | case -ENOMEM: | 1044 | case -ENOMEM: |
@@ -1099,7 +1167,7 @@ set_status: | |||
1099 | err = 0; | 1167 | err = 0; |
1100 | if (!list_empty(&pagelist)) { | 1168 | if (!list_empty(&pagelist)) { |
1101 | err = migrate_pages(&pagelist, new_page_node, | 1169 | err = migrate_pages(&pagelist, new_page_node, |
1102 | (unsigned long)pm, 0, true); | 1170 | (unsigned long)pm, 0, MIGRATE_SYNC); |
1103 | if (err) | 1171 | if (err) |
1104 | putback_lru_pages(&pagelist); | 1172 | putback_lru_pages(&pagelist); |
1105 | } | 1173 | } |
diff --git a/mm/mincore.c b/mm/mincore.c index a4e6b9d75c7..117ff549279 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -161,7 +161,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
161 | } | 161 | } |
162 | /* fall through */ | 162 | /* fall through */ |
163 | } | 163 | } |
164 | if (pmd_none_or_clear_bad(pmd)) | 164 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
165 | mincore_unmapped_range(vma, addr, next, vec); | 165 | mincore_unmapped_range(vma, addr, next, vec); |
166 | else | 166 | else |
167 | mincore_pte_range(vma, pmd, addr, next, vec); | 167 | mincore_pte_range(vma, pmd, addr, next, vec); |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8d032de4088..71c78115c45 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -33,6 +33,24 @@ | |||
33 | void __mmu_notifier_release(struct mm_struct *mm) | 33 | void __mmu_notifier_release(struct mm_struct *mm) |
34 | { | 34 | { |
35 | struct mmu_notifier *mn; | 35 | struct mmu_notifier *mn; |
36 | struct hlist_node *n; | ||
37 | |||
38 | /* | ||
39 | * RCU here will block mmu_notifier_unregister until | ||
40 | * ->release returns. | ||
41 | */ | ||
42 | rcu_read_lock(); | ||
43 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) | ||
44 | /* | ||
45 | * if ->release runs before mmu_notifier_unregister it | ||
46 | * must be handled as it's the only way for the driver | ||
47 | * to flush all existing sptes and stop the driver | ||
48 | * from establishing any more sptes before all the | ||
49 | * pages in the mm are freed. | ||
50 | */ | ||
51 | if (mn->ops->release) | ||
52 | mn->ops->release(mn, mm); | ||
53 | rcu_read_unlock(); | ||
36 | 54 | ||
37 | spin_lock(&mm->mmu_notifier_mm->lock); | 55 | spin_lock(&mm->mmu_notifier_mm->lock); |
38 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | 56 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
46 | * mmu_notifier_unregister to return. | 64 | * mmu_notifier_unregister to return. |
47 | */ | 65 | */ |
48 | hlist_del_init_rcu(&mn->hlist); | 66 | hlist_del_init_rcu(&mn->hlist); |
49 | /* | ||
50 | * RCU here will block mmu_notifier_unregister until | ||
51 | * ->release returns. | ||
52 | */ | ||
53 | rcu_read_lock(); | ||
54 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
55 | /* | ||
56 | * if ->release runs before mmu_notifier_unregister it | ||
57 | * must be handled as it's the only way for the driver | ||
58 | * to flush all existing sptes and stop the driver | ||
59 | * from establishing any more sptes before all the | ||
60 | * pages in the mm are freed. | ||
61 | */ | ||
62 | if (mn->ops->release) | ||
63 | mn->ops->release(mn, mm); | ||
64 | rcu_read_unlock(); | ||
65 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
66 | } | 67 | } |
67 | spin_unlock(&mm->mmu_notifier_mm->lock); | 68 | spin_unlock(&mm->mmu_notifier_mm->lock); |
68 | 69 | ||
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
284 | { | 285 | { |
285 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 286 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
286 | 287 | ||
287 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
288 | if (!hlist_unhashed(&mn->hlist)) { | 288 | if (!hlist_unhashed(&mn->hlist)) { |
289 | hlist_del_rcu(&mn->hlist); | ||
290 | |||
291 | /* | 289 | /* |
292 | * RCU here will force exit_mmap to wait ->release to finish | 290 | * RCU here will force exit_mmap to wait ->release to finish |
293 | * before freeing the pages. | 291 | * before freeing the pages. |
294 | */ | 292 | */ |
295 | rcu_read_lock(); | 293 | rcu_read_lock(); |
296 | spin_unlock(&mm->mmu_notifier_mm->lock); | 294 | |
297 | /* | 295 | /* |
298 | * exit_mmap will block in mmu_notifier_release to | 296 | * exit_mmap will block in mmu_notifier_release to |
299 | * guarantee ->release is called before freeing the | 297 | * guarantee ->release is called before freeing the |
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
302 | if (mn->ops->release) | 300 | if (mn->ops->release) |
303 | mn->ops->release(mn, mm); | 301 | mn->ops->release(mn, mm); |
304 | rcu_read_unlock(); | 302 | rcu_read_unlock(); |
305 | } else | 303 | |
304 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
305 | hlist_del_rcu(&mn->hlist); | ||
306 | spin_unlock(&mm->mmu_notifier_mm->lock); | 306 | spin_unlock(&mm->mmu_notifier_mm->lock); |
307 | } | ||
307 | 308 | ||
308 | /* | 309 | /* |
309 | * Wait any running method to finish, of course including | 310 | * Wait any running method to finish, of course including |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 6e93dc7f258..e39e3efe4a4 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -83,8 +83,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) | |||
83 | 83 | ||
84 | static void __init __free_pages_memory(unsigned long start, unsigned long end) | 84 | static void __init __free_pages_memory(unsigned long start, unsigned long end) |
85 | { | 85 | { |
86 | int i; | 86 | unsigned long i, start_aligned, end_aligned; |
87 | unsigned long start_aligned, end_aligned; | ||
88 | int order = ilog2(BITS_PER_LONG); | 87 | int order = ilog2(BITS_PER_LONG); |
89 | 88 | ||
90 | start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); | 89 | start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); |
diff --git a/mm/nommu.c b/mm/nommu.c index 9edc897a397..5ff9b35883e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -697,9 +697,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
697 | if (vma->vm_file) { | 697 | if (vma->vm_file) { |
698 | mapping = vma->vm_file->f_mapping; | 698 | mapping = vma->vm_file->f_mapping; |
699 | 699 | ||
700 | mutex_lock(&mapping->i_mmap_mutex); | ||
700 | flush_dcache_mmap_lock(mapping); | 701 | flush_dcache_mmap_lock(mapping); |
701 | vma_prio_tree_insert(vma, &mapping->i_mmap); | 702 | vma_prio_tree_insert(vma, &mapping->i_mmap); |
702 | flush_dcache_mmap_unlock(mapping); | 703 | flush_dcache_mmap_unlock(mapping); |
704 | mutex_unlock(&mapping->i_mmap_mutex); | ||
703 | } | 705 | } |
704 | 706 | ||
705 | /* add the VMA to the tree */ | 707 | /* add the VMA to the tree */ |
@@ -761,9 +763,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
761 | if (vma->vm_file) { | 763 | if (vma->vm_file) { |
762 | mapping = vma->vm_file->f_mapping; | 764 | mapping = vma->vm_file->f_mapping; |
763 | 765 | ||
766 | mutex_lock(&mapping->i_mmap_mutex); | ||
764 | flush_dcache_mmap_lock(mapping); | 767 | flush_dcache_mmap_lock(mapping); |
765 | vma_prio_tree_remove(vma, &mapping->i_mmap); | 768 | vma_prio_tree_remove(vma, &mapping->i_mmap); |
766 | flush_dcache_mmap_unlock(mapping); | 769 | flush_dcache_mmap_unlock(mapping); |
770 | mutex_unlock(&mapping->i_mmap_mutex); | ||
767 | } | 771 | } |
768 | 772 | ||
769 | /* remove from the MM's tree and list */ | 773 | /* remove from the MM's tree and list */ |
@@ -776,8 +780,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
776 | 780 | ||
777 | if (vma->vm_next) | 781 | if (vma->vm_next) |
778 | vma->vm_next->vm_prev = vma->vm_prev; | 782 | vma->vm_next->vm_prev = vma->vm_prev; |
779 | |||
780 | vma->vm_mm = NULL; | ||
781 | } | 783 | } |
782 | 784 | ||
783 | /* | 785 | /* |
@@ -2061,6 +2063,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2061 | high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | 2063 | high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
2062 | 2064 | ||
2063 | down_write(&nommu_region_sem); | 2065 | down_write(&nommu_region_sem); |
2066 | mutex_lock(&inode->i_mapping->i_mmap_mutex); | ||
2064 | 2067 | ||
2065 | /* search for VMAs that fall within the dead zone */ | 2068 | /* search for VMAs that fall within the dead zone */ |
2066 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | 2069 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, |
@@ -2068,6 +2071,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2068 | /* found one - only interested if it's shared out of the page | 2071 | /* found one - only interested if it's shared out of the page |
2069 | * cache */ | 2072 | * cache */ |
2070 | if (vma->vm_flags & VM_SHARED) { | 2073 | if (vma->vm_flags & VM_SHARED) { |
2074 | mutex_unlock(&inode->i_mapping->i_mmap_mutex); | ||
2071 | up_write(&nommu_region_sem); | 2075 | up_write(&nommu_region_sem); |
2072 | return -ETXTBSY; /* not quite true, but near enough */ | 2076 | return -ETXTBSY; /* not quite true, but near enough */ |
2073 | } | 2077 | } |
@@ -2095,6 +2099,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2095 | } | 2099 | } |
2096 | } | 2100 | } |
2097 | 2101 | ||
2102 | mutex_unlock(&inode->i_mapping->i_mmap_mutex); | ||
2098 | up_write(&nommu_region_sem); | 2103 | up_write(&nommu_region_sem); |
2099 | return 0; | 2104 | return 0; |
2100 | } | 2105 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e4b0991ca35..7c72487ca45 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -162,7 +162,7 @@ static bool oom_unkillable_task(struct task_struct *p, | |||
162 | unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, | 162 | unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, |
163 | const nodemask_t *nodemask, unsigned long totalpages) | 163 | const nodemask_t *nodemask, unsigned long totalpages) |
164 | { | 164 | { |
165 | int points; | 165 | long points; |
166 | 166 | ||
167 | if (oom_unkillable_task(p, mem, nodemask)) | 167 | if (oom_unkillable_task(p, mem, nodemask)) |
168 | return 0; | 168 | return 0; |
@@ -303,7 +303,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
303 | do_each_thread(g, p) { | 303 | do_each_thread(g, p) { |
304 | unsigned int points; | 304 | unsigned int points; |
305 | 305 | ||
306 | if (!p->mm) | 306 | if (p->exit_state) |
307 | continue; | 307 | continue; |
308 | if (oom_unkillable_task(p, mem, nodemask)) | 308 | if (oom_unkillable_task(p, mem, nodemask)) |
309 | continue; | 309 | continue; |
@@ -319,6 +319,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
319 | */ | 319 | */ |
320 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) | 320 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) |
321 | return ERR_PTR(-1UL); | 321 | return ERR_PTR(-1UL); |
322 | if (!p->mm) | ||
323 | continue; | ||
322 | 324 | ||
323 | if (p->flags & PF_EXITING) { | 325 | if (p->flags & PF_EXITING) { |
324 | /* | 326 | /* |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 31f69886242..903e46bff32 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -37,24 +37,22 @@ | |||
37 | #include <trace/events/writeback.h> | 37 | #include <trace/events/writeback.h> |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited | 40 | * Sleep at most 200ms at a time in balance_dirty_pages(). |
41 | * will look to see if it needs to force writeback or throttling. | ||
42 | */ | 41 | */ |
43 | static long ratelimit_pages = 32; | 42 | #define MAX_PAUSE max(HZ/5, 1) |
44 | 43 | ||
45 | /* | 44 | /* |
46 | * When balance_dirty_pages decides that the caller needs to perform some | 45 | * Estimate write bandwidth at 200ms intervals. |
47 | * non-background writeback, this is how many pages it will attempt to write. | ||
48 | * It should be somewhat larger than dirtied pages to ensure that reasonably | ||
49 | * large amounts of I/O are submitted. | ||
50 | */ | 46 | */ |
51 | static inline long sync_writeback_pages(unsigned long dirtied) | 47 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) |
52 | { | ||
53 | if (dirtied < ratelimit_pages) | ||
54 | dirtied = ratelimit_pages; | ||
55 | 48 | ||
56 | return dirtied + dirtied / 2; | 49 | #define RATELIMIT_CALC_SHIFT 10 |
57 | } | 50 | |
51 | /* | ||
52 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited | ||
53 | * will look to see if it needs to force writeback or throttling. | ||
54 | */ | ||
55 | static long ratelimit_pages = 32; | ||
58 | 56 | ||
59 | /* The following parameters are exported via /proc/sys/vm */ | 57 | /* The following parameters are exported via /proc/sys/vm */ |
60 | 58 | ||
@@ -111,6 +109,7 @@ EXPORT_SYMBOL(laptop_mode); | |||
111 | 109 | ||
112 | /* End of sysctl-exported parameters */ | 110 | /* End of sysctl-exported parameters */ |
113 | 111 | ||
112 | unsigned long global_dirty_limit; | ||
114 | 113 | ||
115 | /* | 114 | /* |
116 | * Scale the writeback cache size proportional to the relative writeout speeds. | 115 | * Scale the writeback cache size proportional to the relative writeout speeds. |
@@ -156,6 +155,8 @@ static void update_completion_period(void) | |||
156 | int shift = calc_period_shift(); | 155 | int shift = calc_period_shift(); |
157 | prop_change_shift(&vm_completions, shift); | 156 | prop_change_shift(&vm_completions, shift); |
158 | prop_change_shift(&vm_dirties, shift); | 157 | prop_change_shift(&vm_dirties, shift); |
158 | |||
159 | writeback_set_ratelimit(); | ||
159 | } | 160 | } |
160 | 161 | ||
161 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | 162 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
@@ -219,6 +220,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
219 | */ | 220 | */ |
220 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | 221 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) |
221 | { | 222 | { |
223 | __inc_bdi_stat(bdi, BDI_WRITTEN); | ||
222 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, | 224 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, |
223 | bdi->max_prop_frac); | 225 | bdi->max_prop_frac); |
224 | } | 226 | } |
@@ -244,50 +246,8 @@ void task_dirty_inc(struct task_struct *tsk) | |||
244 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | 246 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, |
245 | long *numerator, long *denominator) | 247 | long *numerator, long *denominator) |
246 | { | 248 | { |
247 | if (bdi_cap_writeback_dirty(bdi)) { | 249 | prop_fraction_percpu(&vm_completions, &bdi->completions, |
248 | prop_fraction_percpu(&vm_completions, &bdi->completions, | ||
249 | numerator, denominator); | 250 | numerator, denominator); |
250 | } else { | ||
251 | *numerator = 0; | ||
252 | *denominator = 1; | ||
253 | } | ||
254 | } | ||
255 | |||
256 | static inline void task_dirties_fraction(struct task_struct *tsk, | ||
257 | long *numerator, long *denominator) | ||
258 | { | ||
259 | prop_fraction_single(&vm_dirties, &tsk->dirties, | ||
260 | numerator, denominator); | ||
261 | } | ||
262 | |||
263 | /* | ||
264 | * task_dirty_limit - scale down dirty throttling threshold for one task | ||
265 | * | ||
266 | * task specific dirty limit: | ||
267 | * | ||
268 | * dirty -= (dirty/8) * p_{t} | ||
269 | * | ||
270 | * To protect light/slow dirtying tasks from heavier/fast ones, we start | ||
271 | * throttling individual tasks before reaching the bdi dirty limit. | ||
272 | * Relatively low thresholds will be allocated to heavy dirtiers. So when | ||
273 | * dirty pages grow large, heavy dirtiers will be throttled first, which will | ||
274 | * effectively curb the growth of dirty pages. Light dirtiers with high enough | ||
275 | * dirty threshold may never get throttled. | ||
276 | */ | ||
277 | static unsigned long task_dirty_limit(struct task_struct *tsk, | ||
278 | unsigned long bdi_dirty) | ||
279 | { | ||
280 | long numerator, denominator; | ||
281 | unsigned long dirty = bdi_dirty; | ||
282 | u64 inv = dirty >> 3; | ||
283 | |||
284 | task_dirties_fraction(tsk, &numerator, &denominator); | ||
285 | inv *= numerator; | ||
286 | do_div(inv, denominator); | ||
287 | |||
288 | dirty -= inv; | ||
289 | |||
290 | return max(dirty, bdi_dirty/2); | ||
291 | } | 251 | } |
292 | 252 | ||
293 | /* | 253 | /* |
@@ -397,6 +357,17 @@ unsigned long determine_dirtyable_memory(void) | |||
397 | return x + 1; /* Ensure that we never return 0 */ | 357 | return x + 1; /* Ensure that we never return 0 */ |
398 | } | 358 | } |
399 | 359 | ||
360 | static unsigned long dirty_freerun_ceiling(unsigned long thresh, | ||
361 | unsigned long bg_thresh) | ||
362 | { | ||
363 | return (thresh + bg_thresh) / 2; | ||
364 | } | ||
365 | |||
366 | static unsigned long hard_dirty_limit(unsigned long thresh) | ||
367 | { | ||
368 | return max(thresh, global_dirty_limit); | ||
369 | } | ||
370 | |||
400 | /* | 371 | /* |
401 | * global_dirty_limits - background-writeback and dirty-throttling thresholds | 372 | * global_dirty_limits - background-writeback and dirty-throttling thresholds |
402 | * | 373 | * |
@@ -435,12 +406,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | |||
435 | } | 406 | } |
436 | *pbackground = background; | 407 | *pbackground = background; |
437 | *pdirty = dirty; | 408 | *pdirty = dirty; |
409 | trace_global_dirty_state(background, dirty); | ||
438 | } | 410 | } |
439 | 411 | ||
440 | /* | 412 | /** |
441 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold | 413 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold |
414 | * @bdi: the backing_dev_info to query | ||
415 | * @dirty: global dirty limit in pages | ||
416 | * | ||
417 | * Returns @bdi's dirty limit in pages. The term "dirty" in the context of | ||
418 | * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. | ||
419 | * And the "limit" in the name is not seriously taken as hard limit in | ||
420 | * balance_dirty_pages(). | ||
442 | * | 421 | * |
443 | * Allocate high/low dirty limits to fast/slow devices, in order to prevent | 422 | * It allocates high/low dirty limits to fast/slow devices, in order to prevent |
444 | * - starving fast devices | 423 | * - starving fast devices |
445 | * - piling up dirty pages (that will take long time to sync) on slow devices | 424 | * - piling up dirty pages (that will take long time to sync) on slow devices |
446 | * | 425 | * |
@@ -469,36 +448,588 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | |||
469 | } | 448 | } |
470 | 449 | ||
471 | /* | 450 | /* |
451 | * Dirty position control. | ||
452 | * | ||
453 | * (o) global/bdi setpoints | ||
454 | * | ||
455 | * We want the dirty pages be balanced around the global/bdi setpoints. | ||
456 | * When the number of dirty pages is higher/lower than the setpoint, the | ||
457 | * dirty position control ratio (and hence task dirty ratelimit) will be | ||
458 | * decreased/increased to bring the dirty pages back to the setpoint. | ||
459 | * | ||
460 | * pos_ratio = 1 << RATELIMIT_CALC_SHIFT | ||
461 | * | ||
462 | * if (dirty < setpoint) scale up pos_ratio | ||
463 | * if (dirty > setpoint) scale down pos_ratio | ||
464 | * | ||
465 | * if (bdi_dirty < bdi_setpoint) scale up pos_ratio | ||
466 | * if (bdi_dirty > bdi_setpoint) scale down pos_ratio | ||
467 | * | ||
468 | * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT | ||
469 | * | ||
470 | * (o) global control line | ||
471 | * | ||
472 | * ^ pos_ratio | ||
473 | * | | ||
474 | * | |<===== global dirty control scope ======>| | ||
475 | * 2.0 .............* | ||
476 | * | .* | ||
477 | * | . * | ||
478 | * | . * | ||
479 | * | . * | ||
480 | * | . * | ||
481 | * | . * | ||
482 | * 1.0 ................................* | ||
483 | * | . . * | ||
484 | * | . . * | ||
485 | * | . . * | ||
486 | * | . . * | ||
487 | * | . . * | ||
488 | * 0 +------------.------------------.----------------------*-------------> | ||
489 | * freerun^ setpoint^ limit^ dirty pages | ||
490 | * | ||
491 | * (o) bdi control line | ||
492 | * | ||
493 | * ^ pos_ratio | ||
494 | * | | ||
495 | * | * | ||
496 | * | * | ||
497 | * | * | ||
498 | * | * | ||
499 | * | * |<=========== span ============>| | ||
500 | * 1.0 .......................* | ||
501 | * | . * | ||
502 | * | . * | ||
503 | * | . * | ||
504 | * | . * | ||
505 | * | . * | ||
506 | * | . * | ||
507 | * | . * | ||
508 | * | . * | ||
509 | * | . * | ||
510 | * | . * | ||
511 | * | . * | ||
512 | * 1/4 ...............................................* * * * * * * * * * * * | ||
513 | * | . . | ||
514 | * | . . | ||
515 | * | . . | ||
516 | * 0 +----------------------.-------------------------------.-------------> | ||
517 | * bdi_setpoint^ x_intercept^ | ||
518 | * | ||
519 | * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can | ||
520 | * be smoothly throttled down to normal if it starts high in situations like | ||
521 | * - start writing to a slow SD card and a fast disk at the same time. The SD | ||
522 | * card's bdi_dirty may rush to many times higher than bdi_setpoint. | ||
523 | * - the bdi dirty thresh drops quickly due to change of JBOD workload | ||
524 | */ | ||
525 | static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, | ||
526 | unsigned long thresh, | ||
527 | unsigned long bg_thresh, | ||
528 | unsigned long dirty, | ||
529 | unsigned long bdi_thresh, | ||
530 | unsigned long bdi_dirty) | ||
531 | { | ||
532 | unsigned long write_bw = bdi->avg_write_bandwidth; | ||
533 | unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); | ||
534 | unsigned long limit = hard_dirty_limit(thresh); | ||
535 | unsigned long x_intercept; | ||
536 | unsigned long setpoint; /* dirty pages' target balance point */ | ||
537 | unsigned long bdi_setpoint; | ||
538 | unsigned long span; | ||
539 | long long pos_ratio; /* for scaling up/down the rate limit */ | ||
540 | long x; | ||
541 | |||
542 | if (unlikely(dirty >= limit)) | ||
543 | return 0; | ||
544 | |||
545 | /* | ||
546 | * global setpoint | ||
547 | * | ||
548 | * setpoint - dirty 3 | ||
549 | * f(dirty) := 1.0 + (----------------) | ||
550 | * limit - setpoint | ||
551 | * | ||
552 | * it's a 3rd order polynomial that subjects to | ||
553 | * | ||
554 | * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast | ||
555 | * (2) f(setpoint) = 1.0 => the balance point | ||
556 | * (3) f(limit) = 0 => the hard limit | ||
557 | * (4) df/dx <= 0 => negative feedback control | ||
558 | * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) | ||
559 | * => fast response on large errors; small oscillation near setpoint | ||
560 | */ | ||
561 | setpoint = (freerun + limit) / 2; | ||
562 | x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT, | ||
563 | limit - setpoint + 1); | ||
564 | pos_ratio = x; | ||
565 | pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; | ||
566 | pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; | ||
567 | pos_ratio += 1 << RATELIMIT_CALC_SHIFT; | ||
568 | |||
569 | /* | ||
570 | * We have computed basic pos_ratio above based on global situation. If | ||
571 | * the bdi is over/under its share of dirty pages, we want to scale | ||
572 | * pos_ratio further down/up. That is done by the following mechanism. | ||
573 | */ | ||
574 | |||
575 | /* | ||
576 | * bdi setpoint | ||
577 | * | ||
578 | * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint) | ||
579 | * | ||
580 | * x_intercept - bdi_dirty | ||
581 | * := -------------------------- | ||
582 | * x_intercept - bdi_setpoint | ||
583 | * | ||
584 | * The main bdi control line is a linear function that subjects to | ||
585 | * | ||
586 | * (1) f(bdi_setpoint) = 1.0 | ||
587 | * (2) k = - 1 / (8 * write_bw) (in single bdi case) | ||
588 | * or equally: x_intercept = bdi_setpoint + 8 * write_bw | ||
589 | * | ||
590 | * For single bdi case, the dirty pages are observed to fluctuate | ||
591 | * regularly within range | ||
592 | * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2] | ||
593 | * for various filesystems, where (2) can yield in a reasonable 12.5% | ||
594 | * fluctuation range for pos_ratio. | ||
595 | * | ||
596 | * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its | ||
597 | * own size, so move the slope over accordingly and choose a slope that | ||
598 | * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh. | ||
599 | */ | ||
600 | if (unlikely(bdi_thresh > thresh)) | ||
601 | bdi_thresh = thresh; | ||
602 | bdi_thresh = max(bdi_thresh, (limit - dirty) / 8); | ||
603 | /* | ||
604 | * scale global setpoint to bdi's: | ||
605 | * bdi_setpoint = setpoint * bdi_thresh / thresh | ||
606 | */ | ||
607 | x = div_u64((u64)bdi_thresh << 16, thresh + 1); | ||
608 | bdi_setpoint = setpoint * (u64)x >> 16; | ||
609 | /* | ||
610 | * Use span=(8*write_bw) in single bdi case as indicated by | ||
611 | * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case. | ||
612 | * | ||
613 | * bdi_thresh thresh - bdi_thresh | ||
614 | * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh | ||
615 | * thresh thresh | ||
616 | */ | ||
617 | span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16; | ||
618 | x_intercept = bdi_setpoint + span; | ||
619 | |||
620 | if (bdi_dirty < x_intercept - span / 4) { | ||
621 | pos_ratio *= x_intercept - bdi_dirty; | ||
622 | do_div(pos_ratio, x_intercept - bdi_setpoint + 1); | ||
623 | } else | ||
624 | pos_ratio /= 4; | ||
625 | |||
626 | /* | ||
627 | * bdi reserve area, safeguard against dirty pool underrun and disk idle | ||
628 | * It may push the desired control point of global dirty pages higher | ||
629 | * than setpoint. | ||
630 | */ | ||
631 | x_intercept = bdi_thresh / 2; | ||
632 | if (bdi_dirty < x_intercept) { | ||
633 | if (bdi_dirty > x_intercept / 8) { | ||
634 | pos_ratio *= x_intercept; | ||
635 | do_div(pos_ratio, bdi_dirty); | ||
636 | } else | ||
637 | pos_ratio *= 8; | ||
638 | } | ||
639 | |||
640 | return pos_ratio; | ||
641 | } | ||
642 | |||
643 | static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, | ||
644 | unsigned long elapsed, | ||
645 | unsigned long written) | ||
646 | { | ||
647 | const unsigned long period = roundup_pow_of_two(3 * HZ); | ||
648 | unsigned long avg = bdi->avg_write_bandwidth; | ||
649 | unsigned long old = bdi->write_bandwidth; | ||
650 | u64 bw; | ||
651 | |||
652 | /* | ||
653 | * bw = written * HZ / elapsed | ||
654 | * | ||
655 | * bw * elapsed + write_bandwidth * (period - elapsed) | ||
656 | * write_bandwidth = --------------------------------------------------- | ||
657 | * period | ||
658 | */ | ||
659 | bw = written - bdi->written_stamp; | ||
660 | bw *= HZ; | ||
661 | if (unlikely(elapsed > period)) { | ||
662 | do_div(bw, elapsed); | ||
663 | avg = bw; | ||
664 | goto out; | ||
665 | } | ||
666 | bw += (u64)bdi->write_bandwidth * (period - elapsed); | ||
667 | bw >>= ilog2(period); | ||
668 | |||
669 | /* | ||
670 | * one more level of smoothing, for filtering out sudden spikes | ||
671 | */ | ||
672 | if (avg > old && old >= (unsigned long)bw) | ||
673 | avg -= (avg - old) >> 3; | ||
674 | |||
675 | if (avg < old && old <= (unsigned long)bw) | ||
676 | avg += (old - avg) >> 3; | ||
677 | |||
678 | out: | ||
679 | bdi->write_bandwidth = bw; | ||
680 | bdi->avg_write_bandwidth = avg; | ||
681 | } | ||
682 | |||
683 | /* | ||
684 | * The global dirtyable memory and dirty threshold could be suddenly knocked | ||
685 | * down by a large amount (eg. on the startup of KVM in a swapless system). | ||
686 | * This may throw the system into deep dirty exceeded state and throttle | ||
687 | * heavy/light dirtiers alike. To retain good responsiveness, maintain | ||
688 | * global_dirty_limit for tracking slowly down to the knocked down dirty | ||
689 | * threshold. | ||
690 | */ | ||
691 | static void update_dirty_limit(unsigned long thresh, unsigned long dirty) | ||
692 | { | ||
693 | unsigned long limit = global_dirty_limit; | ||
694 | |||
695 | /* | ||
696 | * Follow up in one step. | ||
697 | */ | ||
698 | if (limit < thresh) { | ||
699 | limit = thresh; | ||
700 | goto update; | ||
701 | } | ||
702 | |||
703 | /* | ||
704 | * Follow down slowly. Use the higher one as the target, because thresh | ||
705 | * may drop below dirty. This is exactly the reason to introduce | ||
706 | * global_dirty_limit which is guaranteed to lie above the dirty pages. | ||
707 | */ | ||
708 | thresh = max(thresh, dirty); | ||
709 | if (limit > thresh) { | ||
710 | limit -= (limit - thresh) >> 5; | ||
711 | goto update; | ||
712 | } | ||
713 | return; | ||
714 | update: | ||
715 | global_dirty_limit = limit; | ||
716 | } | ||
717 | |||
718 | static void global_update_bandwidth(unsigned long thresh, | ||
719 | unsigned long dirty, | ||
720 | unsigned long now) | ||
721 | { | ||
722 | static DEFINE_SPINLOCK(dirty_lock); | ||
723 | static unsigned long update_time; | ||
724 | |||
725 | /* | ||
726 | * check locklessly first to optimize away locking for the most time | ||
727 | */ | ||
728 | if (time_before(now, update_time + BANDWIDTH_INTERVAL)) | ||
729 | return; | ||
730 | |||
731 | spin_lock(&dirty_lock); | ||
732 | if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { | ||
733 | update_dirty_limit(thresh, dirty); | ||
734 | update_time = now; | ||
735 | } | ||
736 | spin_unlock(&dirty_lock); | ||
737 | } | ||
738 | |||
739 | /* | ||
740 | * Maintain bdi->dirty_ratelimit, the base dirty throttle rate. | ||
741 | * | ||
742 | * Normal bdi tasks will be curbed at or below it in long term. | ||
743 | * Obviously it should be around (write_bw / N) when there are N dd tasks. | ||
744 | */ | ||
745 | static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | ||
746 | unsigned long thresh, | ||
747 | unsigned long bg_thresh, | ||
748 | unsigned long dirty, | ||
749 | unsigned long bdi_thresh, | ||
750 | unsigned long bdi_dirty, | ||
751 | unsigned long dirtied, | ||
752 | unsigned long elapsed) | ||
753 | { | ||
754 | unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); | ||
755 | unsigned long limit = hard_dirty_limit(thresh); | ||
756 | unsigned long setpoint = (freerun + limit) / 2; | ||
757 | unsigned long write_bw = bdi->avg_write_bandwidth; | ||
758 | unsigned long dirty_ratelimit = bdi->dirty_ratelimit; | ||
759 | unsigned long dirty_rate; | ||
760 | unsigned long task_ratelimit; | ||
761 | unsigned long balanced_dirty_ratelimit; | ||
762 | unsigned long pos_ratio; | ||
763 | unsigned long step; | ||
764 | unsigned long x; | ||
765 | |||
766 | /* | ||
767 | * The dirty rate will match the writeout rate in long term, except | ||
768 | * when dirty pages are truncated by userspace or re-dirtied by FS. | ||
769 | */ | ||
770 | dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed; | ||
771 | |||
772 | pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty, | ||
773 | bdi_thresh, bdi_dirty); | ||
774 | /* | ||
775 | * task_ratelimit reflects each dd's dirty rate for the past 200ms. | ||
776 | */ | ||
777 | task_ratelimit = (u64)dirty_ratelimit * | ||
778 | pos_ratio >> RATELIMIT_CALC_SHIFT; | ||
779 | task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ | ||
780 | |||
781 | /* | ||
782 | * A linear estimation of the "balanced" throttle rate. The theory is, | ||
783 | * if there are N dd tasks, each throttled at task_ratelimit, the bdi's | ||
784 | * dirty_rate will be measured to be (N * task_ratelimit). So the below | ||
785 | * formula will yield the balanced rate limit (write_bw / N). | ||
786 | * | ||
787 | * Note that the expanded form is not a pure rate feedback: | ||
788 | * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) | ||
789 | * but also takes pos_ratio into account: | ||
790 | * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) | ||
791 | * | ||
792 | * (1) is not realistic because pos_ratio also takes part in balancing | ||
793 | * the dirty rate. Consider the state | ||
794 | * pos_ratio = 0.5 (3) | ||
795 | * rate = 2 * (write_bw / N) (4) | ||
796 | * If (1) is used, it will stuck in that state! Because each dd will | ||
797 | * be throttled at | ||
798 | * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) | ||
799 | * yielding | ||
800 | * dirty_rate = N * task_ratelimit = write_bw (6) | ||
801 | * put (6) into (1) we get | ||
802 | * rate_(i+1) = rate_(i) (7) | ||
803 | * | ||
804 | * So we end up using (2) to always keep | ||
805 | * rate_(i+1) ~= (write_bw / N) (8) | ||
806 | * regardless of the value of pos_ratio. As long as (8) is satisfied, | ||
807 | * pos_ratio is able to drive itself to 1.0, which is not only where | ||
808 | * the dirty count meet the setpoint, but also where the slope of | ||
809 | * pos_ratio is most flat and hence task_ratelimit is least fluctuated. | ||
810 | */ | ||
811 | balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, | ||
812 | dirty_rate | 1); | ||
813 | |||
814 | /* | ||
815 | * We could safely do this and return immediately: | ||
816 | * | ||
817 | * bdi->dirty_ratelimit = balanced_dirty_ratelimit; | ||
818 | * | ||
819 | * However to get a more stable dirty_ratelimit, the below elaborated | ||
820 | * code makes use of task_ratelimit to filter out sigular points and | ||
821 | * limit the step size. | ||
822 | * | ||
823 | * The below code essentially only uses the relative value of | ||
824 | * | ||
825 | * task_ratelimit - dirty_ratelimit | ||
826 | * = (pos_ratio - 1) * dirty_ratelimit | ||
827 | * | ||
828 | * which reflects the direction and size of dirty position error. | ||
829 | */ | ||
830 | |||
831 | /* | ||
832 | * dirty_ratelimit will follow balanced_dirty_ratelimit iff | ||
833 | * task_ratelimit is on the same side of dirty_ratelimit, too. | ||
834 | * For example, when | ||
835 | * - dirty_ratelimit > balanced_dirty_ratelimit | ||
836 | * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) | ||
837 | * lowering dirty_ratelimit will help meet both the position and rate | ||
838 | * control targets. Otherwise, don't update dirty_ratelimit if it will | ||
839 | * only help meet the rate target. After all, what the users ultimately | ||
840 | * feel and care are stable dirty rate and small position error. | ||
841 | * | ||
842 | * |task_ratelimit - dirty_ratelimit| is used to limit the step size | ||
843 | * and filter out the sigular points of balanced_dirty_ratelimit. Which | ||
844 | * keeps jumping around randomly and can even leap far away at times | ||
845 | * due to the small 200ms estimation period of dirty_rate (we want to | ||
846 | * keep that period small to reduce time lags). | ||
847 | */ | ||
848 | step = 0; | ||
849 | if (dirty < setpoint) { | ||
850 | x = min(bdi->balanced_dirty_ratelimit, | ||
851 | min(balanced_dirty_ratelimit, task_ratelimit)); | ||
852 | if (dirty_ratelimit < x) | ||
853 | step = x - dirty_ratelimit; | ||
854 | } else { | ||
855 | x = max(bdi->balanced_dirty_ratelimit, | ||
856 | max(balanced_dirty_ratelimit, task_ratelimit)); | ||
857 | if (dirty_ratelimit > x) | ||
858 | step = dirty_ratelimit - x; | ||
859 | } | ||
860 | |||
861 | /* | ||
862 | * Don't pursue 100% rate matching. It's impossible since the balanced | ||
863 | * rate itself is constantly fluctuating. So decrease the track speed | ||
864 | * when it gets close to the target. Helps eliminate pointless tremors. | ||
865 | */ | ||
866 | step >>= dirty_ratelimit / (2 * step + 1); | ||
867 | /* | ||
868 | * Limit the tracking speed to avoid overshooting. | ||
869 | */ | ||
870 | step = (step + 7) / 8; | ||
871 | |||
872 | if (dirty_ratelimit < balanced_dirty_ratelimit) | ||
873 | dirty_ratelimit += step; | ||
874 | else | ||
875 | dirty_ratelimit -= step; | ||
876 | |||
877 | bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL); | ||
878 | bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit; | ||
879 | |||
880 | trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit); | ||
881 | } | ||
882 | |||
883 | void __bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
884 | unsigned long thresh, | ||
885 | unsigned long bg_thresh, | ||
886 | unsigned long dirty, | ||
887 | unsigned long bdi_thresh, | ||
888 | unsigned long bdi_dirty, | ||
889 | unsigned long start_time) | ||
890 | { | ||
891 | unsigned long now = jiffies; | ||
892 | unsigned long elapsed = now - bdi->bw_time_stamp; | ||
893 | unsigned long dirtied; | ||
894 | unsigned long written; | ||
895 | |||
896 | /* | ||
897 | * rate-limit, only update once every 200ms. | ||
898 | */ | ||
899 | if (elapsed < BANDWIDTH_INTERVAL) | ||
900 | return; | ||
901 | |||
902 | dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]); | ||
903 | written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); | ||
904 | |||
905 | /* | ||
906 | * Skip quiet periods when disk bandwidth is under-utilized. | ||
907 | * (at least 1s idle time between two flusher runs) | ||
908 | */ | ||
909 | if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) | ||
910 | goto snapshot; | ||
911 | |||
912 | if (thresh) { | ||
913 | global_update_bandwidth(thresh, dirty, now); | ||
914 | bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty, | ||
915 | bdi_thresh, bdi_dirty, | ||
916 | dirtied, elapsed); | ||
917 | } | ||
918 | bdi_update_write_bandwidth(bdi, elapsed, written); | ||
919 | |||
920 | snapshot: | ||
921 | bdi->dirtied_stamp = dirtied; | ||
922 | bdi->written_stamp = written; | ||
923 | bdi->bw_time_stamp = now; | ||
924 | } | ||
925 | |||
926 | static void bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
927 | unsigned long thresh, | ||
928 | unsigned long bg_thresh, | ||
929 | unsigned long dirty, | ||
930 | unsigned long bdi_thresh, | ||
931 | unsigned long bdi_dirty, | ||
932 | unsigned long start_time) | ||
933 | { | ||
934 | if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) | ||
935 | return; | ||
936 | spin_lock(&bdi->wb.list_lock); | ||
937 | __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty, | ||
938 | bdi_thresh, bdi_dirty, start_time); | ||
939 | spin_unlock(&bdi->wb.list_lock); | ||
940 | } | ||
941 | |||
942 | /* | ||
943 | * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() | ||
944 | * will look to see if it needs to start dirty throttling. | ||
945 | * | ||
946 | * If dirty_poll_interval is too low, big NUMA machines will call the expensive | ||
947 | * global_page_state() too often. So scale it near-sqrt to the safety margin | ||
948 | * (the number of pages we may dirty without exceeding the dirty limits). | ||
949 | */ | ||
950 | static unsigned long dirty_poll_interval(unsigned long dirty, | ||
951 | unsigned long thresh) | ||
952 | { | ||
953 | if (thresh > dirty) | ||
954 | return 1UL << (ilog2(thresh - dirty) >> 1); | ||
955 | |||
956 | return 1; | ||
957 | } | ||
958 | |||
959 | static unsigned long bdi_max_pause(struct backing_dev_info *bdi, | ||
960 | unsigned long bdi_dirty) | ||
961 | { | ||
962 | unsigned long bw = bdi->avg_write_bandwidth; | ||
963 | unsigned long hi = ilog2(bw); | ||
964 | unsigned long lo = ilog2(bdi->dirty_ratelimit); | ||
965 | unsigned long t; | ||
966 | |||
967 | /* target for 20ms max pause on 1-dd case */ | ||
968 | t = HZ / 50; | ||
969 | |||
970 | /* | ||
971 | * Scale up pause time for concurrent dirtiers in order to reduce CPU | ||
972 | * overheads. | ||
973 | * | ||
974 | * (N * 20ms) on 2^N concurrent tasks. | ||
975 | */ | ||
976 | if (hi > lo) | ||
977 | t += (hi - lo) * (20 * HZ) / 1024; | ||
978 | |||
979 | /* | ||
980 | * Limit pause time for small memory systems. If sleeping for too long | ||
981 | * time, a small pool of dirty/writeback pages may go empty and disk go | ||
982 | * idle. | ||
983 | * | ||
984 | * 8 serves as the safety ratio. | ||
985 | */ | ||
986 | if (bdi_dirty) | ||
987 | t = min(t, bdi_dirty * HZ / (8 * bw + 1)); | ||
988 | |||
989 | /* | ||
990 | * The pause time will be settled within range (max_pause/4, max_pause). | ||
991 | * Apply a minimal value of 4 to get a non-zero max_pause/4. | ||
992 | */ | ||
993 | return clamp_val(t, 4, MAX_PAUSE); | ||
994 | } | ||
995 | |||
996 | /* | ||
472 | * balance_dirty_pages() must be called by processes which are generating dirty | 997 | * balance_dirty_pages() must be called by processes which are generating dirty |
473 | * data. It looks at the number of dirty pages in the machine and will force | 998 | * data. It looks at the number of dirty pages in the machine and will force |
474 | * the caller to perform writeback if the system is over `vm_dirty_ratio'. | 999 | * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. |
475 | * If we're over `background_thresh' then the writeback threads are woken to | 1000 | * If we're over `background_thresh' then the writeback threads are woken to |
476 | * perform some writeout. | 1001 | * perform some writeout. |
477 | */ | 1002 | */ |
478 | static void balance_dirty_pages(struct address_space *mapping, | 1003 | static void balance_dirty_pages(struct address_space *mapping, |
479 | unsigned long write_chunk) | 1004 | unsigned long pages_dirtied) |
480 | { | 1005 | { |
481 | long nr_reclaimable, bdi_nr_reclaimable; | 1006 | unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ |
482 | long nr_writeback, bdi_nr_writeback; | 1007 | unsigned long bdi_reclaimable; |
1008 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ | ||
1009 | unsigned long bdi_dirty; | ||
1010 | unsigned long freerun; | ||
483 | unsigned long background_thresh; | 1011 | unsigned long background_thresh; |
484 | unsigned long dirty_thresh; | 1012 | unsigned long dirty_thresh; |
485 | unsigned long bdi_thresh; | 1013 | unsigned long bdi_thresh; |
486 | unsigned long pages_written = 0; | 1014 | long pause = 0; |
487 | unsigned long pause = 1; | 1015 | long max_pause; |
488 | bool dirty_exceeded = false; | 1016 | bool dirty_exceeded = false; |
1017 | unsigned long task_ratelimit; | ||
1018 | unsigned long dirty_ratelimit; | ||
1019 | unsigned long pos_ratio; | ||
489 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1020 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1021 | unsigned long start_time = jiffies; | ||
490 | 1022 | ||
491 | for (;;) { | 1023 | for (;;) { |
492 | struct writeback_control wbc = { | 1024 | /* |
493 | .sync_mode = WB_SYNC_NONE, | 1025 | * Unstable writes are a feature of certain networked |
494 | .older_than_this = NULL, | 1026 | * filesystems (i.e. NFS) in which data may have been |
495 | .nr_to_write = write_chunk, | 1027 | * written to the server's write cache, but has not yet |
496 | .range_cyclic = 1, | 1028 | * been flushed to permanent storage. |
497 | }; | 1029 | */ |
498 | |||
499 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 1030 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + |
500 | global_page_state(NR_UNSTABLE_NFS); | 1031 | global_page_state(NR_UNSTABLE_NFS); |
501 | nr_writeback = global_page_state(NR_WRITEBACK); | 1032 | nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); |
502 | 1033 | ||
503 | global_dirty_limits(&background_thresh, &dirty_thresh); | 1034 | global_dirty_limits(&background_thresh, &dirty_thresh); |
504 | 1035 | ||
@@ -507,12 +1038,28 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
507 | * catch-up. This avoids (excessively) small writeouts | 1038 | * catch-up. This avoids (excessively) small writeouts |
508 | * when the bdi limits are ramping up. | 1039 | * when the bdi limits are ramping up. |
509 | */ | 1040 | */ |
510 | if (nr_reclaimable + nr_writeback <= | 1041 | freerun = dirty_freerun_ceiling(dirty_thresh, |
511 | (background_thresh + dirty_thresh) / 2) | 1042 | background_thresh); |
1043 | if (nr_dirty <= freerun) | ||
512 | break; | 1044 | break; |
513 | 1045 | ||
1046 | if (unlikely(!writeback_in_progress(bdi))) | ||
1047 | bdi_start_background_writeback(bdi); | ||
1048 | |||
1049 | /* | ||
1050 | * bdi_thresh is not treated as some limiting factor as | ||
1051 | * dirty_thresh, due to reasons | ||
1052 | * - in JBOD setup, bdi_thresh can fluctuate a lot | ||
1053 | * - in a system with HDD and USB key, the USB key may somehow | ||
1054 | * go into state (bdi_dirty >> bdi_thresh) either because | ||
1055 | * bdi_dirty starts high, or because bdi_thresh drops low. | ||
1056 | * In this case we don't want to hard throttle the USB key | ||
1057 | * dirtiers for 100 seconds until bdi_dirty drops under | ||
1058 | * bdi_thresh. Instead the auxiliary bdi control line in | ||
1059 | * bdi_position_ratio() will let the dirtier task progress | ||
1060 | * at some rate <= (write_bw / 2) for bringing down bdi_dirty. | ||
1061 | */ | ||
514 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 1062 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
515 | bdi_thresh = task_dirty_limit(current, bdi_thresh); | ||
516 | 1063 | ||
517 | /* | 1064 | /* |
518 | * In order to avoid the stacked BDI deadlock we need | 1065 | * In order to avoid the stacked BDI deadlock we need |
@@ -524,63 +1071,98 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
524 | * actually dirty; with m+n sitting in the percpu | 1071 | * actually dirty; with m+n sitting in the percpu |
525 | * deltas. | 1072 | * deltas. |
526 | */ | 1073 | */ |
527 | if (bdi_thresh < 2*bdi_stat_error(bdi)) { | 1074 | if (bdi_thresh < 2 * bdi_stat_error(bdi)) { |
528 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | 1075 | bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); |
529 | bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); | 1076 | bdi_dirty = bdi_reclaimable + |
1077 | bdi_stat_sum(bdi, BDI_WRITEBACK); | ||
530 | } else { | 1078 | } else { |
531 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | 1079 | bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
532 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); | 1080 | bdi_dirty = bdi_reclaimable + |
1081 | bdi_stat(bdi, BDI_WRITEBACK); | ||
533 | } | 1082 | } |
534 | 1083 | ||
535 | /* | 1084 | dirty_exceeded = (bdi_dirty > bdi_thresh) || |
536 | * The bdi thresh is somehow "soft" limit derived from the | 1085 | (nr_dirty > dirty_thresh); |
537 | * global "hard" limit. The former helps to prevent heavy IO | 1086 | if (dirty_exceeded && !bdi->dirty_exceeded) |
538 | * bdi or process from holding back light ones; The latter is | 1087 | bdi->dirty_exceeded = 1; |
539 | * the last resort safeguard. | ||
540 | */ | ||
541 | dirty_exceeded = | ||
542 | (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) | ||
543 | || (nr_reclaimable + nr_writeback > dirty_thresh); | ||
544 | 1088 | ||
545 | if (!dirty_exceeded) | 1089 | bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, |
546 | break; | 1090 | nr_dirty, bdi_thresh, bdi_dirty, |
1091 | start_time); | ||
547 | 1092 | ||
548 | if (!bdi->dirty_exceeded) | 1093 | max_pause = bdi_max_pause(bdi, bdi_dirty); |
549 | bdi->dirty_exceeded = 1; | ||
550 | 1094 | ||
551 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | 1095 | dirty_ratelimit = bdi->dirty_ratelimit; |
552 | * Unstable writes are a feature of certain networked | 1096 | pos_ratio = bdi_position_ratio(bdi, dirty_thresh, |
553 | * filesystems (i.e. NFS) in which data may have been | 1097 | background_thresh, nr_dirty, |
554 | * written to the server's write cache, but has not yet | 1098 | bdi_thresh, bdi_dirty); |
555 | * been flushed to permanent storage. | 1099 | if (unlikely(pos_ratio == 0)) { |
556 | * Only move pages to writeback if this bdi is over its | 1100 | pause = max_pause; |
557 | * threshold otherwise wait until the disk writes catch | 1101 | goto pause; |
558 | * up. | 1102 | } |
559 | */ | 1103 | task_ratelimit = (u64)dirty_ratelimit * |
560 | trace_wbc_balance_dirty_start(&wbc, bdi); | 1104 | pos_ratio >> RATELIMIT_CALC_SHIFT; |
561 | if (bdi_nr_reclaimable > bdi_thresh) { | 1105 | pause = (HZ * pages_dirtied) / (task_ratelimit | 1); |
562 | writeback_inodes_wb(&bdi->wb, &wbc); | 1106 | if (unlikely(pause <= 0)) { |
563 | pages_written += write_chunk - wbc.nr_to_write; | 1107 | trace_balance_dirty_pages(bdi, |
564 | trace_wbc_balance_dirty_written(&wbc, bdi); | 1108 | dirty_thresh, |
565 | if (pages_written >= write_chunk) | 1109 | background_thresh, |
566 | break; /* We've done our duty */ | 1110 | nr_dirty, |
1111 | bdi_thresh, | ||
1112 | bdi_dirty, | ||
1113 | dirty_ratelimit, | ||
1114 | task_ratelimit, | ||
1115 | pages_dirtied, | ||
1116 | pause, | ||
1117 | start_time); | ||
1118 | pause = 1; /* avoid resetting nr_dirtied_pause below */ | ||
1119 | break; | ||
567 | } | 1120 | } |
568 | trace_wbc_balance_dirty_wait(&wbc, bdi); | 1121 | pause = min(pause, max_pause); |
1122 | |||
1123 | pause: | ||
1124 | trace_balance_dirty_pages(bdi, | ||
1125 | dirty_thresh, | ||
1126 | background_thresh, | ||
1127 | nr_dirty, | ||
1128 | bdi_thresh, | ||
1129 | bdi_dirty, | ||
1130 | dirty_ratelimit, | ||
1131 | task_ratelimit, | ||
1132 | pages_dirtied, | ||
1133 | pause, | ||
1134 | start_time); | ||
569 | __set_current_state(TASK_UNINTERRUPTIBLE); | 1135 | __set_current_state(TASK_UNINTERRUPTIBLE); |
570 | io_schedule_timeout(pause); | 1136 | io_schedule_timeout(pause); |
571 | 1137 | ||
572 | /* | 1138 | /* |
573 | * Increase the delay for each loop, up to our previous | 1139 | * This is typically equal to (nr_dirty < dirty_thresh) and can |
574 | * default of taking a 100ms nap. | 1140 | * also keep "1000+ dd on a slow USB stick" under control. |
575 | */ | 1141 | */ |
576 | pause <<= 1; | 1142 | if (task_ratelimit) |
577 | if (pause > HZ / 10) | 1143 | break; |
578 | pause = HZ / 10; | ||
579 | } | 1144 | } |
580 | 1145 | ||
581 | if (!dirty_exceeded && bdi->dirty_exceeded) | 1146 | if (!dirty_exceeded && bdi->dirty_exceeded) |
582 | bdi->dirty_exceeded = 0; | 1147 | bdi->dirty_exceeded = 0; |
583 | 1148 | ||
1149 | current->nr_dirtied = 0; | ||
1150 | if (pause == 0) { /* in freerun area */ | ||
1151 | current->nr_dirtied_pause = | ||
1152 | dirty_poll_interval(nr_dirty, dirty_thresh); | ||
1153 | } else if (pause <= max_pause / 4 && | ||
1154 | pages_dirtied >= current->nr_dirtied_pause) { | ||
1155 | current->nr_dirtied_pause = clamp_val( | ||
1156 | dirty_ratelimit * (max_pause / 2) / HZ, | ||
1157 | pages_dirtied + pages_dirtied / 8, | ||
1158 | pages_dirtied * 4); | ||
1159 | } else if (pause >= max_pause) { | ||
1160 | current->nr_dirtied_pause = 1 | clamp_val( | ||
1161 | dirty_ratelimit * (max_pause / 2) / HZ, | ||
1162 | pages_dirtied / 4, | ||
1163 | pages_dirtied - pages_dirtied / 8); | ||
1164 | } | ||
1165 | |||
584 | if (writeback_in_progress(bdi)) | 1166 | if (writeback_in_progress(bdi)) |
585 | return; | 1167 | return; |
586 | 1168 | ||
@@ -592,8 +1174,10 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
592 | * In normal mode, we start background writeout at the lower | 1174 | * In normal mode, we start background writeout at the lower |
593 | * background_thresh, to keep the amount of dirty memory low. | 1175 | * background_thresh, to keep the amount of dirty memory low. |
594 | */ | 1176 | */ |
595 | if ((laptop_mode && pages_written) || | 1177 | if (laptop_mode) |
596 | (!laptop_mode && (nr_reclaimable > background_thresh))) | 1178 | return; |
1179 | |||
1180 | if (nr_reclaimable > background_thresh) | ||
597 | bdi_start_background_writeback(bdi); | 1181 | bdi_start_background_writeback(bdi); |
598 | } | 1182 | } |
599 | 1183 | ||
@@ -607,7 +1191,7 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) | |||
607 | } | 1191 | } |
608 | } | 1192 | } |
609 | 1193 | ||
610 | static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; | 1194 | static DEFINE_PER_CPU(int, bdp_ratelimits); |
611 | 1195 | ||
612 | /** | 1196 | /** |
613 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state | 1197 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state |
@@ -626,28 +1210,40 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; | |||
626 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | 1210 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, |
627 | unsigned long nr_pages_dirtied) | 1211 | unsigned long nr_pages_dirtied) |
628 | { | 1212 | { |
629 | unsigned long ratelimit; | 1213 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
630 | unsigned long *p; | 1214 | int ratelimit; |
1215 | int *p; | ||
1216 | |||
1217 | if (!bdi_cap_account_dirty(bdi)) | ||
1218 | return; | ||
631 | 1219 | ||
632 | ratelimit = ratelimit_pages; | 1220 | ratelimit = current->nr_dirtied_pause; |
633 | if (mapping->backing_dev_info->dirty_exceeded) | 1221 | if (bdi->dirty_exceeded) |
634 | ratelimit = 8; | 1222 | ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); |
635 | 1223 | ||
1224 | current->nr_dirtied += nr_pages_dirtied; | ||
1225 | |||
1226 | preempt_disable(); | ||
636 | /* | 1227 | /* |
637 | * Check the rate limiting. Also, we do not want to throttle real-time | 1228 | * This prevents one CPU to accumulate too many dirtied pages without |
638 | * tasks in balance_dirty_pages(). Period. | 1229 | * calling into balance_dirty_pages(), which can happen when there are |
1230 | * 1000+ tasks, all of them start dirtying pages at exactly the same | ||
1231 | * time, hence all honoured too large initial task->nr_dirtied_pause. | ||
639 | */ | 1232 | */ |
640 | preempt_disable(); | ||
641 | p = &__get_cpu_var(bdp_ratelimits); | 1233 | p = &__get_cpu_var(bdp_ratelimits); |
642 | *p += nr_pages_dirtied; | 1234 | if (unlikely(current->nr_dirtied >= ratelimit)) |
643 | if (unlikely(*p >= ratelimit)) { | ||
644 | ratelimit = sync_writeback_pages(*p); | ||
645 | *p = 0; | 1235 | *p = 0; |
646 | preempt_enable(); | 1236 | else { |
647 | balance_dirty_pages(mapping, ratelimit); | 1237 | *p += nr_pages_dirtied; |
648 | return; | 1238 | if (unlikely(*p >= ratelimit_pages)) { |
1239 | *p = 0; | ||
1240 | ratelimit = 0; | ||
1241 | } | ||
649 | } | 1242 | } |
650 | preempt_enable(); | 1243 | preempt_enable(); |
1244 | |||
1245 | if (unlikely(current->nr_dirtied >= ratelimit)) | ||
1246 | balance_dirty_pages(mapping, current->nr_dirtied); | ||
651 | } | 1247 | } |
652 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); | 1248 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); |
653 | 1249 | ||
@@ -703,7 +1299,8 @@ void laptop_mode_timer_fn(unsigned long data) | |||
703 | * threshold | 1299 | * threshold |
704 | */ | 1300 | */ |
705 | if (bdi_has_dirty_io(&q->backing_dev_info)) | 1301 | if (bdi_has_dirty_io(&q->backing_dev_info)) |
706 | bdi_start_writeback(&q->backing_dev_info, nr_pages); | 1302 | bdi_start_writeback(&q->backing_dev_info, nr_pages, |
1303 | WB_REASON_LAPTOP_TIMER); | ||
707 | } | 1304 | } |
708 | 1305 | ||
709 | /* | 1306 | /* |
@@ -742,22 +1339,17 @@ void laptop_sync_completion(void) | |||
742 | * | 1339 | * |
743 | * Here we set ratelimit_pages to a level which ensures that when all CPUs are | 1340 | * Here we set ratelimit_pages to a level which ensures that when all CPUs are |
744 | * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory | 1341 | * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory |
745 | * thresholds before writeback cuts in. | 1342 | * thresholds. |
746 | * | ||
747 | * But the limit should not be set too high. Because it also controls the | ||
748 | * amount of memory which the balance_dirty_pages() caller has to write back. | ||
749 | * If this is too large then the caller will block on the IO queue all the | ||
750 | * time. So limit it to four megabytes - the balance_dirty_pages() caller | ||
751 | * will write six megabyte chunks, max. | ||
752 | */ | 1343 | */ |
753 | 1344 | ||
754 | void writeback_set_ratelimit(void) | 1345 | void writeback_set_ratelimit(void) |
755 | { | 1346 | { |
756 | ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); | 1347 | unsigned long background_thresh; |
1348 | unsigned long dirty_thresh; | ||
1349 | global_dirty_limits(&background_thresh, &dirty_thresh); | ||
1350 | ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); | ||
757 | if (ratelimit_pages < 16) | 1351 | if (ratelimit_pages < 16) |
758 | ratelimit_pages = 16; | 1352 | ratelimit_pages = 16; |
759 | if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) | ||
760 | ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; | ||
761 | } | 1353 | } |
762 | 1354 | ||
763 | static int __cpuinit | 1355 | static int __cpuinit |
@@ -892,12 +1484,12 @@ int write_cache_pages(struct address_space *mapping, | |||
892 | range_whole = 1; | 1484 | range_whole = 1; |
893 | cycled = 1; /* ignore range_cyclic tests */ | 1485 | cycled = 1; /* ignore range_cyclic tests */ |
894 | } | 1486 | } |
895 | if (wbc->sync_mode == WB_SYNC_ALL) | 1487 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
896 | tag = PAGECACHE_TAG_TOWRITE; | 1488 | tag = PAGECACHE_TAG_TOWRITE; |
897 | else | 1489 | else |
898 | tag = PAGECACHE_TAG_DIRTY; | 1490 | tag = PAGECACHE_TAG_DIRTY; |
899 | retry: | 1491 | retry: |
900 | if (wbc->sync_mode == WB_SYNC_ALL) | 1492 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
901 | tag_pages_for_writeback(mapping, index, end); | 1493 | tag_pages_for_writeback(mapping, index, end); |
902 | done_index = index; | 1494 | done_index = index; |
903 | while (!done && (index <= end)) { | 1495 | while (!done && (index <= end)) { |
@@ -1127,6 +1719,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) | |||
1127 | __inc_zone_page_state(page, NR_FILE_DIRTY); | 1719 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
1128 | __inc_zone_page_state(page, NR_DIRTIED); | 1720 | __inc_zone_page_state(page, NR_DIRTIED); |
1129 | __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); | 1721 | __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); |
1722 | __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); | ||
1130 | task_dirty_inc(current); | 1723 | task_dirty_inc(current); |
1131 | task_io_account_write(PAGE_CACHE_SIZE); | 1724 | task_io_account_write(PAGE_CACHE_SIZE); |
1132 | } | 1725 | } |
@@ -1141,7 +1734,6 @@ EXPORT_SYMBOL(account_page_dirtied); | |||
1141 | void account_page_writeback(struct page *page) | 1734 | void account_page_writeback(struct page *page) |
1142 | { | 1735 | { |
1143 | inc_zone_page_state(page, NR_WRITEBACK); | 1736 | inc_zone_page_state(page, NR_WRITEBACK); |
1144 | inc_zone_page_state(page, NR_WRITTEN); | ||
1145 | } | 1737 | } |
1146 | EXPORT_SYMBOL(account_page_writeback); | 1738 | EXPORT_SYMBOL(account_page_writeback); |
1147 | 1739 | ||
@@ -1358,8 +1950,10 @@ int test_clear_page_writeback(struct page *page) | |||
1358 | } else { | 1950 | } else { |
1359 | ret = TestClearPageWriteback(page); | 1951 | ret = TestClearPageWriteback(page); |
1360 | } | 1952 | } |
1361 | if (ret) | 1953 | if (ret) { |
1362 | dec_zone_page_state(page, NR_WRITEBACK); | 1954 | dec_zone_page_state(page, NR_WRITEBACK); |
1955 | inc_zone_page_state(page, NR_WRITTEN); | ||
1956 | } | ||
1363 | return ret; | 1957 | return ret; |
1364 | } | 1958 | } |
1365 | 1959 | ||
@@ -1405,10 +1999,6 @@ EXPORT_SYMBOL(test_set_page_writeback); | |||
1405 | */ | 1999 | */ |
1406 | int mapping_tagged(struct address_space *mapping, int tag) | 2000 | int mapping_tagged(struct address_space *mapping, int tag) |
1407 | { | 2001 | { |
1408 | int ret; | 2002 | return radix_tree_tagged(&mapping->page_tree, tag); |
1409 | rcu_read_lock(); | ||
1410 | ret = radix_tree_tagged(&mapping->page_tree, tag); | ||
1411 | rcu_read_unlock(); | ||
1412 | return ret; | ||
1413 | } | 2003 | } |
1414 | EXPORT_SYMBOL(mapping_tagged); | 2004 | EXPORT_SYMBOL(mapping_tagged); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e8985acdab..87b0a3f074e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -127,6 +127,20 @@ void pm_restrict_gfp_mask(void) | |||
127 | saved_gfp_mask = gfp_allowed_mask; | 127 | saved_gfp_mask = gfp_allowed_mask; |
128 | gfp_allowed_mask &= ~GFP_IOFS; | 128 | gfp_allowed_mask &= ~GFP_IOFS; |
129 | } | 129 | } |
130 | |||
131 | static bool pm_suspending(void) | ||
132 | { | ||
133 | if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) | ||
134 | return false; | ||
135 | return true; | ||
136 | } | ||
137 | |||
138 | #else | ||
139 | |||
140 | static bool pm_suspending(void) | ||
141 | { | ||
142 | return false; | ||
143 | } | ||
130 | #endif /* CONFIG_PM_SLEEP */ | 144 | #endif /* CONFIG_PM_SLEEP */ |
131 | 145 | ||
132 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 146 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
@@ -176,6 +190,7 @@ static char * const zone_names[MAX_NR_ZONES] = { | |||
176 | }; | 190 | }; |
177 | 191 | ||
178 | int min_free_kbytes = 1024; | 192 | int min_free_kbytes = 1024; |
193 | int min_free_order_shift = 1; | ||
179 | 194 | ||
180 | static unsigned long __meminitdata nr_kernel_pages; | 195 | static unsigned long __meminitdata nr_kernel_pages; |
181 | static unsigned long __meminitdata nr_all_pages; | 196 | static unsigned long __meminitdata nr_all_pages; |
@@ -355,8 +370,8 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
355 | __SetPageHead(page); | 370 | __SetPageHead(page); |
356 | for (i = 1; i < nr_pages; i++) { | 371 | for (i = 1; i < nr_pages; i++) { |
357 | struct page *p = page + i; | 372 | struct page *p = page + i; |
358 | |||
359 | __SetPageTail(p); | 373 | __SetPageTail(p); |
374 | set_page_count(p, 0); | ||
360 | p->first_page = page; | 375 | p->first_page = page; |
361 | } | 376 | } |
362 | } | 377 | } |
@@ -1487,7 +1502,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1487 | free_pages -= z->free_area[o].nr_free << o; | 1502 | free_pages -= z->free_area[o].nr_free << o; |
1488 | 1503 | ||
1489 | /* Require fewer higher order pages to be free */ | 1504 | /* Require fewer higher order pages to be free */ |
1490 | min >>= 1; | 1505 | min >>= min_free_order_shift; |
1491 | 1506 | ||
1492 | if (free_pages <= min) | 1507 | if (free_pages <= min) |
1493 | return false; | 1508 | return false; |
@@ -1616,6 +1631,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | |||
1616 | set_bit(i, zlc->fullzones); | 1631 | set_bit(i, zlc->fullzones); |
1617 | } | 1632 | } |
1618 | 1633 | ||
1634 | /* | ||
1635 | * clear all zones full, called after direct reclaim makes progress so that | ||
1636 | * a zone that was recently full is not skipped over for up to a second | ||
1637 | */ | ||
1638 | static void zlc_clear_zones_full(struct zonelist *zonelist) | ||
1639 | { | ||
1640 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1641 | |||
1642 | zlc = zonelist->zlcache_ptr; | ||
1643 | if (!zlc) | ||
1644 | return; | ||
1645 | |||
1646 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
1647 | } | ||
1648 | |||
1619 | #else /* CONFIG_NUMA */ | 1649 | #else /* CONFIG_NUMA */ |
1620 | 1650 | ||
1621 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | 1651 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) |
@@ -1632,6 +1662,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, | |||
1632 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | 1662 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) |
1633 | { | 1663 | { |
1634 | } | 1664 | } |
1665 | |||
1666 | static void zlc_clear_zones_full(struct zonelist *zonelist) | ||
1667 | { | ||
1668 | } | ||
1635 | #endif /* CONFIG_NUMA */ | 1669 | #endif /* CONFIG_NUMA */ |
1636 | 1670 | ||
1637 | /* | 1671 | /* |
@@ -1664,7 +1698,7 @@ zonelist_scan: | |||
1664 | continue; | 1698 | continue; |
1665 | if ((alloc_flags & ALLOC_CPUSET) && | 1699 | if ((alloc_flags & ALLOC_CPUSET) && |
1666 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1700 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1667 | goto try_next_zone; | 1701 | continue; |
1668 | 1702 | ||
1669 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | 1703 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); |
1670 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1704 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
@@ -1676,17 +1710,36 @@ zonelist_scan: | |||
1676 | classzone_idx, alloc_flags)) | 1710 | classzone_idx, alloc_flags)) |
1677 | goto try_this_zone; | 1711 | goto try_this_zone; |
1678 | 1712 | ||
1713 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { | ||
1714 | /* | ||
1715 | * we do zlc_setup if there are multiple nodes | ||
1716 | * and before considering the first zone allowed | ||
1717 | * by the cpuset. | ||
1718 | */ | ||
1719 | allowednodes = zlc_setup(zonelist, alloc_flags); | ||
1720 | zlc_active = 1; | ||
1721 | did_zlc_setup = 1; | ||
1722 | } | ||
1723 | |||
1679 | if (zone_reclaim_mode == 0) | 1724 | if (zone_reclaim_mode == 0) |
1680 | goto this_zone_full; | 1725 | goto this_zone_full; |
1681 | 1726 | ||
1727 | /* | ||
1728 | * As we may have just activated ZLC, check if the first | ||
1729 | * eligible zone has failed zone_reclaim recently. | ||
1730 | */ | ||
1731 | if (NUMA_BUILD && zlc_active && | ||
1732 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | ||
1733 | continue; | ||
1734 | |||
1682 | ret = zone_reclaim(zone, gfp_mask, order); | 1735 | ret = zone_reclaim(zone, gfp_mask, order); |
1683 | switch (ret) { | 1736 | switch (ret) { |
1684 | case ZONE_RECLAIM_NOSCAN: | 1737 | case ZONE_RECLAIM_NOSCAN: |
1685 | /* did not scan */ | 1738 | /* did not scan */ |
1686 | goto try_next_zone; | 1739 | continue; |
1687 | case ZONE_RECLAIM_FULL: | 1740 | case ZONE_RECLAIM_FULL: |
1688 | /* scanned but unreclaimable */ | 1741 | /* scanned but unreclaimable */ |
1689 | goto this_zone_full; | 1742 | continue; |
1690 | default: | 1743 | default: |
1691 | /* did we reclaim enough */ | 1744 | /* did we reclaim enough */ |
1692 | if (!zone_watermark_ok(zone, order, mark, | 1745 | if (!zone_watermark_ok(zone, order, mark, |
@@ -1703,16 +1756,6 @@ try_this_zone: | |||
1703 | this_zone_full: | 1756 | this_zone_full: |
1704 | if (NUMA_BUILD) | 1757 | if (NUMA_BUILD) |
1705 | zlc_mark_zone_full(zonelist, z); | 1758 | zlc_mark_zone_full(zonelist, z); |
1706 | try_next_zone: | ||
1707 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { | ||
1708 | /* | ||
1709 | * we do zlc_setup after the first zone is tried but only | ||
1710 | * if there are multiple nodes make it worthwhile | ||
1711 | */ | ||
1712 | allowednodes = zlc_setup(zonelist, alloc_flags); | ||
1713 | zlc_active = 1; | ||
1714 | did_zlc_setup = 1; | ||
1715 | } | ||
1716 | } | 1759 | } |
1717 | 1760 | ||
1718 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | 1761 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { |
@@ -1869,14 +1912,20 @@ static struct page * | |||
1869 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 1912 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
1870 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 1913 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1871 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 1914 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
1872 | int migratetype, unsigned long *did_some_progress, | 1915 | int migratetype, bool sync_migration, |
1873 | bool sync_migration) | 1916 | bool *deferred_compaction, |
1917 | unsigned long *did_some_progress) | ||
1874 | { | 1918 | { |
1875 | struct page *page; | 1919 | struct page *page; |
1876 | 1920 | ||
1877 | if (!order || compaction_deferred(preferred_zone)) | 1921 | if (!order) |
1878 | return NULL; | 1922 | return NULL; |
1879 | 1923 | ||
1924 | if (compaction_deferred(preferred_zone)) { | ||
1925 | *deferred_compaction = true; | ||
1926 | return NULL; | ||
1927 | } | ||
1928 | |||
1880 | current->flags |= PF_MEMALLOC; | 1929 | current->flags |= PF_MEMALLOC; |
1881 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 1930 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
1882 | nodemask, sync_migration); | 1931 | nodemask, sync_migration); |
@@ -1904,7 +1953,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
1904 | * but not enough to satisfy watermarks. | 1953 | * but not enough to satisfy watermarks. |
1905 | */ | 1954 | */ |
1906 | count_vm_event(COMPACTFAIL); | 1955 | count_vm_event(COMPACTFAIL); |
1907 | defer_compaction(preferred_zone); | 1956 | |
1957 | /* | ||
1958 | * As async compaction considers a subset of pageblocks, only | ||
1959 | * defer if the failure was a sync compaction failure. | ||
1960 | */ | ||
1961 | if (sync_migration) | ||
1962 | defer_compaction(preferred_zone); | ||
1908 | 1963 | ||
1909 | cond_resched(); | 1964 | cond_resched(); |
1910 | } | 1965 | } |
@@ -1916,8 +1971,9 @@ static inline struct page * | |||
1916 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 1971 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
1917 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 1972 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1918 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 1973 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
1919 | int migratetype, unsigned long *did_some_progress, | 1974 | int migratetype, bool sync_migration, |
1920 | bool sync_migration) | 1975 | bool *deferred_compaction, |
1976 | unsigned long *did_some_progress) | ||
1921 | { | 1977 | { |
1922 | return NULL; | 1978 | return NULL; |
1923 | } | 1979 | } |
@@ -1954,6 +2010,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
1954 | if (unlikely(!(*did_some_progress))) | 2010 | if (unlikely(!(*did_some_progress))) |
1955 | return NULL; | 2011 | return NULL; |
1956 | 2012 | ||
2013 | /* After successful reclaim, reconsider all zones for allocation */ | ||
2014 | if (NUMA_BUILD) | ||
2015 | zlc_clear_zones_full(zonelist); | ||
2016 | |||
1957 | retry: | 2017 | retry: |
1958 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2018 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
1959 | zonelist, high_zoneidx, | 2019 | zonelist, high_zoneidx, |
@@ -2063,6 +2123,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2063 | unsigned long pages_reclaimed = 0; | 2123 | unsigned long pages_reclaimed = 0; |
2064 | unsigned long did_some_progress; | 2124 | unsigned long did_some_progress; |
2065 | bool sync_migration = false; | 2125 | bool sync_migration = false; |
2126 | bool deferred_compaction = false; | ||
2066 | 2127 | ||
2067 | /* | 2128 | /* |
2068 | * In the slowpath, we sanity check order to avoid ever trying to | 2129 | * In the slowpath, we sanity check order to avoid ever trying to |
@@ -2143,12 +2204,22 @@ rebalance: | |||
2143 | zonelist, high_zoneidx, | 2204 | zonelist, high_zoneidx, |
2144 | nodemask, | 2205 | nodemask, |
2145 | alloc_flags, preferred_zone, | 2206 | alloc_flags, preferred_zone, |
2146 | migratetype, &did_some_progress, | 2207 | migratetype, sync_migration, |
2147 | sync_migration); | 2208 | &deferred_compaction, |
2209 | &did_some_progress); | ||
2148 | if (page) | 2210 | if (page) |
2149 | goto got_pg; | 2211 | goto got_pg; |
2150 | sync_migration = true; | 2212 | sync_migration = true; |
2151 | 2213 | ||
2214 | /* | ||
2215 | * If compaction is deferred for high-order allocations, it is because | ||
2216 | * sync compaction recently failed. In this is the case and the caller | ||
2217 | * has requested the system not be heavily disrupted, fail the | ||
2218 | * allocation now instead of entering direct reclaim | ||
2219 | */ | ||
2220 | if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) | ||
2221 | goto nopage; | ||
2222 | |||
2152 | /* Try direct reclaim and then allocating */ | 2223 | /* Try direct reclaim and then allocating */ |
2153 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2224 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
2154 | zonelist, high_zoneidx, | 2225 | zonelist, high_zoneidx, |
@@ -2193,6 +2264,14 @@ rebalance: | |||
2193 | 2264 | ||
2194 | goto restart; | 2265 | goto restart; |
2195 | } | 2266 | } |
2267 | |||
2268 | /* | ||
2269 | * Suspend converts GFP_KERNEL to __GFP_WAIT which can | ||
2270 | * prevent reclaim making forward progress without | ||
2271 | * invoking OOM. Bail if we are suspending | ||
2272 | */ | ||
2273 | if (pm_suspending()) | ||
2274 | goto nopage; | ||
2196 | } | 2275 | } |
2197 | 2276 | ||
2198 | /* Check if we should retry the allocation */ | 2277 | /* Check if we should retry the allocation */ |
@@ -2211,8 +2290,9 @@ rebalance: | |||
2211 | zonelist, high_zoneidx, | 2290 | zonelist, high_zoneidx, |
2212 | nodemask, | 2291 | nodemask, |
2213 | alloc_flags, preferred_zone, | 2292 | alloc_flags, preferred_zone, |
2214 | migratetype, &did_some_progress, | 2293 | migratetype, sync_migration, |
2215 | sync_migration); | 2294 | &deferred_compaction, |
2295 | &did_some_progress); | ||
2216 | if (page) | 2296 | if (page) |
2217 | goto got_pg; | 2297 | goto got_pg; |
2218 | } | 2298 | } |
@@ -2236,8 +2316,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2236 | { | 2316 | { |
2237 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 2317 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
2238 | struct zone *preferred_zone; | 2318 | struct zone *preferred_zone; |
2239 | struct page *page; | 2319 | struct page *page = NULL; |
2240 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2320 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2321 | unsigned int cpuset_mems_cookie; | ||
2241 | 2322 | ||
2242 | gfp_mask &= gfp_allowed_mask; | 2323 | gfp_mask &= gfp_allowed_mask; |
2243 | 2324 | ||
@@ -2256,15 +2337,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2256 | if (unlikely(!zonelist->_zonerefs->zone)) | 2337 | if (unlikely(!zonelist->_zonerefs->zone)) |
2257 | return NULL; | 2338 | return NULL; |
2258 | 2339 | ||
2259 | get_mems_allowed(); | 2340 | retry_cpuset: |
2341 | cpuset_mems_cookie = get_mems_allowed(); | ||
2342 | |||
2260 | /* The preferred zone is used for statistics later */ | 2343 | /* The preferred zone is used for statistics later */ |
2261 | first_zones_zonelist(zonelist, high_zoneidx, | 2344 | first_zones_zonelist(zonelist, high_zoneidx, |
2262 | nodemask ? : &cpuset_current_mems_allowed, | 2345 | nodemask ? : &cpuset_current_mems_allowed, |
2263 | &preferred_zone); | 2346 | &preferred_zone); |
2264 | if (!preferred_zone) { | 2347 | if (!preferred_zone) |
2265 | put_mems_allowed(); | 2348 | goto out; |
2266 | return NULL; | ||
2267 | } | ||
2268 | 2349 | ||
2269 | /* First allocation attempt */ | 2350 | /* First allocation attempt */ |
2270 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2351 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
@@ -2274,9 +2355,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2274 | page = __alloc_pages_slowpath(gfp_mask, order, | 2355 | page = __alloc_pages_slowpath(gfp_mask, order, |
2275 | zonelist, high_zoneidx, nodemask, | 2356 | zonelist, high_zoneidx, nodemask, |
2276 | preferred_zone, migratetype); | 2357 | preferred_zone, migratetype); |
2277 | put_mems_allowed(); | ||
2278 | 2358 | ||
2279 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2359 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
2360 | |||
2361 | out: | ||
2362 | /* | ||
2363 | * When updating a task's mems_allowed, it is possible to race with | ||
2364 | * parallel threads in such a way that an allocation can fail while | ||
2365 | * the mask is being updated. If a page allocation is about to fail, | ||
2366 | * check if the cpuset changed during allocation and if so, retry. | ||
2367 | */ | ||
2368 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
2369 | goto retry_cpuset; | ||
2370 | |||
2280 | return page; | 2371 | return page; |
2281 | } | 2372 | } |
2282 | EXPORT_SYMBOL(__alloc_pages_nodemask); | 2373 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
@@ -2500,13 +2591,15 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
2500 | bool skip_free_areas_node(unsigned int flags, int nid) | 2591 | bool skip_free_areas_node(unsigned int flags, int nid) |
2501 | { | 2592 | { |
2502 | bool ret = false; | 2593 | bool ret = false; |
2594 | unsigned int cpuset_mems_cookie; | ||
2503 | 2595 | ||
2504 | if (!(flags & SHOW_MEM_FILTER_NODES)) | 2596 | if (!(flags & SHOW_MEM_FILTER_NODES)) |
2505 | goto out; | 2597 | goto out; |
2506 | 2598 | ||
2507 | get_mems_allowed(); | 2599 | do { |
2508 | ret = !node_isset(nid, cpuset_current_mems_allowed); | 2600 | cpuset_mems_cookie = get_mems_allowed(); |
2509 | put_mems_allowed(); | 2601 | ret = !node_isset(nid, cpuset_current_mems_allowed); |
2602 | } while (!put_mems_allowed(cpuset_mems_cookie)); | ||
2510 | out: | 2603 | out: |
2511 | return ret; | 2604 | return ret; |
2512 | } | 2605 | } |
@@ -3356,9 +3449,15 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3356 | unsigned long block_migratetype; | 3449 | unsigned long block_migratetype; |
3357 | int reserve; | 3450 | int reserve; |
3358 | 3451 | ||
3359 | /* Get the start pfn, end pfn and the number of blocks to reserve */ | 3452 | /* |
3453 | * Get the start pfn, end pfn and the number of blocks to reserve | ||
3454 | * We have to be careful to be aligned to pageblock_nr_pages to | ||
3455 | * make sure that we always check pfn_valid for the first page in | ||
3456 | * the block. | ||
3457 | */ | ||
3360 | start_pfn = zone->zone_start_pfn; | 3458 | start_pfn = zone->zone_start_pfn; |
3361 | end_pfn = start_pfn + zone->spanned_pages; | 3459 | end_pfn = start_pfn + zone->spanned_pages; |
3460 | start_pfn = roundup(start_pfn, pageblock_nr_pages); | ||
3362 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> | 3461 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> |
3363 | pageblock_order; | 3462 | pageblock_order; |
3364 | 3463 | ||
@@ -3380,25 +3479,33 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3380 | if (page_to_nid(page) != zone_to_nid(zone)) | 3479 | if (page_to_nid(page) != zone_to_nid(zone)) |
3381 | continue; | 3480 | continue; |
3382 | 3481 | ||
3383 | /* Blocks with reserved pages will never free, skip them. */ | ||
3384 | block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); | ||
3385 | if (pageblock_is_reserved(pfn, block_end_pfn)) | ||
3386 | continue; | ||
3387 | |||
3388 | block_migratetype = get_pageblock_migratetype(page); | 3482 | block_migratetype = get_pageblock_migratetype(page); |
3389 | 3483 | ||
3390 | /* If this block is reserved, account for it */ | 3484 | /* Only test what is necessary when the reserves are not met */ |
3391 | if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { | 3485 | if (reserve > 0) { |
3392 | reserve--; | 3486 | /* |
3393 | continue; | 3487 | * Blocks with reserved pages will never free, skip |
3394 | } | 3488 | * them. |
3489 | */ | ||
3490 | block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); | ||
3491 | if (pageblock_is_reserved(pfn, block_end_pfn)) | ||
3492 | continue; | ||
3395 | 3493 | ||
3396 | /* Suitable for reserving if this block is movable */ | 3494 | /* If this block is reserved, account for it */ |
3397 | if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { | 3495 | if (block_migratetype == MIGRATE_RESERVE) { |
3398 | set_pageblock_migratetype(page, MIGRATE_RESERVE); | 3496 | reserve--; |
3399 | move_freepages_block(zone, page, MIGRATE_RESERVE); | 3497 | continue; |
3400 | reserve--; | 3498 | } |
3401 | continue; | 3499 | |
3500 | /* Suitable for reserving if this block is movable */ | ||
3501 | if (block_migratetype == MIGRATE_MOVABLE) { | ||
3502 | set_pageblock_migratetype(page, | ||
3503 | MIGRATE_RESERVE); | ||
3504 | move_freepages_block(zone, page, | ||
3505 | MIGRATE_RESERVE); | ||
3506 | reserve--; | ||
3507 | continue; | ||
3508 | } | ||
3402 | } | 3509 | } |
3403 | 3510 | ||
3404 | /* | 3511 | /* |
@@ -5527,6 +5634,17 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) | |||
5527 | bool is_pageblock_removable_nolock(struct page *page) | 5634 | bool is_pageblock_removable_nolock(struct page *page) |
5528 | { | 5635 | { |
5529 | struct zone *zone = page_zone(page); | 5636 | struct zone *zone = page_zone(page); |
5637 | unsigned long pfn = page_to_pfn(page); | ||
5638 | |||
5639 | /* | ||
5640 | * We have to be careful here because we are iterating over memory | ||
5641 | * sections which are not zone aware so we might end up outside of | ||
5642 | * the zone but still within the section. | ||
5643 | */ | ||
5644 | if (!zone || zone->zone_start_pfn > pfn || | ||
5645 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | ||
5646 | return false; | ||
5647 | |||
5530 | return __count_immobile_pages(zone, page, 0); | 5648 | return __count_immobile_pages(zone, page, 0); |
5531 | } | 5649 | } |
5532 | 5650 | ||
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index c3450d53361..87eac0ea2bf 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -59,7 +59,7 @@ again: | |||
59 | continue; | 59 | continue; |
60 | 60 | ||
61 | split_huge_page_pmd(walk->mm, pmd); | 61 | split_huge_page_pmd(walk->mm, pmd); |
62 | if (pmd_none_or_clear_bad(pmd)) | 62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
63 | goto again; | 63 | goto again; |
64 | err = walk_pte_range(pmd, addr, next, walk); | 64 | err = walk_pte_range(pmd, addr, next, walk); |
65 | if (err) | 65 | if (err) |
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index ea534960a04..bfad7246665 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c | |||
@@ -143,8 +143,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, | |||
143 | int page_start, int page_end) | 143 | int page_start, int page_end) |
144 | { | 144 | { |
145 | flush_cache_vunmap( | 145 | flush_cache_vunmap( |
146 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | 146 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
147 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); | 147 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
148 | } | 148 | } |
149 | 149 | ||
150 | static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) | 150 | static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) |
@@ -206,8 +206,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, | |||
206 | int page_start, int page_end) | 206 | int page_start, int page_end) |
207 | { | 207 | { |
208 | flush_tlb_kernel_range( | 208 | flush_tlb_kernel_range( |
209 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | 209 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
210 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); | 210 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
211 | } | 211 | } |
212 | 212 | ||
213 | static int __pcpu_map_pages(unsigned long addr, struct page **pages, | 213 | static int __pcpu_map_pages(unsigned long addr, struct page **pages, |
@@ -284,8 +284,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, | |||
284 | int page_start, int page_end) | 284 | int page_start, int page_end) |
285 | { | 285 | { |
286 | flush_cache_vmap( | 286 | flush_cache_vmap( |
287 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | 287 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
288 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); | 288 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
289 | } | 289 | } |
290 | 290 | ||
291 | /** | 291 | /** |
diff --git a/mm/percpu.c b/mm/percpu.c index bf80e55dbed..af0cc7a58f9 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -116,9 +116,9 @@ static int pcpu_atom_size __read_mostly; | |||
116 | static int pcpu_nr_slots __read_mostly; | 116 | static int pcpu_nr_slots __read_mostly; |
117 | static size_t pcpu_chunk_struct_size __read_mostly; | 117 | static size_t pcpu_chunk_struct_size __read_mostly; |
118 | 118 | ||
119 | /* cpus with the lowest and highest unit numbers */ | 119 | /* cpus with the lowest and highest unit addresses */ |
120 | static unsigned int pcpu_first_unit_cpu __read_mostly; | 120 | static unsigned int pcpu_low_unit_cpu __read_mostly; |
121 | static unsigned int pcpu_last_unit_cpu __read_mostly; | 121 | static unsigned int pcpu_high_unit_cpu __read_mostly; |
122 | 122 | ||
123 | /* the address of the first chunk which starts with the kernel static area */ | 123 | /* the address of the first chunk which starts with the kernel static area */ |
124 | void *pcpu_base_addr __read_mostly; | 124 | void *pcpu_base_addr __read_mostly; |
@@ -984,19 +984,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) | |||
984 | { | 984 | { |
985 | void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); | 985 | void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); |
986 | bool in_first_chunk = false; | 986 | bool in_first_chunk = false; |
987 | unsigned long first_start, first_end; | 987 | unsigned long first_low, first_high; |
988 | unsigned int cpu; | 988 | unsigned int cpu; |
989 | 989 | ||
990 | /* | 990 | /* |
991 | * The following test on first_start/end isn't strictly | 991 | * The following test on unit_low/high isn't strictly |
992 | * necessary but will speed up lookups of addresses which | 992 | * necessary but will speed up lookups of addresses which |
993 | * aren't in the first chunk. | 993 | * aren't in the first chunk. |
994 | */ | 994 | */ |
995 | first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0); | 995 | first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0); |
996 | first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu, | 996 | first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu, |
997 | pcpu_unit_pages); | 997 | pcpu_unit_pages); |
998 | if ((unsigned long)addr >= first_start && | 998 | if ((unsigned long)addr >= first_low && |
999 | (unsigned long)addr < first_end) { | 999 | (unsigned long)addr < first_high) { |
1000 | for_each_possible_cpu(cpu) { | 1000 | for_each_possible_cpu(cpu) { |
1001 | void *start = per_cpu_ptr(base, cpu); | 1001 | void *start = per_cpu_ptr(base, cpu); |
1002 | 1002 | ||
@@ -1011,9 +1011,11 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) | |||
1011 | if (!is_vmalloc_addr(addr)) | 1011 | if (!is_vmalloc_addr(addr)) |
1012 | return __pa(addr); | 1012 | return __pa(addr); |
1013 | else | 1013 | else |
1014 | return page_to_phys(vmalloc_to_page(addr)); | 1014 | return page_to_phys(vmalloc_to_page(addr)) + |
1015 | offset_in_page(addr); | ||
1015 | } else | 1016 | } else |
1016 | return page_to_phys(pcpu_addr_to_page(addr)); | 1017 | return page_to_phys(pcpu_addr_to_page(addr)) + |
1018 | offset_in_page(addr); | ||
1017 | } | 1019 | } |
1018 | 1020 | ||
1019 | /** | 1021 | /** |
@@ -1233,7 +1235,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1233 | 1235 | ||
1234 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) | 1236 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) |
1235 | unit_map[cpu] = UINT_MAX; | 1237 | unit_map[cpu] = UINT_MAX; |
1236 | pcpu_first_unit_cpu = NR_CPUS; | 1238 | |
1239 | pcpu_low_unit_cpu = NR_CPUS; | ||
1240 | pcpu_high_unit_cpu = NR_CPUS; | ||
1237 | 1241 | ||
1238 | for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { | 1242 | for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { |
1239 | const struct pcpu_group_info *gi = &ai->groups[group]; | 1243 | const struct pcpu_group_info *gi = &ai->groups[group]; |
@@ -1253,9 +1257,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1253 | unit_map[cpu] = unit + i; | 1257 | unit_map[cpu] = unit + i; |
1254 | unit_off[cpu] = gi->base_offset + i * ai->unit_size; | 1258 | unit_off[cpu] = gi->base_offset + i * ai->unit_size; |
1255 | 1259 | ||
1256 | if (pcpu_first_unit_cpu == NR_CPUS) | 1260 | /* determine low/high unit_cpu */ |
1257 | pcpu_first_unit_cpu = cpu; | 1261 | if (pcpu_low_unit_cpu == NR_CPUS || |
1258 | pcpu_last_unit_cpu = cpu; | 1262 | unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) |
1263 | pcpu_low_unit_cpu = cpu; | ||
1264 | if (pcpu_high_unit_cpu == NR_CPUS || | ||
1265 | unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) | ||
1266 | pcpu_high_unit_cpu = cpu; | ||
1259 | } | 1267 | } |
1260 | } | 1268 | } |
1261 | pcpu_nr_units = unit; | 1269 | pcpu_nr_units = unit; |
@@ -1622,6 +1630,16 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, | |||
1622 | areas[group] = ptr; | 1630 | areas[group] = ptr; |
1623 | 1631 | ||
1624 | base = min(ptr, base); | 1632 | base = min(ptr, base); |
1633 | } | ||
1634 | |||
1635 | /* | ||
1636 | * Copy data and free unused parts. This should happen after all | ||
1637 | * allocations are complete; otherwise, we may end up with | ||
1638 | * overlapping groups. | ||
1639 | */ | ||
1640 | for (group = 0; group < ai->nr_groups; group++) { | ||
1641 | struct pcpu_group_info *gi = &ai->groups[group]; | ||
1642 | void *ptr = areas[group]; | ||
1625 | 1643 | ||
1626 | for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { | 1644 | for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { |
1627 | if (gi->cpu_map[i] == NR_CPUS) { | 1645 | if (gi->cpu_map[i] == NR_CPUS) { |
diff --git a/mm/shmem.c b/mm/shmem.c index fcedf5464eb..b5a1b89b2d6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -51,6 +51,7 @@ static struct vfsmount *shm_mnt; | |||
51 | #include <linux/shmem_fs.h> | 51 | #include <linux/shmem_fs.h> |
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/blkdev.h> | 53 | #include <linux/blkdev.h> |
54 | #include <linux/splice.h> | ||
54 | #include <linux/security.h> | 55 | #include <linux/security.h> |
55 | #include <linux/swapops.h> | 56 | #include <linux/swapops.h> |
56 | #include <linux/mempolicy.h> | 57 | #include <linux/mempolicy.h> |
@@ -126,8 +127,15 @@ static unsigned long shmem_default_max_inodes(void) | |||
126 | } | 127 | } |
127 | #endif | 128 | #endif |
128 | 129 | ||
129 | static int shmem_getpage(struct inode *inode, unsigned long idx, | 130 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
130 | struct page **pagep, enum sgp_type sgp, int *type); | 131 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); |
132 | |||
133 | static inline int shmem_getpage(struct inode *inode, pgoff_t index, | ||
134 | struct page **pagep, enum sgp_type sgp, int *fault_type) | ||
135 | { | ||
136 | return shmem_getpage_gfp(inode, index, pagep, sgp, | ||
137 | mapping_gfp_mask(inode->i_mapping), fault_type); | ||
138 | } | ||
131 | 139 | ||
132 | static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) | 140 | static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) |
133 | { | 141 | { |
@@ -405,10 +413,12 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns | |||
405 | * @info: info structure for the inode | 413 | * @info: info structure for the inode |
406 | * @index: index of the page to find | 414 | * @index: index of the page to find |
407 | * @sgp: check and recheck i_size? skip allocation? | 415 | * @sgp: check and recheck i_size? skip allocation? |
416 | * @gfp: gfp mask to use for any page allocation | ||
408 | * | 417 | * |
409 | * If the entry does not exist, allocate it. | 418 | * If the entry does not exist, allocate it. |
410 | */ | 419 | */ |
411 | static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) | 420 | static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, |
421 | unsigned long index, enum sgp_type sgp, gfp_t gfp) | ||
412 | { | 422 | { |
413 | struct inode *inode = &info->vfs_inode; | 423 | struct inode *inode = &info->vfs_inode; |
414 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | 424 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); |
@@ -438,7 +448,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long | |||
438 | } | 448 | } |
439 | 449 | ||
440 | spin_unlock(&info->lock); | 450 | spin_unlock(&info->lock); |
441 | page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); | 451 | page = shmem_dir_alloc(gfp); |
442 | spin_lock(&info->lock); | 452 | spin_lock(&info->lock); |
443 | 453 | ||
444 | if (!page) { | 454 | if (!page) { |
@@ -1228,92 +1238,83 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
1228 | #endif | 1238 | #endif |
1229 | 1239 | ||
1230 | /* | 1240 | /* |
1231 | * shmem_getpage - either get the page from swap or allocate a new one | 1241 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate |
1232 | * | 1242 | * |
1233 | * If we allocate a new one we do not mark it dirty. That's up to the | 1243 | * If we allocate a new one we do not mark it dirty. That's up to the |
1234 | * vm. If we swap it in we mark it dirty since we also free the swap | 1244 | * vm. If we swap it in we mark it dirty since we also free the swap |
1235 | * entry since a page cannot live in both the swap and page cache | 1245 | * entry since a page cannot live in both the swap and page cache |
1236 | */ | 1246 | */ |
1237 | static int shmem_getpage(struct inode *inode, unsigned long idx, | 1247 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, |
1238 | struct page **pagep, enum sgp_type sgp, int *type) | 1248 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) |
1239 | { | 1249 | { |
1240 | struct address_space *mapping = inode->i_mapping; | 1250 | struct address_space *mapping = inode->i_mapping; |
1241 | struct shmem_inode_info *info = SHMEM_I(inode); | 1251 | struct shmem_inode_info *info = SHMEM_I(inode); |
1242 | struct shmem_sb_info *sbinfo; | 1252 | struct shmem_sb_info *sbinfo; |
1243 | struct page *filepage = *pagep; | 1253 | struct page *page; |
1244 | struct page *swappage; | ||
1245 | struct page *prealloc_page = NULL; | 1254 | struct page *prealloc_page = NULL; |
1246 | swp_entry_t *entry; | 1255 | swp_entry_t *entry; |
1247 | swp_entry_t swap; | 1256 | swp_entry_t swap; |
1248 | gfp_t gfp; | ||
1249 | int error; | 1257 | int error; |
1258 | int ret; | ||
1250 | 1259 | ||
1251 | if (idx >= SHMEM_MAX_INDEX) | 1260 | if (idx >= SHMEM_MAX_INDEX) |
1252 | return -EFBIG; | 1261 | return -EFBIG; |
1253 | |||
1254 | if (type) | ||
1255 | *type = 0; | ||
1256 | |||
1257 | /* | ||
1258 | * Normally, filepage is NULL on entry, and either found | ||
1259 | * uptodate immediately, or allocated and zeroed, or read | ||
1260 | * in under swappage, which is then assigned to filepage. | ||
1261 | * But shmem_readpage (required for splice) passes in a locked | ||
1262 | * filepage, which may be found not uptodate by other callers | ||
1263 | * too, and may need to be copied from the swappage read in. | ||
1264 | */ | ||
1265 | repeat: | 1262 | repeat: |
1266 | if (!filepage) | 1263 | page = find_lock_page(mapping, idx); |
1267 | filepage = find_lock_page(mapping, idx); | 1264 | if (page) { |
1268 | if (filepage && PageUptodate(filepage)) | ||
1269 | goto done; | ||
1270 | gfp = mapping_gfp_mask(mapping); | ||
1271 | if (!filepage) { | ||
1272 | /* | 1265 | /* |
1273 | * Try to preload while we can wait, to not make a habit of | 1266 | * Once we can get the page lock, it must be uptodate: |
1274 | * draining atomic reserves; but don't latch on to this cpu. | 1267 | * if there were an error in reading back from swap, |
1268 | * the page would not be inserted into the filecache. | ||
1275 | */ | 1269 | */ |
1276 | error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); | 1270 | BUG_ON(!PageUptodate(page)); |
1277 | if (error) | 1271 | goto done; |
1278 | goto failed; | 1272 | } |
1279 | radix_tree_preload_end(); | 1273 | |
1280 | if (sgp != SGP_READ && !prealloc_page) { | 1274 | /* |
1281 | /* We don't care if this fails */ | 1275 | * Try to preload while we can wait, to not make a habit of |
1282 | prealloc_page = shmem_alloc_page(gfp, info, idx); | 1276 | * draining atomic reserves; but don't latch on to this cpu. |
1283 | if (prealloc_page) { | 1277 | */ |
1284 | if (mem_cgroup_cache_charge(prealloc_page, | 1278 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); |
1285 | current->mm, GFP_KERNEL)) { | 1279 | if (error) |
1286 | page_cache_release(prealloc_page); | 1280 | goto out; |
1287 | prealloc_page = NULL; | 1281 | radix_tree_preload_end(); |
1288 | } | 1282 | |
1283 | if (sgp != SGP_READ && !prealloc_page) { | ||
1284 | prealloc_page = shmem_alloc_page(gfp, info, idx); | ||
1285 | if (prealloc_page) { | ||
1286 | SetPageSwapBacked(prealloc_page); | ||
1287 | if (mem_cgroup_cache_charge(prealloc_page, | ||
1288 | current->mm, GFP_KERNEL)) { | ||
1289 | page_cache_release(prealloc_page); | ||
1290 | prealloc_page = NULL; | ||
1289 | } | 1291 | } |
1290 | } | 1292 | } |
1291 | } | 1293 | } |
1292 | error = 0; | ||
1293 | 1294 | ||
1294 | spin_lock(&info->lock); | 1295 | spin_lock(&info->lock); |
1295 | shmem_recalc_inode(inode); | 1296 | shmem_recalc_inode(inode); |
1296 | entry = shmem_swp_alloc(info, idx, sgp); | 1297 | entry = shmem_swp_alloc(info, idx, sgp, gfp); |
1297 | if (IS_ERR(entry)) { | 1298 | if (IS_ERR(entry)) { |
1298 | spin_unlock(&info->lock); | 1299 | spin_unlock(&info->lock); |
1299 | error = PTR_ERR(entry); | 1300 | error = PTR_ERR(entry); |
1300 | goto failed; | 1301 | goto out; |
1301 | } | 1302 | } |
1302 | swap = *entry; | 1303 | swap = *entry; |
1303 | 1304 | ||
1304 | if (swap.val) { | 1305 | if (swap.val) { |
1305 | /* Look it up and read it in.. */ | 1306 | /* Look it up and read it in.. */ |
1306 | swappage = lookup_swap_cache(swap); | 1307 | page = lookup_swap_cache(swap); |
1307 | if (!swappage) { | 1308 | if (!page) { |
1308 | shmem_swp_unmap(entry); | 1309 | shmem_swp_unmap(entry); |
1309 | spin_unlock(&info->lock); | 1310 | spin_unlock(&info->lock); |
1310 | /* here we actually do the io */ | 1311 | /* here we actually do the io */ |
1311 | if (type) | 1312 | if (fault_type) |
1312 | *type |= VM_FAULT_MAJOR; | 1313 | *fault_type |= VM_FAULT_MAJOR; |
1313 | swappage = shmem_swapin(swap, gfp, info, idx); | 1314 | page = shmem_swapin(swap, gfp, info, idx); |
1314 | if (!swappage) { | 1315 | if (!page) { |
1315 | spin_lock(&info->lock); | 1316 | spin_lock(&info->lock); |
1316 | entry = shmem_swp_alloc(info, idx, sgp); | 1317 | entry = shmem_swp_alloc(info, idx, sgp, gfp); |
1317 | if (IS_ERR(entry)) | 1318 | if (IS_ERR(entry)) |
1318 | error = PTR_ERR(entry); | 1319 | error = PTR_ERR(entry); |
1319 | else { | 1320 | else { |
@@ -1323,62 +1324,42 @@ repeat: | |||
1323 | } | 1324 | } |
1324 | spin_unlock(&info->lock); | 1325 | spin_unlock(&info->lock); |
1325 | if (error) | 1326 | if (error) |
1326 | goto failed; | 1327 | goto out; |
1327 | goto repeat; | 1328 | goto repeat; |
1328 | } | 1329 | } |
1329 | wait_on_page_locked(swappage); | 1330 | wait_on_page_locked(page); |
1330 | page_cache_release(swappage); | 1331 | page_cache_release(page); |
1331 | goto repeat; | 1332 | goto repeat; |
1332 | } | 1333 | } |
1333 | 1334 | ||
1334 | /* We have to do this with page locked to prevent races */ | 1335 | /* We have to do this with page locked to prevent races */ |
1335 | if (!trylock_page(swappage)) { | 1336 | if (!trylock_page(page)) { |
1336 | shmem_swp_unmap(entry); | 1337 | shmem_swp_unmap(entry); |
1337 | spin_unlock(&info->lock); | 1338 | spin_unlock(&info->lock); |
1338 | wait_on_page_locked(swappage); | 1339 | wait_on_page_locked(page); |
1339 | page_cache_release(swappage); | 1340 | page_cache_release(page); |
1340 | goto repeat; | 1341 | goto repeat; |
1341 | } | 1342 | } |
1342 | if (PageWriteback(swappage)) { | 1343 | if (PageWriteback(page)) { |
1343 | shmem_swp_unmap(entry); | 1344 | shmem_swp_unmap(entry); |
1344 | spin_unlock(&info->lock); | 1345 | spin_unlock(&info->lock); |
1345 | wait_on_page_writeback(swappage); | 1346 | wait_on_page_writeback(page); |
1346 | unlock_page(swappage); | 1347 | unlock_page(page); |
1347 | page_cache_release(swappage); | 1348 | page_cache_release(page); |
1348 | goto repeat; | 1349 | goto repeat; |
1349 | } | 1350 | } |
1350 | if (!PageUptodate(swappage)) { | 1351 | if (!PageUptodate(page)) { |
1351 | shmem_swp_unmap(entry); | 1352 | shmem_swp_unmap(entry); |
1352 | spin_unlock(&info->lock); | 1353 | spin_unlock(&info->lock); |
1353 | unlock_page(swappage); | 1354 | unlock_page(page); |
1354 | page_cache_release(swappage); | 1355 | page_cache_release(page); |
1355 | error = -EIO; | 1356 | error = -EIO; |
1356 | goto failed; | 1357 | goto out; |
1357 | } | 1358 | } |
1358 | 1359 | ||
1359 | if (filepage) { | 1360 | error = add_to_page_cache_locked(page, mapping, |
1360 | shmem_swp_set(info, entry, 0); | 1361 | idx, GFP_NOWAIT); |
1361 | shmem_swp_unmap(entry); | 1362 | if (error) { |
1362 | delete_from_swap_cache(swappage); | ||
1363 | spin_unlock(&info->lock); | ||
1364 | copy_highpage(filepage, swappage); | ||
1365 | unlock_page(swappage); | ||
1366 | page_cache_release(swappage); | ||
1367 | flush_dcache_page(filepage); | ||
1368 | SetPageUptodate(filepage); | ||
1369 | set_page_dirty(filepage); | ||
1370 | swap_free(swap); | ||
1371 | } else if (!(error = add_to_page_cache_locked(swappage, mapping, | ||
1372 | idx, GFP_NOWAIT))) { | ||
1373 | info->flags |= SHMEM_PAGEIN; | ||
1374 | shmem_swp_set(info, entry, 0); | ||
1375 | shmem_swp_unmap(entry); | ||
1376 | delete_from_swap_cache(swappage); | ||
1377 | spin_unlock(&info->lock); | ||
1378 | filepage = swappage; | ||
1379 | set_page_dirty(filepage); | ||
1380 | swap_free(swap); | ||
1381 | } else { | ||
1382 | shmem_swp_unmap(entry); | 1363 | shmem_swp_unmap(entry); |
1383 | spin_unlock(&info->lock); | 1364 | spin_unlock(&info->lock); |
1384 | if (error == -ENOMEM) { | 1365 | if (error == -ENOMEM) { |
@@ -1387,32 +1368,38 @@ repeat: | |||
1387 | * call memcg's OOM if needed. | 1368 | * call memcg's OOM if needed. |
1388 | */ | 1369 | */ |
1389 | error = mem_cgroup_shmem_charge_fallback( | 1370 | error = mem_cgroup_shmem_charge_fallback( |
1390 | swappage, | 1371 | page, current->mm, gfp); |
1391 | current->mm, | ||
1392 | gfp); | ||
1393 | if (error) { | 1372 | if (error) { |
1394 | unlock_page(swappage); | 1373 | unlock_page(page); |
1395 | page_cache_release(swappage); | 1374 | page_cache_release(page); |
1396 | goto failed; | 1375 | goto out; |
1397 | } | 1376 | } |
1398 | } | 1377 | } |
1399 | unlock_page(swappage); | 1378 | unlock_page(page); |
1400 | page_cache_release(swappage); | 1379 | page_cache_release(page); |
1401 | goto repeat; | 1380 | goto repeat; |
1402 | } | 1381 | } |
1403 | } else if (sgp == SGP_READ && !filepage) { | 1382 | |
1383 | info->flags |= SHMEM_PAGEIN; | ||
1384 | shmem_swp_set(info, entry, 0); | ||
1404 | shmem_swp_unmap(entry); | 1385 | shmem_swp_unmap(entry); |
1405 | filepage = find_get_page(mapping, idx); | 1386 | delete_from_swap_cache(page); |
1406 | if (filepage && | 1387 | spin_unlock(&info->lock); |
1407 | (!PageUptodate(filepage) || !trylock_page(filepage))) { | 1388 | set_page_dirty(page); |
1389 | swap_free(swap); | ||
1390 | |||
1391 | } else if (sgp == SGP_READ) { | ||
1392 | shmem_swp_unmap(entry); | ||
1393 | page = find_get_page(mapping, idx); | ||
1394 | if (page && !trylock_page(page)) { | ||
1408 | spin_unlock(&info->lock); | 1395 | spin_unlock(&info->lock); |
1409 | wait_on_page_locked(filepage); | 1396 | wait_on_page_locked(page); |
1410 | page_cache_release(filepage); | 1397 | page_cache_release(page); |
1411 | filepage = NULL; | ||
1412 | goto repeat; | 1398 | goto repeat; |
1413 | } | 1399 | } |
1414 | spin_unlock(&info->lock); | 1400 | spin_unlock(&info->lock); |
1415 | } else { | 1401 | |
1402 | } else if (prealloc_page) { | ||
1416 | shmem_swp_unmap(entry); | 1403 | shmem_swp_unmap(entry); |
1417 | sbinfo = SHMEM_SB(inode->i_sb); | 1404 | sbinfo = SHMEM_SB(inode->i_sb); |
1418 | if (sbinfo->max_blocks) { | 1405 | if (sbinfo->max_blocks) { |
@@ -1426,121 +1413,82 @@ repeat: | |||
1426 | spin_unlock(&inode->i_lock); | 1413 | spin_unlock(&inode->i_lock); |
1427 | } else if (shmem_acct_block(info->flags)) | 1414 | } else if (shmem_acct_block(info->flags)) |
1428 | goto nospace; | 1415 | goto nospace; |
1429 | 1416 | ||
1430 | if (!filepage) { | 1417 | page = prealloc_page; |
1431 | int ret; | 1418 | prealloc_page = NULL; |
1432 | 1419 | ||
1433 | if (!prealloc_page) { | 1420 | entry = shmem_swp_alloc(info, idx, sgp, gfp); |
1434 | spin_unlock(&info->lock); | ||
1435 | filepage = shmem_alloc_page(gfp, info, idx); | ||
1436 | if (!filepage) { | ||
1437 | shmem_unacct_blocks(info->flags, 1); | ||
1438 | shmem_free_blocks(inode, 1); | ||
1439 | error = -ENOMEM; | ||
1440 | goto failed; | ||
1441 | } | ||
1442 | SetPageSwapBacked(filepage); | ||
1443 | |||
1444 | /* | ||
1445 | * Precharge page while we can wait, compensate | ||
1446 | * after | ||
1447 | */ | ||
1448 | error = mem_cgroup_cache_charge(filepage, | ||
1449 | current->mm, GFP_KERNEL); | ||
1450 | if (error) { | ||
1451 | page_cache_release(filepage); | ||
1452 | shmem_unacct_blocks(info->flags, 1); | ||
1453 | shmem_free_blocks(inode, 1); | ||
1454 | filepage = NULL; | ||
1455 | goto failed; | ||
1456 | } | ||
1457 | |||
1458 | spin_lock(&info->lock); | ||
1459 | } else { | ||
1460 | filepage = prealloc_page; | ||
1461 | prealloc_page = NULL; | ||
1462 | SetPageSwapBacked(filepage); | ||
1463 | } | ||
1464 | |||
1465 | entry = shmem_swp_alloc(info, idx, sgp); | ||
1466 | if (IS_ERR(entry)) | 1421 | if (IS_ERR(entry)) |
1467 | error = PTR_ERR(entry); | 1422 | error = PTR_ERR(entry); |
1468 | else { | 1423 | else { |
1469 | swap = *entry; | 1424 | swap = *entry; |
1470 | shmem_swp_unmap(entry); | 1425 | shmem_swp_unmap(entry); |
1471 | } | 1426 | } |
1472 | ret = error || swap.val; | 1427 | ret = error || swap.val; |
1473 | if (ret) | 1428 | if(ret) |
1474 | mem_cgroup_uncharge_cache_page(filepage); | 1429 | mem_cgroup_uncharge_cache_page(page); |
1475 | else | 1430 | else |
1476 | ret = add_to_page_cache_lru(filepage, mapping, | 1431 | ret = add_to_page_cache_lru(page, mapping, idx, GFP_NOWAIT); |
1477 | idx, GFP_NOWAIT); | 1432 | /* |
1478 | /* | 1433 | * At add_to_page_cache_lru() failure, |
1479 | * At add_to_page_cache_lru() failure, uncharge will | 1434 | * uncharge will be done automatically. |
1480 | * be done automatically. | 1435 | */ |
1481 | */ | 1436 | if (ret) { |
1482 | if (ret) { | 1437 | shmem_unacct_blocks(info->flags, 1); |
1483 | spin_unlock(&info->lock); | 1438 | shmem_free_blocks(inode, 1); |
1484 | page_cache_release(filepage); | 1439 | spin_unlock(&info->lock); |
1485 | shmem_unacct_blocks(info->flags, 1); | 1440 | page_cache_release(page); |
1486 | shmem_free_blocks(inode, 1); | 1441 | if (error) |
1487 | filepage = NULL; | 1442 | goto out; |
1488 | if (error) | 1443 | goto repeat; |
1489 | goto failed; | ||
1490 | goto repeat; | ||
1491 | } | ||
1492 | info->flags |= SHMEM_PAGEIN; | ||
1493 | } | 1444 | } |
1494 | 1445 | ||
1446 | info->flags |= SHMEM_PAGEIN; | ||
1495 | info->alloced++; | 1447 | info->alloced++; |
1496 | spin_unlock(&info->lock); | 1448 | spin_unlock(&info->lock); |
1497 | clear_highpage(filepage); | 1449 | clear_highpage(page); |
1498 | flush_dcache_page(filepage); | 1450 | flush_dcache_page(page); |
1499 | SetPageUptodate(filepage); | 1451 | SetPageUptodate(page); |
1500 | if (sgp == SGP_DIRTY) | 1452 | if (sgp == SGP_DIRTY) |
1501 | set_page_dirty(filepage); | 1453 | set_page_dirty(page); |
1454 | |||
1455 | } else { | ||
1456 | spin_unlock(&info->lock); | ||
1457 | error = -ENOMEM; | ||
1458 | goto out; | ||
1502 | } | 1459 | } |
1503 | done: | 1460 | done: |
1504 | *pagep = filepage; | 1461 | *pagep = page; |
1505 | error = 0; | 1462 | error = 0; |
1506 | goto out; | 1463 | out: |
1464 | if (prealloc_page) { | ||
1465 | mem_cgroup_uncharge_cache_page(prealloc_page); | ||
1466 | page_cache_release(prealloc_page); | ||
1467 | } | ||
1468 | return error; | ||
1507 | 1469 | ||
1508 | nospace: | 1470 | nospace: |
1509 | /* | 1471 | /* |
1510 | * Perhaps the page was brought in from swap between find_lock_page | 1472 | * Perhaps the page was brought in from swap between find_lock_page |
1511 | * and taking info->lock? We allow for that at add_to_page_cache_lru, | 1473 | * and taking info->lock? We allow for that at add_to_page_cache_lru, |
1512 | * but must also avoid reporting a spurious ENOSPC while working on a | 1474 | * but must also avoid reporting a spurious ENOSPC while working on a |
1513 | * full tmpfs. (When filepage has been passed in to shmem_getpage, it | 1475 | * full tmpfs. |
1514 | * is already in page cache, which prevents this race from occurring.) | ||
1515 | */ | 1476 | */ |
1516 | if (!filepage) { | 1477 | page = find_get_page(mapping, idx); |
1517 | struct page *page = find_get_page(mapping, idx); | ||
1518 | if (page) { | ||
1519 | spin_unlock(&info->lock); | ||
1520 | page_cache_release(page); | ||
1521 | goto repeat; | ||
1522 | } | ||
1523 | } | ||
1524 | spin_unlock(&info->lock); | 1478 | spin_unlock(&info->lock); |
1525 | error = -ENOSPC; | 1479 | if (page) { |
1526 | failed: | 1480 | page_cache_release(page); |
1527 | if (*pagep != filepage) { | 1481 | goto repeat; |
1528 | unlock_page(filepage); | ||
1529 | page_cache_release(filepage); | ||
1530 | } | ||
1531 | out: | ||
1532 | if (prealloc_page) { | ||
1533 | mem_cgroup_uncharge_cache_page(prealloc_page); | ||
1534 | page_cache_release(prealloc_page); | ||
1535 | } | 1482 | } |
1536 | return error; | 1483 | error = -ENOSPC; |
1484 | goto out; | ||
1537 | } | 1485 | } |
1538 | 1486 | ||
1539 | static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 1487 | static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
1540 | { | 1488 | { |
1541 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 1489 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1542 | int error; | 1490 | int error; |
1543 | int ret; | 1491 | int ret = VM_FAULT_LOCKED; |
1544 | 1492 | ||
1545 | if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | 1493 | if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) |
1546 | return VM_FAULT_SIGBUS; | 1494 | return VM_FAULT_SIGBUS; |
@@ -1548,11 +1496,12 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1548 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); | 1496 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); |
1549 | if (error) | 1497 | if (error) |
1550 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1498 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
1499 | |||
1551 | if (ret & VM_FAULT_MAJOR) { | 1500 | if (ret & VM_FAULT_MAJOR) { |
1552 | count_vm_event(PGMAJFAULT); | 1501 | count_vm_event(PGMAJFAULT); |
1553 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | 1502 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
1554 | } | 1503 | } |
1555 | return ret | VM_FAULT_LOCKED; | 1504 | return ret; |
1556 | } | 1505 | } |
1557 | 1506 | ||
1558 | #ifdef CONFIG_NUMA | 1507 | #ifdef CONFIG_NUMA |
@@ -1669,19 +1618,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1669 | static const struct inode_operations shmem_symlink_inode_operations; | 1618 | static const struct inode_operations shmem_symlink_inode_operations; |
1670 | static const struct inode_operations shmem_symlink_inline_operations; | 1619 | static const struct inode_operations shmem_symlink_inline_operations; |
1671 | 1620 | ||
1672 | /* | ||
1673 | * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin; | ||
1674 | * but providing them allows a tmpfs file to be used for splice, sendfile, and | ||
1675 | * below the loop driver, in the generic fashion that many filesystems support. | ||
1676 | */ | ||
1677 | static int shmem_readpage(struct file *file, struct page *page) | ||
1678 | { | ||
1679 | struct inode *inode = page->mapping->host; | ||
1680 | int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL); | ||
1681 | unlock_page(page); | ||
1682 | return error; | ||
1683 | } | ||
1684 | |||
1685 | static int | 1621 | static int |
1686 | shmem_write_begin(struct file *file, struct address_space *mapping, | 1622 | shmem_write_begin(struct file *file, struct address_space *mapping, |
1687 | loff_t pos, unsigned len, unsigned flags, | 1623 | loff_t pos, unsigned len, unsigned flags, |
@@ -1689,7 +1625,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping, | |||
1689 | { | 1625 | { |
1690 | struct inode *inode = mapping->host; | 1626 | struct inode *inode = mapping->host; |
1691 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 1627 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1692 | *pagep = NULL; | ||
1693 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); | 1628 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); |
1694 | } | 1629 | } |
1695 | 1630 | ||
@@ -1846,6 +1781,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb, | |||
1846 | return retval; | 1781 | return retval; |
1847 | } | 1782 | } |
1848 | 1783 | ||
1784 | static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | ||
1785 | struct pipe_inode_info *pipe, size_t len, | ||
1786 | unsigned int flags) | ||
1787 | { | ||
1788 | struct address_space *mapping = in->f_mapping; | ||
1789 | struct inode *inode = mapping->host; | ||
1790 | unsigned int loff, nr_pages, req_pages; | ||
1791 | struct page *pages[PIPE_DEF_BUFFERS]; | ||
1792 | struct partial_page partial[PIPE_DEF_BUFFERS]; | ||
1793 | struct page *page; | ||
1794 | pgoff_t index, end_index; | ||
1795 | loff_t isize, left; | ||
1796 | int error, page_nr; | ||
1797 | struct splice_pipe_desc spd = { | ||
1798 | .pages = pages, | ||
1799 | .partial = partial, | ||
1800 | .flags = flags, | ||
1801 | .ops = &page_cache_pipe_buf_ops, | ||
1802 | .spd_release = spd_release_page, | ||
1803 | }; | ||
1804 | |||
1805 | isize = i_size_read(inode); | ||
1806 | if (unlikely(*ppos >= isize)) | ||
1807 | return 0; | ||
1808 | |||
1809 | left = isize - *ppos; | ||
1810 | if (unlikely(left < len)) | ||
1811 | len = left; | ||
1812 | |||
1813 | if (splice_grow_spd(pipe, &spd)) | ||
1814 | return -ENOMEM; | ||
1815 | |||
1816 | index = *ppos >> PAGE_CACHE_SHIFT; | ||
1817 | loff = *ppos & ~PAGE_CACHE_MASK; | ||
1818 | req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1819 | nr_pages = min(req_pages, pipe->buffers); | ||
1820 | |||
1821 | spd.nr_pages = find_get_pages_contig(mapping, index, | ||
1822 | nr_pages, spd.pages); | ||
1823 | index += spd.nr_pages; | ||
1824 | error = 0; | ||
1825 | |||
1826 | while (spd.nr_pages < nr_pages) { | ||
1827 | error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); | ||
1828 | if (error) | ||
1829 | break; | ||
1830 | unlock_page(page); | ||
1831 | spd.pages[spd.nr_pages++] = page; | ||
1832 | index++; | ||
1833 | } | ||
1834 | |||
1835 | index = *ppos >> PAGE_CACHE_SHIFT; | ||
1836 | nr_pages = spd.nr_pages; | ||
1837 | spd.nr_pages = 0; | ||
1838 | |||
1839 | for (page_nr = 0; page_nr < nr_pages; page_nr++) { | ||
1840 | unsigned int this_len; | ||
1841 | |||
1842 | if (!len) | ||
1843 | break; | ||
1844 | |||
1845 | this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); | ||
1846 | page = spd.pages[page_nr]; | ||
1847 | |||
1848 | if (!PageUptodate(page) || page->mapping != mapping) { | ||
1849 | error = shmem_getpage(inode, index, &page, | ||
1850 | SGP_CACHE, NULL); | ||
1851 | if (error) | ||
1852 | break; | ||
1853 | unlock_page(page); | ||
1854 | page_cache_release(spd.pages[page_nr]); | ||
1855 | spd.pages[page_nr] = page; | ||
1856 | } | ||
1857 | |||
1858 | isize = i_size_read(inode); | ||
1859 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; | ||
1860 | if (unlikely(!isize || index > end_index)) | ||
1861 | break; | ||
1862 | |||
1863 | if (end_index == index) { | ||
1864 | unsigned int plen; | ||
1865 | |||
1866 | plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; | ||
1867 | if (plen <= loff) | ||
1868 | break; | ||
1869 | |||
1870 | this_len = min(this_len, plen - loff); | ||
1871 | len = this_len; | ||
1872 | } | ||
1873 | |||
1874 | spd.partial[page_nr].offset = loff; | ||
1875 | spd.partial[page_nr].len = this_len; | ||
1876 | len -= this_len; | ||
1877 | loff = 0; | ||
1878 | spd.nr_pages++; | ||
1879 | index++; | ||
1880 | } | ||
1881 | |||
1882 | while (page_nr < nr_pages) | ||
1883 | page_cache_release(spd.pages[page_nr++]); | ||
1884 | |||
1885 | if (spd.nr_pages) | ||
1886 | error = splice_to_pipe(pipe, &spd); | ||
1887 | |||
1888 | splice_shrink_spd(pipe, &spd); | ||
1889 | |||
1890 | if (error > 0) { | ||
1891 | *ppos += error; | ||
1892 | file_accessed(in); | ||
1893 | } | ||
1894 | return error; | ||
1895 | } | ||
1896 | |||
1849 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | 1897 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) |
1850 | { | 1898 | { |
1851 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); | 1899 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); |
@@ -2006,7 +2054,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2006 | int error; | 2054 | int error; |
2007 | int len; | 2055 | int len; |
2008 | struct inode *inode; | 2056 | struct inode *inode; |
2009 | struct page *page = NULL; | 2057 | struct page *page; |
2010 | char *kaddr; | 2058 | char *kaddr; |
2011 | struct shmem_inode_info *info; | 2059 | struct shmem_inode_info *info; |
2012 | 2060 | ||
@@ -2684,7 +2732,6 @@ static const struct address_space_operations shmem_aops = { | |||
2684 | .writepage = shmem_writepage, | 2732 | .writepage = shmem_writepage, |
2685 | .set_page_dirty = __set_page_dirty_no_writeback, | 2733 | .set_page_dirty = __set_page_dirty_no_writeback, |
2686 | #ifdef CONFIG_TMPFS | 2734 | #ifdef CONFIG_TMPFS |
2687 | .readpage = shmem_readpage, | ||
2688 | .write_begin = shmem_write_begin, | 2735 | .write_begin = shmem_write_begin, |
2689 | .write_end = shmem_write_end, | 2736 | .write_end = shmem_write_end, |
2690 | #endif | 2737 | #endif |
@@ -2701,7 +2748,7 @@ static const struct file_operations shmem_file_operations = { | |||
2701 | .aio_read = shmem_file_aio_read, | 2748 | .aio_read = shmem_file_aio_read, |
2702 | .aio_write = generic_file_aio_write, | 2749 | .aio_write = generic_file_aio_write, |
2703 | .fsync = noop_fsync, | 2750 | .fsync = noop_fsync, |
2704 | .splice_read = generic_file_splice_read, | 2751 | .splice_read = shmem_file_splice_read, |
2705 | .splice_write = generic_file_splice_write, | 2752 | .splice_write = generic_file_splice_write, |
2706 | #endif | 2753 | #endif |
2707 | }; | 2754 | }; |
@@ -3015,6 +3062,15 @@ put_memory: | |||
3015 | } | 3062 | } |
3016 | EXPORT_SYMBOL_GPL(shmem_file_setup); | 3063 | EXPORT_SYMBOL_GPL(shmem_file_setup); |
3017 | 3064 | ||
3065 | void shmem_set_file(struct vm_area_struct *vma, struct file *file) | ||
3066 | { | ||
3067 | if (vma->vm_file) | ||
3068 | fput(vma->vm_file); | ||
3069 | vma->vm_file = file; | ||
3070 | vma->vm_ops = &shmem_vm_ops; | ||
3071 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
3072 | } | ||
3073 | |||
3018 | /** | 3074 | /** |
3019 | * shmem_zero_setup - setup a shared anonymous mapping | 3075 | * shmem_zero_setup - setup a shared anonymous mapping |
3020 | * @vma: the vma to be mmapped is prepared by do_mmap_pgoff | 3076 | * @vma: the vma to be mmapped is prepared by do_mmap_pgoff |
@@ -3028,11 +3084,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
3028 | if (IS_ERR(file)) | 3084 | if (IS_ERR(file)) |
3029 | return PTR_ERR(file); | 3085 | return PTR_ERR(file); |
3030 | 3086 | ||
3031 | if (vma->vm_file) | 3087 | shmem_set_file(vma, file); |
3032 | fput(vma->vm_file); | ||
3033 | vma->vm_file = file; | ||
3034 | vma->vm_ops = &shmem_vm_ops; | ||
3035 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
3036 | return 0; | 3088 | return 0; |
3037 | } | 3089 | } |
3038 | 3090 | ||
@@ -3048,13 +3100,29 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
3048 | * suit tmpfs, since it may have pages in swapcache, and needs to find those | 3100 | * suit tmpfs, since it may have pages in swapcache, and needs to find those |
3049 | * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. | 3101 | * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. |
3050 | * | 3102 | * |
3051 | * Provide a stub for those callers to start using now, then later | 3103 | * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in |
3052 | * flesh it out to call shmem_getpage() with additional gfp mask, when | 3104 | * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. |
3053 | * shmem_file_splice_read() is added and shmem_readpage() is removed. | ||
3054 | */ | 3105 | */ |
3055 | struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, | 3106 | struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, |
3056 | pgoff_t index, gfp_t gfp) | 3107 | pgoff_t index, gfp_t gfp) |
3057 | { | 3108 | { |
3109 | #ifdef CONFIG_SHMEM | ||
3110 | struct inode *inode = mapping->host; | ||
3111 | struct page *page; | ||
3112 | int error; | ||
3113 | |||
3114 | BUG_ON(mapping->a_ops != &shmem_aops); | ||
3115 | error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); | ||
3116 | if (error) | ||
3117 | page = ERR_PTR(error); | ||
3118 | else | ||
3119 | unlock_page(page); | ||
3120 | return page; | ||
3121 | #else | ||
3122 | /* | ||
3123 | * The tiny !SHMEM case uses ramfs without swap | ||
3124 | */ | ||
3058 | return read_cache_page_gfp(mapping, index, gfp); | 3125 | return read_cache_page_gfp(mapping, index, gfp); |
3126 | #endif | ||
3059 | } | 3127 | } |
3060 | EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); | 3128 | EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); |
@@ -3218,12 +3218,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3218 | if (in_interrupt() || (flags & __GFP_THISNODE)) | 3218 | if (in_interrupt() || (flags & __GFP_THISNODE)) |
3219 | return NULL; | 3219 | return NULL; |
3220 | nid_alloc = nid_here = numa_mem_id(); | 3220 | nid_alloc = nid_here = numa_mem_id(); |
3221 | get_mems_allowed(); | ||
3222 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 3221 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) |
3223 | nid_alloc = cpuset_slab_spread_node(); | 3222 | nid_alloc = cpuset_slab_spread_node(); |
3224 | else if (current->mempolicy) | 3223 | else if (current->mempolicy) |
3225 | nid_alloc = slab_node(current->mempolicy); | 3224 | nid_alloc = slab_node(current->mempolicy); |
3226 | put_mems_allowed(); | ||
3227 | if (nid_alloc != nid_here) | 3225 | if (nid_alloc != nid_here) |
3228 | return ____cache_alloc_node(cachep, flags, nid_alloc); | 3226 | return ____cache_alloc_node(cachep, flags, nid_alloc); |
3229 | return NULL; | 3227 | return NULL; |
@@ -3246,14 +3244,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3246 | enum zone_type high_zoneidx = gfp_zone(flags); | 3244 | enum zone_type high_zoneidx = gfp_zone(flags); |
3247 | void *obj = NULL; | 3245 | void *obj = NULL; |
3248 | int nid; | 3246 | int nid; |
3247 | unsigned int cpuset_mems_cookie; | ||
3249 | 3248 | ||
3250 | if (flags & __GFP_THISNODE) | 3249 | if (flags & __GFP_THISNODE) |
3251 | return NULL; | 3250 | return NULL; |
3252 | 3251 | ||
3253 | get_mems_allowed(); | ||
3254 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | ||
3255 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | 3252 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
3256 | 3253 | ||
3254 | retry_cpuset: | ||
3255 | cpuset_mems_cookie = get_mems_allowed(); | ||
3256 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | ||
3257 | |||
3257 | retry: | 3258 | retry: |
3258 | /* | 3259 | /* |
3259 | * Look through allowed nodes for objects available | 3260 | * Look through allowed nodes for objects available |
@@ -3306,7 +3307,9 @@ retry: | |||
3306 | } | 3307 | } |
3307 | } | 3308 | } |
3308 | } | 3309 | } |
3309 | put_mems_allowed(); | 3310 | |
3311 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) | ||
3312 | goto retry_cpuset; | ||
3310 | return obj; | 3313 | return obj; |
3311 | } | 3314 | } |
3312 | 3315 | ||
diff --git a/mm/slqb.c b/mm/slqb.c new file mode 100644 index 00000000000..fbd2ebde3c3 --- /dev/null +++ b/mm/slqb.c | |||
@@ -0,0 +1,3816 @@ | |||
1 | /* | ||
2 | * SLQB: A slab allocator that focuses on per-CPU scaling, and good performance | ||
3 | * with order-0 allocations. Fastpaths emphasis is placed on local allocaiton | ||
4 | * and freeing, but with a secondary goal of good remote freeing (freeing on | ||
5 | * another CPU from that which allocated). | ||
6 | * | ||
7 | * Using ideas and code from mm/slab.c, mm/slob.c, and mm/slub.c. | ||
8 | */ | ||
9 | |||
10 | #include <linux/mm.h> | ||
11 | #include <linux/swap.h> /* struct reclaim_state */ | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/interrupt.h> | ||
14 | #include <linux/slab.h> | ||
15 | #include <linux/seq_file.h> | ||
16 | #include <linux/cpu.h> | ||
17 | #include <linux/cpuset.h> | ||
18 | #include <linux/mempolicy.h> | ||
19 | #include <linux/ctype.h> | ||
20 | #include <linux/kallsyms.h> | ||
21 | #include <linux/memory.h> | ||
22 | #include <linux/fault-inject.h> | ||
23 | |||
24 | /* | ||
25 | * TODO | ||
26 | * - fix up releasing of offlined data structures. Not a big deal because | ||
27 | * they don't get cumulatively leaked with successive online/offline cycles | ||
28 | * - allow OOM conditions to flush back per-CPU pages to common lists to be | ||
29 | * reused by other CPUs. | ||
30 | * - investiage performance with memoryless nodes. Perhaps CPUs can be given | ||
31 | * a default closest home node via which it can use fastpath functions. | ||
32 | * Perhaps it is not a big problem. | ||
33 | */ | ||
34 | |||
35 | /* | ||
36 | * slqb_page overloads struct page, and is used to manage some slob allocation | ||
37 | * aspects, however to avoid the horrible mess in include/linux/mm_types.h, | ||
38 | * we'll just define our own struct slqb_page type variant here. | ||
39 | */ | ||
40 | struct slqb_page { | ||
41 | union { | ||
42 | struct { | ||
43 | unsigned long flags; /* mandatory */ | ||
44 | atomic_t _count; /* mandatory */ | ||
45 | unsigned int inuse; /* Nr of objects */ | ||
46 | struct kmem_cache_list *list; /* Pointer to list */ | ||
47 | void **freelist; /* LIFO freelist */ | ||
48 | union { | ||
49 | struct list_head lru; /* misc. list */ | ||
50 | struct rcu_head rcu_head; /* for rcu freeing */ | ||
51 | }; | ||
52 | }; | ||
53 | struct page page; | ||
54 | }; | ||
55 | }; | ||
56 | static inline void struct_slqb_page_wrong_size(void) | ||
57 | { BUILD_BUG_ON(sizeof(struct slqb_page) != sizeof(struct page)); } | ||
58 | |||
59 | #define PG_SLQB_BIT (1 << PG_slab) | ||
60 | |||
61 | /* | ||
62 | * slqb_min_order: minimum allocation order for slabs | ||
63 | */ | ||
64 | static int slqb_min_order; | ||
65 | |||
66 | /* | ||
67 | * slqb_min_objects: minimum number of objects per slab. Increasing this | ||
68 | * will increase the allocation order for slabs with larger objects | ||
69 | */ | ||
70 | static int slqb_min_objects = 1; | ||
71 | |||
72 | #ifdef CONFIG_NUMA | ||
73 | static inline int slab_numa(struct kmem_cache *s) | ||
74 | { | ||
75 | return s->flags & SLAB_NUMA; | ||
76 | } | ||
77 | #else | ||
78 | static inline int slab_numa(struct kmem_cache *s) | ||
79 | { | ||
80 | return 0; | ||
81 | } | ||
82 | #endif | ||
83 | |||
84 | static inline int slab_hiwater(struct kmem_cache *s) | ||
85 | { | ||
86 | return s->hiwater; | ||
87 | } | ||
88 | |||
89 | static inline int slab_freebatch(struct kmem_cache *s) | ||
90 | { | ||
91 | return s->freebatch; | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * Lock order: | ||
96 | * kmem_cache_node->list_lock | ||
97 | * kmem_cache_remote_free->lock | ||
98 | * | ||
99 | * Data structures: | ||
100 | * SLQB is primarily per-cpu. For each kmem_cache, each CPU has: | ||
101 | * | ||
102 | * - A LIFO list of node-local objects. Allocation and freeing of node local | ||
103 | * objects goes first to this list. | ||
104 | * | ||
105 | * - 2 Lists of slab pages, free and partial pages. If an allocation misses | ||
106 | * the object list, it tries from the partial list, then the free list. | ||
107 | * After freeing an object to the object list, if it is over a watermark, | ||
108 | * some objects are freed back to pages. If an allocation misses these lists, | ||
109 | * a new slab page is allocated from the page allocator. If the free list | ||
110 | * reaches a watermark, some of its pages are returned to the page allocator. | ||
111 | * | ||
112 | * - A remote free queue, where objects freed that did not come from the local | ||
113 | * node are queued to. When this reaches a watermark, the objects are | ||
114 | * flushed. | ||
115 | * | ||
116 | * - A remotely freed queue, where objects allocated from this CPU are flushed | ||
117 | * to from other CPUs' remote free queues. kmem_cache_remote_free->lock is | ||
118 | * used to protect access to this queue. | ||
119 | * | ||
120 | * When the remotely freed queue reaches a watermark, a flag is set to tell | ||
121 | * the owner CPU to check it. The owner CPU will then check the queue on the | ||
122 | * next allocation that misses the object list. It will move all objects from | ||
123 | * this list onto the object list and then allocate one. | ||
124 | * | ||
125 | * This system of remote queueing is intended to reduce lock and remote | ||
126 | * cacheline acquisitions, and give a cooling off period for remotely freed | ||
127 | * objects before they are re-allocated. | ||
128 | * | ||
129 | * node specific allocations from somewhere other than the local node are | ||
130 | * handled by a per-node list which is the same as the above per-CPU data | ||
131 | * structures except for the following differences: | ||
132 | * | ||
133 | * - kmem_cache_node->list_lock is used to protect access for multiple CPUs to | ||
134 | * allocate from a given node. | ||
135 | * | ||
136 | * - There is no remote free queue. Nodes don't free objects, CPUs do. | ||
137 | */ | ||
138 | |||
139 | static inline void slqb_stat_inc(struct kmem_cache_list *list, | ||
140 | enum stat_item si) | ||
141 | { | ||
142 | #ifdef CONFIG_SLQB_STATS | ||
143 | list->stats[si]++; | ||
144 | #endif | ||
145 | } | ||
146 | |||
147 | static inline void slqb_stat_add(struct kmem_cache_list *list, | ||
148 | enum stat_item si, unsigned long nr) | ||
149 | { | ||
150 | #ifdef CONFIG_SLQB_STATS | ||
151 | list->stats[si] += nr; | ||
152 | #endif | ||
153 | } | ||
154 | |||
155 | static inline int slqb_page_to_nid(struct slqb_page *page) | ||
156 | { | ||
157 | return page_to_nid(&page->page); | ||
158 | } | ||
159 | |||
160 | static inline void *slqb_page_address(struct slqb_page *page) | ||
161 | { | ||
162 | return page_address(&page->page); | ||
163 | } | ||
164 | |||
165 | static inline struct zone *slqb_page_zone(struct slqb_page *page) | ||
166 | { | ||
167 | return page_zone(&page->page); | ||
168 | } | ||
169 | |||
170 | static inline int virt_to_nid(const void *addr) | ||
171 | { | ||
172 | return page_to_nid(virt_to_page(addr)); | ||
173 | } | ||
174 | |||
175 | static inline struct slqb_page *virt_to_head_slqb_page(const void *addr) | ||
176 | { | ||
177 | struct page *p; | ||
178 | |||
179 | p = virt_to_head_page(addr); | ||
180 | return (struct slqb_page *)p; | ||
181 | } | ||
182 | |||
183 | static inline void __free_slqb_pages(struct slqb_page *page, unsigned int order, | ||
184 | int pages) | ||
185 | { | ||
186 | struct page *p = &page->page; | ||
187 | |||
188 | reset_page_mapcount(p); | ||
189 | p->mapping = NULL; | ||
190 | VM_BUG_ON(!(p->flags & PG_SLQB_BIT)); | ||
191 | p->flags &= ~PG_SLQB_BIT; | ||
192 | |||
193 | if (current->reclaim_state) | ||
194 | current->reclaim_state->reclaimed_slab += pages; | ||
195 | __free_pages(p, order); | ||
196 | } | ||
197 | |||
198 | #ifdef CONFIG_SLQB_DEBUG | ||
199 | static inline int slab_debug(struct kmem_cache *s) | ||
200 | { | ||
201 | return s->flags & | ||
202 | (SLAB_DEBUG_FREE | | ||
203 | SLAB_RED_ZONE | | ||
204 | SLAB_POISON | | ||
205 | SLAB_STORE_USER | | ||
206 | SLAB_TRACE); | ||
207 | } | ||
208 | static inline int slab_poison(struct kmem_cache *s) | ||
209 | { | ||
210 | return s->flags & SLAB_POISON; | ||
211 | } | ||
212 | #else | ||
213 | static inline int slab_debug(struct kmem_cache *s) | ||
214 | { | ||
215 | return 0; | ||
216 | } | ||
217 | static inline int slab_poison(struct kmem_cache *s) | ||
218 | { | ||
219 | return 0; | ||
220 | } | ||
221 | #endif | ||
222 | |||
223 | #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ | ||
224 | SLAB_POISON | SLAB_STORE_USER) | ||
225 | |||
226 | /* Internal SLQB flags */ | ||
227 | #define __OBJECT_POISON 0x80000000 /* Poison object */ | ||
228 | |||
229 | /* Not all arches define cache_line_size */ | ||
230 | #ifndef cache_line_size | ||
231 | #define cache_line_size() L1_CACHE_BYTES | ||
232 | #endif | ||
233 | |||
234 | #ifdef CONFIG_SMP | ||
235 | static struct notifier_block slab_notifier; | ||
236 | #endif | ||
237 | |||
238 | /* | ||
239 | * slqb_lock protects slab_caches list and serialises hotplug operations. | ||
240 | * hotplug operations take lock for write, other operations can hold off | ||
241 | * hotplug by taking it for read (or write). | ||
242 | */ | ||
243 | static DECLARE_RWSEM(slqb_lock); | ||
244 | |||
245 | /* | ||
246 | * A list of all slab caches on the system | ||
247 | */ | ||
248 | static LIST_HEAD(slab_caches); | ||
249 | |||
250 | /* | ||
251 | * Tracking user of a slab. | ||
252 | */ | ||
253 | struct track { | ||
254 | unsigned long addr; /* Called from address */ | ||
255 | int cpu; /* Was running on cpu */ | ||
256 | int pid; /* Pid context */ | ||
257 | unsigned long when; /* When did the operation occur */ | ||
258 | }; | ||
259 | |||
260 | enum track_item { TRACK_ALLOC, TRACK_FREE }; | ||
261 | |||
262 | static struct kmem_cache kmem_cache_cache; | ||
263 | |||
264 | #ifdef CONFIG_SLQB_SYSFS | ||
265 | static int sysfs_slab_add(struct kmem_cache *s); | ||
266 | static void sysfs_slab_remove(struct kmem_cache *s); | ||
267 | #else | ||
268 | static inline int sysfs_slab_add(struct kmem_cache *s) | ||
269 | { | ||
270 | return 0; | ||
271 | } | ||
272 | static inline void sysfs_slab_remove(struct kmem_cache *s) | ||
273 | { | ||
274 | kmem_cache_free(&kmem_cache_cache, s); | ||
275 | } | ||
276 | #endif | ||
277 | |||
278 | /******************************************************************** | ||
279 | * Core slab cache functions | ||
280 | *******************************************************************/ | ||
281 | |||
282 | static int __slab_is_available __read_mostly; | ||
283 | int slab_is_available(void) | ||
284 | { | ||
285 | return __slab_is_available; | ||
286 | } | ||
287 | |||
288 | static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) | ||
289 | { | ||
290 | #ifdef CONFIG_SMP | ||
291 | VM_BUG_ON(!s->cpu_slab[cpu]); | ||
292 | return s->cpu_slab[cpu]; | ||
293 | #else | ||
294 | return &s->cpu_slab; | ||
295 | #endif | ||
296 | } | ||
297 | |||
298 | static inline int check_valid_pointer(struct kmem_cache *s, | ||
299 | struct slqb_page *page, const void *object) | ||
300 | { | ||
301 | void *base; | ||
302 | |||
303 | base = slqb_page_address(page); | ||
304 | if (object < base || object >= base + s->objects * s->size || | ||
305 | (object - base) % s->size) { | ||
306 | return 0; | ||
307 | } | ||
308 | |||
309 | return 1; | ||
310 | } | ||
311 | |||
312 | static inline void *get_freepointer(struct kmem_cache *s, void *object) | ||
313 | { | ||
314 | return *(void **)(object + s->offset); | ||
315 | } | ||
316 | |||
317 | static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) | ||
318 | { | ||
319 | *(void **)(object + s->offset) = fp; | ||
320 | } | ||
321 | |||
322 | /* Loop over all objects in a slab */ | ||
323 | #define for_each_object(__p, __s, __addr) \ | ||
324 | for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ | ||
325 | __p += (__s)->size) | ||
326 | |||
327 | /* Scan freelist */ | ||
328 | #define for_each_free_object(__p, __s, __free) \ | ||
329 | for (__p = (__free); (__p) != NULL; __p = get_freepointer((__s),\ | ||
330 | __p)) | ||
331 | |||
332 | #ifdef CONFIG_SLQB_DEBUG | ||
333 | /* | ||
334 | * Debug settings: | ||
335 | */ | ||
336 | #ifdef CONFIG_SLQB_DEBUG_ON | ||
337 | static int slqb_debug __read_mostly = DEBUG_DEFAULT_FLAGS; | ||
338 | #else | ||
339 | static int slqb_debug __read_mostly; | ||
340 | #endif | ||
341 | |||
342 | static char *slqb_debug_slabs; | ||
343 | |||
344 | /* | ||
345 | * Object debugging | ||
346 | */ | ||
347 | static void print_section(char *text, u8 *addr, unsigned int length) | ||
348 | { | ||
349 | int i, offset; | ||
350 | int newline = 1; | ||
351 | char ascii[17]; | ||
352 | |||
353 | ascii[16] = 0; | ||
354 | |||
355 | for (i = 0; i < length; i++) { | ||
356 | if (newline) { | ||
357 | printk(KERN_ERR "%8s 0x%p: ", text, addr + i); | ||
358 | newline = 0; | ||
359 | } | ||
360 | printk(KERN_CONT " %02x", addr[i]); | ||
361 | offset = i % 16; | ||
362 | ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; | ||
363 | if (offset == 15) { | ||
364 | printk(KERN_CONT " %s\n", ascii); | ||
365 | newline = 1; | ||
366 | } | ||
367 | } | ||
368 | if (!newline) { | ||
369 | i %= 16; | ||
370 | while (i < 16) { | ||
371 | printk(KERN_CONT " "); | ||
372 | ascii[i] = ' '; | ||
373 | i++; | ||
374 | } | ||
375 | printk(KERN_CONT " %s\n", ascii); | ||
376 | } | ||
377 | } | ||
378 | |||
379 | static struct track *get_track(struct kmem_cache *s, void *object, | ||
380 | enum track_item alloc) | ||
381 | { | ||
382 | struct track *p; | ||
383 | |||
384 | if (s->offset) | ||
385 | p = object + s->offset + sizeof(void *); | ||
386 | else | ||
387 | p = object + s->inuse; | ||
388 | |||
389 | return p + alloc; | ||
390 | } | ||
391 | |||
392 | static void set_track(struct kmem_cache *s, void *object, | ||
393 | enum track_item alloc, unsigned long addr) | ||
394 | { | ||
395 | struct track *p; | ||
396 | |||
397 | if (s->offset) | ||
398 | p = object + s->offset + sizeof(void *); | ||
399 | else | ||
400 | p = object + s->inuse; | ||
401 | |||
402 | p += alloc; | ||
403 | if (addr) { | ||
404 | p->addr = addr; | ||
405 | p->cpu = raw_smp_processor_id(); | ||
406 | p->pid = current ? current->pid : -1; | ||
407 | p->when = jiffies; | ||
408 | } else | ||
409 | memset(p, 0, sizeof(struct track)); | ||
410 | } | ||
411 | |||
412 | static void init_tracking(struct kmem_cache *s, void *object) | ||
413 | { | ||
414 | if (!(s->flags & SLAB_STORE_USER)) | ||
415 | return; | ||
416 | |||
417 | set_track(s, object, TRACK_FREE, 0UL); | ||
418 | set_track(s, object, TRACK_ALLOC, 0UL); | ||
419 | } | ||
420 | |||
421 | static void print_track(const char *s, struct track *t) | ||
422 | { | ||
423 | if (!t->addr) | ||
424 | return; | ||
425 | |||
426 | printk(KERN_ERR "INFO: %s in ", s); | ||
427 | __print_symbol("%s", (unsigned long)t->addr); | ||
428 | printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); | ||
429 | } | ||
430 | |||
431 | static void print_tracking(struct kmem_cache *s, void *object) | ||
432 | { | ||
433 | if (!(s->flags & SLAB_STORE_USER)) | ||
434 | return; | ||
435 | |||
436 | print_track("Allocated", get_track(s, object, TRACK_ALLOC)); | ||
437 | print_track("Freed", get_track(s, object, TRACK_FREE)); | ||
438 | } | ||
439 | |||
440 | static void print_page_info(struct slqb_page *page) | ||
441 | { | ||
442 | printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n", | ||
443 | page, page->inuse, page->freelist, page->flags); | ||
444 | |||
445 | } | ||
446 | |||
447 | #define MAX_ERR_STR 100 | ||
448 | static void slab_bug(struct kmem_cache *s, char *fmt, ...) | ||
449 | { | ||
450 | va_list args; | ||
451 | char buf[MAX_ERR_STR]; | ||
452 | |||
453 | va_start(args, fmt); | ||
454 | vsnprintf(buf, sizeof(buf), fmt, args); | ||
455 | va_end(args); | ||
456 | printk(KERN_ERR "========================================" | ||
457 | "=====================================\n"); | ||
458 | printk(KERN_ERR "BUG %s: %s\n", s->name, buf); | ||
459 | printk(KERN_ERR "----------------------------------------" | ||
460 | "-------------------------------------\n\n"); | ||
461 | } | ||
462 | |||
463 | static void slab_fix(struct kmem_cache *s, char *fmt, ...) | ||
464 | { | ||
465 | va_list args; | ||
466 | char buf[100]; | ||
467 | |||
468 | va_start(args, fmt); | ||
469 | vsnprintf(buf, sizeof(buf), fmt, args); | ||
470 | va_end(args); | ||
471 | printk(KERN_ERR "FIX %s: %s\n", s->name, buf); | ||
472 | } | ||
473 | |||
474 | static void print_trailer(struct kmem_cache *s, struct slqb_page *page, u8 *p) | ||
475 | { | ||
476 | unsigned int off; /* Offset of last byte */ | ||
477 | u8 *addr = slqb_page_address(page); | ||
478 | |||
479 | print_tracking(s, p); | ||
480 | |||
481 | print_page_info(page); | ||
482 | |||
483 | printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", | ||
484 | p, p - addr, get_freepointer(s, p)); | ||
485 | |||
486 | if (p > addr + 16) | ||
487 | print_section("Bytes b4", p - 16, 16); | ||
488 | |||
489 | print_section("Object", p, min(s->objsize, 128)); | ||
490 | |||
491 | if (s->flags & SLAB_RED_ZONE) | ||
492 | print_section("Redzone", p + s->objsize, s->inuse - s->objsize); | ||
493 | |||
494 | if (s->offset) | ||
495 | off = s->offset + sizeof(void *); | ||
496 | else | ||
497 | off = s->inuse; | ||
498 | |||
499 | if (s->flags & SLAB_STORE_USER) | ||
500 | off += 2 * sizeof(struct track); | ||
501 | |||
502 | if (off != s->size) { | ||
503 | /* Beginning of the filler is the free pointer */ | ||
504 | print_section("Padding", p + off, s->size - off); | ||
505 | } | ||
506 | |||
507 | dump_stack(); | ||
508 | } | ||
509 | |||
510 | static void object_err(struct kmem_cache *s, struct slqb_page *page, | ||
511 | u8 *object, char *reason) | ||
512 | { | ||
513 | slab_bug(s, reason); | ||
514 | print_trailer(s, page, object); | ||
515 | } | ||
516 | |||
517 | static void slab_err(struct kmem_cache *s, struct slqb_page *page, | ||
518 | char *fmt, ...) | ||
519 | { | ||
520 | slab_bug(s, fmt); | ||
521 | print_page_info(page); | ||
522 | dump_stack(); | ||
523 | } | ||
524 | |||
525 | static void init_object(struct kmem_cache *s, void *object, int active) | ||
526 | { | ||
527 | u8 *p = object; | ||
528 | |||
529 | if (s->flags & __OBJECT_POISON) { | ||
530 | memset(p, POISON_FREE, s->objsize - 1); | ||
531 | p[s->objsize - 1] = POISON_END; | ||
532 | } | ||
533 | |||
534 | if (s->flags & SLAB_RED_ZONE) { | ||
535 | memset(p + s->objsize, | ||
536 | active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, | ||
537 | s->inuse - s->objsize); | ||
538 | } | ||
539 | } | ||
540 | |||
541 | static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) | ||
542 | { | ||
543 | while (bytes) { | ||
544 | if (*start != (u8)value) | ||
545 | return start; | ||
546 | start++; | ||
547 | bytes--; | ||
548 | } | ||
549 | return NULL; | ||
550 | } | ||
551 | |||
552 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, | ||
553 | void *from, void *to) | ||
554 | { | ||
555 | slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); | ||
556 | memset(from, data, to - from); | ||
557 | } | ||
558 | |||
559 | static int check_bytes_and_report(struct kmem_cache *s, struct slqb_page *page, | ||
560 | u8 *object, char *what, | ||
561 | u8 *start, unsigned int value, unsigned int bytes) | ||
562 | { | ||
563 | u8 *fault; | ||
564 | u8 *end; | ||
565 | |||
566 | fault = check_bytes(start, value, bytes); | ||
567 | if (!fault) | ||
568 | return 1; | ||
569 | |||
570 | end = start + bytes; | ||
571 | while (end > fault && end[-1] == value) | ||
572 | end--; | ||
573 | |||
574 | slab_bug(s, "%s overwritten", what); | ||
575 | printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", | ||
576 | fault, end - 1, fault[0], value); | ||
577 | print_trailer(s, page, object); | ||
578 | |||
579 | restore_bytes(s, what, value, fault, end); | ||
580 | return 0; | ||
581 | } | ||
582 | |||
583 | /* | ||
584 | * Object layout: | ||
585 | * | ||
586 | * object address | ||
587 | * Bytes of the object to be managed. | ||
588 | * If the freepointer may overlay the object then the free | ||
589 | * pointer is the first word of the object. | ||
590 | * | ||
591 | * Poisoning uses 0x6b (POISON_FREE) and the last byte is | ||
592 | * 0xa5 (POISON_END) | ||
593 | * | ||
594 | * object + s->objsize | ||
595 | * Padding to reach word boundary. This is also used for Redzoning. | ||
596 | * Padding is extended by another word if Redzoning is enabled and | ||
597 | * objsize == inuse. | ||
598 | * | ||
599 | * We fill with 0xbb (RED_INACTIVE) for inactive objects and with | ||
600 | * 0xcc (RED_ACTIVE) for objects in use. | ||
601 | * | ||
602 | * object + s->inuse | ||
603 | * Meta data starts here. | ||
604 | * | ||
605 | * A. Free pointer (if we cannot overwrite object on free) | ||
606 | * B. Tracking data for SLAB_STORE_USER | ||
607 | * C. Padding to reach required alignment boundary or at mininum | ||
608 | * one word if debuggin is on to be able to detect writes | ||
609 | * before the word boundary. | ||
610 | * | ||
611 | * Padding is done using 0x5a (POISON_INUSE) | ||
612 | * | ||
613 | * object + s->size | ||
614 | * Nothing is used beyond s->size. | ||
615 | */ | ||
616 | |||
617 | static int check_pad_bytes(struct kmem_cache *s, struct slqb_page *page, u8 *p) | ||
618 | { | ||
619 | unsigned long off = s->inuse; /* The end of info */ | ||
620 | |||
621 | if (s->offset) { | ||
622 | /* Freepointer is placed after the object. */ | ||
623 | off += sizeof(void *); | ||
624 | } | ||
625 | |||
626 | if (s->flags & SLAB_STORE_USER) { | ||
627 | /* We also have user information there */ | ||
628 | off += 2 * sizeof(struct track); | ||
629 | } | ||
630 | |||
631 | if (s->size == off) | ||
632 | return 1; | ||
633 | |||
634 | return check_bytes_and_report(s, page, p, "Object padding", | ||
635 | p + off, POISON_INUSE, s->size - off); | ||
636 | } | ||
637 | |||
638 | static int slab_pad_check(struct kmem_cache *s, struct slqb_page *page) | ||
639 | { | ||
640 | u8 *start; | ||
641 | u8 *fault; | ||
642 | u8 *end; | ||
643 | int length; | ||
644 | int remainder; | ||
645 | |||
646 | if (!(s->flags & SLAB_POISON)) | ||
647 | return 1; | ||
648 | |||
649 | start = slqb_page_address(page); | ||
650 | end = start + (PAGE_SIZE << s->order); | ||
651 | length = s->objects * s->size; | ||
652 | remainder = end - (start + length); | ||
653 | if (!remainder) | ||
654 | return 1; | ||
655 | |||
656 | fault = check_bytes(start + length, POISON_INUSE, remainder); | ||
657 | if (!fault) | ||
658 | return 1; | ||
659 | |||
660 | while (end > fault && end[-1] == POISON_INUSE) | ||
661 | end--; | ||
662 | |||
663 | slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); | ||
664 | print_section("Padding", start, length); | ||
665 | |||
666 | restore_bytes(s, "slab padding", POISON_INUSE, start, end); | ||
667 | return 0; | ||
668 | } | ||
669 | |||
670 | static int check_object(struct kmem_cache *s, struct slqb_page *page, | ||
671 | void *object, int active) | ||
672 | { | ||
673 | u8 *p = object; | ||
674 | u8 *endobject = object + s->objsize; | ||
675 | |||
676 | if (s->flags & SLAB_RED_ZONE) { | ||
677 | unsigned int red = | ||
678 | active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; | ||
679 | |||
680 | if (!check_bytes_and_report(s, page, object, "Redzone", | ||
681 | endobject, red, s->inuse - s->objsize)) | ||
682 | return 0; | ||
683 | } else { | ||
684 | if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { | ||
685 | check_bytes_and_report(s, page, p, "Alignment padding", | ||
686 | endobject, POISON_INUSE, s->inuse - s->objsize); | ||
687 | } | ||
688 | } | ||
689 | |||
690 | if (s->flags & SLAB_POISON) { | ||
691 | if (!active && (s->flags & __OBJECT_POISON)) { | ||
692 | if (!check_bytes_and_report(s, page, p, "Poison", p, | ||
693 | POISON_FREE, s->objsize - 1)) | ||
694 | return 0; | ||
695 | |||
696 | if (!check_bytes_and_report(s, page, p, "Poison", | ||
697 | p + s->objsize - 1, POISON_END, 1)) | ||
698 | return 0; | ||
699 | } | ||
700 | |||
701 | /* | ||
702 | * check_pad_bytes cleans up on its own. | ||
703 | */ | ||
704 | check_pad_bytes(s, page, p); | ||
705 | } | ||
706 | |||
707 | return 1; | ||
708 | } | ||
709 | |||
710 | static int check_slab(struct kmem_cache *s, struct slqb_page *page) | ||
711 | { | ||
712 | if (!(page->flags & PG_SLQB_BIT)) { | ||
713 | slab_err(s, page, "Not a valid slab page"); | ||
714 | return 0; | ||
715 | } | ||
716 | if (page->inuse == 0) { | ||
717 | slab_err(s, page, "inuse before free / after alloc", s->name); | ||
718 | return 0; | ||
719 | } | ||
720 | if (page->inuse > s->objects) { | ||
721 | slab_err(s, page, "inuse %u > max %u", | ||
722 | s->name, page->inuse, s->objects); | ||
723 | return 0; | ||
724 | } | ||
725 | /* Slab_pad_check fixes things up after itself */ | ||
726 | slab_pad_check(s, page); | ||
727 | return 1; | ||
728 | } | ||
729 | |||
730 | static void trace(struct kmem_cache *s, struct slqb_page *page, | ||
731 | void *object, int alloc) | ||
732 | { | ||
733 | if (s->flags & SLAB_TRACE) { | ||
734 | printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", | ||
735 | s->name, | ||
736 | alloc ? "alloc" : "free", | ||
737 | object, page->inuse, | ||
738 | page->freelist); | ||
739 | |||
740 | if (!alloc) | ||
741 | print_section("Object", (void *)object, s->objsize); | ||
742 | |||
743 | dump_stack(); | ||
744 | } | ||
745 | } | ||
746 | |||
747 | static void setup_object_debug(struct kmem_cache *s, struct slqb_page *page, | ||
748 | void *object) | ||
749 | { | ||
750 | if (!slab_debug(s)) | ||
751 | return; | ||
752 | |||
753 | if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) | ||
754 | return; | ||
755 | |||
756 | init_object(s, object, 0); | ||
757 | init_tracking(s, object); | ||
758 | } | ||
759 | |||
760 | static int alloc_debug_processing(struct kmem_cache *s, | ||
761 | void *object, unsigned long addr) | ||
762 | { | ||
763 | struct slqb_page *page; | ||
764 | page = virt_to_head_slqb_page(object); | ||
765 | |||
766 | if (!check_slab(s, page)) | ||
767 | goto bad; | ||
768 | |||
769 | if (!check_valid_pointer(s, page, object)) { | ||
770 | object_err(s, page, object, "Freelist Pointer check fails"); | ||
771 | goto bad; | ||
772 | } | ||
773 | |||
774 | if (object && !check_object(s, page, object, 0)) | ||
775 | goto bad; | ||
776 | |||
777 | /* Success perform special debug activities for allocs */ | ||
778 | if (s->flags & SLAB_STORE_USER) | ||
779 | set_track(s, object, TRACK_ALLOC, addr); | ||
780 | trace(s, page, object, 1); | ||
781 | init_object(s, object, 1); | ||
782 | return 1; | ||
783 | |||
784 | bad: | ||
785 | return 0; | ||
786 | } | ||
787 | |||
788 | static int free_debug_processing(struct kmem_cache *s, | ||
789 | void *object, unsigned long addr) | ||
790 | { | ||
791 | struct slqb_page *page; | ||
792 | page = virt_to_head_slqb_page(object); | ||
793 | |||
794 | if (!check_slab(s, page)) | ||
795 | goto fail; | ||
796 | |||
797 | if (!check_valid_pointer(s, page, object)) { | ||
798 | slab_err(s, page, "Invalid object pointer 0x%p", object); | ||
799 | goto fail; | ||
800 | } | ||
801 | |||
802 | if (!check_object(s, page, object, 1)) | ||
803 | return 0; | ||
804 | |||
805 | /* Special debug activities for freeing objects */ | ||
806 | if (s->flags & SLAB_STORE_USER) | ||
807 | set_track(s, object, TRACK_FREE, addr); | ||
808 | trace(s, page, object, 0); | ||
809 | init_object(s, object, 0); | ||
810 | return 1; | ||
811 | |||
812 | fail: | ||
813 | slab_fix(s, "Object at 0x%p not freed", object); | ||
814 | return 0; | ||
815 | } | ||
816 | |||
817 | static int __init setup_slqb_debug(char *str) | ||
818 | { | ||
819 | slqb_debug = DEBUG_DEFAULT_FLAGS; | ||
820 | if (*str++ != '=' || !*str) { | ||
821 | /* | ||
822 | * No options specified. Switch on full debugging. | ||
823 | */ | ||
824 | goto out; | ||
825 | } | ||
826 | |||
827 | if (*str == ',') { | ||
828 | /* | ||
829 | * No options but restriction on slabs. This means full | ||
830 | * debugging for slabs matching a pattern. | ||
831 | */ | ||
832 | goto check_slabs; | ||
833 | } | ||
834 | |||
835 | slqb_debug = 0; | ||
836 | if (*str == '-') { | ||
837 | /* | ||
838 | * Switch off all debugging measures. | ||
839 | */ | ||
840 | goto out; | ||
841 | } | ||
842 | |||
843 | /* | ||
844 | * Determine which debug features should be switched on | ||
845 | */ | ||
846 | for (; *str && *str != ','; str++) { | ||
847 | switch (tolower(*str)) { | ||
848 | case 'f': | ||
849 | slqb_debug |= SLAB_DEBUG_FREE; | ||
850 | break; | ||
851 | case 'z': | ||
852 | slqb_debug |= SLAB_RED_ZONE; | ||
853 | break; | ||
854 | case 'p': | ||
855 | slqb_debug |= SLAB_POISON; | ||
856 | break; | ||
857 | case 'u': | ||
858 | slqb_debug |= SLAB_STORE_USER; | ||
859 | break; | ||
860 | case 't': | ||
861 | slqb_debug |= SLAB_TRACE; | ||
862 | break; | ||
863 | case 'a': | ||
864 | slqb_debug |= SLAB_FAILSLAB; | ||
865 | break; | ||
866 | default: | ||
867 | printk(KERN_ERR "slqb_debug option '%c' " | ||
868 | "unknown. skipped\n", *str); | ||
869 | } | ||
870 | } | ||
871 | |||
872 | check_slabs: | ||
873 | if (*str == ',') | ||
874 | slqb_debug_slabs = str + 1; | ||
875 | out: | ||
876 | return 1; | ||
877 | } | ||
878 | __setup("slqb_debug", setup_slqb_debug); | ||
879 | |||
880 | static int __init setup_slqb_min_order(char *str) | ||
881 | { | ||
882 | get_option(&str, &slqb_min_order); | ||
883 | slqb_min_order = min(slqb_min_order, MAX_ORDER - 1); | ||
884 | |||
885 | return 1; | ||
886 | } | ||
887 | __setup("slqb_min_order=", setup_slqb_min_order); | ||
888 | |||
889 | static int __init setup_slqb_min_objects(char *str) | ||
890 | { | ||
891 | get_option(&str, &slqb_min_objects); | ||
892 | |||
893 | return 1; | ||
894 | } | ||
895 | |||
896 | __setup("slqb_min_objects=", setup_slqb_min_objects); | ||
897 | |||
898 | static unsigned long kmem_cache_flags(unsigned long objsize, | ||
899 | unsigned long flags, const char *name, | ||
900 | void (*ctor)(void *)) | ||
901 | { | ||
902 | /* | ||
903 | * Enable debugging if selected on the kernel commandline. | ||
904 | */ | ||
905 | if (slqb_debug && (!slqb_debug_slabs || | ||
906 | strncmp(slqb_debug_slabs, name, | ||
907 | strlen(slqb_debug_slabs)) == 0)) | ||
908 | flags |= slqb_debug; | ||
909 | |||
910 | if (num_possible_nodes() > 1) | ||
911 | flags |= SLAB_NUMA; | ||
912 | |||
913 | return flags; | ||
914 | } | ||
915 | #else | ||
916 | static inline void setup_object_debug(struct kmem_cache *s, | ||
917 | struct slqb_page *page, void *object) | ||
918 | { | ||
919 | } | ||
920 | |||
921 | static inline int alloc_debug_processing(struct kmem_cache *s, | ||
922 | void *object, unsigned long addr) | ||
923 | { | ||
924 | return 0; | ||
925 | } | ||
926 | |||
927 | static inline int free_debug_processing(struct kmem_cache *s, | ||
928 | void *object, unsigned long addr) | ||
929 | { | ||
930 | return 0; | ||
931 | } | ||
932 | |||
933 | static inline int slab_pad_check(struct kmem_cache *s, struct slqb_page *page) | ||
934 | { | ||
935 | return 1; | ||
936 | } | ||
937 | |||
938 | static inline int check_object(struct kmem_cache *s, struct slqb_page *page, | ||
939 | void *object, int active) | ||
940 | { | ||
941 | return 1; | ||
942 | } | ||
943 | |||
944 | static inline void add_full(struct kmem_cache_node *n, struct slqb_page *page) | ||
945 | { | ||
946 | } | ||
947 | |||
948 | static inline unsigned long kmem_cache_flags(unsigned long objsize, | ||
949 | unsigned long flags, const char *name, void (*ctor)(void *)) | ||
950 | { | ||
951 | if (num_possible_nodes() > 1) | ||
952 | flags |= SLAB_NUMA; | ||
953 | return flags; | ||
954 | } | ||
955 | |||
956 | static const int slqb_debug; | ||
957 | #endif | ||
958 | |||
959 | /* | ||
960 | * allocate a new slab (return its corresponding struct slqb_page) | ||
961 | */ | ||
962 | static struct slqb_page *allocate_slab(struct kmem_cache *s, | ||
963 | gfp_t flags, int node) | ||
964 | { | ||
965 | struct slqb_page *page; | ||
966 | int pages = 1 << s->order; | ||
967 | |||
968 | flags |= s->allocflags; | ||
969 | |||
970 | page = (struct slqb_page *)alloc_pages_node(node, flags, s->order); | ||
971 | if (!page) | ||
972 | return NULL; | ||
973 | |||
974 | mod_zone_page_state(slqb_page_zone(page), | ||
975 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | ||
976 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | ||
977 | pages); | ||
978 | |||
979 | return page; | ||
980 | } | ||
981 | |||
982 | /* | ||
983 | * Called once for each object on a new slab page | ||
984 | */ | ||
985 | static void setup_object(struct kmem_cache *s, | ||
986 | struct slqb_page *page, void *object) | ||
987 | { | ||
988 | setup_object_debug(s, page, object); | ||
989 | if (unlikely(s->ctor)) | ||
990 | s->ctor(object); | ||
991 | } | ||
992 | |||
993 | /* | ||
994 | * Allocate a new slab, set up its object list. | ||
995 | */ | ||
996 | static struct slqb_page *new_slab_page(struct kmem_cache *s, | ||
997 | gfp_t flags, int node, unsigned int colour) | ||
998 | { | ||
999 | struct slqb_page *page; | ||
1000 | void *start; | ||
1001 | void *last; | ||
1002 | void *p; | ||
1003 | |||
1004 | BUG_ON(flags & GFP_SLAB_BUG_MASK); | ||
1005 | |||
1006 | page = allocate_slab(s, | ||
1007 | flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); | ||
1008 | if (!page) | ||
1009 | goto out; | ||
1010 | |||
1011 | page->flags |= PG_SLQB_BIT; | ||
1012 | |||
1013 | start = page_address(&page->page); | ||
1014 | |||
1015 | if (unlikely(slab_poison(s))) | ||
1016 | memset(start, POISON_INUSE, PAGE_SIZE << s->order); | ||
1017 | |||
1018 | start += colour; | ||
1019 | |||
1020 | last = start; | ||
1021 | for_each_object(p, s, start) { | ||
1022 | setup_object(s, page, p); | ||
1023 | set_freepointer(s, last, p); | ||
1024 | last = p; | ||
1025 | } | ||
1026 | set_freepointer(s, last, NULL); | ||
1027 | |||
1028 | page->freelist = start; | ||
1029 | page->inuse = 0; | ||
1030 | out: | ||
1031 | return page; | ||
1032 | } | ||
1033 | |||
1034 | /* | ||
1035 | * Free a slab page back to the page allocator | ||
1036 | */ | ||
1037 | static void __free_slab(struct kmem_cache *s, struct slqb_page *page) | ||
1038 | { | ||
1039 | int pages = 1 << s->order; | ||
1040 | |||
1041 | if (unlikely(slab_debug(s))) { | ||
1042 | void *p; | ||
1043 | |||
1044 | slab_pad_check(s, page); | ||
1045 | for_each_free_object(p, s, page->freelist) | ||
1046 | check_object(s, page, p, 0); | ||
1047 | } | ||
1048 | |||
1049 | mod_zone_page_state(slqb_page_zone(page), | ||
1050 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | ||
1051 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | ||
1052 | -pages); | ||
1053 | |||
1054 | __free_slqb_pages(page, s->order, pages); | ||
1055 | } | ||
1056 | |||
1057 | static void rcu_free_slab(struct rcu_head *h) | ||
1058 | { | ||
1059 | struct slqb_page *page; | ||
1060 | |||
1061 | page = container_of(h, struct slqb_page, rcu_head); | ||
1062 | __free_slab(page->list->cache, page); | ||
1063 | } | ||
1064 | |||
1065 | static void free_slab(struct kmem_cache *s, struct slqb_page *page) | ||
1066 | { | ||
1067 | VM_BUG_ON(page->inuse); | ||
1068 | if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) | ||
1069 | call_rcu(&page->rcu_head, rcu_free_slab); | ||
1070 | else | ||
1071 | __free_slab(s, page); | ||
1072 | } | ||
1073 | |||
1074 | /* | ||
1075 | * Return an object to its slab. | ||
1076 | * | ||
1077 | * Caller must be the owner CPU in the case of per-CPU list, or hold the node's | ||
1078 | * list_lock in the case of per-node list. | ||
1079 | */ | ||
1080 | static int free_object_to_page(struct kmem_cache *s, | ||
1081 | struct kmem_cache_list *l, struct slqb_page *page, | ||
1082 | void *object) | ||
1083 | { | ||
1084 | VM_BUG_ON(page->list != l); | ||
1085 | |||
1086 | set_freepointer(s, object, page->freelist); | ||
1087 | page->freelist = object; | ||
1088 | page->inuse--; | ||
1089 | |||
1090 | if (!page->inuse) { | ||
1091 | if (likely(s->objects > 1)) { | ||
1092 | l->nr_partial--; | ||
1093 | list_del(&page->lru); | ||
1094 | } | ||
1095 | l->nr_slabs--; | ||
1096 | free_slab(s, page); | ||
1097 | slqb_stat_inc(l, FLUSH_SLAB_FREE); | ||
1098 | return 1; | ||
1099 | |||
1100 | } else if (page->inuse + 1 == s->objects) { | ||
1101 | l->nr_partial++; | ||
1102 | list_add(&page->lru, &l->partial); | ||
1103 | slqb_stat_inc(l, FLUSH_SLAB_PARTIAL); | ||
1104 | return 0; | ||
1105 | } | ||
1106 | return 0; | ||
1107 | } | ||
1108 | |||
1109 | #ifdef CONFIG_SMP | ||
1110 | static void slab_free_to_remote(struct kmem_cache *s, struct slqb_page *page, | ||
1111 | void *object, struct kmem_cache_cpu *c); | ||
1112 | #endif | ||
1113 | |||
1114 | /* | ||
1115 | * Flush the LIFO list of objects on a list. They are sent back to their pages | ||
1116 | * in case the pages also belong to the list, or to our CPU's remote-free list | ||
1117 | * in the case they do not. | ||
1118 | * | ||
1119 | * Doesn't flush the entire list. flush_free_list_all does. | ||
1120 | * | ||
1121 | * Caller must be the owner CPU in the case of per-CPU list, or hold the node's | ||
1122 | * list_lock in the case of per-node list. | ||
1123 | */ | ||
1124 | static void flush_free_list(struct kmem_cache *s, struct kmem_cache_list *l) | ||
1125 | { | ||
1126 | void **head; | ||
1127 | int nr; | ||
1128 | int locked = 0; | ||
1129 | |||
1130 | nr = l->freelist.nr; | ||
1131 | if (unlikely(!nr)) | ||
1132 | return; | ||
1133 | |||
1134 | nr = min(slab_freebatch(s), nr); | ||
1135 | |||
1136 | slqb_stat_inc(l, FLUSH_FREE_LIST); | ||
1137 | slqb_stat_add(l, FLUSH_FREE_LIST_OBJECTS, nr); | ||
1138 | |||
1139 | l->freelist.nr -= nr; | ||
1140 | head = l->freelist.head; | ||
1141 | |||
1142 | do { | ||
1143 | struct slqb_page *page; | ||
1144 | void **object; | ||
1145 | |||
1146 | object = head; | ||
1147 | VM_BUG_ON(!object); | ||
1148 | head = get_freepointer(s, object); | ||
1149 | page = virt_to_head_slqb_page(object); | ||
1150 | |||
1151 | #ifdef CONFIG_SMP | ||
1152 | if (page->list != l) { | ||
1153 | struct kmem_cache_cpu *c; | ||
1154 | |||
1155 | if (locked) { | ||
1156 | spin_unlock(&l->page_lock); | ||
1157 | locked = 0; | ||
1158 | } | ||
1159 | |||
1160 | c = get_cpu_slab(s, smp_processor_id()); | ||
1161 | |||
1162 | slab_free_to_remote(s, page, object, c); | ||
1163 | slqb_stat_inc(l, FLUSH_FREE_LIST_REMOTE); | ||
1164 | } else | ||
1165 | #endif | ||
1166 | { | ||
1167 | if (!locked) { | ||
1168 | spin_lock(&l->page_lock); | ||
1169 | locked = 1; | ||
1170 | } | ||
1171 | free_object_to_page(s, l, page, object); | ||
1172 | } | ||
1173 | |||
1174 | nr--; | ||
1175 | } while (nr); | ||
1176 | |||
1177 | if (locked) | ||
1178 | spin_unlock(&l->page_lock); | ||
1179 | |||
1180 | l->freelist.head = head; | ||
1181 | if (!l->freelist.nr) | ||
1182 | l->freelist.tail = NULL; | ||
1183 | } | ||
1184 | |||
1185 | static void flush_free_list_all(struct kmem_cache *s, struct kmem_cache_list *l) | ||
1186 | { | ||
1187 | while (l->freelist.nr) | ||
1188 | flush_free_list(s, l); | ||
1189 | } | ||
1190 | |||
1191 | #ifdef CONFIG_SMP | ||
1192 | /* | ||
1193 | * If enough objects have been remotely freed back to this list, | ||
1194 | * remote_free_check will be set. In which case, we'll eventually come here | ||
1195 | * to take those objects off our remote_free list and onto our LIFO freelist. | ||
1196 | * | ||
1197 | * Caller must be the owner CPU in the case of per-CPU list, or hold the node's | ||
1198 | * list_lock in the case of per-node list. | ||
1199 | */ | ||
1200 | static void claim_remote_free_list(struct kmem_cache *s, | ||
1201 | struct kmem_cache_list *l) | ||
1202 | { | ||
1203 | void **head, **tail; | ||
1204 | int nr; | ||
1205 | |||
1206 | if (!l->remote_free.list.nr) | ||
1207 | return; | ||
1208 | |||
1209 | spin_lock(&l->remote_free.lock); | ||
1210 | |||
1211 | l->remote_free_check = 0; | ||
1212 | head = l->remote_free.list.head; | ||
1213 | l->remote_free.list.head = NULL; | ||
1214 | tail = l->remote_free.list.tail; | ||
1215 | l->remote_free.list.tail = NULL; | ||
1216 | nr = l->remote_free.list.nr; | ||
1217 | l->remote_free.list.nr = 0; | ||
1218 | |||
1219 | spin_unlock(&l->remote_free.lock); | ||
1220 | |||
1221 | VM_BUG_ON(!nr); | ||
1222 | |||
1223 | if (!l->freelist.nr) { | ||
1224 | /* Get head hot for likely subsequent allocation or flush */ | ||
1225 | prefetchw(head); | ||
1226 | l->freelist.head = head; | ||
1227 | } else | ||
1228 | set_freepointer(s, l->freelist.tail, head); | ||
1229 | l->freelist.tail = tail; | ||
1230 | |||
1231 | l->freelist.nr += nr; | ||
1232 | |||
1233 | slqb_stat_inc(l, CLAIM_REMOTE_LIST); | ||
1234 | slqb_stat_add(l, CLAIM_REMOTE_LIST_OBJECTS, nr); | ||
1235 | } | ||
1236 | #else | ||
1237 | static inline void claim_remote_free_list(struct kmem_cache *s, | ||
1238 | struct kmem_cache_list *l) | ||
1239 | { | ||
1240 | } | ||
1241 | #endif | ||
1242 | |||
1243 | /* | ||
1244 | * Allocation fastpath. Get an object from the list's LIFO freelist, or | ||
1245 | * return NULL if it is empty. | ||
1246 | * | ||
1247 | * Caller must be the owner CPU in the case of per-CPU list, or hold the node's | ||
1248 | * list_lock in the case of per-node list. | ||
1249 | */ | ||
1250 | static __always_inline void *__cache_list_get_object(struct kmem_cache *s, | ||
1251 | struct kmem_cache_list *l) | ||
1252 | { | ||
1253 | void *object; | ||
1254 | |||
1255 | object = l->freelist.head; | ||
1256 | if (likely(object)) { | ||
1257 | void *next = get_freepointer(s, object); | ||
1258 | |||
1259 | VM_BUG_ON(!l->freelist.nr); | ||
1260 | l->freelist.nr--; | ||
1261 | l->freelist.head = next; | ||
1262 | |||
1263 | return object; | ||
1264 | } | ||
1265 | VM_BUG_ON(l->freelist.nr); | ||
1266 | |||
1267 | #ifdef CONFIG_SMP | ||
1268 | if (unlikely(l->remote_free_check)) { | ||
1269 | claim_remote_free_list(s, l); | ||
1270 | |||
1271 | if (l->freelist.nr > slab_hiwater(s)) | ||
1272 | flush_free_list(s, l); | ||
1273 | |||
1274 | /* repetition here helps gcc :( */ | ||
1275 | object = l->freelist.head; | ||
1276 | if (likely(object)) { | ||
1277 | void *next = get_freepointer(s, object); | ||
1278 | |||
1279 | VM_BUG_ON(!l->freelist.nr); | ||
1280 | l->freelist.nr--; | ||
1281 | l->freelist.head = next; | ||
1282 | |||
1283 | return object; | ||
1284 | } | ||
1285 | VM_BUG_ON(l->freelist.nr); | ||
1286 | } | ||
1287 | #endif | ||
1288 | |||
1289 | return NULL; | ||
1290 | } | ||
1291 | |||
1292 | /* | ||
1293 | * Slow(er) path. Get a page from this list's existing pages. Will be a | ||
1294 | * new empty page in the case that __slab_alloc_page has just been called | ||
1295 | * (empty pages otherwise never get queued up on the lists), or a partial page | ||
1296 | * already on the list. | ||
1297 | * | ||
1298 | * Caller must be the owner CPU in the case of per-CPU list, or hold the node's | ||
1299 | * list_lock in the case of per-node list. | ||
1300 | */ | ||
1301 | static noinline void *__cache_list_get_page(struct kmem_cache *s, | ||
1302 | struct kmem_cache_list *l) | ||
1303 | { | ||
1304 | struct slqb_page *page; | ||
1305 | void *object; | ||
1306 | |||
1307 | if (unlikely(!l->nr_partial)) | ||
1308 | return NULL; | ||
1309 | |||
1310 | page = list_first_entry(&l->partial, struct slqb_page, lru); | ||
1311 | VM_BUG_ON(page->inuse == s->objects); | ||
1312 | if (page->inuse + 1 == s->objects) { | ||
1313 | l->nr_partial--; | ||
1314 | list_del(&page->lru); | ||
1315 | } | ||
1316 | |||
1317 | VM_BUG_ON(!page->freelist); | ||
1318 | |||
1319 | page->inuse++; | ||
1320 | |||
1321 | object = page->freelist; | ||
1322 | page->freelist = get_freepointer(s, object); | ||
1323 | if (page->freelist) | ||
1324 | prefetchw(page->freelist); | ||
1325 | VM_BUG_ON((page->inuse == s->objects) != (page->freelist == NULL)); | ||
1326 | slqb_stat_inc(l, ALLOC_SLAB_FILL); | ||
1327 | |||
1328 | return object; | ||
1329 | } | ||
1330 | |||
1331 | static void *cache_list_get_page(struct kmem_cache *s, | ||
1332 | struct kmem_cache_list *l) | ||
1333 | { | ||
1334 | void *object; | ||
1335 | |||
1336 | if (unlikely(!l->nr_partial)) | ||
1337 | return NULL; | ||
1338 | |||
1339 | spin_lock(&l->page_lock); | ||
1340 | object = __cache_list_get_page(s, l); | ||
1341 | spin_unlock(&l->page_lock); | ||
1342 | |||
1343 | return object; | ||
1344 | } | ||
1345 | |||
1346 | /* | ||
1347 | * Allocation slowpath. Allocate a new slab page from the page allocator, and | ||
1348 | * put it on the list's partial list. Must be followed by an allocation so | ||
1349 | * that we don't have dangling empty pages on the partial list. | ||
1350 | * | ||
1351 | * Returns 0 on allocation failure. | ||
1352 | * | ||
1353 | * Must be called with interrupts disabled. | ||
1354 | */ | ||
1355 | static noinline void *__slab_alloc_page(struct kmem_cache *s, | ||
1356 | gfp_t gfpflags, int node) | ||
1357 | { | ||
1358 | struct slqb_page *page; | ||
1359 | struct kmem_cache_list *l; | ||
1360 | struct kmem_cache_cpu *c; | ||
1361 | unsigned int colour; | ||
1362 | void *object; | ||
1363 | |||
1364 | c = get_cpu_slab(s, smp_processor_id()); | ||
1365 | colour = c->colour_next; | ||
1366 | c->colour_next += s->colour_off; | ||
1367 | if (c->colour_next >= s->colour_range) | ||
1368 | c->colour_next = 0; | ||
1369 | |||
1370 | /* Caller handles __GFP_ZERO */ | ||
1371 | gfpflags &= ~__GFP_ZERO; | ||
1372 | |||
1373 | if (gfpflags & __GFP_WAIT) | ||
1374 | local_irq_enable(); | ||
1375 | page = new_slab_page(s, gfpflags, node, colour); | ||
1376 | if (gfpflags & __GFP_WAIT) | ||
1377 | local_irq_disable(); | ||
1378 | if (unlikely(!page)) | ||
1379 | return page; | ||
1380 | |||
1381 | if (!NUMA_BUILD || likely(slqb_page_to_nid(page) == numa_node_id())) { | ||
1382 | struct kmem_cache_cpu *c; | ||
1383 | int cpu = smp_processor_id(); | ||
1384 | |||
1385 | c = get_cpu_slab(s, cpu); | ||
1386 | l = &c->list; | ||
1387 | page->list = l; | ||
1388 | |||
1389 | spin_lock(&l->page_lock); | ||
1390 | l->nr_slabs++; | ||
1391 | l->nr_partial++; | ||
1392 | list_add(&page->lru, &l->partial); | ||
1393 | slqb_stat_inc(l, ALLOC); | ||
1394 | slqb_stat_inc(l, ALLOC_SLAB_NEW); | ||
1395 | object = __cache_list_get_page(s, l); | ||
1396 | spin_unlock(&l->page_lock); | ||
1397 | } else { | ||
1398 | #ifdef CONFIG_NUMA | ||
1399 | struct kmem_cache_node *n; | ||
1400 | |||
1401 | n = s->node_slab[slqb_page_to_nid(page)]; | ||
1402 | l = &n->list; | ||
1403 | page->list = l; | ||
1404 | |||
1405 | spin_lock(&n->list_lock); | ||
1406 | spin_lock(&l->page_lock); | ||
1407 | l->nr_slabs++; | ||
1408 | l->nr_partial++; | ||
1409 | list_add(&page->lru, &l->partial); | ||
1410 | slqb_stat_inc(l, ALLOC); | ||
1411 | slqb_stat_inc(l, ALLOC_SLAB_NEW); | ||
1412 | object = __cache_list_get_page(s, l); | ||
1413 | spin_unlock(&l->page_lock); | ||
1414 | spin_unlock(&n->list_lock); | ||
1415 | #endif | ||
1416 | } | ||
1417 | VM_BUG_ON(!object); | ||
1418 | return object; | ||
1419 | } | ||
1420 | |||
1421 | #ifdef CONFIG_NUMA | ||
1422 | static noinline int alternate_nid(struct kmem_cache *s, | ||
1423 | gfp_t gfpflags, int node) | ||
1424 | { | ||
1425 | if (in_interrupt() || (gfpflags & __GFP_THISNODE)) | ||
1426 | return node; | ||
1427 | if (cpuset_do_slab_mem_spread() && (s->flags & SLAB_MEM_SPREAD)) | ||
1428 | return cpuset_mem_spread_node(); | ||
1429 | else if (current->mempolicy) | ||
1430 | return slab_node(current->mempolicy); | ||
1431 | return node; | ||
1432 | } | ||
1433 | |||
1434 | /* | ||
1435 | * Allocate an object from a remote node. Return NULL if none could be found | ||
1436 | * (in which case, caller should allocate a new slab) | ||
1437 | * | ||
1438 | * Must be called with interrupts disabled. | ||
1439 | */ | ||
1440 | static void *__remote_slab_alloc_node(struct kmem_cache *s, | ||
1441 | gfp_t gfpflags, int node) | ||
1442 | { | ||
1443 | struct kmem_cache_node *n; | ||
1444 | struct kmem_cache_list *l; | ||
1445 | void *object; | ||
1446 | |||
1447 | n = s->node_slab[node]; | ||
1448 | if (unlikely(!n)) /* node has no memory */ | ||
1449 | return NULL; | ||
1450 | l = &n->list; | ||
1451 | |||
1452 | spin_lock(&n->list_lock); | ||
1453 | |||
1454 | object = __cache_list_get_object(s, l); | ||
1455 | if (unlikely(!object)) { | ||
1456 | object = cache_list_get_page(s, l); | ||
1457 | if (unlikely(!object)) { | ||
1458 | spin_unlock(&n->list_lock); | ||
1459 | return __slab_alloc_page(s, gfpflags, node); | ||
1460 | } | ||
1461 | } | ||
1462 | if (likely(object)) | ||
1463 | slqb_stat_inc(l, ALLOC); | ||
1464 | spin_unlock(&n->list_lock); | ||
1465 | return object; | ||
1466 | } | ||
1467 | |||
1468 | static noinline void *__remote_slab_alloc(struct kmem_cache *s, | ||
1469 | gfp_t gfpflags, int node) | ||
1470 | { | ||
1471 | void *object; | ||
1472 | struct zonelist *zonelist; | ||
1473 | struct zoneref *z; | ||
1474 | struct zone *zone; | ||
1475 | enum zone_type high_zoneidx = gfp_zone(gfpflags); | ||
1476 | |||
1477 | object = __remote_slab_alloc_node(s, gfpflags, node); | ||
1478 | if (likely(object || (gfpflags & __GFP_THISNODE))) | ||
1479 | return object; | ||
1480 | |||
1481 | zonelist = node_zonelist(slab_node(current->mempolicy), gfpflags); | ||
1482 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | ||
1483 | if (!cpuset_zone_allowed_hardwall(zone, gfpflags)) | ||
1484 | continue; | ||
1485 | |||
1486 | node = zone_to_nid(zone); | ||
1487 | object = __remote_slab_alloc_node(s, gfpflags, node); | ||
1488 | if (likely(object)) | ||
1489 | return object; | ||
1490 | } | ||
1491 | return NULL; | ||
1492 | } | ||
1493 | #endif | ||
1494 | |||
1495 | /* | ||
1496 | * Main allocation path. Return an object, or NULL on allocation failure. | ||
1497 | * | ||
1498 | * Must be called with interrupts disabled. | ||
1499 | */ | ||
1500 | static __always_inline void *__slab_alloc(struct kmem_cache *s, | ||
1501 | gfp_t gfpflags, int node) | ||
1502 | { | ||
1503 | void *object; | ||
1504 | struct kmem_cache_cpu *c; | ||
1505 | struct kmem_cache_list *l; | ||
1506 | |||
1507 | #ifdef CONFIG_NUMA | ||
1508 | if (unlikely(node != -1) && unlikely(node != numa_node_id())) { | ||
1509 | try_remote: | ||
1510 | return __remote_slab_alloc(s, gfpflags, node); | ||
1511 | } | ||
1512 | #endif | ||
1513 | |||
1514 | c = get_cpu_slab(s, smp_processor_id()); | ||
1515 | VM_BUG_ON(!c); | ||
1516 | l = &c->list; | ||
1517 | object = __cache_list_get_object(s, l); | ||
1518 | if (unlikely(!object)) { | ||
1519 | #ifdef CONFIG_NUMA | ||
1520 | int thisnode = numa_node_id(); | ||
1521 | |||
1522 | /* | ||
1523 | * If the local node is memoryless, try remote alloc before | ||
1524 | * trying the page allocator. Otherwise, what happens is | ||
1525 | * objects are always freed to remote lists but the allocation | ||
1526 | * side always allocates a new page with only one object | ||
1527 | * used in each page | ||
1528 | */ | ||
1529 | if (unlikely(!node_state(thisnode, N_HIGH_MEMORY))) | ||
1530 | object = __remote_slab_alloc(s, gfpflags, thisnode); | ||
1531 | #endif | ||
1532 | |||
1533 | if (!object) { | ||
1534 | object = cache_list_get_page(s, l); | ||
1535 | if (unlikely(!object)) { | ||
1536 | object = __slab_alloc_page(s, gfpflags, node); | ||
1537 | #ifdef CONFIG_NUMA | ||
1538 | if (unlikely(!object)) { | ||
1539 | node = numa_node_id(); | ||
1540 | goto try_remote; | ||
1541 | } | ||
1542 | #endif | ||
1543 | return object; | ||
1544 | } | ||
1545 | } | ||
1546 | } | ||
1547 | if (likely(object)) | ||
1548 | slqb_stat_inc(l, ALLOC); | ||
1549 | return object; | ||
1550 | } | ||
1551 | |||
1552 | /* | ||
1553 | * Perform some interrupts-on processing around the main allocation path | ||
1554 | * (debug checking and memset()ing). | ||
1555 | */ | ||
1556 | static __always_inline void *slab_alloc(struct kmem_cache *s, | ||
1557 | gfp_t gfpflags, int node, unsigned long addr) | ||
1558 | { | ||
1559 | void *object; | ||
1560 | unsigned long flags; | ||
1561 | |||
1562 | gfpflags &= gfp_allowed_mask; | ||
1563 | |||
1564 | lockdep_trace_alloc(gfpflags); | ||
1565 | might_sleep_if(gfpflags & __GFP_WAIT); | ||
1566 | |||
1567 | if (should_failslab(s->objsize, gfpflags, s->flags)) | ||
1568 | return NULL; | ||
1569 | |||
1570 | again: | ||
1571 | local_irq_save(flags); | ||
1572 | object = __slab_alloc(s, gfpflags, node); | ||
1573 | local_irq_restore(flags); | ||
1574 | |||
1575 | if (unlikely(slab_debug(s)) && likely(object)) { | ||
1576 | if (unlikely(!alloc_debug_processing(s, object, addr))) | ||
1577 | goto again; | ||
1578 | } | ||
1579 | |||
1580 | if (unlikely(gfpflags & __GFP_ZERO) && likely(object)) | ||
1581 | memset(object, 0, s->objsize); | ||
1582 | |||
1583 | return object; | ||
1584 | } | ||
1585 | |||
1586 | static __always_inline void *__kmem_cache_alloc(struct kmem_cache *s, | ||
1587 | gfp_t gfpflags, unsigned long caller) | ||
1588 | { | ||
1589 | int node = -1; | ||
1590 | |||
1591 | #ifdef CONFIG_NUMA | ||
1592 | if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) | ||
1593 | node = alternate_nid(s, gfpflags, node); | ||
1594 | #endif | ||
1595 | return slab_alloc(s, gfpflags, node, caller); | ||
1596 | } | ||
1597 | |||
1598 | void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) | ||
1599 | { | ||
1600 | return __kmem_cache_alloc(s, gfpflags, _RET_IP_); | ||
1601 | } | ||
1602 | EXPORT_SYMBOL(kmem_cache_alloc); | ||
1603 | |||
1604 | #ifdef CONFIG_NUMA | ||
1605 | void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) | ||
1606 | { | ||
1607 | return slab_alloc(s, gfpflags, node, _RET_IP_); | ||
1608 | } | ||
1609 | EXPORT_SYMBOL(kmem_cache_alloc_node); | ||
1610 | #endif | ||
1611 | |||
1612 | #ifdef CONFIG_SMP | ||
1613 | /* | ||
1614 | * Flush this CPU's remote free list of objects back to the list from where | ||
1615 | * they originate. They end up on that list's remotely freed list, and | ||
1616 | * eventually we set it's remote_free_check if there are enough objects on it. | ||
1617 | * | ||
1618 | * This seems convoluted, but it keeps is from stomping on the target CPU's | ||
1619 | * fastpath cachelines. | ||
1620 | * | ||
1621 | * Must be called with interrupts disabled. | ||
1622 | */ | ||
1623 | static void flush_remote_free_cache(struct kmem_cache *s, | ||
1624 | struct kmem_cache_cpu *c) | ||
1625 | { | ||
1626 | struct kmlist *src; | ||
1627 | struct kmem_cache_list *dst; | ||
1628 | unsigned int nr; | ||
1629 | int set; | ||
1630 | |||
1631 | src = &c->rlist; | ||
1632 | nr = src->nr; | ||
1633 | if (unlikely(!nr)) | ||
1634 | return; | ||
1635 | |||
1636 | #ifdef CONFIG_SLQB_STATS | ||
1637 | { | ||
1638 | struct kmem_cache_list *l = &c->list; | ||
1639 | |||
1640 | slqb_stat_inc(l, FLUSH_RFREE_LIST); | ||
1641 | slqb_stat_add(l, FLUSH_RFREE_LIST_OBJECTS, nr); | ||
1642 | } | ||
1643 | #endif | ||
1644 | |||
1645 | dst = c->remote_cache_list; | ||
1646 | |||
1647 | /* | ||
1648 | * Less common case, dst is filling up so free synchronously. | ||
1649 | * No point in having remote CPU free thse as it will just | ||
1650 | * free them back to the page list anyway. | ||
1651 | */ | ||
1652 | if (unlikely(dst->remote_free.list.nr > (slab_hiwater(s) >> 1))) { | ||
1653 | void **head; | ||
1654 | |||
1655 | head = src->head; | ||
1656 | spin_lock(&dst->page_lock); | ||
1657 | do { | ||
1658 | struct slqb_page *page; | ||
1659 | void **object; | ||
1660 | |||
1661 | object = head; | ||
1662 | VM_BUG_ON(!object); | ||
1663 | head = get_freepointer(s, object); | ||
1664 | page = virt_to_head_slqb_page(object); | ||
1665 | |||
1666 | free_object_to_page(s, dst, page, object); | ||
1667 | nr--; | ||
1668 | } while (nr); | ||
1669 | spin_unlock(&dst->page_lock); | ||
1670 | |||
1671 | src->head = NULL; | ||
1672 | src->tail = NULL; | ||
1673 | src->nr = 0; | ||
1674 | |||
1675 | return; | ||
1676 | } | ||
1677 | |||
1678 | spin_lock(&dst->remote_free.lock); | ||
1679 | |||
1680 | if (!dst->remote_free.list.head) | ||
1681 | dst->remote_free.list.head = src->head; | ||
1682 | else | ||
1683 | set_freepointer(s, dst->remote_free.list.tail, src->head); | ||
1684 | dst->remote_free.list.tail = src->tail; | ||
1685 | |||
1686 | src->head = NULL; | ||
1687 | src->tail = NULL; | ||
1688 | src->nr = 0; | ||
1689 | |||
1690 | if (dst->remote_free.list.nr < slab_freebatch(s)) | ||
1691 | set = 1; | ||
1692 | else | ||
1693 | set = 0; | ||
1694 | |||
1695 | dst->remote_free.list.nr += nr; | ||
1696 | |||
1697 | if (unlikely(dst->remote_free.list.nr >= slab_freebatch(s) && set)) | ||
1698 | dst->remote_free_check = 1; | ||
1699 | |||
1700 | spin_unlock(&dst->remote_free.lock); | ||
1701 | } | ||
1702 | |||
1703 | /* | ||
1704 | * Free an object to this CPU's remote free list. | ||
1705 | * | ||
1706 | * Must be called with interrupts disabled. | ||
1707 | */ | ||
1708 | static noinline void slab_free_to_remote(struct kmem_cache *s, | ||
1709 | struct slqb_page *page, void *object, | ||
1710 | struct kmem_cache_cpu *c) | ||
1711 | { | ||
1712 | struct kmlist *r; | ||
1713 | |||
1714 | /* | ||
1715 | * Our remote free list corresponds to a different list. Must | ||
1716 | * flush it and switch. | ||
1717 | */ | ||
1718 | if (page->list != c->remote_cache_list) { | ||
1719 | flush_remote_free_cache(s, c); | ||
1720 | c->remote_cache_list = page->list; | ||
1721 | } | ||
1722 | |||
1723 | r = &c->rlist; | ||
1724 | if (!r->head) | ||
1725 | r->head = object; | ||
1726 | else | ||
1727 | set_freepointer(s, r->tail, object); | ||
1728 | set_freepointer(s, object, NULL); | ||
1729 | r->tail = object; | ||
1730 | r->nr++; | ||
1731 | |||
1732 | if (unlikely(r->nr >= slab_freebatch(s))) | ||
1733 | flush_remote_free_cache(s, c); | ||
1734 | } | ||
1735 | #endif | ||
1736 | |||
1737 | /* | ||
1738 | * Main freeing path. Return an object, or NULL on allocation failure. | ||
1739 | * | ||
1740 | * Must be called with interrupts disabled. | ||
1741 | */ | ||
1742 | static __always_inline void __slab_free(struct kmem_cache *s, | ||
1743 | struct slqb_page *page, void *object) | ||
1744 | { | ||
1745 | struct kmem_cache_cpu *c; | ||
1746 | struct kmem_cache_list *l; | ||
1747 | int thiscpu = smp_processor_id(); | ||
1748 | |||
1749 | c = get_cpu_slab(s, thiscpu); | ||
1750 | l = &c->list; | ||
1751 | |||
1752 | slqb_stat_inc(l, FREE); | ||
1753 | |||
1754 | if (!NUMA_BUILD || !slab_numa(s) || | ||
1755 | likely(slqb_page_to_nid(page) == numa_node_id())) { | ||
1756 | /* | ||
1757 | * Freeing fastpath. Collects all local-node objects, not | ||
1758 | * just those allocated from our per-CPU list. This allows | ||
1759 | * fast transfer of objects from one CPU to another within | ||
1760 | * a given node. | ||
1761 | */ | ||
1762 | set_freepointer(s, object, l->freelist.head); | ||
1763 | l->freelist.head = object; | ||
1764 | if (!l->freelist.nr) | ||
1765 | l->freelist.tail = object; | ||
1766 | l->freelist.nr++; | ||
1767 | |||
1768 | if (unlikely(l->freelist.nr > slab_hiwater(s))) | ||
1769 | flush_free_list(s, l); | ||
1770 | |||
1771 | } else { | ||
1772 | #ifdef CONFIG_SMP | ||
1773 | /* | ||
1774 | * Freeing an object that was allocated on a remote node. | ||
1775 | */ | ||
1776 | slab_free_to_remote(s, page, object, c); | ||
1777 | slqb_stat_inc(l, FREE_REMOTE); | ||
1778 | #endif | ||
1779 | } | ||
1780 | } | ||
1781 | |||
1782 | /* | ||
1783 | * Perform some interrupts-on processing around the main freeing path | ||
1784 | * (debug checking). | ||
1785 | */ | ||
1786 | static __always_inline void slab_free(struct kmem_cache *s, | ||
1787 | struct slqb_page *page, void *object) | ||
1788 | { | ||
1789 | unsigned long flags; | ||
1790 | |||
1791 | prefetchw(object); | ||
1792 | |||
1793 | debug_check_no_locks_freed(object, s->objsize); | ||
1794 | if (likely(object) && unlikely(slab_debug(s))) { | ||
1795 | if (unlikely(!free_debug_processing(s, object, _RET_IP_))) | ||
1796 | return; | ||
1797 | } | ||
1798 | |||
1799 | local_irq_save(flags); | ||
1800 | __slab_free(s, page, object); | ||
1801 | local_irq_restore(flags); | ||
1802 | } | ||
1803 | |||
1804 | void kmem_cache_free(struct kmem_cache *s, void *object) | ||
1805 | { | ||
1806 | struct slqb_page *page = NULL; | ||
1807 | |||
1808 | if (slab_numa(s)) | ||
1809 | page = virt_to_head_slqb_page(object); | ||
1810 | slab_free(s, page, object); | ||
1811 | } | ||
1812 | EXPORT_SYMBOL(kmem_cache_free); | ||
1813 | |||
1814 | /* | ||
1815 | * Calculate the order of allocation given an slab object size. | ||
1816 | * | ||
1817 | * Order 0 allocations are preferred since order 0 does not cause fragmentation | ||
1818 | * in the page allocator, and they have fastpaths in the page allocator. But | ||
1819 | * also minimise external fragmentation with large objects. | ||
1820 | */ | ||
1821 | static int slab_order(int size, int max_order, int frac) | ||
1822 | { | ||
1823 | int order; | ||
1824 | |||
1825 | if (fls(size - 1) <= PAGE_SHIFT) | ||
1826 | order = 0; | ||
1827 | else | ||
1828 | order = fls(size - 1) - PAGE_SHIFT; | ||
1829 | if (order < slqb_min_order) | ||
1830 | order = slqb_min_order; | ||
1831 | |||
1832 | while (order <= max_order) { | ||
1833 | unsigned long slab_size = PAGE_SIZE << order; | ||
1834 | unsigned long objects; | ||
1835 | unsigned long waste; | ||
1836 | |||
1837 | objects = slab_size / size; | ||
1838 | if (!objects) | ||
1839 | goto next; | ||
1840 | |||
1841 | if (order < MAX_ORDER && objects < slqb_min_objects) { | ||
1842 | /* | ||
1843 | * if we don't have enough objects for min_objects, | ||
1844 | * then try the next size up. Unless we have reached | ||
1845 | * our maximum possible page size. | ||
1846 | */ | ||
1847 | goto next; | ||
1848 | } | ||
1849 | |||
1850 | waste = slab_size - (objects * size); | ||
1851 | |||
1852 | if (waste * frac <= slab_size) | ||
1853 | break; | ||
1854 | |||
1855 | next: | ||
1856 | order++; | ||
1857 | } | ||
1858 | |||
1859 | return order; | ||
1860 | } | ||
1861 | |||
1862 | static int calculate_order(int size) | ||
1863 | { | ||
1864 | int order; | ||
1865 | |||
1866 | /* | ||
1867 | * Attempt to find best configuration for a slab. This | ||
1868 | * works by first attempting to generate a layout with | ||
1869 | * the best configuration and backing off gradually. | ||
1870 | */ | ||
1871 | order = slab_order(size, 1, 4); | ||
1872 | if (order <= 1) | ||
1873 | return order; | ||
1874 | |||
1875 | /* | ||
1876 | * This size cannot fit in order-1. Allow bigger orders, but | ||
1877 | * forget about trying to save space. | ||
1878 | */ | ||
1879 | order = slab_order(size, MAX_ORDER - 1, 0); | ||
1880 | if (order < MAX_ORDER) | ||
1881 | return order; | ||
1882 | |||
1883 | return -ENOSYS; | ||
1884 | } | ||
1885 | |||
1886 | /* | ||
1887 | * Figure out what the alignment of the objects will be. | ||
1888 | */ | ||
1889 | static unsigned long calculate_alignment(unsigned long flags, | ||
1890 | unsigned long align, unsigned long size) | ||
1891 | { | ||
1892 | /* | ||
1893 | * If the user wants hardware cache aligned objects then follow that | ||
1894 | * suggestion if the object is sufficiently large. | ||
1895 | * | ||
1896 | * The hardware cache alignment cannot override the specified | ||
1897 | * alignment though. If that is greater then use it. | ||
1898 | */ | ||
1899 | if (flags & SLAB_HWCACHE_ALIGN) { | ||
1900 | unsigned long ralign = cache_line_size(); | ||
1901 | |||
1902 | while (size <= ralign / 2) | ||
1903 | ralign /= 2; | ||
1904 | align = max(align, ralign); | ||
1905 | } | ||
1906 | |||
1907 | if (align < ARCH_SLAB_MINALIGN) | ||
1908 | align = ARCH_SLAB_MINALIGN; | ||
1909 | |||
1910 | return ALIGN(align, sizeof(void *)); | ||
1911 | } | ||
1912 | |||
1913 | static void init_kmem_cache_list(struct kmem_cache *s, | ||
1914 | struct kmem_cache_list *l) | ||
1915 | { | ||
1916 | l->cache = s; | ||
1917 | l->freelist.nr = 0; | ||
1918 | l->freelist.head = NULL; | ||
1919 | l->freelist.tail = NULL; | ||
1920 | l->nr_partial = 0; | ||
1921 | l->nr_slabs = 0; | ||
1922 | INIT_LIST_HEAD(&l->partial); | ||
1923 | spin_lock_init(&l->page_lock); | ||
1924 | |||
1925 | #ifdef CONFIG_SMP | ||
1926 | l->remote_free_check = 0; | ||
1927 | spin_lock_init(&l->remote_free.lock); | ||
1928 | l->remote_free.list.nr = 0; | ||
1929 | l->remote_free.list.head = NULL; | ||
1930 | l->remote_free.list.tail = NULL; | ||
1931 | #endif | ||
1932 | |||
1933 | #ifdef CONFIG_SLQB_STATS | ||
1934 | memset(l->stats, 0, sizeof(l->stats)); | ||
1935 | #endif | ||
1936 | } | ||
1937 | |||
1938 | static void init_kmem_cache_cpu(struct kmem_cache *s, | ||
1939 | struct kmem_cache_cpu *c) | ||
1940 | { | ||
1941 | init_kmem_cache_list(s, &c->list); | ||
1942 | |||
1943 | c->colour_next = 0; | ||
1944 | #ifdef CONFIG_SMP | ||
1945 | c->rlist.nr = 0; | ||
1946 | c->rlist.head = NULL; | ||
1947 | c->rlist.tail = NULL; | ||
1948 | c->remote_cache_list = NULL; | ||
1949 | #endif | ||
1950 | } | ||
1951 | |||
1952 | #ifdef CONFIG_NUMA | ||
1953 | static void init_kmem_cache_node(struct kmem_cache *s, | ||
1954 | struct kmem_cache_node *n) | ||
1955 | { | ||
1956 | spin_lock_init(&n->list_lock); | ||
1957 | init_kmem_cache_list(s, &n->list); | ||
1958 | } | ||
1959 | #endif | ||
1960 | |||
1961 | /* Initial slabs. */ | ||
1962 | #ifdef CONFIG_SMP | ||
1963 | static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cache_cpus); | ||
1964 | #endif | ||
1965 | #ifdef CONFIG_NUMA | ||
1966 | /* XXX: really need a DEFINE_PER_NODE for per-node data because a static | ||
1967 | * array is wasteful */ | ||
1968 | static struct kmem_cache_node kmem_cache_nodes[MAX_NUMNODES]; | ||
1969 | #endif | ||
1970 | |||
1971 | #ifdef CONFIG_SMP | ||
1972 | static struct kmem_cache kmem_cpu_cache; | ||
1973 | static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cpu_cpus); | ||
1974 | #ifdef CONFIG_NUMA | ||
1975 | static struct kmem_cache_node kmem_cpu_nodes[MAX_NUMNODES]; /* XXX per-nid */ | ||
1976 | #endif | ||
1977 | #endif | ||
1978 | |||
1979 | #ifdef CONFIG_NUMA | ||
1980 | static struct kmem_cache kmem_node_cache; | ||
1981 | #ifdef CONFIG_SMP | ||
1982 | static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_node_cpus); | ||
1983 | #endif | ||
1984 | static struct kmem_cache_node kmem_node_nodes[MAX_NUMNODES]; /*XXX per-nid */ | ||
1985 | #endif | ||
1986 | |||
1987 | #ifdef CONFIG_SMP | ||
1988 | static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, | ||
1989 | int cpu) | ||
1990 | { | ||
1991 | struct kmem_cache_cpu *c; | ||
1992 | int node; | ||
1993 | |||
1994 | node = cpu_to_node(cpu); | ||
1995 | |||
1996 | c = kmem_cache_alloc_node(&kmem_cpu_cache, GFP_KERNEL, node); | ||
1997 | if (!c) | ||
1998 | return NULL; | ||
1999 | |||
2000 | init_kmem_cache_cpu(s, c); | ||
2001 | return c; | ||
2002 | } | ||
2003 | |||
2004 | static void free_kmem_cache_cpus(struct kmem_cache *s) | ||
2005 | { | ||
2006 | int cpu; | ||
2007 | |||
2008 | for_each_online_cpu(cpu) { | ||
2009 | struct kmem_cache_cpu *c; | ||
2010 | |||
2011 | c = s->cpu_slab[cpu]; | ||
2012 | if (c) { | ||
2013 | kmem_cache_free(&kmem_cpu_cache, c); | ||
2014 | s->cpu_slab[cpu] = NULL; | ||
2015 | } | ||
2016 | } | ||
2017 | } | ||
2018 | |||
2019 | static int alloc_kmem_cache_cpus(struct kmem_cache *s) | ||
2020 | { | ||
2021 | int cpu; | ||
2022 | |||
2023 | for_each_online_cpu(cpu) { | ||
2024 | struct kmem_cache_cpu *c; | ||
2025 | |||
2026 | c = s->cpu_slab[cpu]; | ||
2027 | if (c) | ||
2028 | continue; | ||
2029 | |||
2030 | c = alloc_kmem_cache_cpu(s, cpu); | ||
2031 | if (!c) { | ||
2032 | free_kmem_cache_cpus(s); | ||
2033 | return 0; | ||
2034 | } | ||
2035 | s->cpu_slab[cpu] = c; | ||
2036 | } | ||
2037 | return 1; | ||
2038 | } | ||
2039 | |||
2040 | #else | ||
2041 | static inline void free_kmem_cache_cpus(struct kmem_cache *s) | ||
2042 | { | ||
2043 | } | ||
2044 | |||
2045 | static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) | ||
2046 | { | ||
2047 | init_kmem_cache_cpu(s, &s->cpu_slab); | ||
2048 | return 1; | ||
2049 | } | ||
2050 | #endif | ||
2051 | |||
2052 | #ifdef CONFIG_NUMA | ||
2053 | static void free_kmem_cache_nodes(struct kmem_cache *s) | ||
2054 | { | ||
2055 | int node; | ||
2056 | |||
2057 | for_each_node_state(node, N_NORMAL_MEMORY) { | ||
2058 | struct kmem_cache_node *n; | ||
2059 | |||
2060 | n = s->node_slab[node]; | ||
2061 | if (n) { | ||
2062 | kmem_cache_free(&kmem_node_cache, n); | ||
2063 | s->node_slab[node] = NULL; | ||
2064 | } | ||
2065 | } | ||
2066 | } | ||
2067 | |||
2068 | static int alloc_kmem_cache_nodes(struct kmem_cache *s) | ||
2069 | { | ||
2070 | int node; | ||
2071 | |||
2072 | for_each_node_state(node, N_NORMAL_MEMORY) { | ||
2073 | struct kmem_cache_node *n; | ||
2074 | |||
2075 | n = kmem_cache_alloc_node(&kmem_node_cache, GFP_KERNEL, node); | ||
2076 | if (!n) { | ||
2077 | free_kmem_cache_nodes(s); | ||
2078 | return 0; | ||
2079 | } | ||
2080 | init_kmem_cache_node(s, n); | ||
2081 | s->node_slab[node] = n; | ||
2082 | } | ||
2083 | return 1; | ||
2084 | } | ||
2085 | #else | ||
2086 | static void free_kmem_cache_nodes(struct kmem_cache *s) | ||
2087 | { | ||
2088 | } | ||
2089 | |||
2090 | static int alloc_kmem_cache_nodes(struct kmem_cache *s) | ||
2091 | { | ||
2092 | return 1; | ||
2093 | } | ||
2094 | #endif | ||
2095 | |||
2096 | /* | ||
2097 | * calculate_sizes() determines the order and the distribution of data within | ||
2098 | * a slab object. | ||
2099 | */ | ||
2100 | static int calculate_sizes(struct kmem_cache *s) | ||
2101 | { | ||
2102 | unsigned long flags = s->flags; | ||
2103 | unsigned long size = s->objsize; | ||
2104 | unsigned long align = s->align; | ||
2105 | |||
2106 | /* | ||
2107 | * Determine if we can poison the object itself. If the user of | ||
2108 | * the slab may touch the object after free or before allocation | ||
2109 | * then we should never poison the object itself. | ||
2110 | */ | ||
2111 | if (slab_poison(s) && !(flags & SLAB_DESTROY_BY_RCU) && !s->ctor) | ||
2112 | s->flags |= __OBJECT_POISON; | ||
2113 | else | ||
2114 | s->flags &= ~__OBJECT_POISON; | ||
2115 | |||
2116 | /* | ||
2117 | * Round up object size to the next word boundary. We can only | ||
2118 | * place the free pointer at word boundaries and this determines | ||
2119 | * the possible location of the free pointer. | ||
2120 | */ | ||
2121 | size = ALIGN(size, sizeof(void *)); | ||
2122 | |||
2123 | #ifdef CONFIG_SLQB_DEBUG | ||
2124 | /* | ||
2125 | * If we are Redzoning then check if there is some space between the | ||
2126 | * end of the object and the free pointer. If not then add an | ||
2127 | * additional word to have some bytes to store Redzone information. | ||
2128 | */ | ||
2129 | if ((flags & SLAB_RED_ZONE) && size == s->objsize) | ||
2130 | size += sizeof(void *); | ||
2131 | #endif | ||
2132 | |||
2133 | /* | ||
2134 | * With that we have determined the number of bytes in actual use | ||
2135 | * by the object. This is the potential offset to the free pointer. | ||
2136 | */ | ||
2137 | s->inuse = size; | ||
2138 | |||
2139 | if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || s->ctor)) { | ||
2140 | /* | ||
2141 | * Relocate free pointer after the object if it is not | ||
2142 | * permitted to overwrite the first word of the object on | ||
2143 | * kmem_cache_free. | ||
2144 | * | ||
2145 | * This is the case if we do RCU, have a constructor or | ||
2146 | * destructor or are poisoning the objects. | ||
2147 | */ | ||
2148 | s->offset = size; | ||
2149 | size += sizeof(void *); | ||
2150 | } | ||
2151 | |||
2152 | #ifdef CONFIG_SLQB_DEBUG | ||
2153 | if (flags & SLAB_STORE_USER) { | ||
2154 | /* | ||
2155 | * Need to store information about allocs and frees after | ||
2156 | * the object. | ||
2157 | */ | ||
2158 | size += 2 * sizeof(struct track); | ||
2159 | } | ||
2160 | |||
2161 | if (flags & SLAB_RED_ZONE) { | ||
2162 | /* | ||
2163 | * Add some empty padding so that we can catch | ||
2164 | * overwrites from earlier objects rather than let | ||
2165 | * tracking information or the free pointer be | ||
2166 | * corrupted if an user writes before the start | ||
2167 | * of the object. | ||
2168 | */ | ||
2169 | size += sizeof(void *); | ||
2170 | } | ||
2171 | #endif | ||
2172 | |||
2173 | /* | ||
2174 | * Determine the alignment based on various parameters that the | ||
2175 | * user specified and the dynamic determination of cache line size | ||
2176 | * on bootup. | ||
2177 | */ | ||
2178 | align = calculate_alignment(flags, align, s->objsize); | ||
2179 | |||
2180 | /* | ||
2181 | * SLQB stores one object immediately after another beginning from | ||
2182 | * offset 0. In order to align the objects we have to simply size | ||
2183 | * each object to conform to the alignment. | ||
2184 | */ | ||
2185 | size = ALIGN(size, align); | ||
2186 | s->size = size; | ||
2187 | s->order = calculate_order(size); | ||
2188 | |||
2189 | if (s->order < 0) | ||
2190 | return 0; | ||
2191 | |||
2192 | s->allocflags = 0; | ||
2193 | if (s->order) | ||
2194 | s->allocflags |= __GFP_COMP; | ||
2195 | |||
2196 | if (s->flags & SLAB_CACHE_DMA) | ||
2197 | s->allocflags |= SLQB_DMA; | ||
2198 | |||
2199 | if (s->flags & SLAB_RECLAIM_ACCOUNT) | ||
2200 | s->allocflags |= __GFP_RECLAIMABLE; | ||
2201 | |||
2202 | /* | ||
2203 | * Determine the number of objects per slab | ||
2204 | */ | ||
2205 | s->objects = (PAGE_SIZE << s->order) / size; | ||
2206 | |||
2207 | s->freebatch = max(4UL*PAGE_SIZE / size, | ||
2208 | min(256UL, 64*PAGE_SIZE / size)); | ||
2209 | if (!s->freebatch) | ||
2210 | s->freebatch = 1; | ||
2211 | s->hiwater = s->freebatch << 2; | ||
2212 | |||
2213 | return !!s->objects; | ||
2214 | |||
2215 | } | ||
2216 | |||
2217 | #ifdef CONFIG_SMP | ||
2218 | /* | ||
2219 | * Per-cpu allocator can't be used because it always uses slab allocator, | ||
2220 | * and it can't do per-node allocations. | ||
2221 | */ | ||
2222 | static void *kmem_cache_dyn_array_alloc(int ids) | ||
2223 | { | ||
2224 | size_t size = sizeof(void *) * ids; | ||
2225 | |||
2226 | BUG_ON(!size); | ||
2227 | |||
2228 | if (unlikely(!slab_is_available())) { | ||
2229 | static void *nextmem; | ||
2230 | static size_t nextleft; | ||
2231 | void *ret; | ||
2232 | |||
2233 | /* | ||
2234 | * Special case for setting up initial caches. These will | ||
2235 | * never get freed by definition so we can do it rather | ||
2236 | * simply. | ||
2237 | */ | ||
2238 | if (size > nextleft) { | ||
2239 | nextmem = alloc_pages_exact(size, GFP_KERNEL); | ||
2240 | if (!nextmem) | ||
2241 | return NULL; | ||
2242 | nextleft = roundup(size, PAGE_SIZE); | ||
2243 | } | ||
2244 | |||
2245 | ret = nextmem; | ||
2246 | nextleft -= size; | ||
2247 | nextmem += size; | ||
2248 | memset(ret, 0, size); | ||
2249 | return ret; | ||
2250 | } else { | ||
2251 | return kzalloc(size, GFP_KERNEL); | ||
2252 | } | ||
2253 | } | ||
2254 | |||
2255 | static void kmem_cache_dyn_array_free(void *array) | ||
2256 | { | ||
2257 | if (unlikely(!slab_is_available())) | ||
2258 | return; /* error case without crashing here (will panic soon) */ | ||
2259 | kfree(array); | ||
2260 | } | ||
2261 | #endif | ||
2262 | |||
2263 | /* | ||
2264 | * Except in early boot, this should be called with slqb_lock held for write | ||
2265 | * to lock out hotplug, and protect list modifications. | ||
2266 | */ | ||
2267 | static int kmem_cache_open(struct kmem_cache *s, | ||
2268 | const char *name, size_t size, size_t align, | ||
2269 | unsigned long flags, void (*ctor)(void *), int alloc) | ||
2270 | { | ||
2271 | unsigned int left_over; | ||
2272 | |||
2273 | memset(s, 0, sizeof(struct kmem_cache)); | ||
2274 | s->name = name; | ||
2275 | s->ctor = ctor; | ||
2276 | s->objsize = size; | ||
2277 | s->align = align; | ||
2278 | s->flags = kmem_cache_flags(size, flags, name, ctor); | ||
2279 | |||
2280 | if (!calculate_sizes(s)) | ||
2281 | goto error; | ||
2282 | |||
2283 | if (!slab_debug(s)) { | ||
2284 | left_over = (PAGE_SIZE << s->order) - (s->objects * s->size); | ||
2285 | s->colour_off = max(cache_line_size(), s->align); | ||
2286 | s->colour_range = left_over; | ||
2287 | } else { | ||
2288 | s->colour_off = 0; | ||
2289 | s->colour_range = 0; | ||
2290 | } | ||
2291 | |||
2292 | #ifdef CONFIG_SMP | ||
2293 | s->cpu_slab = kmem_cache_dyn_array_alloc(nr_cpu_ids); | ||
2294 | if (!s->cpu_slab) | ||
2295 | goto error; | ||
2296 | # ifdef CONFIG_NUMA | ||
2297 | s->node_slab = kmem_cache_dyn_array_alloc(nr_node_ids); | ||
2298 | if (!s->node_slab) | ||
2299 | goto error_cpu_array; | ||
2300 | # endif | ||
2301 | #endif | ||
2302 | |||
2303 | if (likely(alloc)) { | ||
2304 | if (!alloc_kmem_cache_nodes(s)) | ||
2305 | goto error_node_array; | ||
2306 | |||
2307 | if (!alloc_kmem_cache_cpus(s)) | ||
2308 | goto error_nodes; | ||
2309 | } | ||
2310 | |||
2311 | sysfs_slab_add(s); | ||
2312 | list_add(&s->list, &slab_caches); | ||
2313 | |||
2314 | return 1; | ||
2315 | |||
2316 | error_nodes: | ||
2317 | free_kmem_cache_nodes(s); | ||
2318 | error_node_array: | ||
2319 | #if defined(CONFIG_NUMA) && defined(CONFIG_SMP) | ||
2320 | kmem_cache_dyn_array_free(s->node_slab); | ||
2321 | error_cpu_array: | ||
2322 | #endif | ||
2323 | #ifdef CONFIG_SMP | ||
2324 | kmem_cache_dyn_array_free(s->cpu_slab); | ||
2325 | #endif | ||
2326 | error: | ||
2327 | if (flags & SLAB_PANIC) | ||
2328 | panic("%s: failed to create slab `%s'\n", __func__, name); | ||
2329 | return 0; | ||
2330 | } | ||
2331 | |||
2332 | /** | ||
2333 | * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. | ||
2334 | * @s: the cache we're checking against | ||
2335 | * @ptr: pointer to validate | ||
2336 | * | ||
2337 | * This verifies that the untrusted pointer looks sane; | ||
2338 | * it is _not_ a guarantee that the pointer is actually | ||
2339 | * part of the slab cache in question, but it at least | ||
2340 | * validates that the pointer can be dereferenced and | ||
2341 | * looks half-way sane. | ||
2342 | * | ||
2343 | * Currently only used for dentry validation. | ||
2344 | */ | ||
2345 | int kmem_ptr_validate(struct kmem_cache *s, const void *ptr) | ||
2346 | { | ||
2347 | unsigned long addr = (unsigned long)ptr; | ||
2348 | struct slqb_page *page; | ||
2349 | |||
2350 | if (unlikely(addr < PAGE_OFFSET)) | ||
2351 | goto out; | ||
2352 | if (unlikely(addr > (unsigned long)high_memory - s->size)) | ||
2353 | goto out; | ||
2354 | if (unlikely(!IS_ALIGNED(addr, s->align))) | ||
2355 | goto out; | ||
2356 | if (unlikely(!kern_addr_valid(addr))) | ||
2357 | goto out; | ||
2358 | if (unlikely(!kern_addr_valid(addr + s->size - 1))) | ||
2359 | goto out; | ||
2360 | if (unlikely(!pfn_valid(addr >> PAGE_SHIFT))) | ||
2361 | goto out; | ||
2362 | page = virt_to_head_slqb_page(ptr); | ||
2363 | if (unlikely(!(page->flags & PG_SLQB_BIT))) | ||
2364 | goto out; | ||
2365 | if (unlikely(page->list->cache != s)) /* XXX: ouch, racy */ | ||
2366 | goto out; | ||
2367 | return 1; | ||
2368 | out: | ||
2369 | return 0; | ||
2370 | } | ||
2371 | EXPORT_SYMBOL(kmem_ptr_validate); | ||
2372 | |||
2373 | /* | ||
2374 | * Determine the size of a slab object | ||
2375 | */ | ||
2376 | unsigned int kmem_cache_size(struct kmem_cache *s) | ||
2377 | { | ||
2378 | return s->objsize; | ||
2379 | } | ||
2380 | EXPORT_SYMBOL(kmem_cache_size); | ||
2381 | |||
2382 | const char *kmem_cache_name(struct kmem_cache *s) | ||
2383 | { | ||
2384 | return s->name; | ||
2385 | } | ||
2386 | EXPORT_SYMBOL(kmem_cache_name); | ||
2387 | |||
2388 | /* | ||
2389 | * Release all resources used by a slab cache. No more concurrency on the | ||
2390 | * slab, so we can touch remote kmem_cache_cpu structures. | ||
2391 | */ | ||
2392 | void kmem_cache_destroy(struct kmem_cache *s) | ||
2393 | { | ||
2394 | #ifdef CONFIG_NUMA | ||
2395 | int node; | ||
2396 | #endif | ||
2397 | int cpu; | ||
2398 | |||
2399 | down_write(&slqb_lock); | ||
2400 | list_del(&s->list); | ||
2401 | |||
2402 | local_irq_disable(); | ||
2403 | #ifdef CONFIG_SMP | ||
2404 | for_each_online_cpu(cpu) { | ||
2405 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
2406 | struct kmem_cache_list *l = &c->list; | ||
2407 | |||
2408 | flush_free_list_all(s, l); | ||
2409 | flush_remote_free_cache(s, c); | ||
2410 | } | ||
2411 | #endif | ||
2412 | |||
2413 | for_each_online_cpu(cpu) { | ||
2414 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
2415 | struct kmem_cache_list *l = &c->list; | ||
2416 | |||
2417 | claim_remote_free_list(s, l); | ||
2418 | flush_free_list_all(s, l); | ||
2419 | |||
2420 | WARN_ON(l->freelist.nr); | ||
2421 | WARN_ON(l->nr_slabs); | ||
2422 | WARN_ON(l->nr_partial); | ||
2423 | } | ||
2424 | |||
2425 | free_kmem_cache_cpus(s); | ||
2426 | |||
2427 | #ifdef CONFIG_NUMA | ||
2428 | for_each_node_state(node, N_NORMAL_MEMORY) { | ||
2429 | struct kmem_cache_node *n; | ||
2430 | struct kmem_cache_list *l; | ||
2431 | |||
2432 | n = s->node_slab[node]; | ||
2433 | if (!n) | ||
2434 | continue; | ||
2435 | l = &n->list; | ||
2436 | |||
2437 | claim_remote_free_list(s, l); | ||
2438 | flush_free_list_all(s, l); | ||
2439 | |||
2440 | WARN_ON(l->freelist.nr); | ||
2441 | WARN_ON(l->nr_slabs); | ||
2442 | WARN_ON(l->nr_partial); | ||
2443 | } | ||
2444 | |||
2445 | free_kmem_cache_nodes(s); | ||
2446 | #endif | ||
2447 | local_irq_enable(); | ||
2448 | |||
2449 | sysfs_slab_remove(s); | ||
2450 | up_write(&slqb_lock); | ||
2451 | } | ||
2452 | EXPORT_SYMBOL(kmem_cache_destroy); | ||
2453 | |||
2454 | /******************************************************************** | ||
2455 | * Kmalloc subsystem | ||
2456 | *******************************************************************/ | ||
2457 | |||
2458 | struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned; | ||
2459 | EXPORT_SYMBOL(kmalloc_caches); | ||
2460 | |||
2461 | #ifdef CONFIG_ZONE_DMA | ||
2462 | struct kmem_cache kmalloc_caches_dma[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned; | ||
2463 | EXPORT_SYMBOL(kmalloc_caches_dma); | ||
2464 | #endif | ||
2465 | |||
2466 | #ifndef ARCH_KMALLOC_FLAGS | ||
2467 | #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN | ||
2468 | #endif | ||
2469 | |||
2470 | static struct kmem_cache *open_kmalloc_cache(struct kmem_cache *s, | ||
2471 | const char *name, int size, gfp_t gfp_flags) | ||
2472 | { | ||
2473 | unsigned int flags = ARCH_KMALLOC_FLAGS | SLAB_PANIC; | ||
2474 | |||
2475 | if (gfp_flags & SLQB_DMA) | ||
2476 | flags |= SLAB_CACHE_DMA; | ||
2477 | |||
2478 | kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL, 1); | ||
2479 | |||
2480 | return s; | ||
2481 | } | ||
2482 | |||
2483 | /* | ||
2484 | * Conversion table for small slabs sizes / 8 to the index in the | ||
2485 | * kmalloc array. This is necessary for slabs < 192 since we have non power | ||
2486 | * of two cache sizes there. The size of larger slabs can be determined using | ||
2487 | * fls. | ||
2488 | */ | ||
2489 | static s8 size_index[24] __cacheline_aligned = { | ||
2490 | 3, /* 8 */ | ||
2491 | 4, /* 16 */ | ||
2492 | 5, /* 24 */ | ||
2493 | 5, /* 32 */ | ||
2494 | 6, /* 40 */ | ||
2495 | 6, /* 48 */ | ||
2496 | 6, /* 56 */ | ||
2497 | 6, /* 64 */ | ||
2498 | #if L1_CACHE_BYTES < 64 | ||
2499 | 1, /* 72 */ | ||
2500 | 1, /* 80 */ | ||
2501 | 1, /* 88 */ | ||
2502 | 1, /* 96 */ | ||
2503 | #else | ||
2504 | 7, | ||
2505 | 7, | ||
2506 | 7, | ||
2507 | 7, | ||
2508 | #endif | ||
2509 | 7, /* 104 */ | ||
2510 | 7, /* 112 */ | ||
2511 | 7, /* 120 */ | ||
2512 | 7, /* 128 */ | ||
2513 | #if L1_CACHE_BYTES < 128 | ||
2514 | 2, /* 136 */ | ||
2515 | 2, /* 144 */ | ||
2516 | 2, /* 152 */ | ||
2517 | 2, /* 160 */ | ||
2518 | 2, /* 168 */ | ||
2519 | 2, /* 176 */ | ||
2520 | 2, /* 184 */ | ||
2521 | 2 /* 192 */ | ||
2522 | #else | ||
2523 | -1, | ||
2524 | -1, | ||
2525 | -1, | ||
2526 | -1, | ||
2527 | -1, | ||
2528 | -1, | ||
2529 | -1, | ||
2530 | -1 | ||
2531 | #endif | ||
2532 | }; | ||
2533 | |||
2534 | static struct kmem_cache *get_slab(size_t size, gfp_t flags) | ||
2535 | { | ||
2536 | int index; | ||
2537 | |||
2538 | if (unlikely(size <= KMALLOC_MIN_SIZE)) { | ||
2539 | if (unlikely(!size)) | ||
2540 | return ZERO_SIZE_PTR; | ||
2541 | |||
2542 | index = KMALLOC_SHIFT_LOW; | ||
2543 | goto got_index; | ||
2544 | } | ||
2545 | |||
2546 | #if L1_CACHE_BYTES >= 128 | ||
2547 | if (size <= 128) { | ||
2548 | #else | ||
2549 | if (size <= 192) { | ||
2550 | #endif | ||
2551 | index = size_index[(size - 1) / 8]; | ||
2552 | } else { | ||
2553 | if (unlikely(size > 1UL << KMALLOC_SHIFT_SLQB_HIGH)) | ||
2554 | return NULL; | ||
2555 | |||
2556 | index = fls(size - 1); | ||
2557 | } | ||
2558 | |||
2559 | got_index: | ||
2560 | if (unlikely((flags & SLQB_DMA))) | ||
2561 | return &kmalloc_caches_dma[index]; | ||
2562 | else | ||
2563 | return &kmalloc_caches[index]; | ||
2564 | } | ||
2565 | |||
2566 | void *__kmalloc(size_t size, gfp_t flags) | ||
2567 | { | ||
2568 | struct kmem_cache *s; | ||
2569 | |||
2570 | s = get_slab(size, flags); | ||
2571 | if (unlikely(ZERO_OR_NULL_PTR(s))) | ||
2572 | return s; | ||
2573 | |||
2574 | return __kmem_cache_alloc(s, flags, _RET_IP_); | ||
2575 | } | ||
2576 | EXPORT_SYMBOL(__kmalloc); | ||
2577 | |||
2578 | #ifdef CONFIG_NUMA | ||
2579 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | ||
2580 | { | ||
2581 | struct kmem_cache *s; | ||
2582 | |||
2583 | s = get_slab(size, flags); | ||
2584 | if (unlikely(ZERO_OR_NULL_PTR(s))) | ||
2585 | return s; | ||
2586 | |||
2587 | return kmem_cache_alloc_node(s, flags, node); | ||
2588 | } | ||
2589 | EXPORT_SYMBOL(__kmalloc_node); | ||
2590 | #endif | ||
2591 | |||
2592 | size_t ksize(const void *object) | ||
2593 | { | ||
2594 | struct slqb_page *page; | ||
2595 | struct kmem_cache *s; | ||
2596 | |||
2597 | BUG_ON(!object); | ||
2598 | if (unlikely(object == ZERO_SIZE_PTR)) | ||
2599 | return 0; | ||
2600 | |||
2601 | page = virt_to_head_slqb_page(object); | ||
2602 | BUG_ON(!(page->flags & PG_SLQB_BIT)); | ||
2603 | |||
2604 | s = page->list->cache; | ||
2605 | |||
2606 | /* | ||
2607 | * Debugging requires use of the padding between object | ||
2608 | * and whatever may come after it. | ||
2609 | */ | ||
2610 | if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) | ||
2611 | return s->objsize; | ||
2612 | |||
2613 | /* | ||
2614 | * If we have the need to store the freelist pointer | ||
2615 | * back there or track user information then we can | ||
2616 | * only use the space before that information. | ||
2617 | */ | ||
2618 | if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) | ||
2619 | return s->inuse; | ||
2620 | |||
2621 | /* | ||
2622 | * Else we can use all the padding etc for the allocation | ||
2623 | */ | ||
2624 | return s->size; | ||
2625 | } | ||
2626 | EXPORT_SYMBOL(ksize); | ||
2627 | |||
2628 | void kfree(const void *object) | ||
2629 | { | ||
2630 | struct kmem_cache *s; | ||
2631 | struct slqb_page *page; | ||
2632 | |||
2633 | if (unlikely(ZERO_OR_NULL_PTR(object))) | ||
2634 | return; | ||
2635 | |||
2636 | page = virt_to_head_slqb_page(object); | ||
2637 | s = page->list->cache; | ||
2638 | |||
2639 | slab_free(s, page, (void *)object); | ||
2640 | } | ||
2641 | EXPORT_SYMBOL(kfree); | ||
2642 | |||
2643 | static void kmem_cache_trim_percpu(void *arg) | ||
2644 | { | ||
2645 | int cpu = smp_processor_id(); | ||
2646 | struct kmem_cache *s = arg; | ||
2647 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
2648 | struct kmem_cache_list *l = &c->list; | ||
2649 | |||
2650 | claim_remote_free_list(s, l); | ||
2651 | flush_free_list(s, l); | ||
2652 | #ifdef CONFIG_SMP | ||
2653 | flush_remote_free_cache(s, c); | ||
2654 | #endif | ||
2655 | } | ||
2656 | |||
2657 | int kmem_cache_shrink(struct kmem_cache *s) | ||
2658 | { | ||
2659 | #ifdef CONFIG_NUMA | ||
2660 | int node; | ||
2661 | #endif | ||
2662 | |||
2663 | on_each_cpu(kmem_cache_trim_percpu, s, 1); | ||
2664 | |||
2665 | #ifdef CONFIG_NUMA | ||
2666 | for_each_node_state(node, N_NORMAL_MEMORY) { | ||
2667 | struct kmem_cache_node *n; | ||
2668 | struct kmem_cache_list *l; | ||
2669 | |||
2670 | n = s->node_slab[node]; | ||
2671 | if (!n) | ||
2672 | continue; | ||
2673 | l = &n->list; | ||
2674 | |||
2675 | spin_lock_irq(&n->list_lock); | ||
2676 | claim_remote_free_list(s, l); | ||
2677 | flush_free_list(s, l); | ||
2678 | spin_unlock_irq(&n->list_lock); | ||
2679 | } | ||
2680 | #endif | ||
2681 | |||
2682 | return 0; | ||
2683 | } | ||
2684 | EXPORT_SYMBOL(kmem_cache_shrink); | ||
2685 | |||
2686 | #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) | ||
2687 | static void kmem_cache_reap_percpu(void *arg) | ||
2688 | { | ||
2689 | int cpu = smp_processor_id(); | ||
2690 | struct kmem_cache *s; | ||
2691 | long phase = (long)arg; | ||
2692 | |||
2693 | list_for_each_entry(s, &slab_caches, list) { | ||
2694 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
2695 | struct kmem_cache_list *l = &c->list; | ||
2696 | |||
2697 | if (phase == 0) { | ||
2698 | flush_free_list_all(s, l); | ||
2699 | flush_remote_free_cache(s, c); | ||
2700 | } | ||
2701 | |||
2702 | if (phase == 1) { | ||
2703 | claim_remote_free_list(s, l); | ||
2704 | flush_free_list_all(s, l); | ||
2705 | } | ||
2706 | } | ||
2707 | } | ||
2708 | |||
2709 | static void kmem_cache_reap(void) | ||
2710 | { | ||
2711 | struct kmem_cache *s; | ||
2712 | int node; | ||
2713 | |||
2714 | down_read(&slqb_lock); | ||
2715 | on_each_cpu(kmem_cache_reap_percpu, (void *)0, 1); | ||
2716 | on_each_cpu(kmem_cache_reap_percpu, (void *)1, 1); | ||
2717 | |||
2718 | list_for_each_entry(s, &slab_caches, list) { | ||
2719 | for_each_node_state(node, N_NORMAL_MEMORY) { | ||
2720 | struct kmem_cache_node *n; | ||
2721 | struct kmem_cache_list *l; | ||
2722 | |||
2723 | n = s->node_slab[node]; | ||
2724 | if (!n) | ||
2725 | continue; | ||
2726 | l = &n->list; | ||
2727 | |||
2728 | spin_lock_irq(&n->list_lock); | ||
2729 | claim_remote_free_list(s, l); | ||
2730 | flush_free_list_all(s, l); | ||
2731 | spin_unlock_irq(&n->list_lock); | ||
2732 | } | ||
2733 | } | ||
2734 | up_read(&slqb_lock); | ||
2735 | } | ||
2736 | #endif | ||
2737 | |||
2738 | static void cache_trim_worker(struct work_struct *w) | ||
2739 | { | ||
2740 | struct delayed_work *work = | ||
2741 | container_of(w, struct delayed_work, work); | ||
2742 | struct kmem_cache *s; | ||
2743 | |||
2744 | if (!down_read_trylock(&slqb_lock)) | ||
2745 | goto out; | ||
2746 | |||
2747 | list_for_each_entry(s, &slab_caches, list) { | ||
2748 | #ifdef CONFIG_NUMA | ||
2749 | int node = numa_node_id(); | ||
2750 | struct kmem_cache_node *n = s->node_slab[node]; | ||
2751 | |||
2752 | if (n) { | ||
2753 | struct kmem_cache_list *l = &n->list; | ||
2754 | |||
2755 | spin_lock_irq(&n->list_lock); | ||
2756 | claim_remote_free_list(s, l); | ||
2757 | flush_free_list(s, l); | ||
2758 | spin_unlock_irq(&n->list_lock); | ||
2759 | } | ||
2760 | #endif | ||
2761 | |||
2762 | local_irq_disable(); | ||
2763 | kmem_cache_trim_percpu(s); | ||
2764 | local_irq_enable(); | ||
2765 | } | ||
2766 | |||
2767 | up_read(&slqb_lock); | ||
2768 | out: | ||
2769 | schedule_delayed_work(work, round_jiffies_relative(3*HZ)); | ||
2770 | } | ||
2771 | |||
2772 | static DEFINE_PER_CPU(struct delayed_work, slqb_cache_trim_work); | ||
2773 | |||
2774 | static void __cpuinit start_cpu_timer(int cpu) | ||
2775 | { | ||
2776 | struct delayed_work *cache_trim_work = &per_cpu(slqb_cache_trim_work, | ||
2777 | cpu); | ||
2778 | |||
2779 | /* | ||
2780 | * When this gets called from do_initcalls via cpucache_init(), | ||
2781 | * init_workqueues() has already run, so keventd will be setup | ||
2782 | * at that time. | ||
2783 | */ | ||
2784 | if (keventd_up() && cache_trim_work->work.func == NULL) { | ||
2785 | INIT_DELAYED_WORK(cache_trim_work, cache_trim_worker); | ||
2786 | schedule_delayed_work_on(cpu, cache_trim_work, | ||
2787 | __round_jiffies_relative(HZ, cpu)); | ||
2788 | } | ||
2789 | } | ||
2790 | |||
2791 | static int __init cpucache_init(void) | ||
2792 | { | ||
2793 | int cpu; | ||
2794 | |||
2795 | for_each_online_cpu(cpu) | ||
2796 | start_cpu_timer(cpu); | ||
2797 | |||
2798 | return 0; | ||
2799 | } | ||
2800 | device_initcall(cpucache_init); | ||
2801 | |||
2802 | #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) | ||
2803 | static void slab_mem_going_offline_callback(void *arg) | ||
2804 | { | ||
2805 | kmem_cache_reap(); | ||
2806 | } | ||
2807 | |||
2808 | static void slab_mem_offline_callback(void *arg) | ||
2809 | { | ||
2810 | /* XXX: should release structures, see CPU offline comment */ | ||
2811 | } | ||
2812 | |||
2813 | static int slab_mem_going_online_callback(void *arg) | ||
2814 | { | ||
2815 | struct kmem_cache *s; | ||
2816 | struct kmem_cache_node *n; | ||
2817 | struct memory_notify *marg = arg; | ||
2818 | int nid = marg->status_change_nid; | ||
2819 | int ret = 0; | ||
2820 | |||
2821 | /* | ||
2822 | * If the node's memory is already available, then kmem_cache_node is | ||
2823 | * already created. Nothing to do. | ||
2824 | */ | ||
2825 | if (nid < 0) | ||
2826 | return 0; | ||
2827 | |||
2828 | /* | ||
2829 | * We are bringing a node online. No memory is availabe yet. We must | ||
2830 | * allocate a kmem_cache_node structure in order to bring the node | ||
2831 | * online. | ||
2832 | */ | ||
2833 | down_write(&slqb_lock); | ||
2834 | list_for_each_entry(s, &slab_caches, list) { | ||
2835 | /* | ||
2836 | * XXX: kmem_cache_alloc_node will fallback to other nodes | ||
2837 | * since memory is not yet available from the node that | ||
2838 | * is brought up. | ||
2839 | */ | ||
2840 | if (s->node_slab[nid]) /* could be lefover from last online */ | ||
2841 | continue; | ||
2842 | n = kmem_cache_alloc(&kmem_node_cache, GFP_KERNEL); | ||
2843 | if (!n) { | ||
2844 | ret = -ENOMEM; | ||
2845 | goto out; | ||
2846 | } | ||
2847 | init_kmem_cache_node(s, n); | ||
2848 | s->node_slab[nid] = n; | ||
2849 | } | ||
2850 | out: | ||
2851 | up_write(&slqb_lock); | ||
2852 | return ret; | ||
2853 | } | ||
2854 | |||
2855 | static int slab_memory_callback(struct notifier_block *self, | ||
2856 | unsigned long action, void *arg) | ||
2857 | { | ||
2858 | int ret = 0; | ||
2859 | |||
2860 | switch (action) { | ||
2861 | case MEM_GOING_ONLINE: | ||
2862 | ret = slab_mem_going_online_callback(arg); | ||
2863 | break; | ||
2864 | case MEM_GOING_OFFLINE: | ||
2865 | slab_mem_going_offline_callback(arg); | ||
2866 | break; | ||
2867 | case MEM_OFFLINE: | ||
2868 | case MEM_CANCEL_ONLINE: | ||
2869 | slab_mem_offline_callback(arg); | ||
2870 | break; | ||
2871 | case MEM_ONLINE: | ||
2872 | case MEM_CANCEL_OFFLINE: | ||
2873 | break; | ||
2874 | } | ||
2875 | |||
2876 | if (ret) | ||
2877 | ret = notifier_from_errno(ret); | ||
2878 | else | ||
2879 | ret = NOTIFY_OK; | ||
2880 | return ret; | ||
2881 | } | ||
2882 | |||
2883 | #endif /* CONFIG_MEMORY_HOTPLUG */ | ||
2884 | |||
2885 | /******************************************************************** | ||
2886 | * Basic setup of slabs | ||
2887 | *******************************************************************/ | ||
2888 | |||
2889 | void __init kmem_cache_init(void) | ||
2890 | { | ||
2891 | int i; | ||
2892 | unsigned int flags = SLAB_HWCACHE_ALIGN|SLAB_PANIC; | ||
2893 | |||
2894 | /* | ||
2895 | * All the ifdefs are rather ugly here, but it's just the setup code, | ||
2896 | * so it doesn't have to be too readable :) | ||
2897 | */ | ||
2898 | |||
2899 | /* | ||
2900 | * No need to take slqb_lock here: there should be no concurrency | ||
2901 | * anyway, and spin_unlock_irq in rwsem code could enable interrupts | ||
2902 | * too early. | ||
2903 | */ | ||
2904 | kmem_cache_open(&kmem_cache_cache, "kmem_cache", | ||
2905 | sizeof(struct kmem_cache), 0, flags, NULL, 0); | ||
2906 | #ifdef CONFIG_SMP | ||
2907 | kmem_cache_open(&kmem_cpu_cache, "kmem_cache_cpu", | ||
2908 | sizeof(struct kmem_cache_cpu), 0, flags, NULL, 0); | ||
2909 | #endif | ||
2910 | #ifdef CONFIG_NUMA | ||
2911 | kmem_cache_open(&kmem_node_cache, "kmem_cache_node", | ||
2912 | sizeof(struct kmem_cache_node), 0, flags, NULL, 0); | ||
2913 | #endif | ||
2914 | |||
2915 | #ifdef CONFIG_SMP | ||
2916 | for_each_possible_cpu(i) { | ||
2917 | struct kmem_cache_cpu *c; | ||
2918 | |||
2919 | c = &per_cpu(kmem_cache_cpus, i); | ||
2920 | init_kmem_cache_cpu(&kmem_cache_cache, c); | ||
2921 | kmem_cache_cache.cpu_slab[i] = c; | ||
2922 | |||
2923 | c = &per_cpu(kmem_cpu_cpus, i); | ||
2924 | init_kmem_cache_cpu(&kmem_cpu_cache, c); | ||
2925 | kmem_cpu_cache.cpu_slab[i] = c; | ||
2926 | |||
2927 | #ifdef CONFIG_NUMA | ||
2928 | c = &per_cpu(kmem_node_cpus, i); | ||
2929 | init_kmem_cache_cpu(&kmem_node_cache, c); | ||
2930 | kmem_node_cache.cpu_slab[i] = c; | ||
2931 | #endif | ||
2932 | } | ||
2933 | #else | ||
2934 | init_kmem_cache_cpu(&kmem_cache_cache, &kmem_cache_cache.cpu_slab); | ||
2935 | #endif | ||
2936 | |||
2937 | #ifdef CONFIG_NUMA | ||
2938 | for_each_node_state(i, N_NORMAL_MEMORY) { | ||
2939 | struct kmem_cache_node *n; | ||
2940 | |||
2941 | n = &kmem_cache_nodes[i]; | ||
2942 | init_kmem_cache_node(&kmem_cache_cache, n); | ||
2943 | kmem_cache_cache.node_slab[i] = n; | ||
2944 | #ifdef CONFIG_SMP | ||
2945 | n = &kmem_cpu_nodes[i]; | ||
2946 | init_kmem_cache_node(&kmem_cpu_cache, n); | ||
2947 | kmem_cpu_cache.node_slab[i] = n; | ||
2948 | #endif | ||
2949 | n = &kmem_node_nodes[i]; | ||
2950 | init_kmem_cache_node(&kmem_node_cache, n); | ||
2951 | kmem_node_cache.node_slab[i] = n; | ||
2952 | } | ||
2953 | #endif | ||
2954 | |||
2955 | /* Caches that are not of the two-to-the-power-of size */ | ||
2956 | if (L1_CACHE_BYTES < 64 && KMALLOC_MIN_SIZE <= 64) { | ||
2957 | open_kmalloc_cache(&kmalloc_caches[1], | ||
2958 | "kmalloc-96", 96, GFP_KERNEL); | ||
2959 | #ifdef CONFIG_ZONE_DMA | ||
2960 | open_kmalloc_cache(&kmalloc_caches_dma[1], | ||
2961 | "kmalloc_dma-96", 96, GFP_KERNEL|SLQB_DMA); | ||
2962 | #endif | ||
2963 | } | ||
2964 | if (L1_CACHE_BYTES < 128 && KMALLOC_MIN_SIZE <= 128) { | ||
2965 | open_kmalloc_cache(&kmalloc_caches[2], | ||
2966 | "kmalloc-192", 192, GFP_KERNEL); | ||
2967 | #ifdef CONFIG_ZONE_DMA | ||
2968 | open_kmalloc_cache(&kmalloc_caches_dma[2], | ||
2969 | "kmalloc_dma-192", 192, GFP_KERNEL|SLQB_DMA); | ||
2970 | #endif | ||
2971 | } | ||
2972 | |||
2973 | for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) { | ||
2974 | open_kmalloc_cache(&kmalloc_caches[i], | ||
2975 | "kmalloc", 1 << i, GFP_KERNEL); | ||
2976 | #ifdef CONFIG_ZONE_DMA | ||
2977 | open_kmalloc_cache(&kmalloc_caches_dma[i], | ||
2978 | "kmalloc_dma", 1 << i, GFP_KERNEL|SLQB_DMA); | ||
2979 | #endif | ||
2980 | } | ||
2981 | |||
2982 | /* | ||
2983 | * Patch up the size_index table if we have strange large alignment | ||
2984 | * requirements for the kmalloc array. This is only the case for | ||
2985 | * mips it seems. The standard arches will not generate any code here. | ||
2986 | * | ||
2987 | * Largest permitted alignment is 256 bytes due to the way we | ||
2988 | * handle the index determination for the smaller caches. | ||
2989 | * | ||
2990 | * Make sure that nothing crazy happens if someone starts tinkering | ||
2991 | * around with ARCH_KMALLOC_MINALIGN | ||
2992 | */ | ||
2993 | BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || | ||
2994 | (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); | ||
2995 | |||
2996 | for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) | ||
2997 | size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; | ||
2998 | |||
2999 | /* Provide the correct kmalloc names now that the caches are up */ | ||
3000 | for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) { | ||
3001 | kmalloc_caches[i].name = | ||
3002 | kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); | ||
3003 | #ifdef CONFIG_ZONE_DMA | ||
3004 | kmalloc_caches_dma[i].name = | ||
3005 | kasprintf(GFP_KERNEL, "kmalloc_dma-%d", 1 << i); | ||
3006 | #endif | ||
3007 | } | ||
3008 | |||
3009 | #ifdef CONFIG_SMP | ||
3010 | register_cpu_notifier(&slab_notifier); | ||
3011 | #endif | ||
3012 | #ifdef CONFIG_NUMA | ||
3013 | hotplug_memory_notifier(slab_memory_callback, 1); | ||
3014 | #endif | ||
3015 | /* | ||
3016 | * smp_init() has not yet been called, so no worries about memory | ||
3017 | * ordering with __slab_is_available. | ||
3018 | */ | ||
3019 | __slab_is_available = 1; | ||
3020 | } | ||
3021 | |||
3022 | void __init kmem_cache_init_late(void) | ||
3023 | { | ||
3024 | } | ||
3025 | |||
3026 | /* | ||
3027 | * Some basic slab creation sanity checks | ||
3028 | */ | ||
3029 | static int kmem_cache_create_ok(const char *name, size_t size, | ||
3030 | size_t align, unsigned long flags) | ||
3031 | { | ||
3032 | struct kmem_cache *tmp; | ||
3033 | |||
3034 | /* | ||
3035 | * Sanity checks... these are all serious usage bugs. | ||
3036 | */ | ||
3037 | if (!name || in_interrupt() || (size < sizeof(void *))) { | ||
3038 | printk(KERN_ERR "kmem_cache_create(): early error in slab %s\n", | ||
3039 | name); | ||
3040 | dump_stack(); | ||
3041 | |||
3042 | return 0; | ||
3043 | } | ||
3044 | |||
3045 | list_for_each_entry(tmp, &slab_caches, list) { | ||
3046 | char x; | ||
3047 | int res; | ||
3048 | |||
3049 | /* | ||
3050 | * This happens when the module gets unloaded and doesn't | ||
3051 | * destroy its slab cache and no-one else reuses the vmalloc | ||
3052 | * area of the module. Print a warning. | ||
3053 | */ | ||
3054 | res = probe_kernel_address(tmp->name, x); | ||
3055 | if (res) { | ||
3056 | printk(KERN_ERR | ||
3057 | "SLAB: cache with size %d has lost its name\n", | ||
3058 | tmp->size); | ||
3059 | continue; | ||
3060 | } | ||
3061 | |||
3062 | if (!strcmp(tmp->name, name)) { | ||
3063 | printk(KERN_ERR | ||
3064 | "SLAB: duplicate cache %s\n", name); | ||
3065 | dump_stack(); | ||
3066 | |||
3067 | return 0; | ||
3068 | } | ||
3069 | } | ||
3070 | |||
3071 | WARN_ON(strchr(name, ' ')); /* It confuses parsers */ | ||
3072 | if (flags & SLAB_DESTROY_BY_RCU) | ||
3073 | WARN_ON(flags & SLAB_POISON); | ||
3074 | |||
3075 | return 1; | ||
3076 | } | ||
3077 | |||
3078 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | ||
3079 | size_t align, unsigned long flags, void (*ctor)(void *)) | ||
3080 | { | ||
3081 | struct kmem_cache *s; | ||
3082 | |||
3083 | down_write(&slqb_lock); | ||
3084 | if (!kmem_cache_create_ok(name, size, align, flags)) | ||
3085 | goto err; | ||
3086 | |||
3087 | s = kmem_cache_alloc(&kmem_cache_cache, GFP_KERNEL); | ||
3088 | if (!s) | ||
3089 | goto err; | ||
3090 | |||
3091 | if (kmem_cache_open(s, name, size, align, flags, ctor, 1)) { | ||
3092 | up_write(&slqb_lock); | ||
3093 | return s; | ||
3094 | } | ||
3095 | |||
3096 | kmem_cache_free(&kmem_cache_cache, s); | ||
3097 | |||
3098 | err: | ||
3099 | up_write(&slqb_lock); | ||
3100 | if (flags & SLAB_PANIC) | ||
3101 | panic("%s: failed to create slab `%s'\n", __func__, name); | ||
3102 | |||
3103 | return NULL; | ||
3104 | } | ||
3105 | EXPORT_SYMBOL(kmem_cache_create); | ||
3106 | |||
3107 | #ifdef CONFIG_SMP | ||
3108 | /* | ||
3109 | * Use the cpu notifier to insure that the cpu slabs are flushed when | ||
3110 | * necessary. | ||
3111 | */ | ||
3112 | static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | ||
3113 | unsigned long action, void *hcpu) | ||
3114 | { | ||
3115 | long cpu = (long)hcpu; | ||
3116 | struct kmem_cache *s; | ||
3117 | |||
3118 | switch (action) { | ||
3119 | case CPU_UP_PREPARE: | ||
3120 | case CPU_UP_PREPARE_FROZEN: | ||
3121 | down_write(&slqb_lock); | ||
3122 | list_for_each_entry(s, &slab_caches, list) { | ||
3123 | if (s->cpu_slab[cpu]) /* could be lefover last online */ | ||
3124 | continue; | ||
3125 | s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu); | ||
3126 | if (!s->cpu_slab[cpu]) { | ||
3127 | up_read(&slqb_lock); | ||
3128 | return NOTIFY_BAD; | ||
3129 | } | ||
3130 | } | ||
3131 | up_write(&slqb_lock); | ||
3132 | break; | ||
3133 | |||
3134 | case CPU_ONLINE: | ||
3135 | case CPU_ONLINE_FROZEN: | ||
3136 | case CPU_DOWN_FAILED: | ||
3137 | case CPU_DOWN_FAILED_FROZEN: | ||
3138 | start_cpu_timer(cpu); | ||
3139 | break; | ||
3140 | |||
3141 | case CPU_DOWN_PREPARE: | ||
3142 | case CPU_DOWN_PREPARE_FROZEN: | ||
3143 | cancel_delayed_work_sync(&per_cpu(slqb_cache_trim_work, | ||
3144 | cpu)); | ||
3145 | per_cpu(slqb_cache_trim_work, cpu).work.func = NULL; | ||
3146 | break; | ||
3147 | |||
3148 | case CPU_UP_CANCELED: | ||
3149 | case CPU_UP_CANCELED_FROZEN: | ||
3150 | case CPU_DEAD: | ||
3151 | case CPU_DEAD_FROZEN: | ||
3152 | /* | ||
3153 | * XXX: Freeing here doesn't work because objects can still be | ||
3154 | * on this CPU's list. periodic timer needs to check if a CPU | ||
3155 | * is offline and then try to cleanup from there. Same for node | ||
3156 | * offline. | ||
3157 | */ | ||
3158 | default: | ||
3159 | break; | ||
3160 | } | ||
3161 | return NOTIFY_OK; | ||
3162 | } | ||
3163 | |||
3164 | static struct notifier_block __cpuinitdata slab_notifier = { | ||
3165 | .notifier_call = slab_cpuup_callback | ||
3166 | }; | ||
3167 | |||
3168 | #endif | ||
3169 | |||
3170 | #ifdef CONFIG_SLQB_DEBUG | ||
3171 | void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) | ||
3172 | { | ||
3173 | struct kmem_cache *s; | ||
3174 | int node = -1; | ||
3175 | |||
3176 | s = get_slab(size, flags); | ||
3177 | if (unlikely(ZERO_OR_NULL_PTR(s))) | ||
3178 | return s; | ||
3179 | |||
3180 | #ifdef CONFIG_NUMA | ||
3181 | if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) | ||
3182 | node = alternate_nid(s, flags, node); | ||
3183 | #endif | ||
3184 | return slab_alloc(s, flags, node, caller); | ||
3185 | } | ||
3186 | |||
3187 | void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, | ||
3188 | unsigned long caller) | ||
3189 | { | ||
3190 | struct kmem_cache *s; | ||
3191 | |||
3192 | s = get_slab(size, flags); | ||
3193 | if (unlikely(ZERO_OR_NULL_PTR(s))) | ||
3194 | return s; | ||
3195 | |||
3196 | return slab_alloc(s, flags, node, caller); | ||
3197 | } | ||
3198 | #endif | ||
3199 | |||
3200 | #if defined(CONFIG_SLQB_SYSFS) || defined(CONFIG_SLABINFO) | ||
3201 | struct stats_gather { | ||
3202 | struct kmem_cache *s; | ||
3203 | spinlock_t lock; | ||
3204 | unsigned long nr_slabs; | ||
3205 | unsigned long nr_partial; | ||
3206 | unsigned long nr_inuse; | ||
3207 | unsigned long nr_objects; | ||
3208 | |||
3209 | #ifdef CONFIG_SLQB_STATS | ||
3210 | unsigned long stats[NR_SLQB_STAT_ITEMS]; | ||
3211 | #endif | ||
3212 | }; | ||
3213 | |||
3214 | static void __gather_stats(void *arg) | ||
3215 | { | ||
3216 | unsigned long nr_slabs; | ||
3217 | unsigned long nr_partial; | ||
3218 | unsigned long nr_inuse; | ||
3219 | struct stats_gather *gather = arg; | ||
3220 | int cpu = smp_processor_id(); | ||
3221 | struct kmem_cache *s = gather->s; | ||
3222 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
3223 | struct kmem_cache_list *l = &c->list; | ||
3224 | struct slqb_page *page; | ||
3225 | #ifdef CONFIG_SLQB_STATS | ||
3226 | int i; | ||
3227 | #endif | ||
3228 | |||
3229 | spin_lock(&l->page_lock); | ||
3230 | nr_slabs = l->nr_slabs; | ||
3231 | nr_partial = l->nr_partial; | ||
3232 | nr_inuse = (nr_slabs - nr_partial) * s->objects; | ||
3233 | |||
3234 | list_for_each_entry(page, &l->partial, lru) { | ||
3235 | nr_inuse += page->inuse; | ||
3236 | } | ||
3237 | spin_unlock(&l->page_lock); | ||
3238 | |||
3239 | spin_lock(&gather->lock); | ||
3240 | gather->nr_slabs += nr_slabs; | ||
3241 | gather->nr_partial += nr_partial; | ||
3242 | gather->nr_inuse += nr_inuse; | ||
3243 | #ifdef CONFIG_SLQB_STATS | ||
3244 | for (i = 0; i < NR_SLQB_STAT_ITEMS; i++) | ||
3245 | gather->stats[i] += l->stats[i]; | ||
3246 | #endif | ||
3247 | spin_unlock(&gather->lock); | ||
3248 | } | ||
3249 | |||
3250 | /* must be called with slqb_lock held */ | ||
3251 | static void gather_stats_locked(struct kmem_cache *s, | ||
3252 | struct stats_gather *stats) | ||
3253 | { | ||
3254 | #ifdef CONFIG_NUMA | ||
3255 | int node; | ||
3256 | #endif | ||
3257 | |||
3258 | memset(stats, 0, sizeof(struct stats_gather)); | ||
3259 | stats->s = s; | ||
3260 | spin_lock_init(&stats->lock); | ||
3261 | |||
3262 | on_each_cpu(__gather_stats, stats, 1); | ||
3263 | |||
3264 | #ifdef CONFIG_NUMA | ||
3265 | for_each_online_node(node) { | ||
3266 | struct kmem_cache_node *n = s->node_slab[node]; | ||
3267 | struct kmem_cache_list *l = &n->list; | ||
3268 | struct slqb_page *page; | ||
3269 | unsigned long flags; | ||
3270 | #ifdef CONFIG_SLQB_STATS | ||
3271 | int i; | ||
3272 | #endif | ||
3273 | |||
3274 | spin_lock_irqsave(&n->list_lock, flags); | ||
3275 | #ifdef CONFIG_SLQB_STATS | ||
3276 | for (i = 0; i < NR_SLQB_STAT_ITEMS; i++) | ||
3277 | stats->stats[i] += l->stats[i]; | ||
3278 | #endif | ||
3279 | stats->nr_slabs += l->nr_slabs; | ||
3280 | stats->nr_partial += l->nr_partial; | ||
3281 | stats->nr_inuse += (l->nr_slabs - l->nr_partial) * s->objects; | ||
3282 | |||
3283 | list_for_each_entry(page, &l->partial, lru) { | ||
3284 | stats->nr_inuse += page->inuse; | ||
3285 | } | ||
3286 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
3287 | } | ||
3288 | #endif | ||
3289 | |||
3290 | stats->nr_objects = stats->nr_slabs * s->objects; | ||
3291 | } | ||
3292 | |||
3293 | #ifdef CONFIG_SLQB_SYSFS | ||
3294 | static void gather_stats(struct kmem_cache *s, struct stats_gather *stats) | ||
3295 | { | ||
3296 | down_read(&slqb_lock); /* hold off hotplug */ | ||
3297 | gather_stats_locked(s, stats); | ||
3298 | up_read(&slqb_lock); | ||
3299 | } | ||
3300 | #endif | ||
3301 | #endif | ||
3302 | |||
3303 | /* | ||
3304 | * The /proc/slabinfo ABI | ||
3305 | */ | ||
3306 | #ifdef CONFIG_SLABINFO | ||
3307 | #include <linux/proc_fs.h> | ||
3308 | ssize_t slabinfo_write(struct file *file, const char __user * buffer, | ||
3309 | size_t count, loff_t *ppos) | ||
3310 | { | ||
3311 | return -EINVAL; | ||
3312 | } | ||
3313 | |||
3314 | static void print_slabinfo_header(struct seq_file *m) | ||
3315 | { | ||
3316 | seq_puts(m, "slabinfo - version: 2.1\n"); | ||
3317 | seq_puts(m, "# name <active_objs> <num_objs> <objsize> " | ||
3318 | "<objperslab> <pagesperslab>"); | ||
3319 | seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); | ||
3320 | seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); | ||
3321 | seq_putc(m, '\n'); | ||
3322 | } | ||
3323 | |||
3324 | static void *s_start(struct seq_file *m, loff_t *pos) | ||
3325 | { | ||
3326 | loff_t n = *pos; | ||
3327 | |||
3328 | down_read(&slqb_lock); | ||
3329 | if (!n) | ||
3330 | print_slabinfo_header(m); | ||
3331 | |||
3332 | return seq_list_start(&slab_caches, *pos); | ||
3333 | } | ||
3334 | |||
3335 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | ||
3336 | { | ||
3337 | return seq_list_next(p, &slab_caches, pos); | ||
3338 | } | ||
3339 | |||
3340 | static void s_stop(struct seq_file *m, void *p) | ||
3341 | { | ||
3342 | up_read(&slqb_lock); | ||
3343 | } | ||
3344 | |||
3345 | static int s_show(struct seq_file *m, void *p) | ||
3346 | { | ||
3347 | struct stats_gather stats; | ||
3348 | struct kmem_cache *s; | ||
3349 | |||
3350 | s = list_entry(p, struct kmem_cache, list); | ||
3351 | |||
3352 | gather_stats_locked(s, &stats); | ||
3353 | |||
3354 | seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, stats.nr_inuse, | ||
3355 | stats.nr_objects, s->size, s->objects, (1 << s->order)); | ||
3356 | seq_printf(m, " : tunables %4u %4u %4u", slab_hiwater(s), | ||
3357 | slab_freebatch(s), 0); | ||
3358 | seq_printf(m, " : slabdata %6lu %6lu %6lu", stats.nr_slabs, | ||
3359 | stats.nr_slabs, 0UL); | ||
3360 | seq_putc(m, '\n'); | ||
3361 | return 0; | ||
3362 | } | ||
3363 | |||
3364 | static const struct seq_operations slabinfo_op = { | ||
3365 | .start = s_start, | ||
3366 | .next = s_next, | ||
3367 | .stop = s_stop, | ||
3368 | .show = s_show, | ||
3369 | }; | ||
3370 | |||
3371 | static int slabinfo_open(struct inode *inode, struct file *file) | ||
3372 | { | ||
3373 | return seq_open(file, &slabinfo_op); | ||
3374 | } | ||
3375 | |||
3376 | static const struct file_operations proc_slabinfo_operations = { | ||
3377 | .open = slabinfo_open, | ||
3378 | .read = seq_read, | ||
3379 | .llseek = seq_lseek, | ||
3380 | .release = seq_release, | ||
3381 | }; | ||
3382 | |||
3383 | static int __init slab_proc_init(void) | ||
3384 | { | ||
3385 | proc_create("slabinfo", S_IWUSR|S_IRUGO, NULL, | ||
3386 | &proc_slabinfo_operations); | ||
3387 | return 0; | ||
3388 | } | ||
3389 | module_init(slab_proc_init); | ||
3390 | #endif /* CONFIG_SLABINFO */ | ||
3391 | |||
3392 | #ifdef CONFIG_SLQB_SYSFS | ||
3393 | /* | ||
3394 | * sysfs API | ||
3395 | */ | ||
3396 | #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) | ||
3397 | #define to_slab(n) container_of(n, struct kmem_cache, kobj); | ||
3398 | |||
3399 | struct slab_attribute { | ||
3400 | struct attribute attr; | ||
3401 | ssize_t (*show)(struct kmem_cache *s, char *buf); | ||
3402 | ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); | ||
3403 | }; | ||
3404 | |||
3405 | #define SLAB_ATTR_RO(_name) \ | ||
3406 | static struct slab_attribute _name##_attr = __ATTR_RO(_name) | ||
3407 | |||
3408 | #define SLAB_ATTR(_name) \ | ||
3409 | static struct slab_attribute _name##_attr = \ | ||
3410 | __ATTR(_name, 0644, _name##_show, _name##_store) | ||
3411 | |||
3412 | static ssize_t slab_size_show(struct kmem_cache *s, char *buf) | ||
3413 | { | ||
3414 | return sprintf(buf, "%d\n", s->size); | ||
3415 | } | ||
3416 | SLAB_ATTR_RO(slab_size); | ||
3417 | |||
3418 | static ssize_t align_show(struct kmem_cache *s, char *buf) | ||
3419 | { | ||
3420 | return sprintf(buf, "%d\n", s->align); | ||
3421 | } | ||
3422 | SLAB_ATTR_RO(align); | ||
3423 | |||
3424 | static ssize_t object_size_show(struct kmem_cache *s, char *buf) | ||
3425 | { | ||
3426 | return sprintf(buf, "%d\n", s->objsize); | ||
3427 | } | ||
3428 | SLAB_ATTR_RO(object_size); | ||
3429 | |||
3430 | static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) | ||
3431 | { | ||
3432 | return sprintf(buf, "%d\n", s->objects); | ||
3433 | } | ||
3434 | SLAB_ATTR_RO(objs_per_slab); | ||
3435 | |||
3436 | static ssize_t order_show(struct kmem_cache *s, char *buf) | ||
3437 | { | ||
3438 | return sprintf(buf, "%d\n", s->order); | ||
3439 | } | ||
3440 | SLAB_ATTR_RO(order); | ||
3441 | |||
3442 | static ssize_t ctor_show(struct kmem_cache *s, char *buf) | ||
3443 | { | ||
3444 | if (s->ctor) { | ||
3445 | int n = sprint_symbol(buf, (unsigned long)s->ctor); | ||
3446 | |||
3447 | return n + sprintf(buf + n, "\n"); | ||
3448 | } | ||
3449 | return 0; | ||
3450 | } | ||
3451 | SLAB_ATTR_RO(ctor); | ||
3452 | |||
3453 | static ssize_t slabs_show(struct kmem_cache *s, char *buf) | ||
3454 | { | ||
3455 | struct stats_gather stats; | ||
3456 | |||
3457 | gather_stats(s, &stats); | ||
3458 | |||
3459 | return sprintf(buf, "%lu\n", stats.nr_slabs); | ||
3460 | } | ||
3461 | SLAB_ATTR_RO(slabs); | ||
3462 | |||
3463 | static ssize_t objects_show(struct kmem_cache *s, char *buf) | ||
3464 | { | ||
3465 | struct stats_gather stats; | ||
3466 | |||
3467 | gather_stats(s, &stats); | ||
3468 | |||
3469 | return sprintf(buf, "%lu\n", stats.nr_inuse); | ||
3470 | } | ||
3471 | SLAB_ATTR_RO(objects); | ||
3472 | |||
3473 | static ssize_t total_objects_show(struct kmem_cache *s, char *buf) | ||
3474 | { | ||
3475 | struct stats_gather stats; | ||
3476 | |||
3477 | gather_stats(s, &stats); | ||
3478 | |||
3479 | return sprintf(buf, "%lu\n", stats.nr_objects); | ||
3480 | } | ||
3481 | SLAB_ATTR_RO(total_objects); | ||
3482 | |||
3483 | #ifdef CONFIG_FAILSLAB | ||
3484 | static ssize_t failslab_show(struct kmem_cache *s, char *buf) | ||
3485 | { | ||
3486 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); | ||
3487 | } | ||
3488 | |||
3489 | static ssize_t failslab_store(struct kmem_cache *s, const char *buf, | ||
3490 | size_t length) | ||
3491 | { | ||
3492 | s->flags &= ~SLAB_FAILSLAB; | ||
3493 | if (buf[0] == '1') | ||
3494 | s->flags |= SLAB_FAILSLAB; | ||
3495 | return length; | ||
3496 | } | ||
3497 | SLAB_ATTR(failslab); | ||
3498 | #endif | ||
3499 | |||
3500 | static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) | ||
3501 | { | ||
3502 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); | ||
3503 | } | ||
3504 | SLAB_ATTR_RO(reclaim_account); | ||
3505 | |||
3506 | static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) | ||
3507 | { | ||
3508 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); | ||
3509 | } | ||
3510 | SLAB_ATTR_RO(hwcache_align); | ||
3511 | |||
3512 | #ifdef CONFIG_ZONE_DMA | ||
3513 | static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) | ||
3514 | { | ||
3515 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); | ||
3516 | } | ||
3517 | SLAB_ATTR_RO(cache_dma); | ||
3518 | #endif | ||
3519 | |||
3520 | static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) | ||
3521 | { | ||
3522 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); | ||
3523 | } | ||
3524 | SLAB_ATTR_RO(destroy_by_rcu); | ||
3525 | |||
3526 | static ssize_t red_zone_show(struct kmem_cache *s, char *buf) | ||
3527 | { | ||
3528 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); | ||
3529 | } | ||
3530 | SLAB_ATTR_RO(red_zone); | ||
3531 | |||
3532 | static ssize_t poison_show(struct kmem_cache *s, char *buf) | ||
3533 | { | ||
3534 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); | ||
3535 | } | ||
3536 | SLAB_ATTR_RO(poison); | ||
3537 | |||
3538 | static ssize_t store_user_show(struct kmem_cache *s, char *buf) | ||
3539 | { | ||
3540 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); | ||
3541 | } | ||
3542 | SLAB_ATTR_RO(store_user); | ||
3543 | |||
3544 | static ssize_t hiwater_store(struct kmem_cache *s, | ||
3545 | const char *buf, size_t length) | ||
3546 | { | ||
3547 | long hiwater; | ||
3548 | int err; | ||
3549 | |||
3550 | err = strict_strtol(buf, 10, &hiwater); | ||
3551 | if (err) | ||
3552 | return err; | ||
3553 | |||
3554 | if (hiwater < 0) | ||
3555 | return -EINVAL; | ||
3556 | |||
3557 | s->hiwater = hiwater; | ||
3558 | |||
3559 | return length; | ||
3560 | } | ||
3561 | |||
3562 | static ssize_t hiwater_show(struct kmem_cache *s, char *buf) | ||
3563 | { | ||
3564 | return sprintf(buf, "%d\n", slab_hiwater(s)); | ||
3565 | } | ||
3566 | SLAB_ATTR(hiwater); | ||
3567 | |||
3568 | static ssize_t freebatch_store(struct kmem_cache *s, | ||
3569 | const char *buf, size_t length) | ||
3570 | { | ||
3571 | long freebatch; | ||
3572 | int err; | ||
3573 | |||
3574 | err = strict_strtol(buf, 10, &freebatch); | ||
3575 | if (err) | ||
3576 | return err; | ||
3577 | |||
3578 | if (freebatch <= 0 || freebatch - 1 > s->hiwater) | ||
3579 | return -EINVAL; | ||
3580 | |||
3581 | s->freebatch = freebatch; | ||
3582 | |||
3583 | return length; | ||
3584 | } | ||
3585 | |||
3586 | static ssize_t freebatch_show(struct kmem_cache *s, char *buf) | ||
3587 | { | ||
3588 | return sprintf(buf, "%d\n", slab_freebatch(s)); | ||
3589 | } | ||
3590 | SLAB_ATTR(freebatch); | ||
3591 | |||
3592 | #ifdef CONFIG_SLQB_STATS | ||
3593 | static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) | ||
3594 | { | ||
3595 | struct stats_gather stats; | ||
3596 | int len; | ||
3597 | #ifdef CONFIG_SMP | ||
3598 | int cpu; | ||
3599 | #endif | ||
3600 | |||
3601 | gather_stats(s, &stats); | ||
3602 | |||
3603 | len = sprintf(buf, "%lu", stats.stats[si]); | ||
3604 | |||
3605 | #ifdef CONFIG_SMP | ||
3606 | for_each_online_cpu(cpu) { | ||
3607 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
3608 | struct kmem_cache_list *l = &c->list; | ||
3609 | |||
3610 | if (len < PAGE_SIZE - 20) | ||
3611 | len += sprintf(buf+len, " C%d=%lu", cpu, l->stats[si]); | ||
3612 | } | ||
3613 | #endif | ||
3614 | return len + sprintf(buf + len, "\n"); | ||
3615 | } | ||
3616 | |||
3617 | #define STAT_ATTR(si, text) \ | ||
3618 | static ssize_t text##_show(struct kmem_cache *s, char *buf) \ | ||
3619 | { \ | ||
3620 | return show_stat(s, buf, si); \ | ||
3621 | } \ | ||
3622 | SLAB_ATTR_RO(text); \ | ||
3623 | |||
3624 | STAT_ATTR(ALLOC, alloc); | ||
3625 | STAT_ATTR(ALLOC_SLAB_FILL, alloc_slab_fill); | ||
3626 | STAT_ATTR(ALLOC_SLAB_NEW, alloc_slab_new); | ||
3627 | STAT_ATTR(FREE, free); | ||
3628 | STAT_ATTR(FREE_REMOTE, free_remote); | ||
3629 | STAT_ATTR(FLUSH_FREE_LIST, flush_free_list); | ||
3630 | STAT_ATTR(FLUSH_FREE_LIST_OBJECTS, flush_free_list_objects); | ||
3631 | STAT_ATTR(FLUSH_FREE_LIST_REMOTE, flush_free_list_remote); | ||
3632 | STAT_ATTR(FLUSH_SLAB_PARTIAL, flush_slab_partial); | ||
3633 | STAT_ATTR(FLUSH_SLAB_FREE, flush_slab_free); | ||
3634 | STAT_ATTR(FLUSH_RFREE_LIST, flush_rfree_list); | ||
3635 | STAT_ATTR(FLUSH_RFREE_LIST_OBJECTS, flush_rfree_list_objects); | ||
3636 | STAT_ATTR(CLAIM_REMOTE_LIST, claim_remote_list); | ||
3637 | STAT_ATTR(CLAIM_REMOTE_LIST_OBJECTS, claim_remote_list_objects); | ||
3638 | #endif | ||
3639 | |||
3640 | static struct attribute *slab_attrs[] = { | ||
3641 | &slab_size_attr.attr, | ||
3642 | &object_size_attr.attr, | ||
3643 | &objs_per_slab_attr.attr, | ||
3644 | &order_attr.attr, | ||
3645 | &objects_attr.attr, | ||
3646 | &total_objects_attr.attr, | ||
3647 | &slabs_attr.attr, | ||
3648 | &ctor_attr.attr, | ||
3649 | &align_attr.attr, | ||
3650 | &hwcache_align_attr.attr, | ||
3651 | &reclaim_account_attr.attr, | ||
3652 | &destroy_by_rcu_attr.attr, | ||
3653 | &red_zone_attr.attr, | ||
3654 | &poison_attr.attr, | ||
3655 | &store_user_attr.attr, | ||
3656 | &hiwater_attr.attr, | ||
3657 | &freebatch_attr.attr, | ||
3658 | #ifdef CONFIG_ZONE_DMA | ||
3659 | &cache_dma_attr.attr, | ||
3660 | #endif | ||
3661 | #ifdef CONFIG_SLQB_STATS | ||
3662 | &alloc_attr.attr, | ||
3663 | &alloc_slab_fill_attr.attr, | ||
3664 | &alloc_slab_new_attr.attr, | ||
3665 | &free_attr.attr, | ||
3666 | &free_remote_attr.attr, | ||
3667 | &flush_free_list_attr.attr, | ||
3668 | &flush_free_list_objects_attr.attr, | ||
3669 | &flush_free_list_remote_attr.attr, | ||
3670 | &flush_slab_partial_attr.attr, | ||
3671 | &flush_slab_free_attr.attr, | ||
3672 | &flush_rfree_list_attr.attr, | ||
3673 | &flush_rfree_list_objects_attr.attr, | ||
3674 | &claim_remote_list_attr.attr, | ||
3675 | &claim_remote_list_objects_attr.attr, | ||
3676 | #endif | ||
3677 | #ifdef CONFIG_FAILSLAB | ||
3678 | &failslab_attr.attr, | ||
3679 | #endif | ||
3680 | |||
3681 | NULL | ||
3682 | }; | ||
3683 | |||
3684 | static struct attribute_group slab_attr_group = { | ||
3685 | .attrs = slab_attrs, | ||
3686 | }; | ||
3687 | |||
3688 | static ssize_t slab_attr_show(struct kobject *kobj, | ||
3689 | struct attribute *attr, char *buf) | ||
3690 | { | ||
3691 | struct slab_attribute *attribute; | ||
3692 | struct kmem_cache *s; | ||
3693 | int err; | ||
3694 | |||
3695 | attribute = to_slab_attr(attr); | ||
3696 | s = to_slab(kobj); | ||
3697 | |||
3698 | if (!attribute->show) | ||
3699 | return -EIO; | ||
3700 | |||
3701 | err = attribute->show(s, buf); | ||
3702 | |||
3703 | return err; | ||
3704 | } | ||
3705 | |||
3706 | static ssize_t slab_attr_store(struct kobject *kobj, | ||
3707 | struct attribute *attr, const char *buf, size_t len) | ||
3708 | { | ||
3709 | struct slab_attribute *attribute; | ||
3710 | struct kmem_cache *s; | ||
3711 | int err; | ||
3712 | |||
3713 | attribute = to_slab_attr(attr); | ||
3714 | s = to_slab(kobj); | ||
3715 | |||
3716 | if (!attribute->store) | ||
3717 | return -EIO; | ||
3718 | |||
3719 | err = attribute->store(s, buf, len); | ||
3720 | |||
3721 | return err; | ||
3722 | } | ||
3723 | |||
3724 | static void kmem_cache_release(struct kobject *kobj) | ||
3725 | { | ||
3726 | struct kmem_cache *s = to_slab(kobj); | ||
3727 | |||
3728 | kmem_cache_free(&kmem_cache_cache, s); | ||
3729 | } | ||
3730 | |||
3731 | static struct sysfs_ops slab_sysfs_ops = { | ||
3732 | .show = slab_attr_show, | ||
3733 | .store = slab_attr_store, | ||
3734 | }; | ||
3735 | |||
3736 | static struct kobj_type slab_ktype = { | ||
3737 | .sysfs_ops = &slab_sysfs_ops, | ||
3738 | .release = kmem_cache_release | ||
3739 | }; | ||
3740 | |||
3741 | static int uevent_filter(struct kset *kset, struct kobject *kobj) | ||
3742 | { | ||
3743 | struct kobj_type *ktype = get_ktype(kobj); | ||
3744 | |||
3745 | if (ktype == &slab_ktype) | ||
3746 | return 1; | ||
3747 | return 0; | ||
3748 | } | ||
3749 | |||
3750 | static struct kset_uevent_ops slab_uevent_ops = { | ||
3751 | .filter = uevent_filter, | ||
3752 | }; | ||
3753 | |||
3754 | static struct kset *slab_kset; | ||
3755 | |||
3756 | static int sysfs_available __read_mostly; | ||
3757 | |||
3758 | static int sysfs_slab_add(struct kmem_cache *s) | ||
3759 | { | ||
3760 | int err; | ||
3761 | |||
3762 | if (!sysfs_available) | ||
3763 | return 0; | ||
3764 | |||
3765 | s->kobj.kset = slab_kset; | ||
3766 | err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, s->name); | ||
3767 | if (err) { | ||
3768 | kobject_put(&s->kobj); | ||
3769 | return err; | ||
3770 | } | ||
3771 | |||
3772 | err = sysfs_create_group(&s->kobj, &slab_attr_group); | ||
3773 | if (err) | ||
3774 | return err; | ||
3775 | |||
3776 | kobject_uevent(&s->kobj, KOBJ_ADD); | ||
3777 | |||
3778 | return 0; | ||
3779 | } | ||
3780 | |||
3781 | static void sysfs_slab_remove(struct kmem_cache *s) | ||
3782 | { | ||
3783 | kobject_uevent(&s->kobj, KOBJ_REMOVE); | ||
3784 | kobject_del(&s->kobj); | ||
3785 | kobject_put(&s->kobj); | ||
3786 | } | ||
3787 | |||
3788 | static int __init slab_sysfs_init(void) | ||
3789 | { | ||
3790 | struct kmem_cache *s; | ||
3791 | int err; | ||
3792 | |||
3793 | slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); | ||
3794 | if (!slab_kset) { | ||
3795 | printk(KERN_ERR "Cannot register slab subsystem.\n"); | ||
3796 | return -ENOSYS; | ||
3797 | } | ||
3798 | |||
3799 | down_write(&slqb_lock); | ||
3800 | |||
3801 | sysfs_available = 1; | ||
3802 | |||
3803 | list_for_each_entry(s, &slab_caches, list) { | ||
3804 | err = sysfs_slab_add(s); | ||
3805 | if (err) | ||
3806 | printk(KERN_ERR "SLQB: Unable to add boot slab %s" | ||
3807 | " to sysfs\n", s->name); | ||
3808 | } | ||
3809 | |||
3810 | up_write(&slqb_lock); | ||
3811 | |||
3812 | return 0; | ||
3813 | } | ||
3814 | device_initcall(slab_sysfs_init); | ||
3815 | |||
3816 | #endif | ||
@@ -1457,6 +1457,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1457 | struct zone *zone; | 1457 | struct zone *zone; |
1458 | enum zone_type high_zoneidx = gfp_zone(flags); | 1458 | enum zone_type high_zoneidx = gfp_zone(flags); |
1459 | struct page *page; | 1459 | struct page *page; |
1460 | unsigned int cpuset_mems_cookie; | ||
1460 | 1461 | ||
1461 | /* | 1462 | /* |
1462 | * The defrag ratio allows a configuration of the tradeoffs between | 1463 | * The defrag ratio allows a configuration of the tradeoffs between |
@@ -1480,23 +1481,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1480 | get_cycles() % 1024 > s->remote_node_defrag_ratio) | 1481 | get_cycles() % 1024 > s->remote_node_defrag_ratio) |
1481 | return NULL; | 1482 | return NULL; |
1482 | 1483 | ||
1483 | get_mems_allowed(); | 1484 | do { |
1484 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | 1485 | cpuset_mems_cookie = get_mems_allowed(); |
1485 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1486 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); |
1486 | struct kmem_cache_node *n; | 1487 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1487 | 1488 | struct kmem_cache_node *n; | |
1488 | n = get_node(s, zone_to_nid(zone)); | 1489 | |
1489 | 1490 | n = get_node(s, zone_to_nid(zone)); | |
1490 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1491 | |
1491 | n->nr_partial > s->min_partial) { | 1492 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
1492 | page = get_partial_node(n); | 1493 | n->nr_partial > s->min_partial) { |
1493 | if (page) { | 1494 | page = get_partial_node(n); |
1494 | put_mems_allowed(); | 1495 | if (page) { |
1495 | return page; | 1496 | /* |
1497 | * Return the object even if | ||
1498 | * put_mems_allowed indicated that | ||
1499 | * the cpuset mems_allowed was | ||
1500 | * updated in parallel. It's a | ||
1501 | * harmless race between the alloc | ||
1502 | * and the cpuset update. | ||
1503 | */ | ||
1504 | put_mems_allowed(cpuset_mems_cookie); | ||
1505 | return page; | ||
1506 | } | ||
1496 | } | 1507 | } |
1497 | } | 1508 | } |
1498 | } | 1509 | } while (!put_mems_allowed(cpuset_mems_cookie)); |
1499 | put_mems_allowed(); | ||
1500 | #endif | 1510 | #endif |
1501 | return NULL; | 1511 | return NULL; |
1502 | } | 1512 | } |
@@ -1818,6 +1828,11 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1818 | if (unlikely(!node_match(c, node))) | 1828 | if (unlikely(!node_match(c, node))) |
1819 | goto another_slab; | 1829 | goto another_slab; |
1820 | 1830 | ||
1831 | /* must check again c->freelist in case of cpu migration or IRQ */ | ||
1832 | object = c->freelist; | ||
1833 | if (object) | ||
1834 | goto update_freelist; | ||
1835 | |||
1821 | stat(s, ALLOC_REFILL); | 1836 | stat(s, ALLOC_REFILL); |
1822 | 1837 | ||
1823 | load_freelist: | 1838 | load_freelist: |
@@ -1827,6 +1842,7 @@ load_freelist: | |||
1827 | if (kmem_cache_debug(s)) | 1842 | if (kmem_cache_debug(s)) |
1828 | goto debug; | 1843 | goto debug; |
1829 | 1844 | ||
1845 | update_freelist: | ||
1830 | c->freelist = get_freepointer(s, object); | 1846 | c->freelist = get_freepointer(s, object); |
1831 | page->inuse = page->objects; | 1847 | page->inuse = page->objects; |
1832 | page->freelist = NULL; | 1848 | page->freelist = NULL; |
@@ -2163,7 +2179,7 @@ EXPORT_SYMBOL(kmem_cache_free); | |||
2163 | * take the list_lock. | 2179 | * take the list_lock. |
2164 | */ | 2180 | */ |
2165 | static int slub_min_order; | 2181 | static int slub_min_order; |
2166 | static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; | 2182 | static int slub_max_order; |
2167 | static int slub_min_objects; | 2183 | static int slub_min_objects; |
2168 | 2184 | ||
2169 | /* | 2185 | /* |
@@ -3433,13 +3449,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
3433 | if (kmem_cache_open(s, n, | 3449 | if (kmem_cache_open(s, n, |
3434 | size, align, flags, ctor)) { | 3450 | size, align, flags, ctor)) { |
3435 | list_add(&s->list, &slab_caches); | 3451 | list_add(&s->list, &slab_caches); |
3452 | up_write(&slub_lock); | ||
3436 | if (sysfs_slab_add(s)) { | 3453 | if (sysfs_slab_add(s)) { |
3454 | down_write(&slub_lock); | ||
3437 | list_del(&s->list); | 3455 | list_del(&s->list); |
3438 | kfree(n); | 3456 | kfree(n); |
3439 | kfree(s); | 3457 | kfree(s); |
3440 | goto err; | 3458 | goto err; |
3441 | } | 3459 | } |
3442 | up_write(&slub_lock); | ||
3443 | return s; | 3460 | return s; |
3444 | } | 3461 | } |
3445 | kfree(n); | 3462 | kfree(n); |
diff --git a/mm/sparse.c b/mm/sparse.c index aa64b12831a..4cd05e5f2f4 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, | |||
353 | 353 | ||
354 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), | 354 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), |
355 | usemap_count); | 355 | usemap_count); |
356 | if (usemap) { | 356 | if (!usemap) { |
357 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 357 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); |
358 | if (!present_section_nr(pnum)) | 358 | if (!usemap) { |
359 | continue; | 359 | printk(KERN_WARNING "%s: allocation failed\n", __func__); |
360 | usemap_map[pnum] = usemap; | 360 | return; |
361 | usemap += size; | ||
362 | } | 361 | } |
363 | return; | ||
364 | } | 362 | } |
365 | 363 | ||
366 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); | 364 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
367 | if (usemap) { | 365 | if (!present_section_nr(pnum)) |
368 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 366 | continue; |
369 | if (!present_section_nr(pnum)) | 367 | usemap_map[pnum] = usemap; |
370 | continue; | 368 | usemap += size; |
371 | usemap_map[pnum] = usemap; | 369 | check_usemap_section_nr(nodeid, usemap_map[pnum]); |
372 | usemap += size; | ||
373 | check_usemap_section_nr(nodeid, usemap_map[pnum]); | ||
374 | } | ||
375 | return; | ||
376 | } | 370 | } |
377 | |||
378 | printk(KERN_WARNING "%s: allocation failed\n", __func__); | ||
379 | } | 371 | } |
380 | 372 | ||
381 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 373 | #ifndef CONFIG_SPARSEMEM_VMEMMAP |
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page) | |||
78 | { | 78 | { |
79 | if (unlikely(PageTail(page))) { | 79 | if (unlikely(PageTail(page))) { |
80 | /* __split_huge_page_refcount can run under us */ | 80 | /* __split_huge_page_refcount can run under us */ |
81 | struct page *page_head = page->first_page; | 81 | struct page *page_head = compound_trans_head(page); |
82 | smp_rmb(); | 82 | |
83 | /* | 83 | if (likely(page != page_head && |
84 | * If PageTail is still set after smp_rmb() we can be sure | 84 | get_page_unless_zero(page_head))) { |
85 | * that the page->first_page we read wasn't a dangling pointer. | ||
86 | * See __split_huge_page_refcount() smp_wmb(). | ||
87 | */ | ||
88 | if (likely(PageTail(page) && get_page_unless_zero(page_head))) { | ||
89 | unsigned long flags; | 85 | unsigned long flags; |
90 | /* | 86 | /* |
91 | * Verify that our page_head wasn't converted | 87 | * page_head wasn't a dangling pointer but it |
92 | * to a a regular page before we got a | 88 | * may not be a head page anymore by the time |
93 | * reference on it. | 89 | * we obtain the lock. That is ok as long as it |
90 | * can't be freed from under us. | ||
94 | */ | 91 | */ |
95 | if (unlikely(!PageHead(page_head))) { | ||
96 | /* PageHead is cleared after PageTail */ | ||
97 | smp_rmb(); | ||
98 | VM_BUG_ON(PageTail(page)); | ||
99 | goto out_put_head; | ||
100 | } | ||
101 | /* | ||
102 | * Only run compound_lock on a valid PageHead, | ||
103 | * after having it pinned with | ||
104 | * get_page_unless_zero() above. | ||
105 | */ | ||
106 | smp_mb(); | ||
107 | /* page_head wasn't a dangling pointer */ | ||
108 | flags = compound_lock_irqsave(page_head); | 92 | flags = compound_lock_irqsave(page_head); |
109 | if (unlikely(!PageTail(page))) { | 93 | if (unlikely(!PageTail(page))) { |
110 | /* __split_huge_page_refcount run before us */ | 94 | /* __split_huge_page_refcount run before us */ |
111 | compound_unlock_irqrestore(page_head, flags); | 95 | compound_unlock_irqrestore(page_head, flags); |
112 | VM_BUG_ON(PageHead(page_head)); | 96 | VM_BUG_ON(PageHead(page_head)); |
113 | out_put_head: | ||
114 | if (put_page_testzero(page_head)) | 97 | if (put_page_testzero(page_head)) |
115 | __put_single_page(page_head); | 98 | __put_single_page(page_head); |
116 | out_put_single: | 99 | out_put_single: |
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page) | |||
121 | VM_BUG_ON(page_head != page->first_page); | 104 | VM_BUG_ON(page_head != page->first_page); |
122 | /* | 105 | /* |
123 | * We can release the refcount taken by | 106 | * We can release the refcount taken by |
124 | * get_page_unless_zero now that | 107 | * get_page_unless_zero() now that |
125 | * split_huge_page_refcount is blocked on the | 108 | * __split_huge_page_refcount() is blocked on |
126 | * compound_lock. | 109 | * the compound_lock. |
127 | */ | 110 | */ |
128 | if (put_page_testzero(page_head)) | 111 | if (put_page_testzero(page_head)) |
129 | VM_BUG_ON(1); | 112 | VM_BUG_ON(1); |
130 | /* __split_huge_page_refcount will wait now */ | 113 | /* __split_huge_page_refcount will wait now */ |
131 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | 114 | VM_BUG_ON(page_mapcount(page) <= 0); |
132 | atomic_dec(&page->_count); | 115 | atomic_dec(&page->_mapcount); |
133 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | 116 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); |
117 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
134 | compound_unlock_irqrestore(page_head, flags); | 118 | compound_unlock_irqrestore(page_head, flags); |
135 | if (put_page_testzero(page_head)) { | 119 | if (put_page_testzero(page_head)) { |
136 | if (PageHead(page_head)) | 120 | if (PageHead(page_head)) |
@@ -160,6 +144,45 @@ void put_page(struct page *page) | |||
160 | } | 144 | } |
161 | EXPORT_SYMBOL(put_page); | 145 | EXPORT_SYMBOL(put_page); |
162 | 146 | ||
147 | /* | ||
148 | * This function is exported but must not be called by anything other | ||
149 | * than get_page(). It implements the slow path of get_page(). | ||
150 | */ | ||
151 | bool __get_page_tail(struct page *page) | ||
152 | { | ||
153 | /* | ||
154 | * This takes care of get_page() if run on a tail page | ||
155 | * returned by one of the get_user_pages/follow_page variants. | ||
156 | * get_user_pages/follow_page itself doesn't need the compound | ||
157 | * lock because it runs __get_page_tail_foll() under the | ||
158 | * proper PT lock that already serializes against | ||
159 | * split_huge_page(). | ||
160 | */ | ||
161 | unsigned long flags; | ||
162 | bool got = false; | ||
163 | struct page *page_head = compound_trans_head(page); | ||
164 | |||
165 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
166 | /* | ||
167 | * page_head wasn't a dangling pointer but it | ||
168 | * may not be a head page anymore by the time | ||
169 | * we obtain the lock. That is ok as long as it | ||
170 | * can't be freed from under us. | ||
171 | */ | ||
172 | flags = compound_lock_irqsave(page_head); | ||
173 | /* here __split_huge_page_refcount won't run anymore */ | ||
174 | if (likely(PageTail(page))) { | ||
175 | __get_page_tail_foll(page, false); | ||
176 | got = true; | ||
177 | } | ||
178 | compound_unlock_irqrestore(page_head, flags); | ||
179 | if (unlikely(!got)) | ||
180 | put_page(page_head); | ||
181 | } | ||
182 | return got; | ||
183 | } | ||
184 | EXPORT_SYMBOL(__get_page_tail); | ||
185 | |||
163 | /** | 186 | /** |
164 | * put_pages_list() - release a list of pages | 187 | * put_pages_list() - release a list of pages |
165 | * @pages: list of pages threaded on page->lru | 188 | * @pages: list of pages threaded on page->lru |
@@ -644,7 +667,7 @@ void lru_add_page_tail(struct zone* zone, | |||
644 | VM_BUG_ON(!PageHead(page)); | 667 | VM_BUG_ON(!PageHead(page)); |
645 | VM_BUG_ON(PageCompound(page_tail)); | 668 | VM_BUG_ON(PageCompound(page_tail)); |
646 | VM_BUG_ON(PageLRU(page_tail)); | 669 | VM_BUG_ON(PageLRU(page_tail)); |
647 | VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); | 670 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock)); |
648 | 671 | ||
649 | SetPageLRU(page_tail); | 672 | SetPageLRU(page_tail); |
650 | 673 | ||
diff --git a/mm/swap_state.c b/mm/swap_state.c index 46680461785..10e9198778c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -28,7 +28,7 @@ | |||
28 | */ | 28 | */ |
29 | static const struct address_space_operations swap_aops = { | 29 | static const struct address_space_operations swap_aops = { |
30 | .writepage = swap_writepage, | 30 | .writepage = swap_writepage, |
31 | .set_page_dirty = __set_page_dirty_nobuffers, | 31 | .set_page_dirty = __set_page_dirty_no_writeback, |
32 | .migratepage = migrate_page, | 32 | .migratepage = migrate_page, |
33 | }; | 33 | }; |
34 | 34 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index ff8dc1a18cb..c8f4338848d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
932 | pmd = pmd_offset(pud, addr); | 932 | pmd = pmd_offset(pud, addr); |
933 | do { | 933 | do { |
934 | next = pmd_addr_end(addr, end); | 934 | next = pmd_addr_end(addr, end); |
935 | if (unlikely(pmd_trans_huge(*pmd))) | 935 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
936 | continue; | ||
937 | if (pmd_none_or_clear_bad(pmd)) | ||
938 | continue; | 936 | continue; |
939 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); | 937 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
940 | if (ret) | 938 | if (ret) |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1d34d75366a..bdb70042c12 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -256,7 +256,7 @@ struct vmap_area { | |||
256 | struct rb_node rb_node; /* address sorted rbtree */ | 256 | struct rb_node rb_node; /* address sorted rbtree */ |
257 | struct list_head list; /* address sorted list */ | 257 | struct list_head list; /* address sorted list */ |
258 | struct list_head purge_list; /* "lazy purge" list */ | 258 | struct list_head purge_list; /* "lazy purge" list */ |
259 | void *private; | 259 | struct vm_struct *vm; |
260 | struct rcu_head rcu_head; | 260 | struct rcu_head rcu_head; |
261 | }; | 261 | }; |
262 | 262 | ||
@@ -732,9 +732,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr) | |||
732 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) | 732 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) |
733 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ | 733 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ |
734 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ | 734 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ |
735 | #define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ | 735 | #define VMAP_BBMAP_BITS \ |
736 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ | 736 | VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ |
737 | VMALLOC_PAGES / NR_CPUS / 16)) | 737 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ |
738 | VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) | ||
738 | 739 | ||
739 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) | 740 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) |
740 | 741 | ||
@@ -1173,9 +1174,10 @@ void __init vmalloc_init(void) | |||
1173 | /* Import existing vmlist entries. */ | 1174 | /* Import existing vmlist entries. */ |
1174 | for (tmp = vmlist; tmp; tmp = tmp->next) { | 1175 | for (tmp = vmlist; tmp; tmp = tmp->next) { |
1175 | va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); | 1176 | va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); |
1176 | va->flags = tmp->flags | VM_VM_AREA; | 1177 | va->flags = VM_VM_AREA; |
1177 | va->va_start = (unsigned long)tmp->addr; | 1178 | va->va_start = (unsigned long)tmp->addr; |
1178 | va->va_end = va->va_start + tmp->size; | 1179 | va->va_end = va->va_start + tmp->size; |
1180 | va->vm = tmp; | ||
1179 | __insert_vmap_area(va); | 1181 | __insert_vmap_area(va); |
1180 | } | 1182 | } |
1181 | 1183 | ||
@@ -1266,18 +1268,22 @@ EXPORT_SYMBOL_GPL(map_vm_area); | |||
1266 | DEFINE_RWLOCK(vmlist_lock); | 1268 | DEFINE_RWLOCK(vmlist_lock); |
1267 | struct vm_struct *vmlist; | 1269 | struct vm_struct *vmlist; |
1268 | 1270 | ||
1269 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | 1271 | static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, |
1270 | unsigned long flags, void *caller) | 1272 | unsigned long flags, void *caller) |
1271 | { | 1273 | { |
1272 | struct vm_struct *tmp, **p; | ||
1273 | |||
1274 | vm->flags = flags; | 1274 | vm->flags = flags; |
1275 | vm->addr = (void *)va->va_start; | 1275 | vm->addr = (void *)va->va_start; |
1276 | vm->size = va->va_end - va->va_start; | 1276 | vm->size = va->va_end - va->va_start; |
1277 | vm->caller = caller; | 1277 | vm->caller = caller; |
1278 | va->private = vm; | 1278 | va->vm = vm; |
1279 | va->flags |= VM_VM_AREA; | 1279 | va->flags |= VM_VM_AREA; |
1280 | } | ||
1280 | 1281 | ||
1282 | static void insert_vmalloc_vmlist(struct vm_struct *vm) | ||
1283 | { | ||
1284 | struct vm_struct *tmp, **p; | ||
1285 | |||
1286 | vm->flags &= ~VM_UNLIST; | ||
1281 | write_lock(&vmlist_lock); | 1287 | write_lock(&vmlist_lock); |
1282 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { | 1288 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { |
1283 | if (tmp->addr >= vm->addr) | 1289 | if (tmp->addr >= vm->addr) |
@@ -1288,6 +1294,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | |||
1288 | write_unlock(&vmlist_lock); | 1294 | write_unlock(&vmlist_lock); |
1289 | } | 1295 | } |
1290 | 1296 | ||
1297 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | ||
1298 | unsigned long flags, void *caller) | ||
1299 | { | ||
1300 | setup_vmalloc_vm(vm, va, flags, caller); | ||
1301 | insert_vmalloc_vmlist(vm); | ||
1302 | } | ||
1303 | |||
1291 | static struct vm_struct *__get_vm_area_node(unsigned long size, | 1304 | static struct vm_struct *__get_vm_area_node(unsigned long size, |
1292 | unsigned long align, unsigned long flags, unsigned long start, | 1305 | unsigned long align, unsigned long flags, unsigned long start, |
1293 | unsigned long end, int node, gfp_t gfp_mask, void *caller) | 1306 | unsigned long end, int node, gfp_t gfp_mask, void *caller) |
@@ -1326,7 +1339,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
1326 | return NULL; | 1339 | return NULL; |
1327 | } | 1340 | } |
1328 | 1341 | ||
1329 | insert_vmalloc_vm(area, va, flags, caller); | 1342 | /* |
1343 | * When this function is called from __vmalloc_node_range, | ||
1344 | * we do not add vm_struct to vmlist here to avoid | ||
1345 | * accessing uninitialized members of vm_struct such as | ||
1346 | * pages and nr_pages fields. They will be set later. | ||
1347 | * To distinguish it from others, we use a VM_UNLIST flag. | ||
1348 | */ | ||
1349 | if (flags & VM_UNLIST) | ||
1350 | setup_vmalloc_vm(area, va, flags, caller); | ||
1351 | else | ||
1352 | insert_vmalloc_vm(area, va, flags, caller); | ||
1353 | |||
1330 | return area; | 1354 | return area; |
1331 | } | 1355 | } |
1332 | 1356 | ||
@@ -1374,7 +1398,7 @@ static struct vm_struct *find_vm_area(const void *addr) | |||
1374 | 1398 | ||
1375 | va = find_vmap_area((unsigned long)addr); | 1399 | va = find_vmap_area((unsigned long)addr); |
1376 | if (va && va->flags & VM_VM_AREA) | 1400 | if (va && va->flags & VM_VM_AREA) |
1377 | return va->private; | 1401 | return va->vm; |
1378 | 1402 | ||
1379 | return NULL; | 1403 | return NULL; |
1380 | } | 1404 | } |
@@ -1393,18 +1417,21 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
1393 | 1417 | ||
1394 | va = find_vmap_area((unsigned long)addr); | 1418 | va = find_vmap_area((unsigned long)addr); |
1395 | if (va && va->flags & VM_VM_AREA) { | 1419 | if (va && va->flags & VM_VM_AREA) { |
1396 | struct vm_struct *vm = va->private; | 1420 | struct vm_struct *vm = va->vm; |
1397 | struct vm_struct *tmp, **p; | 1421 | |
1398 | /* | 1422 | if (!(vm->flags & VM_UNLIST)) { |
1399 | * remove from list and disallow access to this vm_struct | 1423 | struct vm_struct *tmp, **p; |
1400 | * before unmap. (address range confliction is maintained by | 1424 | /* |
1401 | * vmap.) | 1425 | * remove from list and disallow access to |
1402 | */ | 1426 | * this vm_struct before unmap. (address range |
1403 | write_lock(&vmlist_lock); | 1427 | * confliction is maintained by vmap.) |
1404 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) | 1428 | */ |
1405 | ; | 1429 | write_lock(&vmlist_lock); |
1406 | *p = tmp->next; | 1430 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) |
1407 | write_unlock(&vmlist_lock); | 1431 | ; |
1432 | *p = tmp->next; | ||
1433 | write_unlock(&vmlist_lock); | ||
1434 | } | ||
1408 | 1435 | ||
1409 | vmap_debug_free_range(va->va_start, va->va_end); | 1436 | vmap_debug_free_range(va->va_start, va->va_end); |
1410 | free_unmap_vmap_area(va); | 1437 | free_unmap_vmap_area(va); |
@@ -1615,13 +1642,21 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1615 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) | 1642 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) |
1616 | return NULL; | 1643 | return NULL; |
1617 | 1644 | ||
1618 | area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, | 1645 | area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, |
1619 | gfp_mask, caller); | 1646 | start, end, node, gfp_mask, caller); |
1620 | 1647 | ||
1621 | if (!area) | 1648 | if (!area) |
1622 | return NULL; | 1649 | return NULL; |
1623 | 1650 | ||
1624 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); | 1651 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); |
1652 | if (!addr) | ||
1653 | return NULL; | ||
1654 | |||
1655 | /* | ||
1656 | * In this function, newly allocated vm_struct is not added | ||
1657 | * to vmlist at __get_vm_area_node(). so, it is added here. | ||
1658 | */ | ||
1659 | insert_vmalloc_vmlist(area); | ||
1625 | 1660 | ||
1626 | /* | 1661 | /* |
1627 | * A ref_count = 3 is needed because the vm_struct and vmap_area | 1662 | * A ref_count = 3 is needed because the vm_struct and vmap_area |
@@ -2153,6 +2188,14 @@ struct vm_struct *alloc_vm_area(size_t size) | |||
2153 | return NULL; | 2188 | return NULL; |
2154 | } | 2189 | } |
2155 | 2190 | ||
2191 | /* | ||
2192 | * If the allocated address space is passed to a hypercall | ||
2193 | * before being used then we cannot rely on a page fault to | ||
2194 | * trigger an update of the page tables. So sync all the page | ||
2195 | * tables here. | ||
2196 | */ | ||
2197 | vmalloc_sync_all(); | ||
2198 | |||
2156 | return area; | 2199 | return area; |
2157 | } | 2200 | } |
2158 | EXPORT_SYMBOL_GPL(alloc_vm_area); | 2201 | EXPORT_SYMBOL_GPL(alloc_vm_area); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index d036e59d302..1eb3edf7920 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -248,35 +248,66 @@ unsigned long shrink_slab(struct shrink_control *shrink, | |||
248 | 248 | ||
249 | list_for_each_entry(shrinker, &shrinker_list, list) { | 249 | list_for_each_entry(shrinker, &shrinker_list, list) { |
250 | unsigned long long delta; | 250 | unsigned long long delta; |
251 | unsigned long total_scan; | 251 | long total_scan; |
252 | unsigned long max_pass; | 252 | long max_pass; |
253 | int shrink_ret = 0; | ||
254 | long nr; | ||
255 | long new_nr; | ||
253 | 256 | ||
254 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); | 257 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); |
258 | if (max_pass <= 0) | ||
259 | continue; | ||
260 | |||
261 | /* | ||
262 | * copy the current shrinker scan count into a local variable | ||
263 | * and zero it so that other concurrent shrinker invocations | ||
264 | * don't also do this scanning work. | ||
265 | */ | ||
266 | do { | ||
267 | nr = shrinker->nr; | ||
268 | } while (cmpxchg(&shrinker->nr, nr, 0) != nr); | ||
269 | |||
270 | total_scan = nr; | ||
255 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | 271 | delta = (4 * nr_pages_scanned) / shrinker->seeks; |
256 | delta *= max_pass; | 272 | delta *= max_pass; |
257 | do_div(delta, lru_pages + 1); | 273 | do_div(delta, lru_pages + 1); |
258 | shrinker->nr += delta; | 274 | total_scan += delta; |
259 | if (shrinker->nr < 0) { | 275 | if (total_scan < 0) { |
260 | printk(KERN_ERR "shrink_slab: %pF negative objects to " | 276 | printk(KERN_ERR "shrink_slab: %pF negative objects to " |
261 | "delete nr=%ld\n", | 277 | "delete nr=%ld\n", |
262 | shrinker->shrink, shrinker->nr); | 278 | shrinker->shrink, total_scan); |
263 | shrinker->nr = max_pass; | 279 | total_scan = max_pass; |
264 | } | 280 | } |
265 | 281 | ||
266 | /* | 282 | /* |
283 | * We need to avoid excessive windup on filesystem shrinkers | ||
284 | * due to large numbers of GFP_NOFS allocations causing the | ||
285 | * shrinkers to return -1 all the time. This results in a large | ||
286 | * nr being built up so when a shrink that can do some work | ||
287 | * comes along it empties the entire cache due to nr >>> | ||
288 | * max_pass. This is bad for sustaining a working set in | ||
289 | * memory. | ||
290 | * | ||
291 | * Hence only allow the shrinker to scan the entire cache when | ||
292 | * a large delta change is calculated directly. | ||
293 | */ | ||
294 | if (delta < max_pass / 4) | ||
295 | total_scan = min(total_scan, max_pass / 2); | ||
296 | |||
297 | /* | ||
267 | * Avoid risking looping forever due to too large nr value: | 298 | * Avoid risking looping forever due to too large nr value: |
268 | * never try to free more than twice the estimate number of | 299 | * never try to free more than twice the estimate number of |
269 | * freeable entries. | 300 | * freeable entries. |
270 | */ | 301 | */ |
271 | if (shrinker->nr > max_pass * 2) | 302 | if (total_scan > max_pass * 2) |
272 | shrinker->nr = max_pass * 2; | 303 | total_scan = max_pass * 2; |
273 | 304 | ||
274 | total_scan = shrinker->nr; | 305 | trace_mm_shrink_slab_start(shrinker, shrink, nr, |
275 | shrinker->nr = 0; | 306 | nr_pages_scanned, lru_pages, |
307 | max_pass, delta, total_scan); | ||
276 | 308 | ||
277 | while (total_scan >= SHRINK_BATCH) { | 309 | while (total_scan >= SHRINK_BATCH) { |
278 | long this_scan = SHRINK_BATCH; | 310 | long this_scan = SHRINK_BATCH; |
279 | int shrink_ret; | ||
280 | int nr_before; | 311 | int nr_before; |
281 | 312 | ||
282 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); | 313 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); |
@@ -292,7 +323,19 @@ unsigned long shrink_slab(struct shrink_control *shrink, | |||
292 | cond_resched(); | 323 | cond_resched(); |
293 | } | 324 | } |
294 | 325 | ||
295 | shrinker->nr += total_scan; | 326 | /* |
327 | * move the unused scan count back into the shrinker in a | ||
328 | * manner that handles concurrent updates. If we exhausted the | ||
329 | * scan, there is no need to do an update. | ||
330 | */ | ||
331 | do { | ||
332 | nr = shrinker->nr; | ||
333 | new_nr = total_scan + nr; | ||
334 | if (total_scan <= 0) | ||
335 | break; | ||
336 | } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); | ||
337 | |||
338 | trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); | ||
296 | } | 339 | } |
297 | up_read(&shrinker_rwsem); | 340 | up_read(&shrinker_rwsem); |
298 | out: | 341 | out: |
@@ -455,15 +498,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
455 | return PAGE_ACTIVATE; | 498 | return PAGE_ACTIVATE; |
456 | } | 499 | } |
457 | 500 | ||
458 | /* | ||
459 | * Wait on writeback if requested to. This happens when | ||
460 | * direct reclaiming a large contiguous area and the | ||
461 | * first attempt to free a range of pages fails. | ||
462 | */ | ||
463 | if (PageWriteback(page) && | ||
464 | (sc->reclaim_mode & RECLAIM_MODE_SYNC)) | ||
465 | wait_on_page_writeback(page); | ||
466 | |||
467 | if (!PageWriteback(page)) { | 501 | if (!PageWriteback(page)) { |
468 | /* synchronous write or broken a_ops? */ | 502 | /* synchronous write or broken a_ops? */ |
469 | ClearPageReclaim(page); | 503 | ClearPageReclaim(page); |
@@ -581,6 +615,10 @@ void putback_lru_page(struct page *page) | |||
581 | int was_unevictable = PageUnevictable(page); | 615 | int was_unevictable = PageUnevictable(page); |
582 | 616 | ||
583 | VM_BUG_ON(PageLRU(page)); | 617 | VM_BUG_ON(PageLRU(page)); |
618 | #ifdef CONFIG_CLEANCACHE | ||
619 | if (active) | ||
620 | SetPageWasActive(page); | ||
621 | #endif | ||
584 | 622 | ||
585 | redo: | 623 | redo: |
586 | ClearPageUnevictable(page); | 624 | ClearPageUnevictable(page); |
@@ -665,7 +703,7 @@ static enum page_references page_check_references(struct page *page, | |||
665 | return PAGEREF_RECLAIM; | 703 | return PAGEREF_RECLAIM; |
666 | 704 | ||
667 | if (referenced_ptes) { | 705 | if (referenced_ptes) { |
668 | if (PageAnon(page)) | 706 | if (PageSwapBacked(page)) |
669 | return PAGEREF_ACTIVATE; | 707 | return PAGEREF_ACTIVATE; |
670 | /* | 708 | /* |
671 | * All mapped pages start out with page table | 709 | * All mapped pages start out with page table |
@@ -683,7 +721,13 @@ static enum page_references page_check_references(struct page *page, | |||
683 | */ | 721 | */ |
684 | SetPageReferenced(page); | 722 | SetPageReferenced(page); |
685 | 723 | ||
686 | if (referenced_page) | 724 | if (referenced_page || referenced_ptes > 1) |
725 | return PAGEREF_ACTIVATE; | ||
726 | |||
727 | /* | ||
728 | * Activate file-backed executable pages after first usage. | ||
729 | */ | ||
730 | if (vm_flags & VM_EXEC) | ||
687 | return PAGEREF_ACTIVATE; | 731 | return PAGEREF_ACTIVATE; |
688 | 732 | ||
689 | return PAGEREF_KEEP; | 733 | return PAGEREF_KEEP; |
@@ -719,7 +763,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) | |||
719 | */ | 763 | */ |
720 | static unsigned long shrink_page_list(struct list_head *page_list, | 764 | static unsigned long shrink_page_list(struct list_head *page_list, |
721 | struct zone *zone, | 765 | struct zone *zone, |
722 | struct scan_control *sc) | 766 | struct scan_control *sc, |
767 | int priority, | ||
768 | unsigned long *ret_nr_dirty, | ||
769 | unsigned long *ret_nr_writeback) | ||
723 | { | 770 | { |
724 | LIST_HEAD(ret_pages); | 771 | LIST_HEAD(ret_pages); |
725 | LIST_HEAD(free_pages); | 772 | LIST_HEAD(free_pages); |
@@ -727,6 +774,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
727 | unsigned long nr_dirty = 0; | 774 | unsigned long nr_dirty = 0; |
728 | unsigned long nr_congested = 0; | 775 | unsigned long nr_congested = 0; |
729 | unsigned long nr_reclaimed = 0; | 776 | unsigned long nr_reclaimed = 0; |
777 | unsigned long nr_writeback = 0; | ||
730 | 778 | ||
731 | cond_resched(); | 779 | cond_resched(); |
732 | 780 | ||
@@ -763,13 +811,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
763 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | 811 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); |
764 | 812 | ||
765 | if (PageWriteback(page)) { | 813 | if (PageWriteback(page)) { |
814 | nr_writeback++; | ||
766 | /* | 815 | /* |
767 | * Synchronous reclaim is performed in two passes, | 816 | * Synchronous reclaim cannot queue pages for |
768 | * first an asynchronous pass over the list to | 817 | * writeback due to the possibility of stack overflow |
769 | * start parallel writeback, and a second synchronous | 818 | * but if it encounters a page under writeback, wait |
770 | * pass to wait for the IO to complete. Wait here | 819 | * for the IO to complete. |
771 | * for any page for which writeback has already | ||
772 | * started. | ||
773 | */ | 820 | */ |
774 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && | 821 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && |
775 | may_enter_fs) | 822 | may_enter_fs) |
@@ -825,6 +872,25 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
825 | if (PageDirty(page)) { | 872 | if (PageDirty(page)) { |
826 | nr_dirty++; | 873 | nr_dirty++; |
827 | 874 | ||
875 | /* | ||
876 | * Only kswapd can writeback filesystem pages to | ||
877 | * avoid risk of stack overflow but do not writeback | ||
878 | * unless under significant pressure. | ||
879 | */ | ||
880 | if (page_is_file_cache(page) && | ||
881 | (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { | ||
882 | /* | ||
883 | * Immediately reclaim when written back. | ||
884 | * Similar in principal to deactivate_page() | ||
885 | * except we already have the page isolated | ||
886 | * and know it's dirty | ||
887 | */ | ||
888 | inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); | ||
889 | SetPageReclaim(page); | ||
890 | |||
891 | goto keep_locked; | ||
892 | } | ||
893 | |||
828 | if (references == PAGEREF_RECLAIM_CLEAN) | 894 | if (references == PAGEREF_RECLAIM_CLEAN) |
829 | goto keep_locked; | 895 | goto keep_locked; |
830 | if (!may_enter_fs) | 896 | if (!may_enter_fs) |
@@ -959,6 +1025,8 @@ keep_lumpy: | |||
959 | 1025 | ||
960 | list_splice(&ret_pages, page_list); | 1026 | list_splice(&ret_pages, page_list); |
961 | count_vm_events(PGACTIVATE, pgactivate); | 1027 | count_vm_events(PGACTIVATE, pgactivate); |
1028 | *ret_nr_dirty += nr_dirty; | ||
1029 | *ret_nr_writeback += nr_writeback; | ||
962 | return nr_reclaimed; | 1030 | return nr_reclaimed; |
963 | } | 1031 | } |
964 | 1032 | ||
@@ -972,23 +1040,27 @@ keep_lumpy: | |||
972 | * | 1040 | * |
973 | * returns 0 on success, -ve errno on failure. | 1041 | * returns 0 on success, -ve errno on failure. |
974 | */ | 1042 | */ |
975 | int __isolate_lru_page(struct page *page, int mode, int file) | 1043 | int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) |
976 | { | 1044 | { |
1045 | bool all_lru_mode; | ||
977 | int ret = -EINVAL; | 1046 | int ret = -EINVAL; |
978 | 1047 | ||
979 | /* Only take pages on the LRU. */ | 1048 | /* Only take pages on the LRU. */ |
980 | if (!PageLRU(page)) | 1049 | if (!PageLRU(page)) |
981 | return ret; | 1050 | return ret; |
982 | 1051 | ||
1052 | all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == | ||
1053 | (ISOLATE_ACTIVE|ISOLATE_INACTIVE); | ||
1054 | |||
983 | /* | 1055 | /* |
984 | * When checking the active state, we need to be sure we are | 1056 | * When checking the active state, we need to be sure we are |
985 | * dealing with comparible boolean values. Take the logical not | 1057 | * dealing with comparible boolean values. Take the logical not |
986 | * of each. | 1058 | * of each. |
987 | */ | 1059 | */ |
988 | if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) | 1060 | if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE)) |
989 | return ret; | 1061 | return ret; |
990 | 1062 | ||
991 | if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) | 1063 | if (!all_lru_mode && !!page_is_file_cache(page) != file) |
992 | return ret; | 1064 | return ret; |
993 | 1065 | ||
994 | /* | 1066 | /* |
@@ -1001,6 +1073,43 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
1001 | 1073 | ||
1002 | ret = -EBUSY; | 1074 | ret = -EBUSY; |
1003 | 1075 | ||
1076 | /* | ||
1077 | * To minimise LRU disruption, the caller can indicate that it only | ||
1078 | * wants to isolate pages it will be able to operate on without | ||
1079 | * blocking - clean pages for the most part. | ||
1080 | * | ||
1081 | * ISOLATE_CLEAN means that only clean pages should be isolated. This | ||
1082 | * is used by reclaim when it is cannot write to backing storage | ||
1083 | * | ||
1084 | * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages | ||
1085 | * that it is possible to migrate without blocking | ||
1086 | */ | ||
1087 | if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { | ||
1088 | /* All the caller can do on PageWriteback is block */ | ||
1089 | if (PageWriteback(page)) | ||
1090 | return ret; | ||
1091 | |||
1092 | if (PageDirty(page)) { | ||
1093 | struct address_space *mapping; | ||
1094 | |||
1095 | /* ISOLATE_CLEAN means only clean pages */ | ||
1096 | if (mode & ISOLATE_CLEAN) | ||
1097 | return ret; | ||
1098 | |||
1099 | /* | ||
1100 | * Only pages without mappings or that have a | ||
1101 | * ->migratepage callback are possible to migrate | ||
1102 | * without blocking | ||
1103 | */ | ||
1104 | mapping = page_mapping(page); | ||
1105 | if (mapping && !mapping->a_ops->migratepage) | ||
1106 | return ret; | ||
1107 | } | ||
1108 | } | ||
1109 | |||
1110 | if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) | ||
1111 | return ret; | ||
1112 | |||
1004 | if (likely(get_page_unless_zero(page))) { | 1113 | if (likely(get_page_unless_zero(page))) { |
1005 | /* | 1114 | /* |
1006 | * Be careful not to clear PageLRU until after we're | 1115 | * Be careful not to clear PageLRU until after we're |
@@ -1036,7 +1145,8 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
1036 | */ | 1145 | */ |
1037 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 1146 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
1038 | struct list_head *src, struct list_head *dst, | 1147 | struct list_head *src, struct list_head *dst, |
1039 | unsigned long *scanned, int order, int mode, int file) | 1148 | unsigned long *scanned, int order, isolate_mode_t mode, |
1149 | int file) | ||
1040 | { | 1150 | { |
1041 | unsigned long nr_taken = 0; | 1151 | unsigned long nr_taken = 0; |
1042 | unsigned long nr_lumpy_taken = 0; | 1152 | unsigned long nr_lumpy_taken = 0; |
@@ -1111,7 +1221,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1111 | * anon page which don't already have a swap slot is | 1221 | * anon page which don't already have a swap slot is |
1112 | * pointless. | 1222 | * pointless. |
1113 | */ | 1223 | */ |
1114 | if (nr_swap_pages <= 0 && PageAnon(cursor_page) && | 1224 | if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) && |
1115 | !PageSwapCache(cursor_page)) | 1225 | !PageSwapCache(cursor_page)) |
1116 | break; | 1226 | break; |
1117 | 1227 | ||
@@ -1161,8 +1271,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1161 | static unsigned long isolate_pages_global(unsigned long nr, | 1271 | static unsigned long isolate_pages_global(unsigned long nr, |
1162 | struct list_head *dst, | 1272 | struct list_head *dst, |
1163 | unsigned long *scanned, int order, | 1273 | unsigned long *scanned, int order, |
1164 | int mode, struct zone *z, | 1274 | isolate_mode_t mode, |
1165 | int active, int file) | 1275 | struct zone *z, int active, int file) |
1166 | { | 1276 | { |
1167 | int lru = LRU_BASE; | 1277 | int lru = LRU_BASE; |
1168 | if (active) | 1278 | if (active) |
@@ -1190,6 +1300,9 @@ static unsigned long clear_active_flags(struct list_head *page_list, | |||
1190 | if (PageActive(page)) { | 1300 | if (PageActive(page)) { |
1191 | lru += LRU_ACTIVE; | 1301 | lru += LRU_ACTIVE; |
1192 | ClearPageActive(page); | 1302 | ClearPageActive(page); |
1303 | #ifdef CONFIG_CLEANCACHE | ||
1304 | SetPageWasActive(page); | ||
1305 | #endif | ||
1193 | nr_active += numpages; | 1306 | nr_active += numpages; |
1194 | } | 1307 | } |
1195 | if (count) | 1308 | if (count) |
@@ -1354,7 +1467,7 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, | |||
1354 | } | 1467 | } |
1355 | 1468 | ||
1356 | /* | 1469 | /* |
1357 | * Returns true if the caller should wait to clean dirty/writeback pages. | 1470 | * Returns true if a direct reclaim should wait on pages under writeback. |
1358 | * | 1471 | * |
1359 | * If we are direct reclaiming for contiguous pages and we do not reclaim | 1472 | * If we are direct reclaiming for contiguous pages and we do not reclaim |
1360 | * everything in the list, try again and wait for writeback IO to complete. | 1473 | * everything in the list, try again and wait for writeback IO to complete. |
@@ -1408,6 +1521,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1408 | unsigned long nr_taken; | 1521 | unsigned long nr_taken; |
1409 | unsigned long nr_anon; | 1522 | unsigned long nr_anon; |
1410 | unsigned long nr_file; | 1523 | unsigned long nr_file; |
1524 | unsigned long nr_dirty = 0; | ||
1525 | unsigned long nr_writeback = 0; | ||
1526 | |||
1527 | isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; | ||
1411 | 1528 | ||
1412 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1529 | while (unlikely(too_many_isolated(zone, file, sc))) { |
1413 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1530 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -1418,15 +1535,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1418 | } | 1535 | } |
1419 | 1536 | ||
1420 | set_reclaim_mode(priority, sc, false); | 1537 | set_reclaim_mode(priority, sc, false); |
1538 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | ||
1539 | reclaim_mode |= ISOLATE_ACTIVE; | ||
1540 | |||
1421 | lru_add_drain(); | 1541 | lru_add_drain(); |
1542 | |||
1543 | if (!sc->may_unmap) | ||
1544 | reclaim_mode |= ISOLATE_UNMAPPED; | ||
1545 | if (!sc->may_writepage) | ||
1546 | reclaim_mode |= ISOLATE_CLEAN; | ||
1547 | |||
1422 | spin_lock_irq(&zone->lru_lock); | 1548 | spin_lock_irq(&zone->lru_lock); |
1423 | 1549 | ||
1424 | if (scanning_global_lru(sc)) { | 1550 | if (scanning_global_lru(sc)) { |
1425 | nr_taken = isolate_pages_global(nr_to_scan, | 1551 | nr_taken = isolate_pages_global(nr_to_scan, &page_list, |
1426 | &page_list, &nr_scanned, sc->order, | 1552 | &nr_scanned, sc->order, reclaim_mode, zone, 0, file); |
1427 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? | ||
1428 | ISOLATE_BOTH : ISOLATE_INACTIVE, | ||
1429 | zone, 0, file); | ||
1430 | zone->pages_scanned += nr_scanned; | 1553 | zone->pages_scanned += nr_scanned; |
1431 | if (current_is_kswapd()) | 1554 | if (current_is_kswapd()) |
1432 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, | 1555 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, |
@@ -1435,12 +1558,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1435 | __count_zone_vm_events(PGSCAN_DIRECT, zone, | 1558 | __count_zone_vm_events(PGSCAN_DIRECT, zone, |
1436 | nr_scanned); | 1559 | nr_scanned); |
1437 | } else { | 1560 | } else { |
1438 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, | 1561 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, |
1439 | &page_list, &nr_scanned, sc->order, | 1562 | &nr_scanned, sc->order, reclaim_mode, zone, |
1440 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? | 1563 | sc->mem_cgroup, 0, file); |
1441 | ISOLATE_BOTH : ISOLATE_INACTIVE, | ||
1442 | zone, sc->mem_cgroup, | ||
1443 | 0, file); | ||
1444 | /* | 1564 | /* |
1445 | * mem_cgroup_isolate_pages() keeps track of | 1565 | * mem_cgroup_isolate_pages() keeps track of |
1446 | * scanned pages on its own. | 1566 | * scanned pages on its own. |
@@ -1456,12 +1576,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1456 | 1576 | ||
1457 | spin_unlock_irq(&zone->lru_lock); | 1577 | spin_unlock_irq(&zone->lru_lock); |
1458 | 1578 | ||
1459 | nr_reclaimed = shrink_page_list(&page_list, zone, sc); | 1579 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, |
1580 | &nr_dirty, &nr_writeback); | ||
1460 | 1581 | ||
1461 | /* Check if we should syncronously wait for writeback */ | 1582 | /* Check if we should syncronously wait for writeback */ |
1462 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | 1583 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { |
1463 | set_reclaim_mode(priority, sc, true); | 1584 | set_reclaim_mode(priority, sc, true); |
1464 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); | 1585 | nr_reclaimed += shrink_page_list(&page_list, zone, sc, |
1586 | priority, &nr_dirty, &nr_writeback); | ||
1465 | } | 1587 | } |
1466 | 1588 | ||
1467 | local_irq_disable(); | 1589 | local_irq_disable(); |
@@ -1471,6 +1593,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1471 | 1593 | ||
1472 | putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); | 1594 | putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); |
1473 | 1595 | ||
1596 | /* | ||
1597 | * If reclaim is isolating dirty pages under writeback, it implies | ||
1598 | * that the long-lived page allocation rate is exceeding the page | ||
1599 | * laundering rate. Either the global limits are not being effective | ||
1600 | * at throttling processes due to the page distribution throughout | ||
1601 | * zones or there is heavy usage of a slow backing device. The | ||
1602 | * only option is to throttle from reclaim context which is not ideal | ||
1603 | * as there is no guarantee the dirtying process is throttled in the | ||
1604 | * same way balance_dirty_pages() manages. | ||
1605 | * | ||
1606 | * This scales the number of dirty pages that must be under writeback | ||
1607 | * before throttling depending on priority. It is a simple backoff | ||
1608 | * function that has the most effect in the range DEF_PRIORITY to | ||
1609 | * DEF_PRIORITY-2 which is the priority reclaim is considered to be | ||
1610 | * in trouble and reclaim is considered to be in trouble. | ||
1611 | * | ||
1612 | * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle | ||
1613 | * DEF_PRIORITY-1 50% must be PageWriteback | ||
1614 | * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble | ||
1615 | * ... | ||
1616 | * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any | ||
1617 | * isolated page is PageWriteback | ||
1618 | */ | ||
1619 | if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) | ||
1620 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); | ||
1621 | |||
1474 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, | 1622 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, |
1475 | zone_idx(zone), | 1623 | zone_idx(zone), |
1476 | nr_scanned, nr_reclaimed, | 1624 | nr_scanned, nr_reclaimed, |
@@ -1542,19 +1690,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1542 | struct page *page; | 1690 | struct page *page; |
1543 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1691 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1544 | unsigned long nr_rotated = 0; | 1692 | unsigned long nr_rotated = 0; |
1693 | isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; | ||
1545 | 1694 | ||
1546 | lru_add_drain(); | 1695 | lru_add_drain(); |
1696 | |||
1697 | if (!sc->may_unmap) | ||
1698 | reclaim_mode |= ISOLATE_UNMAPPED; | ||
1699 | if (!sc->may_writepage) | ||
1700 | reclaim_mode |= ISOLATE_CLEAN; | ||
1701 | |||
1547 | spin_lock_irq(&zone->lru_lock); | 1702 | spin_lock_irq(&zone->lru_lock); |
1548 | if (scanning_global_lru(sc)) { | 1703 | if (scanning_global_lru(sc)) { |
1549 | nr_taken = isolate_pages_global(nr_pages, &l_hold, | 1704 | nr_taken = isolate_pages_global(nr_pages, &l_hold, |
1550 | &pgscanned, sc->order, | 1705 | &pgscanned, sc->order, |
1551 | ISOLATE_ACTIVE, zone, | 1706 | reclaim_mode, zone, |
1552 | 1, file); | 1707 | 1, file); |
1553 | zone->pages_scanned += pgscanned; | 1708 | zone->pages_scanned += pgscanned; |
1554 | } else { | 1709 | } else { |
1555 | nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, | 1710 | nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, |
1556 | &pgscanned, sc->order, | 1711 | &pgscanned, sc->order, |
1557 | ISOLATE_ACTIVE, zone, | 1712 | reclaim_mode, zone, |
1558 | sc->mem_cgroup, 1, file); | 1713 | sc->mem_cgroup, 1, file); |
1559 | /* | 1714 | /* |
1560 | * mem_cgroup_isolate_pages() keeps track of | 1715 | * mem_cgroup_isolate_pages() keeps track of |
@@ -1600,6 +1755,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1600 | } | 1755 | } |
1601 | 1756 | ||
1602 | ClearPageActive(page); /* we are de-activating */ | 1757 | ClearPageActive(page); /* we are de-activating */ |
1758 | #ifdef CONFIG_CLEANCACHE | ||
1759 | SetPageWasActive(page); | ||
1760 | #endif | ||
1603 | list_add(&page->lru, &l_inactive); | 1761 | list_add(&page->lru, &l_inactive); |
1604 | } | 1762 | } |
1605 | 1763 | ||
@@ -1747,22 +1905,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1747 | u64 fraction[2], denominator; | 1905 | u64 fraction[2], denominator; |
1748 | enum lru_list l; | 1906 | enum lru_list l; |
1749 | int noswap = 0; | 1907 | int noswap = 0; |
1750 | int force_scan = 0; | 1908 | bool force_scan = false; |
1751 | 1909 | unsigned long nr_force_scan[2]; | |
1752 | 1910 | ||
1753 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | 1911 | /* kswapd does zone balancing and needs to scan this zone */ |
1754 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | 1912 | if (scanning_global_lru(sc) && current_is_kswapd() && |
1755 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | 1913 | zone->all_unreclaimable) |
1756 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | 1914 | force_scan = true; |
1757 | 1915 | /* memcg may have small limit and need to avoid priority drop */ | |
1758 | if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { | 1916 | if (!scanning_global_lru(sc)) |
1759 | /* kswapd does zone balancing and need to scan this zone */ | 1917 | force_scan = true; |
1760 | if (scanning_global_lru(sc) && current_is_kswapd()) | ||
1761 | force_scan = 1; | ||
1762 | /* memcg may have small limit and need to avoid priority drop */ | ||
1763 | if (!scanning_global_lru(sc)) | ||
1764 | force_scan = 1; | ||
1765 | } | ||
1766 | 1918 | ||
1767 | /* If we have no swap space, do not bother scanning anon pages. */ | 1919 | /* If we have no swap space, do not bother scanning anon pages. */ |
1768 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1920 | if (!sc->may_swap || (nr_swap_pages <= 0)) { |
@@ -1770,9 +1922,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1770 | fraction[0] = 0; | 1922 | fraction[0] = 0; |
1771 | fraction[1] = 1; | 1923 | fraction[1] = 1; |
1772 | denominator = 1; | 1924 | denominator = 1; |
1925 | nr_force_scan[0] = 0; | ||
1926 | nr_force_scan[1] = SWAP_CLUSTER_MAX; | ||
1773 | goto out; | 1927 | goto out; |
1774 | } | 1928 | } |
1775 | 1929 | ||
1930 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
1931 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1932 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
1933 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1934 | |||
1776 | if (scanning_global_lru(sc)) { | 1935 | if (scanning_global_lru(sc)) { |
1777 | free = zone_page_state(zone, NR_FREE_PAGES); | 1936 | free = zone_page_state(zone, NR_FREE_PAGES); |
1778 | /* If we have very few page cache pages, | 1937 | /* If we have very few page cache pages, |
@@ -1781,6 +1940,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1781 | fraction[0] = 1; | 1940 | fraction[0] = 1; |
1782 | fraction[1] = 0; | 1941 | fraction[1] = 0; |
1783 | denominator = 1; | 1942 | denominator = 1; |
1943 | nr_force_scan[0] = SWAP_CLUSTER_MAX; | ||
1944 | nr_force_scan[1] = 0; | ||
1784 | goto out; | 1945 | goto out; |
1785 | } | 1946 | } |
1786 | } | 1947 | } |
@@ -1829,6 +1990,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1829 | fraction[0] = ap; | 1990 | fraction[0] = ap; |
1830 | fraction[1] = fp; | 1991 | fraction[1] = fp; |
1831 | denominator = ap + fp + 1; | 1992 | denominator = ap + fp + 1; |
1993 | if (force_scan) { | ||
1994 | unsigned long scan = SWAP_CLUSTER_MAX; | ||
1995 | nr_force_scan[0] = div64_u64(scan * ap, denominator); | ||
1996 | nr_force_scan[1] = div64_u64(scan * fp, denominator); | ||
1997 | } | ||
1832 | out: | 1998 | out: |
1833 | for_each_evictable_lru(l) { | 1999 | for_each_evictable_lru(l) { |
1834 | int file = is_file_lru(l); | 2000 | int file = is_file_lru(l); |
@@ -1849,12 +2015,8 @@ out: | |||
1849 | * memcg, priority drop can cause big latency. So, it's better | 2015 | * memcg, priority drop can cause big latency. So, it's better |
1850 | * to scan small amount. See may_noscan above. | 2016 | * to scan small amount. See may_noscan above. |
1851 | */ | 2017 | */ |
1852 | if (!scan && force_scan) { | 2018 | if (!scan && force_scan) |
1853 | if (file) | 2019 | scan = nr_force_scan[file]; |
1854 | scan = SWAP_CLUSTER_MAX; | ||
1855 | else if (!noswap) | ||
1856 | scan = SWAP_CLUSTER_MAX; | ||
1857 | } | ||
1858 | nr[l] = scan; | 2020 | nr[l] = scan; |
1859 | } | 2021 | } |
1860 | } | 2022 | } |
@@ -1906,8 +2068,9 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
1906 | * inactive lists are large enough, continue reclaiming | 2068 | * inactive lists are large enough, continue reclaiming |
1907 | */ | 2069 | */ |
1908 | pages_for_compaction = (2UL << sc->order); | 2070 | pages_for_compaction = (2UL << sc->order); |
1909 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + | 2071 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); |
1910 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | 2072 | if (nr_swap_pages > 0) |
2073 | inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1911 | if (sc->nr_reclaimed < pages_for_compaction && | 2074 | if (sc->nr_reclaimed < pages_for_compaction && |
1912 | inactive_lru_pages > pages_for_compaction) | 2075 | inactive_lru_pages > pages_for_compaction) |
1913 | return true; | 2076 | return true; |
@@ -1979,6 +2142,42 @@ restart: | |||
1979 | throttle_vm_writeout(sc->gfp_mask); | 2142 | throttle_vm_writeout(sc->gfp_mask); |
1980 | } | 2143 | } |
1981 | 2144 | ||
2145 | /* Returns true if compaction should go ahead for a high-order request */ | ||
2146 | static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | ||
2147 | { | ||
2148 | unsigned long balance_gap, watermark; | ||
2149 | bool watermark_ok; | ||
2150 | |||
2151 | /* Do not consider compaction for orders reclaim is meant to satisfy */ | ||
2152 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) | ||
2153 | return false; | ||
2154 | |||
2155 | /* | ||
2156 | * Compaction takes time to run and there are potentially other | ||
2157 | * callers using the pages just freed. Continue reclaiming until | ||
2158 | * there is a buffer of free pages available to give compaction | ||
2159 | * a reasonable chance of completing and allocating the page | ||
2160 | */ | ||
2161 | balance_gap = min(low_wmark_pages(zone), | ||
2162 | (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | ||
2163 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | ||
2164 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); | ||
2165 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); | ||
2166 | |||
2167 | /* | ||
2168 | * If compaction is deferred, reclaim up to a point where | ||
2169 | * compaction will have a chance of success when re-enabled | ||
2170 | */ | ||
2171 | if (compaction_deferred(zone)) | ||
2172 | return watermark_ok; | ||
2173 | |||
2174 | /* If compaction is not ready to start, keep reclaiming */ | ||
2175 | if (!compaction_suitable(zone, sc->order)) | ||
2176 | return false; | ||
2177 | |||
2178 | return watermark_ok; | ||
2179 | } | ||
2180 | |||
1982 | /* | 2181 | /* |
1983 | * This is the direct reclaim path, for page-allocating processes. We only | 2182 | * This is the direct reclaim path, for page-allocating processes. We only |
1984 | * try to reclaim pages from zones which will satisfy the caller's allocation | 2183 | * try to reclaim pages from zones which will satisfy the caller's allocation |
@@ -1994,14 +2193,20 @@ restart: | |||
1994 | * | 2193 | * |
1995 | * If a zone is deemed to be full of pinned pages then just give it a light | 2194 | * If a zone is deemed to be full of pinned pages then just give it a light |
1996 | * scan then give up on it. | 2195 | * scan then give up on it. |
2196 | * | ||
2197 | * This function returns true if a zone is being reclaimed for a costly | ||
2198 | * high-order allocation and compaction is ready to begin. This indicates to | ||
2199 | * the caller that it should consider retrying the allocation instead of | ||
2200 | * further reclaim. | ||
1997 | */ | 2201 | */ |
1998 | static void shrink_zones(int priority, struct zonelist *zonelist, | 2202 | static bool shrink_zones(int priority, struct zonelist *zonelist, |
1999 | struct scan_control *sc) | 2203 | struct scan_control *sc) |
2000 | { | 2204 | { |
2001 | struct zoneref *z; | 2205 | struct zoneref *z; |
2002 | struct zone *zone; | 2206 | struct zone *zone; |
2003 | unsigned long nr_soft_reclaimed; | 2207 | unsigned long nr_soft_reclaimed; |
2004 | unsigned long nr_soft_scanned; | 2208 | unsigned long nr_soft_scanned; |
2209 | bool aborted_reclaim = false; | ||
2005 | 2210 | ||
2006 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2211 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2007 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2212 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
@@ -2016,6 +2221,21 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
2016 | continue; | 2221 | continue; |
2017 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2222 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
2018 | continue; /* Let kswapd poll it */ | 2223 | continue; /* Let kswapd poll it */ |
2224 | if (COMPACTION_BUILD) { | ||
2225 | /* | ||
2226 | * If we already have plenty of memory free for | ||
2227 | * compaction in this zone, don't free any more. | ||
2228 | * Even though compaction is invoked for any | ||
2229 | * non-zero order, only frequent costly order | ||
2230 | * reclamation is disruptive enough to become a | ||
2231 | * noticable problem, like transparent huge page | ||
2232 | * allocations. | ||
2233 | */ | ||
2234 | if (compaction_ready(zone, sc)) { | ||
2235 | aborted_reclaim = true; | ||
2236 | continue; | ||
2237 | } | ||
2238 | } | ||
2019 | /* | 2239 | /* |
2020 | * This steals pages from memory cgroups over softlimit | 2240 | * This steals pages from memory cgroups over softlimit |
2021 | * and returns the number of reclaimed pages and | 2241 | * and returns the number of reclaimed pages and |
@@ -2033,6 +2253,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
2033 | 2253 | ||
2034 | shrink_zone(priority, zone, sc); | 2254 | shrink_zone(priority, zone, sc); |
2035 | } | 2255 | } |
2256 | |||
2257 | return aborted_reclaim; | ||
2036 | } | 2258 | } |
2037 | 2259 | ||
2038 | static bool zone_reclaimable(struct zone *zone) | 2260 | static bool zone_reclaimable(struct zone *zone) |
@@ -2086,8 +2308,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2086 | struct zoneref *z; | 2308 | struct zoneref *z; |
2087 | struct zone *zone; | 2309 | struct zone *zone; |
2088 | unsigned long writeback_threshold; | 2310 | unsigned long writeback_threshold; |
2311 | bool aborted_reclaim; | ||
2089 | 2312 | ||
2090 | get_mems_allowed(); | ||
2091 | delayacct_freepages_start(); | 2313 | delayacct_freepages_start(); |
2092 | 2314 | ||
2093 | if (scanning_global_lru(sc)) | 2315 | if (scanning_global_lru(sc)) |
@@ -2097,7 +2319,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2097 | sc->nr_scanned = 0; | 2319 | sc->nr_scanned = 0; |
2098 | if (!priority) | 2320 | if (!priority) |
2099 | disable_swap_token(sc->mem_cgroup); | 2321 | disable_swap_token(sc->mem_cgroup); |
2100 | shrink_zones(priority, zonelist, sc); | 2322 | aborted_reclaim = shrink_zones(priority, zonelist, sc); |
2323 | |||
2101 | /* | 2324 | /* |
2102 | * Don't shrink slabs when reclaiming memory from | 2325 | * Don't shrink slabs when reclaiming memory from |
2103 | * over limit cgroups | 2326 | * over limit cgroups |
@@ -2131,7 +2354,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2131 | */ | 2354 | */ |
2132 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; | 2355 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; |
2133 | if (total_scanned > writeback_threshold) { | 2356 | if (total_scanned > writeback_threshold) { |
2134 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); | 2357 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, |
2358 | WB_REASON_TRY_TO_FREE_PAGES); | ||
2135 | sc->may_writepage = 1; | 2359 | sc->may_writepage = 1; |
2136 | } | 2360 | } |
2137 | 2361 | ||
@@ -2149,7 +2373,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2149 | 2373 | ||
2150 | out: | 2374 | out: |
2151 | delayacct_freepages_end(); | 2375 | delayacct_freepages_end(); |
2152 | put_mems_allowed(); | ||
2153 | 2376 | ||
2154 | if (sc->nr_reclaimed) | 2377 | if (sc->nr_reclaimed) |
2155 | return sc->nr_reclaimed; | 2378 | return sc->nr_reclaimed; |
@@ -2162,6 +2385,10 @@ out: | |||
2162 | if (oom_killer_disabled) | 2385 | if (oom_killer_disabled) |
2163 | return 0; | 2386 | return 0; |
2164 | 2387 | ||
2388 | /* Aborted reclaim to try compaction? don't OOM, then */ | ||
2389 | if (aborted_reclaim) | ||
2390 | return 1; | ||
2391 | |||
2165 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 2392 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
2166 | if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) | 2393 | if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) |
2167 | return 1; | 2394 | return 1; |
@@ -2453,6 +2680,9 @@ loop_again: | |||
2453 | high_wmark_pages(zone), 0, 0)) { | 2680 | high_wmark_pages(zone), 0, 0)) { |
2454 | end_zone = i; | 2681 | end_zone = i; |
2455 | break; | 2682 | break; |
2683 | } else { | ||
2684 | /* If balanced, clear the congested flag */ | ||
2685 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2456 | } | 2686 | } |
2457 | } | 2687 | } |
2458 | if (i < 0) | 2688 | if (i < 0) |
@@ -2689,7 +2919,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2689 | * them before going back to sleep. | 2919 | * them before going back to sleep. |
2690 | */ | 2920 | */ |
2691 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | 2921 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); |
2692 | schedule(); | 2922 | |
2923 | if (!kthread_should_stop()) | ||
2924 | schedule(); | ||
2925 | |||
2693 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); | 2926 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); |
2694 | } else { | 2927 | } else { |
2695 | if (remaining) | 2928 | if (remaining) |
@@ -2716,7 +2949,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2716 | static int kswapd(void *p) | 2949 | static int kswapd(void *p) |
2717 | { | 2950 | { |
2718 | unsigned long order, new_order; | 2951 | unsigned long order, new_order; |
2952 | unsigned balanced_order; | ||
2719 | int classzone_idx, new_classzone_idx; | 2953 | int classzone_idx, new_classzone_idx; |
2954 | int balanced_classzone_idx; | ||
2720 | pg_data_t *pgdat = (pg_data_t*)p; | 2955 | pg_data_t *pgdat = (pg_data_t*)p; |
2721 | struct task_struct *tsk = current; | 2956 | struct task_struct *tsk = current; |
2722 | 2957 | ||
@@ -2747,7 +2982,9 @@ static int kswapd(void *p) | |||
2747 | set_freezable(); | 2982 | set_freezable(); |
2748 | 2983 | ||
2749 | order = new_order = 0; | 2984 | order = new_order = 0; |
2985 | balanced_order = 0; | ||
2750 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; | 2986 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; |
2987 | balanced_classzone_idx = classzone_idx; | ||
2751 | for ( ; ; ) { | 2988 | for ( ; ; ) { |
2752 | int ret; | 2989 | int ret; |
2753 | 2990 | ||
@@ -2756,7 +2993,8 @@ static int kswapd(void *p) | |||
2756 | * new request of a similar or harder type will succeed soon | 2993 | * new request of a similar or harder type will succeed soon |
2757 | * so consider going to sleep on the basis we reclaimed at | 2994 | * so consider going to sleep on the basis we reclaimed at |
2758 | */ | 2995 | */ |
2759 | if (classzone_idx >= new_classzone_idx && order == new_order) { | 2996 | if (balanced_classzone_idx >= new_classzone_idx && |
2997 | balanced_order == new_order) { | ||
2760 | new_order = pgdat->kswapd_max_order; | 2998 | new_order = pgdat->kswapd_max_order; |
2761 | new_classzone_idx = pgdat->classzone_idx; | 2999 | new_classzone_idx = pgdat->classzone_idx; |
2762 | pgdat->kswapd_max_order = 0; | 3000 | pgdat->kswapd_max_order = 0; |
@@ -2771,9 +3009,12 @@ static int kswapd(void *p) | |||
2771 | order = new_order; | 3009 | order = new_order; |
2772 | classzone_idx = new_classzone_idx; | 3010 | classzone_idx = new_classzone_idx; |
2773 | } else { | 3011 | } else { |
2774 | kswapd_try_to_sleep(pgdat, order, classzone_idx); | 3012 | kswapd_try_to_sleep(pgdat, balanced_order, |
3013 | balanced_classzone_idx); | ||
2775 | order = pgdat->kswapd_max_order; | 3014 | order = pgdat->kswapd_max_order; |
2776 | classzone_idx = pgdat->classzone_idx; | 3015 | classzone_idx = pgdat->classzone_idx; |
3016 | new_order = order; | ||
3017 | new_classzone_idx = classzone_idx; | ||
2777 | pgdat->kswapd_max_order = 0; | 3018 | pgdat->kswapd_max_order = 0; |
2778 | pgdat->classzone_idx = pgdat->nr_zones - 1; | 3019 | pgdat->classzone_idx = pgdat->nr_zones - 1; |
2779 | } | 3020 | } |
@@ -2788,7 +3029,9 @@ static int kswapd(void *p) | |||
2788 | */ | 3029 | */ |
2789 | if (!ret) { | 3030 | if (!ret) { |
2790 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); | 3031 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); |
2791 | order = balance_pgdat(pgdat, order, &classzone_idx); | 3032 | balanced_classzone_idx = classzone_idx; |
3033 | balanced_order = balance_pgdat(pgdat, order, | ||
3034 | &balanced_classzone_idx); | ||
2792 | } | 3035 | } |
2793 | } | 3036 | } |
2794 | return 0; | 3037 | return 0; |
@@ -2946,14 +3189,17 @@ int kswapd_run(int nid) | |||
2946 | } | 3189 | } |
2947 | 3190 | ||
2948 | /* | 3191 | /* |
2949 | * Called by memory hotplug when all memory in a node is offlined. | 3192 | * Called by memory hotplug when all memory in a node is offlined. Caller must |
3193 | * hold lock_memory_hotplug(). | ||
2950 | */ | 3194 | */ |
2951 | void kswapd_stop(int nid) | 3195 | void kswapd_stop(int nid) |
2952 | { | 3196 | { |
2953 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; | 3197 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; |
2954 | 3198 | ||
2955 | if (kswapd) | 3199 | if (kswapd) { |
2956 | kthread_stop(kswapd); | 3200 | kthread_stop(kswapd); |
3201 | NODE_DATA(nid)->kswapd = NULL; | ||
3202 | } | ||
2957 | } | 3203 | } |
2958 | 3204 | ||
2959 | static int __init kswapd_init(void) | 3205 | static int __init kswapd_init(void) |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 20c18b7694b..9c001a268ab 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu) | |||
78 | * | 78 | * |
79 | * vm_stat contains the global counters | 79 | * vm_stat contains the global counters |
80 | */ | 80 | */ |
81 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; | 81 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; |
82 | EXPORT_SYMBOL(vm_stat); | 82 | EXPORT_SYMBOL(vm_stat); |
83 | 83 | ||
84 | #ifdef CONFIG_SMP | 84 | #ifdef CONFIG_SMP |
@@ -702,6 +702,7 @@ const char * const vmstat_text[] = { | |||
702 | "nr_unstable", | 702 | "nr_unstable", |
703 | "nr_bounce", | 703 | "nr_bounce", |
704 | "nr_vmscan_write", | 704 | "nr_vmscan_write", |
705 | "nr_vmscan_immediate_reclaim", | ||
705 | "nr_writeback_temp", | 706 | "nr_writeback_temp", |
706 | "nr_isolated_anon", | 707 | "nr_isolated_anon", |
707 | "nr_isolated_file", | 708 | "nr_isolated_file", |