aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig52
-rw-r--r--mm/Makefile4
-rw-r--r--mm/ashmem.c748
-rw-r--r--mm/backing-dev.c128
-rw-r--r--mm/bootmem.c5
-rw-r--r--mm/cma-best-fit.c408
-rw-r--r--mm/cma.c1413
-rw-r--r--mm/compaction.c55
-rw-r--r--mm/filemap.c44
-rw-r--r--mm/filemap_xip.c7
-rw-r--r--mm/huge_memory.c47
-rw-r--r--mm/hugetlb.c75
-rw-r--r--mm/internal.h46
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/madvise.c16
-rw-r--r--mm/memcontrol.c67
-rw-r--r--mm/memory-failure.c6
-rw-r--r--mm/memory.c76
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c73
-rw-r--r--mm/migrate.c248
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mmu_notifier.c45
-rw-r--r--mm/nobootmem.c3
-rw-r--r--mm/nommu.c9
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c894
-rw-r--r--mm/page_alloc.c222
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu-vm.c12
-rw-r--r--mm/percpu.c50
-rw-r--r--mm/shmem.c516
-rw-r--r--mm/slab.c13
-rw-r--r--mm/slqb.c3816
-rw-r--r--mm/slub.c51
-rw-r--r--mm/sparse.c30
-rw-r--r--mm/swap.c85
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/vmalloc.c93
-rw-r--r--mm/vmscan.c422
-rw-r--r--mm/vmstat.c3
42 files changed, 8873 insertions, 933 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 8ca47a5ee9c..3c2b6739c87 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -370,3 +370,55 @@ config CLEANCACHE
370 in a negligible performance hit. 370 in a negligible performance hit.
371 371
372 If unsure, say Y to enable cleancache 372 If unsure, say Y to enable cleancache
373
374config CMA
375 bool "Contiguous Memory Allocator framework"
376 # Currently there is only one allocator so force it on
377 select CMA_BEST_FIT
378 help
379 This enables the Contiguous Memory Allocator framework which
380 allows drivers to allocate big physically-contiguous blocks of
381 memory for use with hardware components that do not support I/O
382 map nor scatter-gather.
383
384 If you select this option you will also have to select at least
385 one allocator algorithm below.
386
387 To make use of CMA you need to specify the regions and
388 driver->region mapping on command line when booting the kernel.
389
390config CMA_DEVELOPEMENT
391 bool "Include CMA developement features"
392 depends on CMA
393 help
394 This lets you enable some developement features of the CMA
395 freamework.
396
397config CMA_DEBUG
398 bool "CMA debug messages"
399 depends on CMA_DEVELOPEMENT
400 help
401 Enable debug messages in CMA code.
402
403config CMA_SYSFS
404 bool "CMA SysFS interface support"
405 depends on CMA_DEVELOPEMENT
406 help
407 Enable support for SysFS interface.
408
409config CMA_CMDLINE
410 bool "CMA command line parameters support"
411 depends on CMA_DEVELOPEMENT
412 help
413 Enable support for cma, cma.map and cma.asterisk command line
414 parameters.
415
416config CMA_BEST_FIT
417 bool "CMA best-fit allocator"
418 depends on CMA
419 help
420 This is a best-fit algorithm running in O(n log n) time where
421 n is the number of existing holes (which is never greater then
422 the number of allocated regions and usually much smaller). It
423 allocates area from the smallest hole that is big enough for
424 allocation in question.
diff --git a/mm/Makefile b/mm/Makefile
index 836e4163c1b..f846ad087a1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o
30obj-$(CONFIG_NUMA) += mempolicy.o 30obj-$(CONFIG_NUMA) += mempolicy.o
31obj-$(CONFIG_SPARSEMEM) += sparse.o 31obj-$(CONFIG_SPARSEMEM) += sparse.o
32obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 32obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
33obj-$(CONFIG_ASHMEM) += ashmem.o
33obj-$(CONFIG_SLOB) += slob.o 34obj-$(CONFIG_SLOB) += slob.o
34obj-$(CONFIG_COMPACTION) += compaction.o 35obj-$(CONFIG_COMPACTION) += compaction.o
35obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 36obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
@@ -37,6 +38,7 @@ obj-$(CONFIG_KSM) += ksm.o
37obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 38obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
38obj-$(CONFIG_SLAB) += slab.o 39obj-$(CONFIG_SLAB) += slab.o
39obj-$(CONFIG_SLUB) += slub.o 40obj-$(CONFIG_SLUB) += slub.o
41obj-$(CONFIG_SLQB) += slqb.o
40obj-$(CONFIG_KMEMCHECK) += kmemcheck.o 42obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
41obj-$(CONFIG_FAILSLAB) += failslab.o 43obj-$(CONFIG_FAILSLAB) += failslab.o
42obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 44obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
@@ -50,3 +52,5 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
50obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 52obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
51obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 53obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
52obj-$(CONFIG_CLEANCACHE) += cleancache.o 54obj-$(CONFIG_CLEANCACHE) += cleancache.o
55obj-$(CONFIG_CMA) += cma.o
56obj-$(CONFIG_CMA_BEST_FIT) += cma-best-fit.o
diff --git a/mm/ashmem.c b/mm/ashmem.c
new file mode 100644
index 00000000000..66e3f23ee33
--- /dev/null
+++ b/mm/ashmem.c
@@ -0,0 +1,748 @@
1/* mm/ashmem.c
2**
3** Anonymous Shared Memory Subsystem, ashmem
4**
5** Copyright (C) 2008 Google, Inc.
6**
7** Robert Love <rlove@google.com>
8**
9** This software is licensed under the terms of the GNU General Public
10** License version 2, as published by the Free Software Foundation, and
11** may be copied, distributed, and modified under those terms.
12**
13** This program is distributed in the hope that it will be useful,
14** but WITHOUT ANY WARRANTY; without even the implied warranty of
15** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16** GNU General Public License for more details.
17*/
18
19#include <linux/module.h>
20#include <linux/file.h>
21#include <linux/fs.h>
22#include <linux/miscdevice.h>
23#include <linux/security.h>
24#include <linux/mm.h>
25#include <linux/mman.h>
26#include <linux/uaccess.h>
27#include <linux/personality.h>
28#include <linux/bitops.h>
29#include <linux/mutex.h>
30#include <linux/shmem_fs.h>
31#include <linux/ashmem.h>
32
33#define ASHMEM_NAME_PREFIX "dev/ashmem/"
34#define ASHMEM_NAME_PREFIX_LEN (sizeof(ASHMEM_NAME_PREFIX) - 1)
35#define ASHMEM_FULL_NAME_LEN (ASHMEM_NAME_LEN + ASHMEM_NAME_PREFIX_LEN)
36
37/*
38 * ashmem_area - anonymous shared memory area
39 * Lifecycle: From our parent file's open() until its release()
40 * Locking: Protected by `ashmem_mutex'
41 * Big Note: Mappings do NOT pin this structure; it dies on close()
42 */
43struct ashmem_area {
44 char name[ASHMEM_FULL_NAME_LEN];/* optional name for /proc/pid/maps */
45 struct list_head unpinned_list; /* list of all ashmem areas */
46 struct file *file; /* the shmem-based backing file */
47 size_t size; /* size of the mapping, in bytes */
48 unsigned long prot_mask; /* allowed prot bits, as vm_flags */
49};
50
51/*
52 * ashmem_range - represents an interval of unpinned (evictable) pages
53 * Lifecycle: From unpin to pin
54 * Locking: Protected by `ashmem_mutex'
55 */
56struct ashmem_range {
57 struct list_head lru; /* entry in LRU list */
58 struct list_head unpinned; /* entry in its area's unpinned list */
59 struct ashmem_area *asma; /* associated area */
60 size_t pgstart; /* starting page, inclusive */
61 size_t pgend; /* ending page, inclusive */
62 unsigned int purged; /* ASHMEM_NOT or ASHMEM_WAS_PURGED */
63};
64
65/* LRU list of unpinned pages, protected by ashmem_mutex */
66static LIST_HEAD(ashmem_lru_list);
67
68/* Count of pages on our LRU list, protected by ashmem_mutex */
69static unsigned long lru_count;
70
71/*
72 * ashmem_mutex - protects the list of and each individual ashmem_area
73 *
74 * Lock Ordering: ashmex_mutex -> i_mutex -> i_alloc_sem
75 */
76static DEFINE_MUTEX(ashmem_mutex);
77
78static struct kmem_cache *ashmem_area_cachep __read_mostly;
79static struct kmem_cache *ashmem_range_cachep __read_mostly;
80
81#define range_size(range) \
82 ((range)->pgend - (range)->pgstart + 1)
83
84#define range_on_lru(range) \
85 ((range)->purged == ASHMEM_NOT_PURGED)
86
87#define page_range_subsumes_range(range, start, end) \
88 (((range)->pgstart >= (start)) && ((range)->pgend <= (end)))
89
90#define page_range_subsumed_by_range(range, start, end) \
91 (((range)->pgstart <= (start)) && ((range)->pgend >= (end)))
92
93#define page_in_range(range, page) \
94 (((range)->pgstart <= (page)) && ((range)->pgend >= (page)))
95
96#define page_range_in_range(range, start, end) \
97 (page_in_range(range, start) || page_in_range(range, end) || \
98 page_range_subsumes_range(range, start, end))
99
100#define range_before_page(range, page) \
101 ((range)->pgend < (page))
102
103#define PROT_MASK (PROT_EXEC | PROT_READ | PROT_WRITE)
104
105static inline void lru_add(struct ashmem_range *range)
106{
107 list_add_tail(&range->lru, &ashmem_lru_list);
108 lru_count += range_size(range);
109}
110
111static inline void lru_del(struct ashmem_range *range)
112{
113 list_del(&range->lru);
114 lru_count -= range_size(range);
115}
116
117/*
118 * range_alloc - allocate and initialize a new ashmem_range structure
119 *
120 * 'asma' - associated ashmem_area
121 * 'prev_range' - the previous ashmem_range in the sorted asma->unpinned list
122 * 'purged' - initial purge value (ASMEM_NOT_PURGED or ASHMEM_WAS_PURGED)
123 * 'start' - starting page, inclusive
124 * 'end' - ending page, inclusive
125 *
126 * Caller must hold ashmem_mutex.
127 */
128static int range_alloc(struct ashmem_area *asma,
129 struct ashmem_range *prev_range, unsigned int purged,
130 size_t start, size_t end)
131{
132 struct ashmem_range *range;
133
134 range = kmem_cache_zalloc(ashmem_range_cachep, GFP_KERNEL);
135 if (unlikely(!range))
136 return -ENOMEM;
137
138 range->asma = asma;
139 range->pgstart = start;
140 range->pgend = end;
141 range->purged = purged;
142
143 list_add_tail(&range->unpinned, &prev_range->unpinned);
144
145 if (range_on_lru(range))
146 lru_add(range);
147
148 return 0;
149}
150
151static void range_del(struct ashmem_range *range)
152{
153 list_del(&range->unpinned);
154 if (range_on_lru(range))
155 lru_del(range);
156 kmem_cache_free(ashmem_range_cachep, range);
157}
158
159/*
160 * range_shrink - shrinks a range
161 *
162 * Caller must hold ashmem_mutex.
163 */
164static inline void range_shrink(struct ashmem_range *range,
165 size_t start, size_t end)
166{
167 size_t pre = range_size(range);
168
169 range->pgstart = start;
170 range->pgend = end;
171
172 if (range_on_lru(range))
173 lru_count -= pre - range_size(range);
174}
175
176static int ashmem_open(struct inode *inode, struct file *file)
177{
178 struct ashmem_area *asma;
179 int ret;
180
181 ret = generic_file_open(inode, file);
182 if (unlikely(ret))
183 return ret;
184
185 asma = kmem_cache_zalloc(ashmem_area_cachep, GFP_KERNEL);
186 if (unlikely(!asma))
187 return -ENOMEM;
188
189 INIT_LIST_HEAD(&asma->unpinned_list);
190 memcpy(asma->name, ASHMEM_NAME_PREFIX, ASHMEM_NAME_PREFIX_LEN);
191 asma->prot_mask = PROT_MASK;
192 file->private_data = asma;
193
194 return 0;
195}
196
197static int ashmem_release(struct inode *ignored, struct file *file)
198{
199 struct ashmem_area *asma = file->private_data;
200 struct ashmem_range *range, *next;
201
202 mutex_lock(&ashmem_mutex);
203 list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned)
204 range_del(range);
205 mutex_unlock(&ashmem_mutex);
206
207 if (asma->file)
208 fput(asma->file);
209 kmem_cache_free(ashmem_area_cachep, asma);
210
211 return 0;
212}
213
214static ssize_t ashmem_read(struct file *file, char __user *buf,
215 size_t len, loff_t *pos)
216{
217 struct ashmem_area *asma = file->private_data;
218 int ret = 0;
219
220 mutex_lock(&ashmem_mutex);
221
222 /* If size is not set, or set to 0, always return EOF. */
223 if (asma->size == 0) {
224 goto out;
225 }
226
227 if (!asma->file) {
228 ret = -EBADF;
229 goto out;
230 }
231
232 ret = asma->file->f_op->read(asma->file, buf, len, pos);
233 if (ret < 0) {
234 goto out;
235 }
236
237 /** Update backing file pos, since f_ops->read() doesn't */
238 asma->file->f_pos = *pos;
239
240out:
241 mutex_unlock(&ashmem_mutex);
242 return ret;
243}
244
245static loff_t ashmem_llseek(struct file *file, loff_t offset, int origin)
246{
247 struct ashmem_area *asma = file->private_data;
248 int ret;
249
250 mutex_lock(&ashmem_mutex);
251
252 if (asma->size == 0) {
253 ret = -EINVAL;
254 goto out;
255 }
256
257 if (!asma->file) {
258 ret = -EBADF;
259 goto out;
260 }
261
262 ret = asma->file->f_op->llseek(asma->file, offset, origin);
263 if (ret < 0) {
264 goto out;
265 }
266
267 /** Copy f_pos from backing file, since f_ops->llseek() sets it */
268 file->f_pos = asma->file->f_pos;
269
270out:
271 mutex_unlock(&ashmem_mutex);
272 return ret;
273}
274
275static inline unsigned long
276calc_vm_may_flags(unsigned long prot)
277{
278 return _calc_vm_trans(prot, PROT_READ, VM_MAYREAD ) |
279 _calc_vm_trans(prot, PROT_WRITE, VM_MAYWRITE) |
280 _calc_vm_trans(prot, PROT_EXEC, VM_MAYEXEC);
281}
282
283static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
284{
285 struct ashmem_area *asma = file->private_data;
286 int ret = 0;
287
288 mutex_lock(&ashmem_mutex);
289
290 /* user needs to SET_SIZE before mapping */
291 if (unlikely(!asma->size)) {
292 ret = -EINVAL;
293 goto out;
294 }
295
296 /* requested protection bits must match our allowed protection mask */
297 if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) &
298 calc_vm_prot_bits(PROT_MASK))) {
299 ret = -EPERM;
300 goto out;
301 }
302 vma->vm_flags &= ~calc_vm_may_flags(~asma->prot_mask);
303
304 if (!asma->file) {
305 char *name = ASHMEM_NAME_DEF;
306 struct file *vmfile;
307
308 if (asma->name[ASHMEM_NAME_PREFIX_LEN] != '\0')
309 name = asma->name;
310
311 /* ... and allocate the backing shmem file */
312 vmfile = shmem_file_setup(name, asma->size, vma->vm_flags);
313 if (unlikely(IS_ERR(vmfile))) {
314 ret = PTR_ERR(vmfile);
315 goto out;
316 }
317 asma->file = vmfile;
318 }
319 get_file(asma->file);
320
321 if (vma->vm_flags & VM_SHARED)
322 shmem_set_file(vma, asma->file);
323 else {
324 if (vma->vm_file)
325 fput(vma->vm_file);
326 vma->vm_file = asma->file;
327 }
328 vma->vm_flags |= VM_CAN_NONLINEAR;
329
330out:
331 mutex_unlock(&ashmem_mutex);
332 return ret;
333}
334
335/*
336 * ashmem_shrink - our cache shrinker, called from mm/vmscan.c :: shrink_slab
337 *
338 * 'nr_to_scan' is the number of objects (pages) to prune, or 0 to query how
339 * many objects (pages) we have in total.
340 *
341 * 'gfp_mask' is the mask of the allocation that got us into this mess.
342 *
343 * Return value is the number of objects (pages) remaining, or -1 if we cannot
344 * proceed without risk of deadlock (due to gfp_mask).
345 *
346 * We approximate LRU via least-recently-unpinned, jettisoning unpinned partial
347 * chunks of ashmem regions LRU-wise one-at-a-time until we hit 'nr_to_scan'
348 * pages freed.
349 */
350static int ashmem_shrink(struct shrinker *s, struct shrink_control *sc)
351{
352 struct ashmem_range *range, *next;
353
354 /* We might recurse into filesystem code, so bail out if necessary */
355 if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
356 return -1;
357 if (!sc->nr_to_scan)
358 return lru_count;
359
360 mutex_lock(&ashmem_mutex);
361 list_for_each_entry_safe(range, next, &ashmem_lru_list, lru) {
362 struct inode *inode = range->asma->file->f_dentry->d_inode;
363 loff_t start = range->pgstart * PAGE_SIZE;
364 loff_t end = (range->pgend + 1) * PAGE_SIZE - 1;
365
366 vmtruncate_range(inode, start, end);
367 range->purged = ASHMEM_WAS_PURGED;
368 lru_del(range);
369
370 sc->nr_to_scan -= range_size(range);
371 if (sc->nr_to_scan <= 0)
372 break;
373 }
374 mutex_unlock(&ashmem_mutex);
375
376 return lru_count;
377}
378
379static struct shrinker ashmem_shrinker = {
380 .shrink = ashmem_shrink,
381 .seeks = DEFAULT_SEEKS * 4,
382};
383
384static int set_prot_mask(struct ashmem_area *asma, unsigned long prot)
385{
386 int ret = 0;
387
388 mutex_lock(&ashmem_mutex);
389
390 /* the user can only remove, not add, protection bits */
391 if (unlikely((asma->prot_mask & prot) != prot)) {
392 ret = -EINVAL;
393 goto out;
394 }
395
396 /* does the application expect PROT_READ to imply PROT_EXEC? */
397 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
398 prot |= PROT_EXEC;
399
400 asma->prot_mask = prot;
401
402out:
403 mutex_unlock(&ashmem_mutex);
404 return ret;
405}
406
407static int set_name(struct ashmem_area *asma, void __user *name)
408{
409 int ret = 0;
410
411 mutex_lock(&ashmem_mutex);
412
413 /* cannot change an existing mapping's name */
414 if (unlikely(asma->file)) {
415 ret = -EINVAL;
416 goto out;
417 }
418
419 if (unlikely(copy_from_user(asma->name + ASHMEM_NAME_PREFIX_LEN,
420 name, ASHMEM_NAME_LEN)))
421 ret = -EFAULT;
422 asma->name[ASHMEM_FULL_NAME_LEN-1] = '\0';
423
424out:
425 mutex_unlock(&ashmem_mutex);
426
427 return ret;
428}
429
430static int get_name(struct ashmem_area *asma, void __user *name)
431{
432 int ret = 0;
433
434 mutex_lock(&ashmem_mutex);
435 if (asma->name[ASHMEM_NAME_PREFIX_LEN] != '\0') {
436 size_t len;
437
438 /*
439 * Copying only `len', instead of ASHMEM_NAME_LEN, bytes
440 * prevents us from revealing one user's stack to another.
441 */
442 len = strlen(asma->name + ASHMEM_NAME_PREFIX_LEN) + 1;
443 if (unlikely(copy_to_user(name,
444 asma->name + ASHMEM_NAME_PREFIX_LEN, len)))
445 ret = -EFAULT;
446 } else {
447 if (unlikely(copy_to_user(name, ASHMEM_NAME_DEF,
448 sizeof(ASHMEM_NAME_DEF))))
449 ret = -EFAULT;
450 }
451 mutex_unlock(&ashmem_mutex);
452
453 return ret;
454}
455
456/*
457 * ashmem_pin - pin the given ashmem region, returning whether it was
458 * previously purged (ASHMEM_WAS_PURGED) or not (ASHMEM_NOT_PURGED).
459 *
460 * Caller must hold ashmem_mutex.
461 */
462static int ashmem_pin(struct ashmem_area *asma, size_t pgstart, size_t pgend)
463{
464 struct ashmem_range *range, *next;
465 int ret = ASHMEM_NOT_PURGED;
466
467 list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned) {
468 /* moved past last applicable page; we can short circuit */
469 if (range_before_page(range, pgstart))
470 break;
471
472 /*
473 * The user can ask us to pin pages that span multiple ranges,
474 * or to pin pages that aren't even unpinned, so this is messy.
475 *
476 * Four cases:
477 * 1. The requested range subsumes an existing range, so we
478 * just remove the entire matching range.
479 * 2. The requested range overlaps the start of an existing
480 * range, so we just update that range.
481 * 3. The requested range overlaps the end of an existing
482 * range, so we just update that range.
483 * 4. The requested range punches a hole in an existing range,
484 * so we have to update one side of the range and then
485 * create a new range for the other side.
486 */
487 if (page_range_in_range(range, pgstart, pgend)) {
488 ret |= range->purged;
489
490 /* Case #1: Easy. Just nuke the whole thing. */
491 if (page_range_subsumes_range(range, pgstart, pgend)) {
492 range_del(range);
493 continue;
494 }
495
496 /* Case #2: We overlap from the start, so adjust it */
497 if (range->pgstart >= pgstart) {
498 range_shrink(range, pgend + 1, range->pgend);
499 continue;
500 }
501
502 /* Case #3: We overlap from the rear, so adjust it */
503 if (range->pgend <= pgend) {
504 range_shrink(range, range->pgstart, pgstart-1);
505 continue;
506 }
507
508 /*
509 * Case #4: We eat a chunk out of the middle. A bit
510 * more complicated, we allocate a new range for the
511 * second half and adjust the first chunk's endpoint.
512 */
513 range_alloc(asma, range, range->purged,
514 pgend + 1, range->pgend);
515 range_shrink(range, range->pgstart, pgstart - 1);
516 break;
517 }
518 }
519
520 return ret;
521}
522
523/*
524 * ashmem_unpin - unpin the given range of pages. Returns zero on success.
525 *
526 * Caller must hold ashmem_mutex.
527 */
528static int ashmem_unpin(struct ashmem_area *asma, size_t pgstart, size_t pgend)
529{
530 struct ashmem_range *range, *next;
531 unsigned int purged = ASHMEM_NOT_PURGED;
532
533restart:
534 list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned) {
535 /* short circuit: this is our insertion point */
536 if (range_before_page(range, pgstart))
537 break;
538
539 /*
540 * The user can ask us to unpin pages that are already entirely
541 * or partially pinned. We handle those two cases here.
542 */
543 if (page_range_subsumed_by_range(range, pgstart, pgend))
544 return 0;
545 if (page_range_in_range(range, pgstart, pgend)) {
546 pgstart = min_t(size_t, range->pgstart, pgstart),
547 pgend = max_t(size_t, range->pgend, pgend);
548 purged |= range->purged;
549 range_del(range);
550 goto restart;
551 }
552 }
553
554 return range_alloc(asma, range, purged, pgstart, pgend);
555}
556
557/*
558 * ashmem_get_pin_status - Returns ASHMEM_IS_UNPINNED if _any_ pages in the
559 * given interval are unpinned and ASHMEM_IS_PINNED otherwise.
560 *
561 * Caller must hold ashmem_mutex.
562 */
563static int ashmem_get_pin_status(struct ashmem_area *asma, size_t pgstart,
564 size_t pgend)
565{
566 struct ashmem_range *range;
567 int ret = ASHMEM_IS_PINNED;
568
569 list_for_each_entry(range, &asma->unpinned_list, unpinned) {
570 if (range_before_page(range, pgstart))
571 break;
572 if (page_range_in_range(range, pgstart, pgend)) {
573 ret = ASHMEM_IS_UNPINNED;
574 break;
575 }
576 }
577
578 return ret;
579}
580
581static int ashmem_pin_unpin(struct ashmem_area *asma, unsigned long cmd,
582 void __user *p)
583{
584 struct ashmem_pin pin;
585 size_t pgstart, pgend;
586 int ret = -EINVAL;
587
588 if (unlikely(!asma->file))
589 return -EINVAL;
590
591 if (unlikely(copy_from_user(&pin, p, sizeof(pin))))
592 return -EFAULT;
593
594 /* per custom, you can pass zero for len to mean "everything onward" */
595 if (!pin.len)
596 pin.len = PAGE_ALIGN(asma->size) - pin.offset;
597
598 if (unlikely((pin.offset | pin.len) & ~PAGE_MASK))
599 return -EINVAL;
600
601 if (unlikely(((__u32) -1) - pin.offset < pin.len))
602 return -EINVAL;
603
604 if (unlikely(PAGE_ALIGN(asma->size) < pin.offset + pin.len))
605 return -EINVAL;
606
607 pgstart = pin.offset / PAGE_SIZE;
608 pgend = pgstart + (pin.len / PAGE_SIZE) - 1;
609
610 mutex_lock(&ashmem_mutex);
611
612 switch (cmd) {
613 case ASHMEM_PIN:
614 ret = ashmem_pin(asma, pgstart, pgend);
615 break;
616 case ASHMEM_UNPIN:
617 ret = ashmem_unpin(asma, pgstart, pgend);
618 break;
619 case ASHMEM_GET_PIN_STATUS:
620 ret = ashmem_get_pin_status(asma, pgstart, pgend);
621 break;
622 }
623
624 mutex_unlock(&ashmem_mutex);
625
626 return ret;
627}
628
629static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
630{
631 struct ashmem_area *asma = file->private_data;
632 long ret = -ENOTTY;
633
634 switch (cmd) {
635 case ASHMEM_SET_NAME:
636 ret = set_name(asma, (void __user *) arg);
637 break;
638 case ASHMEM_GET_NAME:
639 ret = get_name(asma, (void __user *) arg);
640 break;
641 case ASHMEM_SET_SIZE:
642 ret = -EINVAL;
643 if (!asma->file) {
644 ret = 0;
645 asma->size = (size_t) arg;
646 }
647 break;
648 case ASHMEM_GET_SIZE:
649 ret = asma->size;
650 break;
651 case ASHMEM_SET_PROT_MASK:
652 ret = set_prot_mask(asma, arg);
653 break;
654 case ASHMEM_GET_PROT_MASK:
655 ret = asma->prot_mask;
656 break;
657 case ASHMEM_PIN:
658 case ASHMEM_UNPIN:
659 case ASHMEM_GET_PIN_STATUS:
660 ret = ashmem_pin_unpin(asma, cmd, (void __user *) arg);
661 break;
662 case ASHMEM_PURGE_ALL_CACHES:
663 ret = -EPERM;
664 if (capable(CAP_SYS_ADMIN)) {
665 struct shrink_control sc = {
666 .gfp_mask = GFP_KERNEL,
667 .nr_to_scan = 0,
668 };
669 ret = ashmem_shrink(&ashmem_shrinker, &sc);
670 sc.nr_to_scan = ret;
671 ashmem_shrink(&ashmem_shrinker, &sc);
672 }
673 break;
674 }
675
676 return ret;
677}
678
679static struct file_operations ashmem_fops = {
680 .owner = THIS_MODULE,
681 .open = ashmem_open,
682 .release = ashmem_release,
683 .read = ashmem_read,
684 .llseek = ashmem_llseek,
685 .mmap = ashmem_mmap,
686 .unlocked_ioctl = ashmem_ioctl,
687 .compat_ioctl = ashmem_ioctl,
688};
689
690static struct miscdevice ashmem_misc = {
691 .minor = MISC_DYNAMIC_MINOR,
692 .name = "ashmem",
693 .fops = &ashmem_fops,
694};
695
696static int __init ashmem_init(void)
697{
698 int ret;
699
700 ashmem_area_cachep = kmem_cache_create("ashmem_area_cache",
701 sizeof(struct ashmem_area),
702 0, 0, NULL);
703 if (unlikely(!ashmem_area_cachep)) {
704 printk(KERN_ERR "ashmem: failed to create slab cache\n");
705 return -ENOMEM;
706 }
707
708 ashmem_range_cachep = kmem_cache_create("ashmem_range_cache",
709 sizeof(struct ashmem_range),
710 0, 0, NULL);
711 if (unlikely(!ashmem_range_cachep)) {
712 printk(KERN_ERR "ashmem: failed to create slab cache\n");
713 return -ENOMEM;
714 }
715
716 ret = misc_register(&ashmem_misc);
717 if (unlikely(ret)) {
718 printk(KERN_ERR "ashmem: failed to register misc device!\n");
719 return ret;
720 }
721
722 register_shrinker(&ashmem_shrinker);
723
724 printk(KERN_INFO "ashmem: initialized\n");
725
726 return 0;
727}
728
729static void __exit ashmem_exit(void)
730{
731 int ret;
732
733 unregister_shrinker(&ashmem_shrinker);
734
735 ret = misc_deregister(&ashmem_misc);
736 if (unlikely(ret))
737 printk(KERN_ERR "ashmem: failed to unregister misc device!\n");
738
739 kmem_cache_destroy(ashmem_range_cachep);
740 kmem_cache_destroy(ashmem_area_cachep);
741
742 printk(KERN_INFO "ashmem: unloaded\n");
743}
744
745module_init(ashmem_init);
746module_exit(ashmem_exit);
747
748MODULE_LICENSE("GPL");
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f032e6e1e09..cb9f1c2d01a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer;
45static int bdi_sync_supers(void *); 45static int bdi_sync_supers(void *);
46static void sync_supers_timer_fn(unsigned long); 46static void sync_supers_timer_fn(unsigned long);
47 47
48void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
49{
50 if (wb1 < wb2) {
51 spin_lock(&wb1->list_lock);
52 spin_lock_nested(&wb2->list_lock, 1);
53 } else {
54 spin_lock(&wb2->list_lock);
55 spin_lock_nested(&wb1->list_lock, 1);
56 }
57}
58
48#ifdef CONFIG_DEBUG_FS 59#ifdef CONFIG_DEBUG_FS
49#include <linux/debugfs.h> 60#include <linux/debugfs.h>
50#include <linux/seq_file.h> 61#include <linux/seq_file.h>
@@ -67,34 +78,44 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
67 struct inode *inode; 78 struct inode *inode;
68 79
69 nr_dirty = nr_io = nr_more_io = 0; 80 nr_dirty = nr_io = nr_more_io = 0;
70 spin_lock(&inode_wb_list_lock); 81 spin_lock(&wb->list_lock);
71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 82 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
72 nr_dirty++; 83 nr_dirty++;
73 list_for_each_entry(inode, &wb->b_io, i_wb_list) 84 list_for_each_entry(inode, &wb->b_io, i_wb_list)
74 nr_io++; 85 nr_io++;
75 list_for_each_entry(inode, &wb->b_more_io, i_wb_list) 86 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
76 nr_more_io++; 87 nr_more_io++;
77 spin_unlock(&inode_wb_list_lock); 88 spin_unlock(&wb->list_lock);
78 89
79 global_dirty_limits(&background_thresh, &dirty_thresh); 90 global_dirty_limits(&background_thresh, &dirty_thresh);
80 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 91 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
81 92
82#define K(x) ((x) << (PAGE_SHIFT - 10)) 93#define K(x) ((x) << (PAGE_SHIFT - 10))
83 seq_printf(m, 94 seq_printf(m,
84 "BdiWriteback: %8lu kB\n" 95 "BdiWriteback: %10lu kB\n"
85 "BdiReclaimable: %8lu kB\n" 96 "BdiReclaimable: %10lu kB\n"
86 "BdiDirtyThresh: %8lu kB\n" 97 "BdiDirtyThresh: %10lu kB\n"
87 "DirtyThresh: %8lu kB\n" 98 "DirtyThresh: %10lu kB\n"
88 "BackgroundThresh: %8lu kB\n" 99 "BackgroundThresh: %10lu kB\n"
89 "b_dirty: %8lu\n" 100 "BdiDirtied: %10lu kB\n"
90 "b_io: %8lu\n" 101 "BdiWritten: %10lu kB\n"
91 "b_more_io: %8lu\n" 102 "BdiWriteBandwidth: %10lu kBps\n"
92 "bdi_list: %8u\n" 103 "b_dirty: %10lu\n"
93 "state: %8lx\n", 104 "b_io: %10lu\n"
105 "b_more_io: %10lu\n"
106 "bdi_list: %10u\n"
107 "state: %10lx\n",
94 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 108 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
95 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 109 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
96 K(bdi_thresh), K(dirty_thresh), 110 K(bdi_thresh),
97 K(background_thresh), nr_dirty, nr_io, nr_more_io, 111 K(dirty_thresh),
112 K(background_thresh),
113 (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
114 (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
115 (unsigned long) K(bdi->write_bandwidth),
116 nr_dirty,
117 nr_io,
118 nr_more_io,
98 !list_empty(&bdi->bdi_list), bdi->state); 119 !list_empty(&bdi->bdi_list), bdi->state);
99#undef K 120#undef K
100 121
@@ -249,18 +270,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
249 return wb_has_dirty_io(&bdi->wb); 270 return wb_has_dirty_io(&bdi->wb);
250} 271}
251 272
252static void bdi_flush_io(struct backing_dev_info *bdi)
253{
254 struct writeback_control wbc = {
255 .sync_mode = WB_SYNC_NONE,
256 .older_than_this = NULL,
257 .range_cyclic = 1,
258 .nr_to_write = 1024,
259 };
260
261 writeback_inodes_wb(&bdi->wb, &wbc);
262}
263
264/* 273/*
265 * kupdated() used to do this. We cannot do it from the bdi_forker_thread() 274 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
266 * or we risk deadlocking on ->s_umount. The longer term solution would be 275 * or we risk deadlocking on ->s_umount. The longer term solution would be
@@ -352,6 +361,17 @@ static unsigned long bdi_longest_inactive(void)
352 return max(5UL * 60 * HZ, interval); 361 return max(5UL * 60 * HZ, interval);
353} 362}
354 363
364/*
365 * Clear pending bit and wakeup anybody waiting for flusher thread creation or
366 * shutdown
367 */
368static void bdi_clear_pending(struct backing_dev_info *bdi)
369{
370 clear_bit(BDI_pending, &bdi->state);
371 smp_mb__after_clear_bit();
372 wake_up_bit(&bdi->state, BDI_pending);
373}
374
355static int bdi_forker_thread(void *ptr) 375static int bdi_forker_thread(void *ptr)
356{ 376{
357 struct bdi_writeback *me = ptr; 377 struct bdi_writeback *me = ptr;
@@ -383,6 +403,13 @@ static int bdi_forker_thread(void *ptr)
383 } 403 }
384 404
385 spin_lock_bh(&bdi_lock); 405 spin_lock_bh(&bdi_lock);
406 /*
407 * In the following loop we are going to check whether we have
408 * some work to do without any synchronization with tasks
409 * waking us up to do work for them. So we have to set task
410 * state already here so that we don't miss wakeups coming
411 * after we verify some condition.
412 */
386 set_current_state(TASK_INTERRUPTIBLE); 413 set_current_state(TASK_INTERRUPTIBLE);
387 414
388 list_for_each_entry(bdi, &bdi_list, bdi_list) { 415 list_for_each_entry(bdi, &bdi_list, bdi_list) {
@@ -446,9 +473,11 @@ static int bdi_forker_thread(void *ptr)
446 if (IS_ERR(task)) { 473 if (IS_ERR(task)) {
447 /* 474 /*
448 * If thread creation fails, force writeout of 475 * If thread creation fails, force writeout of
449 * the bdi from the thread. 476 * the bdi from the thread. Hopefully 1024 is
477 * large enough for efficient IO.
450 */ 478 */
451 bdi_flush_io(bdi); 479 writeback_inodes_wb(&bdi->wb, 1024,
480 WB_REASON_FORKER_THREAD);
452 } else { 481 } else {
453 /* 482 /*
454 * The spinlock makes sure we do not lose 483 * The spinlock makes sure we do not lose
@@ -461,11 +490,13 @@ static int bdi_forker_thread(void *ptr)
461 spin_unlock_bh(&bdi->wb_lock); 490 spin_unlock_bh(&bdi->wb_lock);
462 wake_up_process(task); 491 wake_up_process(task);
463 } 492 }
493 bdi_clear_pending(bdi);
464 break; 494 break;
465 495
466 case KILL_THREAD: 496 case KILL_THREAD:
467 __set_current_state(TASK_RUNNING); 497 __set_current_state(TASK_RUNNING);
468 kthread_stop(task); 498 kthread_stop(task);
499 bdi_clear_pending(bdi);
469 break; 500 break;
470 501
471 case NO_ACTION: 502 case NO_ACTION:
@@ -481,16 +512,8 @@ static int bdi_forker_thread(void *ptr)
481 else 512 else
482 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); 513 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
483 try_to_freeze(); 514 try_to_freeze();
484 /* Back to the main loop */ 515 break;
485 continue;
486 } 516 }
487
488 /*
489 * Clear pending bit and wakeup anybody waiting to tear us down.
490 */
491 clear_bit(BDI_pending, &bdi->state);
492 smp_mb__after_clear_bit();
493 wake_up_bit(&bdi->state, BDI_pending);
494 } 517 }
495 518
496 return 0; 519 return 0;
@@ -505,7 +528,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
505 list_del_rcu(&bdi->bdi_list); 528 list_del_rcu(&bdi->bdi_list);
506 spin_unlock_bh(&bdi_lock); 529 spin_unlock_bh(&bdi_lock);
507 530
508 synchronize_rcu(); 531 synchronize_rcu_expedited();
509} 532}
510 533
511int bdi_register(struct backing_dev_info *bdi, struct device *parent, 534int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -606,6 +629,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
606void bdi_unregister(struct backing_dev_info *bdi) 629void bdi_unregister(struct backing_dev_info *bdi)
607{ 630{
608 if (bdi->dev) { 631 if (bdi->dev) {
632 bdi_set_min_ratio(bdi, 0);
609 trace_writeback_bdi_unregister(bdi); 633 trace_writeback_bdi_unregister(bdi);
610 bdi_prune_sb(bdi); 634 bdi_prune_sb(bdi);
611 del_timer_sync(&bdi->wb.wakeup_timer); 635 del_timer_sync(&bdi->wb.wakeup_timer);
@@ -628,9 +652,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
628 INIT_LIST_HEAD(&wb->b_dirty); 652 INIT_LIST_HEAD(&wb->b_dirty);
629 INIT_LIST_HEAD(&wb->b_io); 653 INIT_LIST_HEAD(&wb->b_io);
630 INIT_LIST_HEAD(&wb->b_more_io); 654 INIT_LIST_HEAD(&wb->b_more_io);
655 spin_lock_init(&wb->list_lock);
631 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); 656 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
632} 657}
633 658
659/*
660 * Initial write bandwidth: 100 MB/s
661 */
662#define INIT_BW (100 << (20 - PAGE_SHIFT))
663
634int bdi_init(struct backing_dev_info *bdi) 664int bdi_init(struct backing_dev_info *bdi)
635{ 665{
636 int i, err; 666 int i, err;
@@ -653,6 +683,15 @@ int bdi_init(struct backing_dev_info *bdi)
653 } 683 }
654 684
655 bdi->dirty_exceeded = 0; 685 bdi->dirty_exceeded = 0;
686
687 bdi->bw_time_stamp = jiffies;
688 bdi->written_stamp = 0;
689
690 bdi->balanced_dirty_ratelimit = INIT_BW;
691 bdi->dirty_ratelimit = INIT_BW;
692 bdi->write_bandwidth = INIT_BW;
693 bdi->avg_write_bandwidth = INIT_BW;
694
656 err = prop_local_init_percpu(&bdi->completions); 695 err = prop_local_init_percpu(&bdi->completions);
657 696
658 if (err) { 697 if (err) {
@@ -676,15 +715,24 @@ void bdi_destroy(struct backing_dev_info *bdi)
676 if (bdi_has_dirty_io(bdi)) { 715 if (bdi_has_dirty_io(bdi)) {
677 struct bdi_writeback *dst = &default_backing_dev_info.wb; 716 struct bdi_writeback *dst = &default_backing_dev_info.wb;
678 717
679 spin_lock(&inode_wb_list_lock); 718 bdi_lock_two(&bdi->wb, dst);
680 list_splice(&bdi->wb.b_dirty, &dst->b_dirty); 719 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
681 list_splice(&bdi->wb.b_io, &dst->b_io); 720 list_splice(&bdi->wb.b_io, &dst->b_io);
682 list_splice(&bdi->wb.b_more_io, &dst->b_more_io); 721 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
683 spin_unlock(&inode_wb_list_lock); 722 spin_unlock(&bdi->wb.list_lock);
723 spin_unlock(&dst->list_lock);
684 } 724 }
685 725
686 bdi_unregister(bdi); 726 bdi_unregister(bdi);
687 727
728 /*
729 * If bdi_unregister() had already been called earlier, the
730 * wakeup_timer could still be armed because bdi_prune_sb()
731 * can race with the bdi_wakeup_thread_delayed() calls from
732 * __mark_inode_dirty().
733 */
734 del_timer_sync(&bdi->wb.wakeup_timer);
735
688 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 736 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
689 percpu_counter_destroy(&bdi->bdi_stat[i]); 737 percpu_counter_destroy(&bdi->bdi_stat[i]);
690 738
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 01d5a4b3dd0..9686c4e3f80 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -768,14 +768,13 @@ void * __init alloc_bootmem_section(unsigned long size,
768 unsigned long section_nr) 768 unsigned long section_nr)
769{ 769{
770 bootmem_data_t *bdata; 770 bootmem_data_t *bdata;
771 unsigned long pfn, goal, limit; 771 unsigned long pfn, goal;
772 772
773 pfn = section_nr_to_pfn(section_nr); 773 pfn = section_nr_to_pfn(section_nr);
774 goal = pfn << PAGE_SHIFT; 774 goal = pfn << PAGE_SHIFT;
775 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
776 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 775 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
777 776
778 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 777 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
779} 778}
780#endif 779#endif
781 780
diff --git a/mm/cma-best-fit.c b/mm/cma-best-fit.c
new file mode 100644
index 00000000000..24c27c89cae
--- /dev/null
+++ b/mm/cma-best-fit.c
@@ -0,0 +1,408 @@
1/*
2 * Contiguous Memory Allocator framework: Best Fit allocator
3 * Copyright (c) 2010 by Samsung Electronics.
4 * Written by Michal Nazarewicz (m.nazarewicz@samsung.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License or (at your optional) any later version of the license.
10 */
11
12#define pr_fmt(fmt) "cma: bf: " fmt
13
14#ifdef CONFIG_CMA_DEBUG
15# define DEBUG
16#endif
17
18#include <linux/errno.h> /* Error numbers */
19#include <linux/slab.h> /* kmalloc() */
20
21#include <linux/cma.h> /* CMA structures */
22
23
24/************************* Data Types *************************/
25
26struct cma_bf_item {
27 struct cma_chunk ch;
28 struct rb_node by_size;
29};
30
31struct cma_bf_private {
32 struct rb_root by_start_root;
33 struct rb_root by_size_root;
34};
35
36
37/************************* Prototypes *************************/
38
39/*
40 * Those are only for holes. They must be called whenever hole's
41 * properties change but also whenever chunk becomes a hole or hole
42 * becames a chunk.
43 */
44static void __cma_bf_hole_insert_by_size(struct cma_bf_item *item);
45static void __cma_bf_hole_erase_by_size(struct cma_bf_item *item);
46static int __must_check
47__cma_bf_hole_insert_by_start(struct cma_bf_item *item);
48static void __cma_bf_hole_erase_by_start(struct cma_bf_item *item);
49
50/**
51 * __cma_bf_hole_take - takes a chunk of memory out of a hole.
52 * @hole: hole to take chunk from
53 * @size: chunk's size
54 * @alignment: chunk's starting address alignment (must be power of two)
55 *
56 * Takes a @size bytes large chunk from hole @hole which must be able
57 * to hold the chunk. The "must be able" includes also alignment
58 * constraint.
59 *
60 * Returns allocated item or NULL on error (if kmalloc() failed).
61 */
62static struct cma_bf_item *__must_check
63__cma_bf_hole_take(struct cma_bf_item *hole, size_t size, dma_addr_t alignment);
64
65/**
66 * __cma_bf_hole_merge_maybe - tries to merge hole with neighbours.
67 * @item: hole to try and merge
68 *
69 * Which items are preserved is undefined so you may not rely on it.
70 */
71static void __cma_bf_hole_merge_maybe(struct cma_bf_item *item);
72
73
74/************************* Device API *************************/
75
76int cma_bf_init(struct cma_region *reg)
77{
78 struct cma_bf_private *prv;
79 struct cma_bf_item *item;
80
81 prv = kzalloc(sizeof *prv, GFP_KERNEL);
82 if (unlikely(!prv))
83 return -ENOMEM;
84
85 item = kzalloc(sizeof *item, GFP_KERNEL);
86 if (unlikely(!item)) {
87 kfree(prv);
88 return -ENOMEM;
89 }
90
91 item->ch.start = reg->start;
92 item->ch.size = reg->size;
93 item->ch.reg = reg;
94
95 rb_root_init(&prv->by_start_root, &item->ch.by_start);
96 rb_root_init(&prv->by_size_root, &item->by_size);
97
98 reg->private_data = prv;
99 return 0;
100}
101
102void cma_bf_cleanup(struct cma_region *reg)
103{
104 struct cma_bf_private *prv = reg->private_data;
105 struct cma_bf_item *item =
106 rb_entry(prv->by_size_root.rb_node,
107 struct cma_bf_item, by_size);
108
109 /* We can assume there is only a single hole in the tree. */
110 WARN_ON(item->by_size.rb_left || item->by_size.rb_right ||
111 item->ch.by_start.rb_left || item->ch.by_start.rb_right);
112
113 kfree(item);
114 kfree(prv);
115}
116
117struct cma_chunk *cma_bf_alloc(struct cma_region *reg,
118 size_t size, dma_addr_t alignment)
119{
120 struct cma_bf_private *prv = reg->private_data;
121 struct rb_node *node = prv->by_size_root.rb_node;
122 struct cma_bf_item *item = NULL;
123
124 /* First find hole that is large enough */
125 while (node) {
126 struct cma_bf_item *i =
127 rb_entry(node, struct cma_bf_item, by_size);
128
129 if (i->ch.size < size) {
130 node = node->rb_right;
131 } else if (i->ch.size >= size) {
132 node = node->rb_left;
133 item = i;
134 }
135 }
136 if (!item)
137 return NULL;
138
139 /* Now look for items which can satisfy alignment requirements */
140 node = &item->by_size;
141 for (;;) {
142 dma_addr_t start = ALIGN(item->ch.start, alignment);
143 dma_addr_t end = item->ch.start + item->ch.size;
144 if (start < end && end - start >= size) {
145 item = __cma_bf_hole_take(item, size, alignment);
146 return likely(item) ? &item->ch : NULL;
147 }
148
149 node = rb_next(node);
150 if (!node)
151 return NULL;
152
153 item = rb_entry(node, struct cma_bf_item, by_size);
154 }
155}
156
157void cma_bf_free(struct cma_chunk *chunk)
158{
159 struct cma_bf_item *item = container_of(chunk, struct cma_bf_item, ch);
160
161 /* Add new hole */
162 if (unlikely(__cma_bf_hole_insert_by_start(item))) {
163 /*
164 * We're screwed... Just free the item and forget
165 * about it. Things are broken beyond repair so no
166 * sense in trying to recover.
167 */
168 kfree(item);
169 } else {
170 __cma_bf_hole_insert_by_size(item);
171
172 /* Merge with prev and next sibling */
173 __cma_bf_hole_merge_maybe(item);
174 }
175}
176
177
178/************************* Basic Tree Manipulation *************************/
179
180static void __cma_bf_hole_insert_by_size(struct cma_bf_item *item)
181{
182 struct cma_bf_private *prv = item->ch.reg->private_data;
183 struct rb_node **link = &prv->by_size_root.rb_node, *parent = NULL;
184 const typeof(item->ch.size) value = item->ch.size;
185
186 while (*link) {
187 struct cma_bf_item *i;
188 parent = *link;
189 i = rb_entry(parent, struct cma_bf_item, by_size);
190 link = value <= i->ch.size
191 ? &parent->rb_left
192 : &parent->rb_right;
193 }
194
195 rb_link_node(&item->by_size, parent, link);
196 rb_insert_color(&item->by_size, &prv->by_size_root);
197}
198
199static void __cma_bf_hole_erase_by_size(struct cma_bf_item *item)
200{
201 struct cma_bf_private *prv = item->ch.reg->private_data;
202 rb_erase(&item->by_size, &prv->by_size_root);
203}
204
205static int __must_check
206__cma_bf_hole_insert_by_start(struct cma_bf_item *item)
207{
208 struct cma_bf_private *prv = item->ch.reg->private_data;
209 struct rb_node **link = &prv->by_start_root.rb_node, *parent = NULL;
210 const typeof(item->ch.start) value = item->ch.start;
211
212 while (*link) {
213 struct cma_bf_item *i;
214 parent = *link;
215 i = rb_entry(parent, struct cma_bf_item, ch.by_start);
216
217 if (WARN_ON(value == i->ch.start))
218 /*
219 * This should *never* happen. And I mean
220 * *never*. We could even BUG on it but
221 * hopefully things are only a bit broken,
222 * ie. system can still run. We produce
223 * a warning and return an error.
224 */
225 return -EBUSY;
226
227 link = value <= i->ch.start
228 ? &parent->rb_left
229 : &parent->rb_right;
230 }
231
232 rb_link_node(&item->ch.by_start, parent, link);
233 rb_insert_color(&item->ch.by_start, &prv->by_start_root);
234 return 0;
235}
236
237static void __cma_bf_hole_erase_by_start(struct cma_bf_item *item)
238{
239 struct cma_bf_private *prv = item->ch.reg->private_data;
240 rb_erase(&item->ch.by_start, &prv->by_start_root);
241}
242
243
244/************************* More Tree Manipulation *************************/
245
246static struct cma_bf_item *__must_check
247__cma_bf_hole_take(struct cma_bf_item *hole, size_t size, size_t alignment)
248{
249 struct cma_bf_item *item;
250
251 /*
252 * There are three cases:
253 * 1. the chunk takes the whole hole,
254 * 2. the chunk is at the beginning or at the end of the hole, or
255 * 3. the chunk is in the middle of the hole.
256 */
257
258
259 /* Case 1, the whole hole */
260 if (size == hole->ch.size) {
261 __cma_bf_hole_erase_by_size(hole);
262 __cma_bf_hole_erase_by_start(hole);
263 return hole;
264 }
265
266
267 /* Allocate */
268 item = kmalloc(sizeof *item, GFP_KERNEL);
269 if (unlikely(!item))
270 return NULL;
271
272 item->ch.start = ALIGN(hole->ch.start, alignment);
273 item->ch.size = size;
274
275 /* Case 3, in the middle */
276 if (item->ch.start != hole->ch.start
277 && item->ch.start + item->ch.size !=
278 hole->ch.start + hole->ch.size) {
279 struct cma_bf_item *tail;
280
281 /*
282 * Space between the end of the chunk and the end of
283 * the region, ie. space left after the end of the
284 * chunk. If this is dividable by alignment we can
285 * move the chunk to the end of the hole.
286 */
287 size_t left =
288 hole->ch.start + hole->ch.size -
289 (item->ch.start + item->ch.size);
290 if (left % alignment == 0) {
291 item->ch.start += left;
292 goto case_2;
293 }
294
295 /*
296 * We are going to add a hole at the end. This way,
297 * we will reduce the problem to case 2 -- the chunk
298 * will be at the end of the hole.
299 */
300 tail = kmalloc(sizeof *tail, GFP_KERNEL);
301 if (unlikely(!tail)) {
302 kfree(item);
303 return NULL;
304 }
305
306 tail->ch.start = item->ch.start + item->ch.size;
307 tail->ch.size =
308 hole->ch.start + hole->ch.size - tail->ch.start;
309 tail->ch.reg = hole->ch.reg;
310
311 if (unlikely(__cma_bf_hole_insert_by_start(tail))) {
312 /*
313 * Things are broken beyond repair... Abort
314 * inserting the hole but still continue with
315 * allocation (seems like the best we can do).
316 */
317
318 hole->ch.size = tail->ch.start - hole->ch.start;
319 kfree(tail);
320 } else {
321 __cma_bf_hole_insert_by_size(tail);
322 /*
323 * It's important that we first insert the new
324 * hole in the tree sorted by size and later
325 * reduce the size of the old hole. We will
326 * update the position of the old hole in the
327 * rb tree in code that handles case 2.
328 */
329 hole->ch.size = tail->ch.start - hole->ch.start;
330 }
331
332 /* Go to case 2 */
333 }
334
335
336 /* Case 2, at the beginning or at the end */
337case_2:
338 /* No need to update the tree; order preserved. */
339 if (item->ch.start == hole->ch.start)
340 hole->ch.start += item->ch.size;
341
342 /* Alter hole's size */
343 hole->ch.size -= size;
344 __cma_bf_hole_erase_by_size(hole);
345 __cma_bf_hole_insert_by_size(hole);
346
347 return item;
348}
349
350
351static void __cma_bf_hole_merge_maybe(struct cma_bf_item *item)
352{
353 struct cma_bf_item *prev;
354 struct rb_node *node;
355 int twice = 2;
356
357 node = rb_prev(&item->ch.by_start);
358 if (unlikely(!node))
359 goto next;
360 prev = rb_entry(node, struct cma_bf_item, ch.by_start);
361
362 for (;;) {
363 if (prev->ch.start + prev->ch.size == item->ch.start) {
364 /* Remove previous hole from trees */
365 __cma_bf_hole_erase_by_size(prev);
366 __cma_bf_hole_erase_by_start(prev);
367
368 /* Alter this hole */
369 item->ch.size += prev->ch.size;
370 item->ch.start = prev->ch.start;
371 __cma_bf_hole_erase_by_size(item);
372 __cma_bf_hole_insert_by_size(item);
373 /*
374 * No need to update by start trees as we do
375 * not break sequence order
376 */
377
378 /* Free prev hole */
379 kfree(prev);
380 }
381
382next:
383 if (!--twice)
384 break;
385
386 node = rb_next(&item->ch.by_start);
387 if (unlikely(!node))
388 break;
389 prev = item;
390 item = rb_entry(node, struct cma_bf_item, ch.by_start);
391 }
392}
393
394
395
396/************************* Register *************************/
397static int cma_bf_module_init(void)
398{
399 static struct cma_allocator alloc = {
400 .name = "bf",
401 .init = cma_bf_init,
402 .cleanup = cma_bf_cleanup,
403 .alloc = cma_bf_alloc,
404 .free = cma_bf_free,
405 };
406 return cma_allocator_register(&alloc);
407}
408module_init(cma_bf_module_init);
diff --git a/mm/cma.c b/mm/cma.c
new file mode 100644
index 00000000000..546dd861bdb
--- /dev/null
+++ b/mm/cma.c
@@ -0,0 +1,1413 @@
1/*
2 * Contiguous Memory Allocator framework
3 * Copyright (c) 2010 by Samsung Electronics.
4 * Written by Michal Nazarewicz (m.nazarewicz@samsung.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License or (at your optional) any later version of the license.
10 */
11
12/*
13 * See Documentation/contiguous-memory.txt for details.
14 */
15
16#define pr_fmt(fmt) "cma: " fmt
17
18#ifdef CONFIG_CMA_DEBUG
19# define DEBUG
20#endif
21
22#ifndef CONFIG_NO_BOOTMEM
23# include <linux/bootmem.h> /* alloc_bootmem_pages_nopanic() */
24#endif
25#ifdef CONFIG_HAVE_MEMBLOCK
26# include <linux/memblock.h> /* memblock*() */
27#endif
28#include <linux/device.h> /* struct device, dev_name() */
29#include <linux/errno.h> /* Error numbers */
30#include <linux/err.h> /* IS_ERR, PTR_ERR, etc. */
31#include <linux/mm.h> /* PAGE_ALIGN() */
32#include <linux/module.h> /* EXPORT_SYMBOL_GPL() */
33#include <linux/mutex.h> /* mutex */
34#include <linux/slab.h> /* kmalloc() */
35#include <linux/string.h> /* str*() */
36
37#include <linux/cma.h>
38#include <linux/vmalloc.h>
39
40/*
41 * Protects cma_regions, cma_allocators, cma_map, cma_map_length,
42 * cma_kobj, cma_sysfs_regions and cma_chunks_by_start.
43 */
44static DEFINE_MUTEX(cma_mutex);
45
46
47
48/************************* Map attribute *************************/
49
50static const char *cma_map;
51static size_t cma_map_length;
52
53/*
54 * map-attr ::= [ rules [ ';' ] ]
55 * rules ::= rule [ ';' rules ]
56 * rule ::= patterns '=' regions
57 * patterns ::= pattern [ ',' patterns ]
58 * regions ::= REG-NAME [ ',' regions ]
59 * pattern ::= dev-pattern [ '/' TYPE-NAME ] | '/' TYPE-NAME
60 *
61 * See Documentation/contiguous-memory.txt for details.
62 */
63static ssize_t cma_map_validate(const char *param)
64{
65 const char *ch = param;
66
67 if (*ch == '\0' || *ch == '\n')
68 return 0;
69
70 for (;;) {
71 const char *start = ch;
72
73 while (*ch && *ch != '\n' && *ch != ';' && *ch != '=')
74 ++ch;
75
76 if (*ch != '=' || start == ch) {
77 pr_err("map: expecting \"<patterns>=<regions>\" near %s\n",
78 start);
79 return -EINVAL;
80 }
81
82 while (*++ch != ';')
83 if (*ch == '\0' || *ch == '\n')
84 return ch - param;
85 if (ch[1] == '\0' || ch[1] == '\n')
86 return ch - param;
87 ++ch;
88 }
89}
90
91static int __init cma_map_param(char *param)
92{
93 ssize_t len;
94
95 pr_debug("param: map: %s\n", param);
96
97 len = cma_map_validate(param);
98 if (len < 0)
99 return len;
100
101 cma_map = param;
102 cma_map_length = len;
103 return 0;
104}
105
106#if defined CONFIG_CMA_CMDLINE
107
108early_param("cma.map", cma_map_param);
109
110#endif
111
112
113
114/************************* Early regions *************************/
115
116struct list_head cma_early_regions __initdata =
117 LIST_HEAD_INIT(cma_early_regions);
118
119#ifdef CONFIG_CMA_CMDLINE
120
121/*
122 * regions-attr ::= [ regions [ ';' ] ]
123 * regions ::= region [ ';' regions ]
124 *
125 * region ::= [ '-' ] reg-name
126 * '=' size
127 * [ '@' start ]
128 * [ '/' alignment ]
129 * [ ':' alloc-name ]
130 *
131 * See Documentation/contiguous-memory.txt for details.
132 *
133 * Example:
134 * cma=reg1=64M:bf;reg2=32M@0x100000:bf;reg3=64M/1M:bf
135 *
136 * If allocator is ommited the first available allocater will be used.
137 */
138
139#define NUMPARSE(cond_ch, type, cond) ({ \
140 unsigned long long v = 0; \
141 if (*param == (cond_ch)) { \
142 const char *const msg = param + 1; \
143 v = memparse(msg, &param); \
144 if (!v || v > ~(type)0 || !(cond)) { \
145 pr_err("param: invalid value near %s\n", msg); \
146 ret = -EINVAL; \
147 break; \
148 } \
149 } \
150 v; \
151 })
152
153static int __init cma_param_parse(char *param)
154{
155 static struct cma_region regions[16];
156
157 size_t left = ARRAY_SIZE(regions);
158 struct cma_region *reg = regions;
159 int ret = 0;
160
161 pr_debug("param: %s\n", param);
162
163 for (; *param; ++reg) {
164 dma_addr_t start, alignment;
165 size_t size;
166
167 if (unlikely(!--left)) {
168 pr_err("param: too many early regions\n");
169 return -ENOSPC;
170 }
171
172 /* Parse name */
173 reg->name = param;
174 param = strchr(param, '=');
175 if (!param || param == reg->name) {
176 pr_err("param: expected \"<name>=\" near %s\n",
177 reg->name);
178 ret = -EINVAL;
179 break;
180 }
181 *param = '\0';
182
183 /* Parse numbers */
184 size = NUMPARSE('\0', size_t, true);
185 start = NUMPARSE('@', dma_addr_t, true);
186 alignment = NUMPARSE('/', dma_addr_t, (v & (v - 1)) == 0);
187
188 alignment = max(alignment, (dma_addr_t)PAGE_SIZE);
189 start = ALIGN(start, alignment);
190 size = PAGE_ALIGN(size);
191 if (start + size < start) {
192 pr_err("param: invalid start, size combination\n");
193 ret = -EINVAL;
194 break;
195 }
196
197 /* Parse allocator */
198 if (*param == ':') {
199 reg->alloc_name = ++param;
200 while (*param && *param != ';')
201 ++param;
202 if (param == reg->alloc_name)
203 reg->alloc_name = NULL;
204 }
205
206 /* Go to next */
207 if (*param == ';') {
208 *param = '\0';
209 ++param;
210 } else if (*param) {
211 pr_err("param: expecting ';' or end of parameter near %s\n",
212 param);
213 ret = -EINVAL;
214 break;
215 }
216
217 /* Add */
218 reg->size = size;
219 reg->start = start;
220 reg->alignment = alignment;
221 reg->copy_name = 1;
222
223 list_add_tail(&reg->list, &cma_early_regions);
224
225 pr_debug("param: registering early region %s (%p@%p/%p)\n",
226 reg->name, (void *)reg->size, (void *)reg->start,
227 (void *)reg->alignment);
228 }
229
230 return ret;
231}
232early_param("cma", cma_param_parse);
233
234#undef NUMPARSE
235
236#endif
237
238
239int __init __must_check cma_early_region_register(struct cma_region *reg)
240{
241 dma_addr_t start, alignment;
242 size_t size;
243
244 if (reg->alignment & (reg->alignment - 1))
245 return -EINVAL;
246
247 alignment = max(reg->alignment, (dma_addr_t)PAGE_SIZE);
248 start = ALIGN(reg->start, alignment);
249 size = PAGE_ALIGN(reg->size);
250
251 if (start + size < start)
252 return -EINVAL;
253
254 reg->size = size;
255 reg->start = start;
256 reg->alignment = alignment;
257
258 list_add_tail(&reg->list, &cma_early_regions);
259
260 pr_debug("param: registering early region %s (%p@%p/%p)\n",
261 reg->name, (void *)reg->size, (void *)reg->start,
262 (void *)reg->alignment);
263
264 return 0;
265}
266
267
268
269/************************* Regions & Allocators *************************/
270
271static void __cma_sysfs_region_add(struct cma_region *reg);
272
273static int __cma_region_attach_alloc(struct cma_region *reg);
274static void __maybe_unused __cma_region_detach_alloc(struct cma_region *reg);
275
276
277/* List of all regions. Named regions are kept before unnamed. */
278static LIST_HEAD(cma_regions);
279
280#define cma_foreach_region(reg) \
281 list_for_each_entry(reg, &cma_regions, list)
282
283int __must_check cma_region_register(struct cma_region *reg)
284{
285 const char *name, *alloc_name;
286 struct cma_region *r;
287 char *ch = NULL;
288 int ret = 0;
289
290 if (!reg->size || reg->start + reg->size < reg->start)
291 return -EINVAL;
292
293 reg->users = 0;
294 reg->used = 0;
295 reg->private_data = NULL;
296 reg->registered = 0;
297 reg->free_space = reg->size;
298
299 /* Copy name and alloc_name */
300 name = reg->name;
301 alloc_name = reg->alloc_name;
302 if (reg->copy_name && (reg->name || reg->alloc_name)) {
303 size_t name_size, alloc_size;
304
305 name_size = reg->name ? strlen(reg->name) + 1 : 0;
306 alloc_size = reg->alloc_name ? strlen(reg->alloc_name) + 1 : 0;
307
308 ch = kmalloc(name_size + alloc_size, GFP_KERNEL);
309 if (!ch) {
310 pr_err("%s: not enough memory to allocate name\n",
311 reg->name ?: "(private)");
312 return -ENOMEM;
313 }
314
315 if (name_size) {
316 memcpy(ch, reg->name, name_size);
317 name = ch;
318 ch += name_size;
319 }
320
321 if (alloc_size) {
322 memcpy(ch, reg->alloc_name, alloc_size);
323 alloc_name = ch;
324 }
325 }
326
327 mutex_lock(&cma_mutex);
328
329 /* Don't let regions overlap */
330 cma_foreach_region(r)
331 if (r->start + r->size > reg->start &&
332 r->start < reg->start + reg->size) {
333 ret = -EADDRINUSE;
334 goto done;
335 }
336
337 if (reg->alloc) {
338 ret = __cma_region_attach_alloc(reg);
339 if (unlikely(ret < 0))
340 goto done;
341 }
342
343 reg->name = name;
344 reg->alloc_name = alloc_name;
345 reg->registered = 1;
346 ch = NULL;
347
348 /*
349 * Keep named at the beginning and unnamed (private) at the
350 * end. This helps in traversal when named region is looked
351 * for.
352 */
353 if (name)
354 list_add(&reg->list, &cma_regions);
355 else
356 list_add_tail(&reg->list, &cma_regions);
357
358 __cma_sysfs_region_add(reg);
359
360done:
361 mutex_unlock(&cma_mutex);
362
363 pr_debug("%s: region %sregistered\n",
364 reg->name ?: "(private)", ret ? "not " : "");
365 kfree(ch);
366
367 return ret;
368}
369EXPORT_SYMBOL_GPL(cma_region_register);
370
371static struct cma_region *__must_check
372__cma_region_find(const char **namep)
373{
374 struct cma_region *reg;
375 const char *ch, *name;
376 size_t n;
377
378 ch = *namep;
379 while (*ch && *ch != ',' && *ch != ';')
380 ++ch;
381 name = *namep;
382 *namep = *ch == ',' ? ch + 1 : ch;
383 n = ch - name;
384
385 /*
386 * Named regions are kept in front of unnamed so if we
387 * encounter unnamed region we can stop.
388 */
389 cma_foreach_region(reg)
390 if (!reg->name)
391 break;
392 else if (!strncmp(name, reg->name, n) && !reg->name[n])
393 return reg;
394
395 return NULL;
396}
397
398
399/* List of all allocators. */
400static LIST_HEAD(cma_allocators);
401
402#define cma_foreach_allocator(alloc) \
403 list_for_each_entry(alloc, &cma_allocators, list)
404
405int cma_allocator_register(struct cma_allocator *alloc)
406{
407 struct cma_region *reg;
408 int first;
409
410 if (!alloc->alloc || !alloc->free)
411 return -EINVAL;
412
413 mutex_lock(&cma_mutex);
414
415 first = list_empty(&cma_allocators);
416
417 list_add_tail(&alloc->list, &cma_allocators);
418
419 /*
420 * Attach this allocator to all allocator-less regions that
421 * request this particular allocator (reg->alloc_name equals
422 * alloc->name) or if region wants the first available
423 * allocator and we are the first.
424 */
425 cma_foreach_region(reg) {
426 if (reg->alloc)
427 continue;
428 if (reg->alloc_name
429 ? alloc->name && !strcmp(alloc->name, reg->alloc_name)
430 : (!reg->used && first))
431 continue;
432
433 reg->alloc = alloc;
434 __cma_region_attach_alloc(reg);
435 }
436
437 mutex_unlock(&cma_mutex);
438
439 pr_debug("%s: allocator registered\n", alloc->name ?: "(unnamed)");
440
441 return 0;
442}
443EXPORT_SYMBOL_GPL(cma_allocator_register);
444
445static struct cma_allocator *__must_check
446__cma_allocator_find(const char *name)
447{
448 struct cma_allocator *alloc;
449
450 if (!name)
451 return list_empty(&cma_allocators)
452 ? NULL
453 : list_entry(cma_allocators.next,
454 struct cma_allocator, list);
455
456 cma_foreach_allocator(alloc)
457 if (alloc->name && !strcmp(name, alloc->name))
458 return alloc;
459
460 return NULL;
461}
462
463
464
465/************************* Initialise CMA *************************/
466
467int __init cma_set_defaults(struct cma_region *regions, const char *map)
468{
469 if (map) {
470 int ret = cma_map_param((char *)map);
471 if (unlikely(ret < 0))
472 return ret;
473 }
474
475 if (!regions)
476 return 0;
477
478 for (; regions->size; ++regions) {
479 int ret = cma_early_region_register(regions);
480 if (unlikely(ret < 0))
481 return ret;
482 }
483
484 return 0;
485}
486
487
488int __init cma_early_region_reserve(struct cma_region *reg)
489{
490 int tried = 0;
491
492 if (!reg->size || (reg->alignment & (reg->alignment - 1)) ||
493 reg->reserved)
494 return -EINVAL;
495
496#ifndef CONFIG_NO_BOOTMEM
497
498 tried = 1;
499
500 {
501 void *ptr = __alloc_bootmem_nopanic(reg->size, reg->alignment,
502 reg->start);
503 if (ptr) {
504 reg->start = virt_to_phys(ptr);
505 reg->reserved = 1;
506 return 0;
507 }
508 }
509
510#endif
511
512#ifdef CONFIG_HAVE_MEMBLOCK
513
514 tried = 1;
515
516 if (reg->start) {
517 if (!memblock_is_region_reserved(reg->start, reg->size) &&
518 memblock_reserve(reg->start, reg->size) >= 0) {
519 reg->reserved = 1;
520 return 0;
521 }
522 } else {
523 /*
524 * Use __memblock_alloc_base() since
525 * memblock_alloc_base() panic()s.
526 */
527 u64 ret = __memblock_alloc_base(reg->size, reg->alignment, 0);
528 if (ret &&
529 ret < ~(dma_addr_t)0 &&
530 ret + reg->size < ~(dma_addr_t)0 &&
531 ret + reg->size > ret) {
532 reg->start = ret;
533 reg->reserved = 1;
534 return 0;
535 }
536
537 if (ret)
538 memblock_free(ret, reg->size);
539 }
540
541#endif
542
543 return tried ? -ENOMEM : -EOPNOTSUPP;
544}
545
546void __init cma_early_regions_reserve(int (*reserve)(struct cma_region *reg))
547{
548 struct cma_region *reg;
549
550 pr_debug("init: reserving early regions\n");
551
552 if (!reserve)
553 reserve = cma_early_region_reserve;
554
555 list_for_each_entry(reg, &cma_early_regions, list) {
556 if (reg->reserved) {
557 /* nothing */
558 } else if (reserve(reg) >= 0) {
559 pr_debug("init: %s: reserved %p@%p\n",
560 reg->name ?: "(private)",
561 (void *)reg->size, (void *)reg->start);
562 reg->reserved = 1;
563 } else {
564 pr_warn("init: %s: unable to reserve %p@%p/%p\n",
565 reg->name ?: "(private)",
566 (void *)reg->size, (void *)reg->start,
567 (void *)reg->alignment);
568 }
569 }
570}
571
572
573static int __init cma_init(void)
574{
575 struct cma_region *reg, *n;
576
577 pr_debug("init: initialising\n");
578
579 if (cma_map) {
580 char *val = kmemdup(cma_map, cma_map_length + 1, GFP_KERNEL);
581 cma_map = val;
582 if (!val)
583 return -ENOMEM;
584 val[cma_map_length] = '\0';
585 }
586
587 list_for_each_entry_safe(reg, n, &cma_early_regions, list) {
588 INIT_LIST_HEAD(&reg->list);
589 /*
590 * We don't care if there was an error. It's a pity
591 * but there's not much we can do about it any way.
592 * If the error is on a region that was parsed from
593 * command line then it will stay and waste a bit of
594 * space; if it was registered using
595 * cma_early_region_register() it's caller's
596 * responsibility to do something about it.
597 */
598 if (reg->reserved && cma_region_register(reg) < 0)
599 /* ignore error */;
600 }
601
602 INIT_LIST_HEAD(&cma_early_regions);
603
604 return 0;
605}
606/*
607 * We want to be initialised earlier than module_init/__initcall so
608 * that drivers that want to grab memory at boot time will get CMA
609 * ready. subsys_initcall() seems early enough and not too early at
610 * the same time.
611 */
612subsys_initcall(cma_init);
613
614
615
616/************************* SysFS *************************/
617
618#if defined CONFIG_CMA_SYSFS
619
620static struct kobject cma_sysfs_regions;
621static int cma_sysfs_regions_ready;
622
623
624#define CMA_ATTR_INLINE(_type, _name) \
625 (&((struct cma_ ## _type ## _attribute){ \
626 .attr = { \
627 .name = __stringify(_name), \
628 .mode = 0644, \
629 }, \
630 .show = cma_sysfs_ ## _type ## _ ## _name ## _show, \
631 .store = cma_sysfs_ ## _type ## _ ## _name ## _store, \
632 }).attr)
633
634#define CMA_ATTR_RO_INLINE(_type, _name) \
635 (&((struct cma_ ## _type ## _attribute){ \
636 .attr = { \
637 .name = __stringify(_name), \
638 .mode = 0444, \
639 }, \
640 .show = cma_sysfs_ ## _type ## _ ## _name ## _show, \
641 }).attr)
642
643
644struct cma_root_attribute {
645 struct attribute attr;
646 ssize_t (*show)(char *buf);
647 int (*store)(const char *buf);
648};
649
650static ssize_t cma_sysfs_root_map_show(char *page)
651{
652 ssize_t len;
653
654 len = cma_map_length;
655 if (!len) {
656 *page = 0;
657 len = 0;
658 } else {
659 if (len > (size_t)PAGE_SIZE - 1)
660 len = (size_t)PAGE_SIZE - 1;
661 memcpy(page, cma_map, len);
662 page[len++] = '\n';
663 }
664
665 return len;
666}
667
668static int cma_sysfs_root_map_store(const char *page)
669{
670 ssize_t len = cma_map_validate(page);
671 char *val = NULL;
672
673 if (len < 0)
674 return len;
675
676 if (len) {
677 val = kmemdup(page, len + 1, GFP_KERNEL);
678 if (!val)
679 return -ENOMEM;
680 val[len] = '\0';
681 }
682
683 kfree(cma_map);
684 cma_map = val;
685 cma_map_length = len;
686
687 return 0;
688}
689
690static ssize_t cma_sysfs_root_allocators_show(char *page)
691{
692 struct cma_allocator *alloc;
693 size_t left = PAGE_SIZE;
694 char *ch = page;
695
696 cma_foreach_allocator(alloc) {
697 ssize_t l = snprintf(ch, left, "%s ", alloc->name ?: "-");
698 ch += l;
699 left -= l;
700 }
701
702 if (ch != page)
703 ch[-1] = '\n';
704 return ch - page;
705}
706
707static ssize_t
708cma_sysfs_root_show(struct kobject *kobj, struct attribute *attr, char *buf)
709{
710 struct cma_root_attribute *rattr =
711 container_of(attr, struct cma_root_attribute, attr);
712 ssize_t ret;
713
714 mutex_lock(&cma_mutex);
715 ret = rattr->show(buf);
716 mutex_unlock(&cma_mutex);
717
718 return ret;
719}
720
721static ssize_t
722cma_sysfs_root_store(struct kobject *kobj, struct attribute *attr,
723 const char *buf, size_t count)
724{
725 struct cma_root_attribute *rattr =
726 container_of(attr, struct cma_root_attribute, attr);
727 int ret;
728
729 mutex_lock(&cma_mutex);
730 ret = rattr->store(buf);
731 mutex_unlock(&cma_mutex);
732
733 return ret < 0 ? ret : count;
734}
735
736static struct kobj_type cma_sysfs_root_type = {
737 .sysfs_ops = &(const struct sysfs_ops){
738 .show = cma_sysfs_root_show,
739 .store = cma_sysfs_root_store,
740 },
741 .default_attrs = (struct attribute * []) {
742 CMA_ATTR_INLINE(root, map),
743 CMA_ATTR_RO_INLINE(root, allocators),
744 NULL
745 },
746};
747
748static int __init cma_sysfs_init(void)
749{
750 static struct kobject root;
751 static struct kobj_type fake_type;
752
753 struct cma_region *reg;
754 int ret;
755
756 /* Root */
757 ret = kobject_init_and_add(&root, &cma_sysfs_root_type,
758 mm_kobj, "contiguous");
759 if (unlikely(ret < 0)) {
760 pr_err("init: unable to add root kobject: %d\n", ret);
761 return ret;
762 }
763
764 /* Regions */
765 ret = kobject_init_and_add(&cma_sysfs_regions, &fake_type,
766 &root, "regions");
767 if (unlikely(ret < 0)) {
768 pr_err("init: unable to add regions kobject: %d\n", ret);
769 return ret;
770 }
771
772 mutex_lock(&cma_mutex);
773 cma_sysfs_regions_ready = 1;
774 cma_foreach_region(reg)
775 __cma_sysfs_region_add(reg);
776 mutex_unlock(&cma_mutex);
777
778 return 0;
779}
780device_initcall(cma_sysfs_init);
781
782
783
784struct cma_region_attribute {
785 struct attribute attr;
786 ssize_t (*show)(struct cma_region *reg, char *buf);
787 int (*store)(struct cma_region *reg, const char *buf);
788};
789
790
791static ssize_t cma_sysfs_region_name_show(struct cma_region *reg, char *page)
792{
793 return reg->name ? snprintf(page, PAGE_SIZE, "%s\n", reg->name) : 0;
794}
795
796static ssize_t cma_sysfs_region_start_show(struct cma_region *reg, char *page)
797{
798 return snprintf(page, PAGE_SIZE, "%p\n", (void *)reg->start);
799}
800
801static ssize_t cma_sysfs_region_size_show(struct cma_region *reg, char *page)
802{
803 return snprintf(page, PAGE_SIZE, "%zu\n", reg->size);
804}
805
806static ssize_t cma_sysfs_region_free_show(struct cma_region *reg, char *page)
807{
808 return snprintf(page, PAGE_SIZE, "%zu\n", reg->free_space);
809}
810
811static ssize_t cma_sysfs_region_users_show(struct cma_region *reg, char *page)
812{
813 return snprintf(page, PAGE_SIZE, "%u\n", reg->users);
814}
815
816static ssize_t cma_sysfs_region_alloc_show(struct cma_region *reg, char *page)
817{
818 if (reg->alloc)
819 return snprintf(page, PAGE_SIZE, "%s\n",
820 reg->alloc->name ?: "-");
821 else if (reg->alloc_name)
822 return snprintf(page, PAGE_SIZE, "[%s]\n", reg->alloc_name);
823 else
824 return 0;
825}
826
827static int
828cma_sysfs_region_alloc_store(struct cma_region *reg, const char *page)
829{
830 char *s;
831
832 if (reg->alloc && reg->users)
833 return -EBUSY;
834
835 if (!*page || *page == '\n') {
836 s = NULL;
837 } else {
838 size_t len;
839
840 for (s = (char *)page; *++s && *s != '\n'; )
841 /* nop */;
842
843 len = s - page;
844 s = kmemdup(page, len + 1, GFP_KERNEL);
845 if (!s)
846 return -ENOMEM;
847 s[len] = '\0';
848 }
849
850 if (reg->alloc)
851 __cma_region_detach_alloc(reg);
852
853 if (reg->free_alloc_name)
854 kfree(reg->alloc_name);
855
856 reg->alloc_name = s;
857 reg->free_alloc_name = !!s;
858
859 return 0;
860}
861
862
863static ssize_t
864cma_sysfs_region_show(struct kobject *kobj, struct attribute *attr,
865 char *buf)
866{
867 struct cma_region *reg = container_of(kobj, struct cma_region, kobj);
868 struct cma_region_attribute *rattr =
869 container_of(attr, struct cma_region_attribute, attr);
870 ssize_t ret;
871
872 mutex_lock(&cma_mutex);
873 ret = rattr->show(reg, buf);
874 mutex_unlock(&cma_mutex);
875
876 return ret;
877}
878
879static int
880cma_sysfs_region_store(struct kobject *kobj, struct attribute *attr,
881 const char *buf, size_t count)
882{
883 struct cma_region *reg = container_of(kobj, struct cma_region, kobj);
884 struct cma_region_attribute *rattr =
885 container_of(attr, struct cma_region_attribute, attr);
886 int ret;
887
888 mutex_lock(&cma_mutex);
889 ret = rattr->store(reg, buf);
890 mutex_unlock(&cma_mutex);
891
892 return ret < 0 ? ret : count;
893}
894
895static struct kobj_type cma_sysfs_region_type = {
896 .sysfs_ops = &(const struct sysfs_ops){
897 .show = cma_sysfs_region_show,
898 .store = cma_sysfs_region_store,
899 },
900 .default_attrs = (struct attribute * []) {
901 CMA_ATTR_RO_INLINE(region, name),
902 CMA_ATTR_RO_INLINE(region, start),
903 CMA_ATTR_RO_INLINE(region, size),
904 CMA_ATTR_RO_INLINE(region, free),
905 CMA_ATTR_RO_INLINE(region, users),
906 CMA_ATTR_INLINE(region, alloc),
907 NULL
908 },
909};
910
911static void __cma_sysfs_region_add(struct cma_region *reg)
912{
913 int ret;
914
915 if (!cma_sysfs_regions_ready)
916 return;
917
918 memset(&reg->kobj, 0, sizeof reg->kobj);
919
920 ret = kobject_init_and_add(&reg->kobj, &cma_sysfs_region_type,
921 &cma_sysfs_regions,
922 "%p", (void *)reg->start);
923
924 if (reg->name &&
925 sysfs_create_link(&cma_sysfs_regions, &reg->kobj, reg->name) < 0)
926 /* Ignore any errors. */;
927}
928
929#else
930
931static void __cma_sysfs_region_add(struct cma_region *reg)
932{
933 /* nop */
934}
935
936#endif
937
938
939/************************* Chunks *************************/
940
941/* All chunks sorted by start address. */
942static struct rb_root cma_chunks_by_start;
943
944static struct cma_chunk *__must_check __cma_chunk_find(dma_addr_t addr)
945{
946 struct cma_chunk *chunk;
947 struct rb_node *n;
948
949 for (n = cma_chunks_by_start.rb_node; n; ) {
950 chunk = rb_entry(n, struct cma_chunk, by_start);
951 if (addr < chunk->start)
952 n = n->rb_left;
953 else if (addr > chunk->start)
954 n = n->rb_right;
955 else
956 return chunk;
957 }
958 WARN(1, KERN_WARNING "no chunk starting at %p\n", (void *)addr);
959 return NULL;
960}
961
962static int __must_check __cma_chunk_insert(struct cma_chunk *chunk)
963{
964 struct rb_node **new, *parent = NULL;
965 typeof(chunk->start) addr = chunk->start;
966
967 for (new = &cma_chunks_by_start.rb_node; *new; ) {
968 struct cma_chunk *c =
969 container_of(*new, struct cma_chunk, by_start);
970
971 parent = *new;
972 if (addr < c->start) {
973 new = &(*new)->rb_left;
974 } else if (addr > c->start) {
975 new = &(*new)->rb_right;
976 } else {
977 /*
978 * We should never be here. If we are it
979 * means allocator gave us an invalid chunk
980 * (one that has already been allocated) so we
981 * refuse to accept it. Our caller will
982 * recover by freeing the chunk.
983 */
984 WARN_ON(1);
985 return -EADDRINUSE;
986 }
987 }
988
989 rb_link_node(&chunk->by_start, parent, new);
990 rb_insert_color(&chunk->by_start, &cma_chunks_by_start);
991
992 return 0;
993}
994
995static void __cma_chunk_free(struct cma_chunk *chunk)
996{
997 rb_erase(&chunk->by_start, &cma_chunks_by_start);
998
999 chunk->reg->free_space += chunk->size;
1000 --chunk->reg->users;
1001
1002 chunk->reg->alloc->free(chunk);
1003}
1004
1005
1006/************************* The Device API *************************/
1007
1008static const char *__must_check
1009__cma_where_from(const struct device *dev, const char *type);
1010
1011
1012/* Allocate. */
1013
1014static dma_addr_t __must_check
1015__cma_alloc_from_region(struct cma_region *reg,
1016 size_t size, dma_addr_t alignment)
1017{
1018 struct cma_chunk *chunk;
1019
1020 pr_debug("allocate %p/%p from %s\n",
1021 (void *)size, (void *)alignment,
1022 reg ? reg->name ?: "(private)" : "(null)");
1023
1024 if (!reg || reg->free_space < size)
1025 return -ENOMEM;
1026
1027 if (!reg->alloc) {
1028 if (!reg->used)
1029 __cma_region_attach_alloc(reg);
1030 if (!reg->alloc)
1031 return -ENOMEM;
1032 }
1033
1034 chunk = reg->alloc->alloc(reg, size, alignment);
1035 if (!chunk)
1036 return -ENOMEM;
1037
1038 if (unlikely(__cma_chunk_insert(chunk) < 0)) {
1039 /* We should *never* be here. */
1040 chunk->reg->alloc->free(chunk);
1041 kfree(chunk);
1042 return -EADDRINUSE;
1043 }
1044
1045 chunk->reg = reg;
1046 ++reg->users;
1047 reg->free_space -= chunk->size;
1048 pr_debug("allocated at %p\n", (void *)chunk->start);
1049 return chunk->start;
1050}
1051
1052dma_addr_t __must_check
1053cma_alloc_from_region(struct cma_region *reg,
1054 size_t size, dma_addr_t alignment)
1055{
1056 dma_addr_t addr;
1057
1058 pr_debug("allocate %p/%p from %s\n",
1059 (void *)size, (void *)alignment,
1060 reg ? reg->name ?: "(private)" : "(null)");
1061
1062 if (!size || alignment & (alignment - 1) || !reg)
1063 return -EINVAL;
1064
1065 mutex_lock(&cma_mutex);
1066
1067 addr = reg->registered ?
1068 __cma_alloc_from_region(reg, PAGE_ALIGN(size),
1069 max(alignment, (dma_addr_t)PAGE_SIZE)) :
1070 -EINVAL;
1071
1072 mutex_unlock(&cma_mutex);
1073
1074 return addr;
1075}
1076EXPORT_SYMBOL_GPL(cma_alloc_from_region);
1077
1078dma_addr_t __must_check
1079__cma_alloc(const struct device *dev, const char *type,
1080 dma_addr_t size, dma_addr_t alignment)
1081{
1082 struct cma_region *reg;
1083 const char *from;
1084 dma_addr_t addr;
1085
1086 if (dev)
1087 pr_debug("allocate %p/%p for %s/%s\n",
1088 (void *)size, (void *)alignment,
1089 dev_name(dev), type ?: "");
1090
1091 if (!size || (alignment & ~alignment))
1092 return -EINVAL;
1093
1094 if (alignment < PAGE_SIZE)
1095 alignment = PAGE_SIZE;
1096
1097 if (!IS_ALIGNED(size, alignment))
1098 size = ALIGN(size, alignment);
1099
1100 mutex_lock(&cma_mutex);
1101
1102 from = __cma_where_from(dev, type);
1103 if (unlikely(IS_ERR(from))) {
1104 addr = PTR_ERR(from);
1105 goto done;
1106 }
1107
1108 pr_debug("allocate %p/%p from one of %s\n",
1109 (void *)size, (void *)alignment, from);
1110
1111 while (*from && *from != ';') {
1112 reg = __cma_region_find(&from);
1113 addr = __cma_alloc_from_region(reg, size, alignment);
1114 if (!IS_ERR_VALUE(addr))
1115 goto done;
1116 }
1117
1118 pr_debug("not enough memory\n");
1119 addr = -ENOMEM;
1120
1121done:
1122 mutex_unlock(&cma_mutex);
1123
1124 return addr;
1125}
1126EXPORT_SYMBOL_GPL(__cma_alloc);
1127
1128
1129void *cma_get_virt(dma_addr_t phys, dma_addr_t size, int noncached)
1130{
1131 unsigned long num_pages, i;
1132 struct page **pages;
1133 void *virt;
1134
1135 if (noncached) {
1136 num_pages = size >> PAGE_SHIFT;
1137 pages = kmalloc(num_pages * sizeof(struct page *), GFP_KERNEL);
1138
1139 if (!pages)
1140 return ERR_PTR(-ENOMEM);
1141
1142 for (i = 0; i < num_pages; i++)
1143 pages[i] = pfn_to_page((phys >> PAGE_SHIFT) + i);
1144
1145 virt = vmap(pages, num_pages, VM_MAP,
1146 pgprot_writecombine(PAGE_KERNEL));
1147
1148 if (!virt) {
1149 kfree(pages);
1150 return ERR_PTR(-ENOMEM);
1151 }
1152
1153 kfree(pages);
1154 } else {
1155 virt = phys_to_virt((unsigned long)phys);
1156 }
1157
1158 return virt;
1159}
1160EXPORT_SYMBOL_GPL(cma_get_virt);
1161
1162/* Query information about regions. */
1163static void __cma_info_add(struct cma_info *infop, struct cma_region *reg)
1164{
1165 infop->total_size += reg->size;
1166 infop->free_size += reg->free_space;
1167 if (infop->lower_bound > reg->start)
1168 infop->lower_bound = reg->start;
1169 if (infop->upper_bound < reg->start + reg->size)
1170 infop->upper_bound = reg->start + reg->size;
1171 ++infop->count;
1172}
1173
1174int
1175__cma_info(struct cma_info *infop, const struct device *dev, const char *type)
1176{
1177 struct cma_info info = { ~(dma_addr_t)0, 0, 0, 0, 0 };
1178 struct cma_region *reg;
1179 const char *from;
1180 int ret;
1181
1182 if (unlikely(!infop))
1183 return -EINVAL;
1184
1185 mutex_lock(&cma_mutex);
1186
1187 from = __cma_where_from(dev, type);
1188 if (IS_ERR(from)) {
1189 ret = PTR_ERR(from);
1190 info.lower_bound = 0;
1191 goto done;
1192 }
1193
1194 while (*from && *from != ';') {
1195 reg = __cma_region_find(&from);
1196 if (reg)
1197 __cma_info_add(&info, reg);
1198 }
1199
1200 ret = 0;
1201done:
1202 mutex_unlock(&cma_mutex);
1203
1204 memcpy(infop, &info, sizeof info);
1205 return ret;
1206}
1207EXPORT_SYMBOL_GPL(__cma_info);
1208
1209
1210/* Freeing. */
1211int cma_free(dma_addr_t addr)
1212{
1213 struct cma_chunk *c;
1214 int ret;
1215
1216 mutex_lock(&cma_mutex);
1217
1218 c = __cma_chunk_find(addr);
1219
1220 if (c) {
1221 __cma_chunk_free(c);
1222 ret = 0;
1223 } else {
1224 ret = -ENOENT;
1225 }
1226
1227 mutex_unlock(&cma_mutex);
1228
1229 if (c)
1230 pr_debug("free(%p): freed\n", (void *)addr);
1231 else
1232 pr_err("free(%p): not found\n", (void *)addr);
1233 return ret;
1234}
1235EXPORT_SYMBOL_GPL(cma_free);
1236
1237
1238/************************* Miscellaneous *************************/
1239
1240static int __cma_region_attach_alloc(struct cma_region *reg)
1241{
1242 struct cma_allocator *alloc;
1243 int ret;
1244
1245 /*
1246 * If reg->alloc is set then caller wants us to use this
1247 * allocator. Otherwise we need to find one by name.
1248 */
1249 if (reg->alloc) {
1250 alloc = reg->alloc;
1251 } else {
1252 alloc = __cma_allocator_find(reg->alloc_name);
1253 if (!alloc) {
1254 pr_warn("init: %s: %s: no such allocator\n",
1255 reg->name ?: "(private)",
1256 reg->alloc_name ?: "(default)");
1257 reg->used = 1;
1258 return -ENOENT;
1259 }
1260 }
1261
1262 /* Try to initialise the allocator. */
1263 reg->private_data = NULL;
1264 ret = alloc->init ? alloc->init(reg) : 0;
1265 if (unlikely(ret < 0)) {
1266 pr_err("init: %s: %s: unable to initialise allocator\n",
1267 reg->name ?: "(private)", alloc->name ?: "(unnamed)");
1268 reg->alloc = NULL;
1269 reg->used = 1;
1270 } else {
1271 reg->alloc = alloc;
1272 pr_debug("init: %s: %s: initialised allocator\n",
1273 reg->name ?: "(private)", alloc->name ?: "(unnamed)");
1274 }
1275 return ret;
1276}
1277
1278static void __cma_region_detach_alloc(struct cma_region *reg)
1279{
1280 if (!reg->alloc)
1281 return;
1282
1283 if (reg->alloc->cleanup)
1284 reg->alloc->cleanup(reg);
1285
1286 reg->alloc = NULL;
1287 reg->used = 1;
1288}
1289
1290
1291/*
1292 * s ::= rules
1293 * rules ::= rule [ ';' rules ]
1294 * rule ::= patterns '=' regions
1295 * patterns ::= pattern [ ',' patterns ]
1296 * regions ::= REG-NAME [ ',' regions ]
1297 * pattern ::= dev-pattern [ '/' TYPE-NAME ] | '/' TYPE-NAME
1298 */
1299static const char *__must_check
1300__cma_where_from(const struct device *dev, const char *type)
1301{
1302 /*
1303 * This function matches the pattern from the map attribute
1304 * agains given device name and type. Type may be of course
1305 * NULL or an emtpy string.
1306 */
1307
1308 const char *s, *name;
1309 int name_matched = 0;
1310
1311 /*
1312 * If dev is NULL we were called in alternative form where
1313 * type is the from string. All we have to do is return it.
1314 */
1315 if (!dev)
1316 return type ?: ERR_PTR(-EINVAL);
1317
1318 if (!cma_map)
1319 return ERR_PTR(-ENOENT);
1320
1321 name = dev_name(dev);
1322 if (WARN_ON(!name || !*name))
1323 return ERR_PTR(-EINVAL);
1324
1325 if (!type)
1326 type = "common";
1327
1328 /*
1329 * Now we go throught the cma_map attribute.
1330 */
1331 for (s = cma_map; *s; ++s) {
1332 const char *c;
1333
1334 /*
1335 * If the pattern starts with a slash, the device part of the
1336 * pattern matches if it matched previously.
1337 */
1338 if (*s == '/') {
1339 if (!name_matched)
1340 goto look_for_next;
1341 goto match_type;
1342 }
1343
1344 /*
1345 * We are now trying to match the device name. This also
1346 * updates the name_matched variable. If, while reading the
1347 * spec, we ecnounter comma it means that the pattern does not
1348 * match and we need to start over with another pattern (the
1349 * one afther the comma). If we encounter equal sign we need
1350 * to start over with another rule. If there is a character
1351 * that does not match, we neet to look for a comma (to get
1352 * another pattern) or semicolon (to get another rule) and try
1353 * again if there is one somewhere.
1354 */
1355
1356 name_matched = 0;
1357
1358 for (c = name; *s != '*' && *c; ++c, ++s)
1359 if (*s == '=')
1360 goto next_rule;
1361 else if (*s == ',')
1362 goto next_pattern;
1363 else if (*s != '?' && *c != *s)
1364 goto look_for_next;
1365 if (*s == '*')
1366 ++s;
1367
1368 name_matched = 1;
1369
1370 /*
1371 * Now we need to match the type part of the pattern. If the
1372 * pattern is missing it we match only if type points to an
1373 * empty string. Otherwise wy try to match it just like name.
1374 */
1375 if (*s == '/') {
1376match_type: /* s points to '/' */
1377 ++s;
1378
1379 for (c = type; *s && *c; ++c, ++s)
1380 if (*s == '=')
1381 goto next_rule;
1382 else if (*s == ',')
1383 goto next_pattern;
1384 else if (*c != *s)
1385 goto look_for_next;
1386 }
1387
1388 /* Return the string behind the '=' sign of the rule. */
1389 if (*s == '=')
1390 return s + 1;
1391 else if (*s == ',')
1392 return strchr(s, '=') + 1;
1393
1394 /* Pattern did not match */
1395
1396look_for_next:
1397 do {
1398 ++s;
1399 } while (*s != ',' && *s != '=');
1400 if (*s == ',')
1401 continue;
1402
1403next_rule: /* s points to '=' */
1404 s = strchr(s, ';');
1405 if (!s)
1406 break;
1407
1408next_pattern:
1409 continue;
1410 }
1411
1412 return ERR_PTR(-ENOENT);
1413}
diff --git a/mm/compaction.c b/mm/compaction.c
index 6cc604bd564..8ea7308601b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,10 +35,6 @@ struct compact_control {
35 unsigned long migrate_pfn; /* isolate_migratepages search base */ 35 unsigned long migrate_pfn; /* isolate_migratepages search base */
36 bool sync; /* Synchronous migration */ 36 bool sync; /* Synchronous migration */
37 37
38 /* Account for isolated anon and file pages */
39 unsigned long nr_anon;
40 unsigned long nr_file;
41
42 unsigned int order; /* order a direct compactor needs */ 38 unsigned int order; /* order a direct compactor needs */
43 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 39 int migratetype; /* MOVABLE, RECLAIMABLE etc */
44 struct zone *zone; 40 struct zone *zone;
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone,
223static void acct_isolated(struct zone *zone, struct compact_control *cc) 219static void acct_isolated(struct zone *zone, struct compact_control *cc)
224{ 220{
225 struct page *page; 221 struct page *page;
226 unsigned int count[NR_LRU_LISTS] = { 0, }; 222 unsigned int count[2] = { 0, };
227 223
228 list_for_each_entry(page, &cc->migratepages, lru) { 224 list_for_each_entry(page, &cc->migratepages, lru)
229 int lru = page_lru_base_type(page); 225 count[!!page_is_file_cache(page)]++;
230 count[lru]++;
231 }
232 226
233 cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 227 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
234 cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 228 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
235 __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
236 __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
237} 229}
238 230
239/* Similar to reclaim, but different enough that they don't share logic */ 231/* Similar to reclaim, but different enough that they don't share logic */
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
269 unsigned long last_pageblock_nr = 0, pageblock_nr; 261 unsigned long last_pageblock_nr = 0, pageblock_nr;
270 unsigned long nr_scanned = 0, nr_isolated = 0; 262 unsigned long nr_scanned = 0, nr_isolated = 0;
271 struct list_head *migratelist = &cc->migratepages; 263 struct list_head *migratelist = &cc->migratepages;
264 isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
272 265
273 /* Do not scan outside zone boundaries */ 266 /* Do not scan outside zone boundaries */
274 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); 267 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -320,12 +313,34 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
320 } else if (!locked) 313 } else if (!locked)
321 spin_lock_irq(&zone->lru_lock); 314 spin_lock_irq(&zone->lru_lock);
322 315
316 /*
317 * migrate_pfn does not necessarily start aligned to a
318 * pageblock. Ensure that pfn_valid is called when moving
319 * into a new MAX_ORDER_NR_PAGES range in case of large
320 * memory holes within the zone
321 */
322 if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
323 if (!pfn_valid(low_pfn)) {
324 low_pfn += MAX_ORDER_NR_PAGES - 1;
325 continue;
326 }
327 }
328
323 if (!pfn_valid_within(low_pfn)) 329 if (!pfn_valid_within(low_pfn))
324 continue; 330 continue;
325 nr_scanned++; 331 nr_scanned++;
326 332
327 /* Get the page and skip if free */ 333 /*
334 * Get the page and ensure the page is within the same zone.
335 * See the comment in isolate_freepages about overlapping
336 * nodes. It is deliberate that the new zone lock is not taken
337 * as memory compaction should not move pages between nodes.
338 */
328 page = pfn_to_page(low_pfn); 339 page = pfn_to_page(low_pfn);
340 if (page_zone(page) != zone)
341 continue;
342
343 /* Skip if free */
329 if (PageBuddy(page)) 344 if (PageBuddy(page))
330 continue; 345 continue;
331 346
@@ -356,8 +371,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
356 continue; 371 continue;
357 } 372 }
358 373
374 if (!cc->sync)
375 mode |= ISOLATE_ASYNC_MIGRATE;
376
359 /* Try isolate the page */ 377 /* Try isolate the page */
360 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) 378 if (__isolate_lru_page(page, mode, 0) != 0)
361 continue; 379 continue;
362 380
363 VM_BUG_ON(PageTransCompound(page)); 381 VM_BUG_ON(PageTransCompound(page));
@@ -559,7 +577,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
559 nr_migrate = cc->nr_migratepages; 577 nr_migrate = cc->nr_migratepages;
560 err = migrate_pages(&cc->migratepages, compaction_alloc, 578 err = migrate_pages(&cc->migratepages, compaction_alloc,
561 (unsigned long)cc, false, 579 (unsigned long)cc, false,
562 cc->sync); 580 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
563 update_nr_listpages(cc); 581 update_nr_listpages(cc);
564 nr_remaining = cc->nr_migratepages; 582 nr_remaining = cc->nr_migratepages;
565 583
@@ -574,8 +592,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
574 if (err) { 592 if (err) {
575 putback_lru_pages(&cc->migratepages); 593 putback_lru_pages(&cc->migratepages);
576 cc->nr_migratepages = 0; 594 cc->nr_migratepages = 0;
595 if (err == -ENOMEM) {
596 ret = COMPACT_PARTIAL;
597 goto out;
598 }
577 } 599 }
578
579 } 600 }
580 601
581out: 602out:
diff --git a/mm/filemap.c b/mm/filemap.c
index a8251a8d345..10481ebd96c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -396,24 +396,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range);
396int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) 396int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
397{ 397{
398 int error; 398 int error;
399 struct mem_cgroup *memcg = NULL;
400 399
401 VM_BUG_ON(!PageLocked(old)); 400 VM_BUG_ON(!PageLocked(old));
402 VM_BUG_ON(!PageLocked(new)); 401 VM_BUG_ON(!PageLocked(new));
403 VM_BUG_ON(new->mapping); 402 VM_BUG_ON(new->mapping);
404 403
405 /*
406 * This is not page migration, but prepare_migration and
407 * end_migration does enough work for charge replacement.
408 *
409 * In the longer term we probably want a specialized function
410 * for moving the charge from old to new in a more efficient
411 * manner.
412 */
413 error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
414 if (error)
415 return error;
416
417 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 404 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
418 if (!error) { 405 if (!error) {
419 struct address_space *mapping = old->mapping; 406 struct address_space *mapping = old->mapping;
@@ -435,13 +422,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
435 if (PageSwapBacked(new)) 422 if (PageSwapBacked(new))
436 __inc_zone_page_state(new, NR_SHMEM); 423 __inc_zone_page_state(new, NR_SHMEM);
437 spin_unlock_irq(&mapping->tree_lock); 424 spin_unlock_irq(&mapping->tree_lock);
425 /* mem_cgroup codes must not be called under tree_lock */
426 mem_cgroup_replace_page_cache(old, new);
438 radix_tree_preload_end(); 427 radix_tree_preload_end();
439 if (freepage) 428 if (freepage)
440 freepage(old); 429 freepage(old);
441 page_cache_release(old); 430 page_cache_release(old);
442 mem_cgroup_end_migration(memcg, old, new, true);
443 } else {
444 mem_cgroup_end_migration(memcg, old, new, false);
445 } 431 }
446 432
447 return error; 433 return error;
@@ -530,10 +516,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
530 struct page *page; 516 struct page *page;
531 517
532 if (cpuset_do_page_mem_spread()) { 518 if (cpuset_do_page_mem_spread()) {
533 get_mems_allowed(); 519 unsigned int cpuset_mems_cookie;
534 n = cpuset_mem_spread_node(); 520 do {
535 page = alloc_pages_exact_node(n, gfp, 0); 521 cpuset_mems_cookie = get_mems_allowed();
536 put_mems_allowed(); 522 n = cpuset_mem_spread_node();
523 page = alloc_pages_exact_node(n, gfp, 0);
524 } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
525
537 return page; 526 return page;
538 } 527 }
539 return alloc_pages(gfp, 0); 528 return alloc_pages(gfp, 0);
@@ -1393,15 +1382,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1393 unsigned long seg = 0; 1382 unsigned long seg = 0;
1394 size_t count; 1383 size_t count;
1395 loff_t *ppos = &iocb->ki_pos; 1384 loff_t *ppos = &iocb->ki_pos;
1396 struct blk_plug plug;
1397 1385
1398 count = 0; 1386 count = 0;
1399 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1387 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1400 if (retval) 1388 if (retval)
1401 return retval; 1389 return retval;
1402 1390
1403 blk_start_plug(&plug);
1404
1405 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1391 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1406 if (filp->f_flags & O_DIRECT) { 1392 if (filp->f_flags & O_DIRECT) {
1407 loff_t size; 1393 loff_t size;
@@ -1417,8 +1403,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1417 retval = filemap_write_and_wait_range(mapping, pos, 1403 retval = filemap_write_and_wait_range(mapping, pos,
1418 pos + iov_length(iov, nr_segs) - 1); 1404 pos + iov_length(iov, nr_segs) - 1);
1419 if (!retval) { 1405 if (!retval) {
1406 struct blk_plug plug;
1407
1408 blk_start_plug(&plug);
1420 retval = mapping->a_ops->direct_IO(READ, iocb, 1409 retval = mapping->a_ops->direct_IO(READ, iocb,
1421 iov, pos, nr_segs); 1410 iov, pos, nr_segs);
1411 blk_finish_plug(&plug);
1422 } 1412 }
1423 if (retval > 0) { 1413 if (retval > 0) {
1424 *ppos = pos + retval; 1414 *ppos = pos + retval;
@@ -1474,7 +1464,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1474 break; 1464 break;
1475 } 1465 }
1476out: 1466out:
1477 blk_finish_plug(&plug);
1478 return retval; 1467 return retval;
1479} 1468}
1480EXPORT_SYMBOL(generic_file_aio_read); 1469EXPORT_SYMBOL(generic_file_aio_read);
@@ -1807,7 +1796,7 @@ repeat:
1807 page = __page_cache_alloc(gfp | __GFP_COLD); 1796 page = __page_cache_alloc(gfp | __GFP_COLD);
1808 if (!page) 1797 if (!page)
1809 return ERR_PTR(-ENOMEM); 1798 return ERR_PTR(-ENOMEM);
1810 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); 1799 err = add_to_page_cache_lru(page, mapping, index, gfp);
1811 if (unlikely(err)) { 1800 if (unlikely(err)) {
1812 page_cache_release(page); 1801 page_cache_release(page);
1813 if (err == -EEXIST) 1802 if (err == -EEXIST)
@@ -1904,10 +1893,7 @@ static struct page *wait_on_page_read(struct page *page)
1904 * @gfp: the page allocator flags to use if allocating 1893 * @gfp: the page allocator flags to use if allocating
1905 * 1894 *
1906 * This is the same as "read_mapping_page(mapping, index, NULL)", but with 1895 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
1907 * any new page allocations done using the specified allocation flags. Note 1896 * any new page allocations done using the specified allocation flags.
1908 * that the Radix tree operations will still use GFP_KERNEL, so you can't
1909 * expect to do this atomically or anything like that - but you can pass in
1910 * other page requirements.
1911 * 1897 *
1912 * If the page does not get brought uptodate, return -EIO. 1898 * If the page does not get brought uptodate, return -EIO.
1913 */ 1899 */
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 93356cd1282..dee94297f39 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -263,7 +263,12 @@ found:
263 xip_pfn); 263 xip_pfn);
264 if (err == -ENOMEM) 264 if (err == -ENOMEM)
265 return VM_FAULT_OOM; 265 return VM_FAULT_OOM;
266 BUG_ON(err); 266 /*
267 * err == -EBUSY is fine, we've raced against another thread
268 * that faulted-in the same page
269 */
270 if (err != -EBUSY)
271 BUG_ON(err);
267 return VM_FAULT_NOPAGE; 272 return VM_FAULT_NOPAGE;
268 } else { 273 } else {
269 int err, ret = VM_FAULT_OOM; 274 int err, ret = VM_FAULT_OOM;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 81532f297fd..8cc11dda6a7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -641,6 +641,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
641 set_pmd_at(mm, haddr, pmd, entry); 641 set_pmd_at(mm, haddr, pmd, entry);
642 prepare_pmd_huge_pte(pgtable, mm); 642 prepare_pmd_huge_pte(pgtable, mm);
643 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 643 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
644 mm->nr_ptes++;
644 spin_unlock(&mm->page_table_lock); 645 spin_unlock(&mm->page_table_lock);
645 } 646 }
646 647
@@ -759,6 +760,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
759 pmd = pmd_mkold(pmd_wrprotect(pmd)); 760 pmd = pmd_mkold(pmd_wrprotect(pmd));
760 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 761 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
761 prepare_pmd_huge_pte(pgtable, dst_mm); 762 prepare_pmd_huge_pte(pgtable, dst_mm);
763 dst_mm->nr_ptes++;
762 764
763 ret = 0; 765 ret = 0;
764out_unlock: 766out_unlock:
@@ -857,7 +859,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
857 } 859 }
858 kfree(pages); 860 kfree(pages);
859 861
860 mm->nr_ptes++;
861 smp_wmb(); /* make pte visible before pmd */ 862 smp_wmb(); /* make pte visible before pmd */
862 pmd_populate(mm, pmd, pgtable); 863 pmd_populate(mm, pmd, pgtable);
863 page_remove_rmap(page); 864 page_remove_rmap(page);
@@ -989,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
989 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 990 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
990 VM_BUG_ON(!PageCompound(page)); 991 VM_BUG_ON(!PageCompound(page));
991 if (flags & FOLL_GET) 992 if (flags & FOLL_GET)
992 get_page(page); 993 get_page_foll(page);
993 994
994out: 995out:
995 return page; 996 return page;
@@ -1016,6 +1017,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1016 VM_BUG_ON(page_mapcount(page) < 0); 1017 VM_BUG_ON(page_mapcount(page) < 0);
1017 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1018 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1018 VM_BUG_ON(!PageHead(page)); 1019 VM_BUG_ON(!PageHead(page));
1020 tlb->mm->nr_ptes--;
1019 spin_unlock(&tlb->mm->page_table_lock); 1021 spin_unlock(&tlb->mm->page_table_lock);
1020 tlb_remove_page(tlb, page); 1022 tlb_remove_page(tlb, page);
1021 pte_free(tlb->mm, pgtable); 1023 pte_free(tlb->mm, pgtable);
@@ -1156,6 +1158,7 @@ static void __split_huge_page_refcount(struct page *page)
1156 unsigned long head_index = page->index; 1158 unsigned long head_index = page->index;
1157 struct zone *zone = page_zone(page); 1159 struct zone *zone = page_zone(page);
1158 int zonestat; 1160 int zonestat;
1161 int tail_count = 0;
1159 1162
1160 /* prevent PageLRU to go away from under us, and freeze lru stats */ 1163 /* prevent PageLRU to go away from under us, and freeze lru stats */
1161 spin_lock_irq(&zone->lru_lock); 1164 spin_lock_irq(&zone->lru_lock);
@@ -1164,11 +1167,27 @@ static void __split_huge_page_refcount(struct page *page)
1164 for (i = 1; i < HPAGE_PMD_NR; i++) { 1167 for (i = 1; i < HPAGE_PMD_NR; i++) {
1165 struct page *page_tail = page + i; 1168 struct page *page_tail = page + i;
1166 1169
1167 /* tail_page->_count cannot change */ 1170 /* tail_page->_mapcount cannot change */
1168 atomic_sub(atomic_read(&page_tail->_count), &page->_count); 1171 BUG_ON(page_mapcount(page_tail) < 0);
1169 BUG_ON(page_count(page) <= 0); 1172 tail_count += page_mapcount(page_tail);
1170 atomic_add(page_mapcount(page) + 1, &page_tail->_count); 1173 /* check for overflow */
1171 BUG_ON(atomic_read(&page_tail->_count) <= 0); 1174 BUG_ON(tail_count < 0);
1175 BUG_ON(atomic_read(&page_tail->_count) != 0);
1176 /*
1177 * tail_page->_count is zero and not changing from
1178 * under us. But get_page_unless_zero() may be running
1179 * from under us on the tail_page. If we used
1180 * atomic_set() below instead of atomic_add(), we
1181 * would then run atomic_set() concurrently with
1182 * get_page_unless_zero(), and atomic_set() is
1183 * implemented in C not using locked ops. spin_unlock
1184 * on x86 sometime uses locked ops because of PPro
1185 * errata 66, 92, so unless somebody can guarantee
1186 * atomic_set() here would be safe on all archs (and
1187 * not only on x86), it's safer to use atomic_add().
1188 */
1189 atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
1190 &page_tail->_count);
1172 1191
1173 /* after clearing PageTail the gup refcount can be released */ 1192 /* after clearing PageTail the gup refcount can be released */
1174 smp_mb(); 1193 smp_mb();
@@ -1186,10 +1205,7 @@ static void __split_huge_page_refcount(struct page *page)
1186 (1L << PG_uptodate))); 1205 (1L << PG_uptodate)));
1187 page_tail->flags |= (1L << PG_dirty); 1206 page_tail->flags |= (1L << PG_dirty);
1188 1207
1189 /* 1208 /* clear PageTail before overwriting first_page */
1190 * 1) clear PageTail before overwriting first_page
1191 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
1192 */
1193 smp_wmb(); 1209 smp_wmb();
1194 1210
1195 /* 1211 /*
@@ -1206,7 +1222,6 @@ static void __split_huge_page_refcount(struct page *page)
1206 * status is achieved setting a reserved bit in the 1222 * status is achieved setting a reserved bit in the
1207 * pmd, not by clearing the present bit. 1223 * pmd, not by clearing the present bit.
1208 */ 1224 */
1209 BUG_ON(page_mapcount(page_tail));
1210 page_tail->_mapcount = page->_mapcount; 1225 page_tail->_mapcount = page->_mapcount;
1211 1226
1212 BUG_ON(page_tail->mapping); 1227 BUG_ON(page_tail->mapping);
@@ -1223,6 +1238,8 @@ static void __split_huge_page_refcount(struct page *page)
1223 1238
1224 lru_add_page_tail(zone, page, page_tail); 1239 lru_add_page_tail(zone, page, page_tail);
1225 } 1240 }
1241 atomic_sub(tail_count, &page->_count);
1242 BUG_ON(atomic_read(&page->_count) <= 0);
1226 1243
1227 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1244 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1228 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); 1245 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
@@ -1295,7 +1312,6 @@ static int __split_huge_page_map(struct page *page,
1295 pte_unmap(pte); 1312 pte_unmap(pte);
1296 } 1313 }
1297 1314
1298 mm->nr_ptes++;
1299 smp_wmb(); /* make pte visible before pmd */ 1315 smp_wmb(); /* make pte visible before pmd */
1300 /* 1316 /*
1301 * Up to this point the pmd is present and huge and 1317 * Up to this point the pmd is present and huge and
@@ -1910,7 +1926,6 @@ static void collapse_huge_page(struct mm_struct *mm,
1910 set_pmd_at(mm, address, pmd, _pmd); 1926 set_pmd_at(mm, address, pmd, _pmd);
1911 update_mmu_cache(vma, address, entry); 1927 update_mmu_cache(vma, address, entry);
1912 prepare_pmd_huge_pte(pgtable, mm); 1928 prepare_pmd_huge_pte(pgtable, mm);
1913 mm->nr_ptes--;
1914 spin_unlock(&mm->page_table_lock); 1929 spin_unlock(&mm->page_table_lock);
1915 1930
1916#ifndef CONFIG_NUMA 1931#ifndef CONFIG_NUMA
@@ -2005,7 +2020,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
2005{ 2020{
2006 struct mm_struct *mm = mm_slot->mm; 2021 struct mm_struct *mm = mm_slot->mm;
2007 2022
2008 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); 2023 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2009 2024
2010 if (khugepaged_test_exit(mm)) { 2025 if (khugepaged_test_exit(mm)) {
2011 /* free mm_slot */ 2026 /* free mm_slot */
@@ -2033,7 +2048,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2033 int progress = 0; 2048 int progress = 0;
2034 2049
2035 VM_BUG_ON(!pages); 2050 VM_BUG_ON(!pages);
2036 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); 2051 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2037 2052
2038 if (khugepaged_scan.mm_slot) 2053 if (khugepaged_scan.mm_slot)
2039 mm_slot = khugepaged_scan.mm_slot; 2054 mm_slot = khugepaged_scan.mm_slot;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bfcf153bc82..037f077b986 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -460,8 +460,10 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
460 struct zonelist *zonelist; 460 struct zonelist *zonelist;
461 struct zone *zone; 461 struct zone *zone;
462 struct zoneref *z; 462 struct zoneref *z;
463 unsigned int cpuset_mems_cookie;
463 464
464 get_mems_allowed(); 465retry_cpuset:
466 cpuset_mems_cookie = get_mems_allowed();
465 zonelist = huge_zonelist(vma, address, 467 zonelist = huge_zonelist(vma, address,
466 htlb_alloc_mask, &mpol, &nodemask); 468 htlb_alloc_mask, &mpol, &nodemask);
467 /* 469 /*
@@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
488 } 490 }
489 } 491 }
490 } 492 }
491err: 493
492 mpol_cond_put(mpol); 494 mpol_cond_put(mpol);
493 put_mems_allowed(); 495 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
496 goto retry_cpuset;
494 return page; 497 return page;
498
499err:
500 mpol_cond_put(mpol);
501 return NULL;
495} 502}
496 503
497static void update_and_free_page(struct hstate *h, struct page *page) 504static void update_and_free_page(struct hstate *h, struct page *page)
@@ -575,6 +582,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
575 __SetPageHead(page); 582 __SetPageHead(page);
576 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 583 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
577 __SetPageTail(p); 584 __SetPageTail(p);
585 set_page_count(p, 0);
578 p->first_page = page; 586 p->first_page = page;
579 } 587 }
580} 588}
@@ -900,7 +908,6 @@ retry:
900 h->resv_huge_pages += delta; 908 h->resv_huge_pages += delta;
901 ret = 0; 909 ret = 0;
902 910
903 spin_unlock(&hugetlb_lock);
904 /* Free the needed pages to the hugetlb pool */ 911 /* Free the needed pages to the hugetlb pool */
905 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 912 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
906 if ((--needed) < 0) 913 if ((--needed) < 0)
@@ -914,6 +921,7 @@ retry:
914 VM_BUG_ON(page_count(page)); 921 VM_BUG_ON(page_count(page));
915 enqueue_huge_page(h, page); 922 enqueue_huge_page(h, page);
916 } 923 }
924 spin_unlock(&hugetlb_lock);
917 925
918 /* Free unnecessary surplus pages to the buddy allocator */ 926 /* Free unnecessary surplus pages to the buddy allocator */
919free: 927free:
@@ -2059,6 +2067,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2059 kref_get(&reservations->refs); 2067 kref_get(&reservations->refs);
2060} 2068}
2061 2069
2070static void resv_map_put(struct vm_area_struct *vma)
2071{
2072 struct resv_map *reservations = vma_resv_map(vma);
2073
2074 if (!reservations)
2075 return;
2076 kref_put(&reservations->refs, resv_map_release);
2077}
2078
2062static void hugetlb_vm_op_close(struct vm_area_struct *vma) 2079static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2063{ 2080{
2064 struct hstate *h = hstate_vma(vma); 2081 struct hstate *h = hstate_vma(vma);
@@ -2074,7 +2091,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2074 reserve = (end - start) - 2091 reserve = (end - start) -
2075 region_count(&reservations->regions, start, end); 2092 region_count(&reservations->regions, start, end);
2076 2093
2077 kref_put(&reservations->refs, resv_map_release); 2094 resv_map_put(vma);
2078 2095
2079 if (reserve) { 2096 if (reserve) {
2080 hugetlb_acct_memory(h, -reserve); 2097 hugetlb_acct_memory(h, -reserve);
@@ -2284,6 +2301,22 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2284{ 2301{
2285 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 2302 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
2286 __unmap_hugepage_range(vma, start, end, ref_page); 2303 __unmap_hugepage_range(vma, start, end, ref_page);
2304 /*
2305 * Clear this flag so that x86's huge_pmd_share page_table_shareable
2306 * test will fail on a vma being torn down, and not grab a page table
2307 * on its way out. We're lucky that the flag has such an appropriate
2308 * name, and can in fact be safely cleared here. We could clear it
2309 * before the __unmap_hugepage_range above, but all that's necessary
2310 * is to clear it before releasing the i_mmap_mutex below.
2311 *
2312 * This works because in the contexts this is called, the VMA is
2313 * going to be destroyed. It is not vunerable to madvise(DONTNEED)
2314 * because madvise is not supported on hugetlbfs. The same applies
2315 * for direct IO. unmap_hugepage_range() is only being called just
2316 * before free_pgtables() so clearing VM_MAYSHARE will not cause
2317 * surprises later.
2318 */
2319 vma->vm_flags &= ~VM_MAYSHARE;
2287 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 2320 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2288} 2321}
2289 2322
@@ -2397,7 +2430,6 @@ retry_avoidcopy:
2397 if (outside_reserve) { 2430 if (outside_reserve) {
2398 BUG_ON(huge_pte_none(pte)); 2431 BUG_ON(huge_pte_none(pte));
2399 if (unmap_ref_private(mm, vma, old_page, address)) { 2432 if (unmap_ref_private(mm, vma, old_page, address)) {
2400 BUG_ON(page_count(old_page) != 1);
2401 BUG_ON(huge_pte_none(pte)); 2433 BUG_ON(huge_pte_none(pte));
2402 spin_lock(&mm->page_table_lock); 2434 spin_lock(&mm->page_table_lock);
2403 goto retry_avoidcopy; 2435 goto retry_avoidcopy;
@@ -2415,6 +2447,8 @@ retry_avoidcopy:
2415 * anon_vma prepared. 2447 * anon_vma prepared.
2416 */ 2448 */
2417 if (unlikely(anon_vma_prepare(vma))) { 2449 if (unlikely(anon_vma_prepare(vma))) {
2450 page_cache_release(new_page);
2451 page_cache_release(old_page);
2418 /* Caller expects lock to be held */ 2452 /* Caller expects lock to be held */
2419 spin_lock(&mm->page_table_lock); 2453 spin_lock(&mm->page_table_lock);
2420 return VM_FAULT_OOM; 2454 return VM_FAULT_OOM;
@@ -2676,6 +2710,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2676 * so no worry about deadlock. 2710 * so no worry about deadlock.
2677 */ 2711 */
2678 page = pte_page(entry); 2712 page = pte_page(entry);
2713 get_page(page);
2679 if (page != pagecache_page) 2714 if (page != pagecache_page)
2680 lock_page(page); 2715 lock_page(page);
2681 2716
@@ -2707,6 +2742,7 @@ out_page_table_lock:
2707 } 2742 }
2708 if (page != pagecache_page) 2743 if (page != pagecache_page)
2709 unlock_page(page); 2744 unlock_page(page);
2745 put_page(page);
2710 2746
2711out_mutex: 2747out_mutex:
2712 mutex_unlock(&hugetlb_instantiation_mutex); 2748 mutex_unlock(&hugetlb_instantiation_mutex);
@@ -2833,9 +2869,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2833 } 2869 }
2834 } 2870 }
2835 spin_unlock(&mm->page_table_lock); 2871 spin_unlock(&mm->page_table_lock);
2836 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 2872 /*
2837 2873 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
2874 * may have cleared our pud entry and done put_page on the page table:
2875 * once we release i_mmap_mutex, another task can do the final put_page
2876 * and that page table be reused and filled with junk.
2877 */
2838 flush_tlb_range(vma, start, end); 2878 flush_tlb_range(vma, start, end);
2879 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2839} 2880}
2840 2881
2841int hugetlb_reserve_pages(struct inode *inode, 2882int hugetlb_reserve_pages(struct inode *inode,
@@ -2873,12 +2914,16 @@ int hugetlb_reserve_pages(struct inode *inode,
2873 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 2914 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
2874 } 2915 }
2875 2916
2876 if (chg < 0) 2917 if (chg < 0) {
2877 return chg; 2918 ret = chg;
2919 goto out_err;
2920 }
2878 2921
2879 /* There must be enough filesystem quota for the mapping */ 2922 /* There must be enough filesystem quota for the mapping */
2880 if (hugetlb_get_quota(inode->i_mapping, chg)) 2923 if (hugetlb_get_quota(inode->i_mapping, chg)) {
2881 return -ENOSPC; 2924 ret = -ENOSPC;
2925 goto out_err;
2926 }
2882 2927
2883 /* 2928 /*
2884 * Check enough hugepages are available for the reservation. 2929 * Check enough hugepages are available for the reservation.
@@ -2887,7 +2932,7 @@ int hugetlb_reserve_pages(struct inode *inode,
2887 ret = hugetlb_acct_memory(h, chg); 2932 ret = hugetlb_acct_memory(h, chg);
2888 if (ret < 0) { 2933 if (ret < 0) {
2889 hugetlb_put_quota(inode->i_mapping, chg); 2934 hugetlb_put_quota(inode->i_mapping, chg);
2890 return ret; 2935 goto out_err;
2891 } 2936 }
2892 2937
2893 /* 2938 /*
@@ -2904,6 +2949,10 @@ int hugetlb_reserve_pages(struct inode *inode,
2904 if (!vma || vma->vm_flags & VM_MAYSHARE) 2949 if (!vma || vma->vm_flags & VM_MAYSHARE)
2905 region_add(&inode->i_mapping->private_list, from, to); 2950 region_add(&inode->i_mapping->private_list, from, to);
2906 return 0; 2951 return 0;
2952out_err:
2953 if (vma)
2954 resv_map_put(vma);
2955 return ret;
2907} 2956}
2908 2957
2909void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 2958void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
diff --git a/mm/internal.h b/mm/internal.h
index d071d380fb4..2189af49178 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)
37 atomic_dec(&page->_count); 37 atomic_dec(&page->_count);
38} 38}
39 39
40static inline void __get_page_tail_foll(struct page *page,
41 bool get_page_head)
42{
43 /*
44 * If we're getting a tail page, the elevated page->_count is
45 * required only in the head page and we will elevate the head
46 * page->_count and tail page->_mapcount.
47 *
48 * We elevate page_tail->_mapcount for tail pages to force
49 * page_tail->_count to be zero at all times to avoid getting
50 * false positives from get_page_unless_zero() with
51 * speculative page access (like in
52 * page_cache_get_speculative()) on tail pages.
53 */
54 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
55 VM_BUG_ON(atomic_read(&page->_count) != 0);
56 VM_BUG_ON(page_mapcount(page) < 0);
57 if (get_page_head)
58 atomic_inc(&page->first_page->_count);
59 atomic_inc(&page->_mapcount);
60}
61
62/*
63 * This is meant to be called as the FOLL_GET operation of
64 * follow_page() and it must be called while holding the proper PT
65 * lock while the pte (or pmd_trans_huge) is still mapping the page.
66 */
67static inline void get_page_foll(struct page *page)
68{
69 if (unlikely(PageTail(page)))
70 /*
71 * This is safe only because
72 * __split_huge_page_refcount() can't run under
73 * get_page_foll() because we hold the proper PT lock.
74 */
75 __get_page_tail_foll(page, true);
76 else {
77 /*
78 * Getting a normal page or the head of a compound page
79 * requires to already have an elevated page->_count.
80 */
81 VM_BUG_ON(atomic_read(&page->_count) <= 0);
82 atomic_inc(&page->_count);
83 }
84}
85
40extern unsigned long highest_memmap_pfn; 86extern unsigned long highest_memmap_pfn;
41 87
42/* 88/*
diff --git a/mm/ksm.c b/mm/ksm.c
index 9a68b0cf0a1..bf0d59a2c7b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -184,15 +184,15 @@ static unsigned long ksm_pages_unshared;
184static unsigned long ksm_rmap_items; 184static unsigned long ksm_rmap_items;
185 185
186/* Number of pages ksmd should scan in one batch */ 186/* Number of pages ksmd should scan in one batch */
187static unsigned int ksm_thread_pages_to_scan = 100; 187static unsigned int ksm_thread_pages_to_scan = 128;
188 188
189/* Milliseconds ksmd should sleep between batches */ 189/* Milliseconds ksmd should sleep between batches */
190static unsigned int ksm_thread_sleep_millisecs = 20; 190static unsigned int ksm_thread_sleep_millisecs = 4000;
191 191
192#define KSM_RUN_STOP 0 192#define KSM_RUN_STOP 0
193#define KSM_RUN_MERGE 1 193#define KSM_RUN_MERGE 1
194#define KSM_RUN_UNMERGE 2 194#define KSM_RUN_UNMERGE 2
195static unsigned int ksm_run = KSM_RUN_STOP; 195static unsigned int ksm_run = KSM_RUN_MERGE;
196 196
197static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); 197static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
198static DEFINE_MUTEX(ksm_thread_mutex); 198static DEFINE_MUTEX(ksm_thread_mutex);
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491ed50..deabe5f603a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -13,6 +13,7 @@
13#include <linux/hugetlb.h> 13#include <linux/hugetlb.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/ksm.h> 15#include <linux/ksm.h>
16#include <linux/file.h>
16 17
17/* 18/*
18 * Any behaviour which results in changes to the vma->vm_flags needs to 19 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -197,14 +198,16 @@ static long madvise_remove(struct vm_area_struct *vma,
197 struct address_space *mapping; 198 struct address_space *mapping;
198 loff_t offset, endoff; 199 loff_t offset, endoff;
199 int error; 200 int error;
201 struct file *f;
200 202
201 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 203 *prev = NULL; /* tell sys_madvise we drop mmap_sem */
202 204
203 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 205 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
204 return -EINVAL; 206 return -EINVAL;
205 207
206 if (!vma->vm_file || !vma->vm_file->f_mapping 208 f = vma->vm_file;
207 || !vma->vm_file->f_mapping->host) { 209
210 if (!f || !f->f_mapping || !f->f_mapping->host) {
208 return -EINVAL; 211 return -EINVAL;
209 } 212 }
210 213
@@ -218,9 +221,16 @@ static long madvise_remove(struct vm_area_struct *vma,
218 endoff = (loff_t)(end - vma->vm_start - 1) 221 endoff = (loff_t)(end - vma->vm_start - 1)
219 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 222 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
220 223
221 /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ 224 /*
225 * vmtruncate_range may need to take i_mutex and i_alloc_sem.
226 * We need to explicitly grab a reference because the vma (and
227 * hence the vma's reference to the file) can go away as soon as
228 * we drop mmap_sem.
229 */
230 get_file(f);
222 up_read(&current->mm->mmap_sem); 231 up_read(&current->mm->mmap_sem);
223 error = vmtruncate_range(mapping->host, offset, endoff); 232 error = vmtruncate_range(mapping->host, offset, endoff);
233 fput(f);
224 down_read(&current->mm->mmap_sem); 234 down_read(&current->mm->mmap_sem);
225 return error; 235 return error;
226} 236}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e013b8e57d2..57cdf5ad692 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1251,7 +1251,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1251unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 1251unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1252 struct list_head *dst, 1252 struct list_head *dst,
1253 unsigned long *scanned, int order, 1253 unsigned long *scanned, int order,
1254 int mode, struct zone *z, 1254 isolate_mode_t mode,
1255 struct zone *z,
1255 struct mem_cgroup *mem_cont, 1256 struct mem_cgroup *mem_cont,
1256 int active, int file) 1257 int active, int file)
1257{ 1258{
@@ -1730,7 +1731,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1730 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1731 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1731 1732
1732 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1733 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1733 if (!check_soft && root_mem->memsw_is_minimum) 1734 if (!check_soft && !shrink && root_mem->memsw_is_minimum)
1734 noswap = true; 1735 noswap = true;
1735 1736
1736 while (1) { 1737 while (1) {
@@ -3422,6 +3423,50 @@ int mem_cgroup_shmem_charge_fallback(struct page *page,
3422 return ret; 3423 return ret;
3423} 3424}
3424 3425
3426/*
3427 * At replace page cache, newpage is not under any memcg but it's on
3428 * LRU. So, this function doesn't touch res_counter but handles LRU
3429 * in correct way. Both pages are locked so we cannot race with uncharge.
3430 */
3431void mem_cgroup_replace_page_cache(struct page *oldpage,
3432 struct page *newpage)
3433{
3434 struct mem_cgroup *memcg;
3435 struct page_cgroup *pc;
3436 struct zone *zone;
3437 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3438 unsigned long flags;
3439
3440 if (mem_cgroup_disabled())
3441 return;
3442
3443 pc = lookup_page_cgroup(oldpage);
3444 /* fix accounting on old pages */
3445 lock_page_cgroup(pc);
3446 memcg = pc->mem_cgroup;
3447 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
3448 ClearPageCgroupUsed(pc);
3449 unlock_page_cgroup(pc);
3450
3451 if (PageSwapBacked(oldpage))
3452 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3453
3454 zone = page_zone(newpage);
3455 pc = lookup_page_cgroup(newpage);
3456 /*
3457 * Even if newpage->mapping was NULL before starting replacement,
3458 * the newpage may be on LRU(or pagevec for LRU) already. We lock
3459 * LRU while we overwrite pc->mem_cgroup.
3460 */
3461 spin_lock_irqsave(&zone->lru_lock, flags);
3462 if (PageLRU(newpage))
3463 del_page_from_lru_list(zone, newpage, page_lru(newpage));
3464 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type);
3465 if (PageLRU(newpage))
3466 add_page_to_lru_list(zone, newpage, page_lru(newpage));
3467 spin_unlock_irqrestore(&zone->lru_lock, flags);
3468}
3469
3425#ifdef CONFIG_DEBUG_VM 3470#ifdef CONFIG_DEBUG_VM
3426static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3471static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3427{ 3472{
@@ -4514,6 +4559,9 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4514 */ 4559 */
4515 BUG_ON(!thresholds); 4560 BUG_ON(!thresholds);
4516 4561
4562 if (!thresholds->primary)
4563 goto unlock;
4564
4517 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4565 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4518 4566
4519 /* Check if a threshold crossed before removing */ 4567 /* Check if a threshold crossed before removing */
@@ -4558,11 +4606,17 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4558swap_buffers: 4606swap_buffers:
4559 /* Swap primary and spare array */ 4607 /* Swap primary and spare array */
4560 thresholds->spare = thresholds->primary; 4608 thresholds->spare = thresholds->primary;
4609 /* If all events are unregistered, free the spare array */
4610 if (!new) {
4611 kfree(thresholds->spare);
4612 thresholds->spare = NULL;
4613 }
4614
4561 rcu_assign_pointer(thresholds->primary, new); 4615 rcu_assign_pointer(thresholds->primary, new);
4562 4616
4563 /* To be sure that nobody uses thresholds */ 4617 /* To be sure that nobody uses thresholds */
4564 synchronize_rcu(); 4618 synchronize_rcu();
4565 4619unlock:
4566 mutex_unlock(&memcg->thresholds_lock); 4620 mutex_unlock(&memcg->thresholds_lock);
4567} 4621}
4568 4622
@@ -4963,9 +5017,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4963 int cpu; 5017 int cpu;
4964 enable_swap_cgroup(); 5018 enable_swap_cgroup();
4965 parent = NULL; 5019 parent = NULL;
4966 root_mem_cgroup = mem;
4967 if (mem_cgroup_soft_limit_tree_init()) 5020 if (mem_cgroup_soft_limit_tree_init())
4968 goto free_out; 5021 goto free_out;
5022 root_mem_cgroup = mem;
4969 for_each_possible_cpu(cpu) { 5023 for_each_possible_cpu(cpu) {
4970 struct memcg_stock_pcp *stock = 5024 struct memcg_stock_pcp *stock =
4971 &per_cpu(memcg_stock, cpu); 5025 &per_cpu(memcg_stock, cpu);
@@ -5004,7 +5058,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
5004 return &mem->css; 5058 return &mem->css;
5005free_out: 5059free_out:
5006 __mem_cgroup_free(mem); 5060 __mem_cgroup_free(mem);
5007 root_mem_cgroup = NULL;
5008 return ERR_PTR(error); 5061 return ERR_PTR(error);
5009} 5062}
5010 5063
@@ -5244,6 +5297,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5244 spinlock_t *ptl; 5297 spinlock_t *ptl;
5245 5298
5246 split_huge_page_pmd(walk->mm, pmd); 5299 split_huge_page_pmd(walk->mm, pmd);
5300 if (pmd_trans_unstable(pmd))
5301 return 0;
5247 5302
5248 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5303 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5249 for (; addr != end; pte++, addr += PAGE_SIZE) 5304 for (; addr != end; pte++, addr += PAGE_SIZE)
@@ -5405,6 +5460,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5405 spinlock_t *ptl; 5460 spinlock_t *ptl;
5406 5461
5407 split_huge_page_pmd(walk->mm, pmd); 5462 split_huge_page_pmd(walk->mm, pmd);
5463 if (pmd_trans_unstable(pmd))
5464 return 0;
5408retry: 5465retry:
5409 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5466 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5410 for (; addr != end; addr += PAGE_SIZE) { 5467 for (; addr != end; addr += PAGE_SIZE) {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 740c4f52059..2f49dcf4f47 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1334,8 +1334,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
1334 /* Keep page count to indicate a given hugepage is isolated. */ 1334 /* Keep page count to indicate a given hugepage is isolated. */
1335 1335
1336 list_add(&hpage->lru, &pagelist); 1336 list_add(&hpage->lru, &pagelist);
1337 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, 1337 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false,
1338 true); 1338 MIGRATE_SYNC);
1339 if (ret) { 1339 if (ret) {
1340 struct page *page1, *page2; 1340 struct page *page1, *page2;
1341 list_for_each_entry_safe(page1, page2, &pagelist, lru) 1341 list_for_each_entry_safe(page1, page2, &pagelist, lru)
@@ -1464,7 +1464,7 @@ int soft_offline_page(struct page *page, int flags)
1464 page_is_file_cache(page)); 1464 page_is_file_cache(page));
1465 list_add(&page->lru, &pagelist); 1465 list_add(&page->lru, &pagelist);
1466 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1466 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1467 0, true); 1467 false, MIGRATE_SYNC);
1468 if (ret) { 1468 if (ret) {
1469 putback_lru_pages(&pagelist); 1469 putback_lru_pages(&pagelist);
1470 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1470 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 36e889cca24..79ff0613449 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1228,16 +1228,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1228 do { 1228 do {
1229 next = pmd_addr_end(addr, end); 1229 next = pmd_addr_end(addr, end);
1230 if (pmd_trans_huge(*pmd)) { 1230 if (pmd_trans_huge(*pmd)) {
1231 if (next-addr != HPAGE_PMD_SIZE) { 1231 if (next - addr != HPAGE_PMD_SIZE) {
1232 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1232 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1233 split_huge_page_pmd(vma->vm_mm, pmd); 1233 split_huge_page_pmd(vma->vm_mm, pmd);
1234 } else if (zap_huge_pmd(tlb, vma, pmd)) 1234 } else if (zap_huge_pmd(tlb, vma, pmd))
1235 continue; 1235 goto next;
1236 /* fall through */ 1236 /* fall through */
1237 } 1237 }
1238 if (pmd_none_or_clear_bad(pmd)) 1238 /*
1239 continue; 1239 * Here there can be other concurrent MADV_DONTNEED or
1240 * trans huge page faults running, and if the pmd is
1241 * none or trans huge it can change under us. This is
1242 * because MADV_DONTNEED holds the mmap_sem in read
1243 * mode.
1244 */
1245 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1246 goto next;
1240 next = zap_pte_range(tlb, vma, pmd, addr, next, details); 1247 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1248next:
1241 cond_resched(); 1249 cond_resched();
1242 } while (pmd++, addr = next, addr != end); 1250 } while (pmd++, addr = next, addr != end);
1243 1251
@@ -1514,7 +1522,7 @@ split_fallthrough:
1514 } 1522 }
1515 1523
1516 if (flags & FOLL_GET) 1524 if (flags & FOLL_GET)
1517 get_page(page); 1525 get_page_foll(page);
1518 if (flags & FOLL_TOUCH) { 1526 if (flags & FOLL_TOUCH) {
1519 if ((flags & FOLL_WRITE) && 1527 if ((flags & FOLL_WRITE) &&
1520 !pte_dirty(pte) && !PageDirty(page)) 1528 !pte_dirty(pte) && !PageDirty(page))
@@ -1816,7 +1824,63 @@ next_page:
1816} 1824}
1817EXPORT_SYMBOL(__get_user_pages); 1825EXPORT_SYMBOL(__get_user_pages);
1818 1826
1819/** 1827/*
1828 * fixup_user_fault() - manually resolve a user page fault
1829 * @tsk: the task_struct to use for page fault accounting, or
1830 * NULL if faults are not to be recorded.
1831 * @mm: mm_struct of target mm
1832 * @address: user address
1833 * @fault_flags:flags to pass down to handle_mm_fault()
1834 *
1835 * This is meant to be called in the specific scenario where for locking reasons
1836 * we try to access user memory in atomic context (within a pagefault_disable()
1837 * section), this returns -EFAULT, and we want to resolve the user fault before
1838 * trying again.
1839 *
1840 * Typically this is meant to be used by the futex code.
1841 *
1842 * The main difference with get_user_pages() is that this function will
1843 * unconditionally call handle_mm_fault() which will in turn perform all the
1844 * necessary SW fixup of the dirty and young bits in the PTE, while
1845 * handle_mm_fault() only guarantees to update these in the struct page.
1846 *
1847 * This is important for some architectures where those bits also gate the
1848 * access permission to the page because they are maintained in software. On
1849 * such architectures, gup() will not be enough to make a subsequent access
1850 * succeed.
1851 *
1852 * This should be called with the mm_sem held for read.
1853 */
1854int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1855 unsigned long address, unsigned int fault_flags)
1856{
1857 struct vm_area_struct *vma;
1858 int ret;
1859
1860 vma = find_extend_vma(mm, address);
1861 if (!vma || address < vma->vm_start)
1862 return -EFAULT;
1863
1864 ret = handle_mm_fault(mm, vma, address, fault_flags);
1865 if (ret & VM_FAULT_ERROR) {
1866 if (ret & VM_FAULT_OOM)
1867 return -ENOMEM;
1868 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1869 return -EHWPOISON;
1870 if (ret & VM_FAULT_SIGBUS)
1871 return -EFAULT;
1872 BUG();
1873 }
1874 if (tsk) {
1875 if (ret & VM_FAULT_MAJOR)
1876 tsk->maj_flt++;
1877 else
1878 tsk->min_flt++;
1879 }
1880 return 0;
1881}
1882
1883/*
1820 * get_user_pages() - pin user pages in memory 1884 * get_user_pages() - pin user pages in memory
1821 * @tsk: the task_struct to use for page fault accounting, or 1885 * @tsk: the task_struct to use for page fault accounting, or
1822 * NULL if faults are not to be recorded. 1886 * NULL if faults are not to be recorded.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c46887b5a11..ae5a3f21010 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -747,7 +747,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
747 } 747 }
748 /* this function returns # of failed pages */ 748 /* this function returns # of failed pages */
749 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 749 ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
750 true, true); 750 true, MIGRATE_SYNC);
751 if (ret) 751 if (ret)
752 putback_lru_pages(&source); 752 putback_lru_pages(&source);
753 } 753 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e7fb9d25c54..cff919fe702 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -511,7 +511,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
511 do { 511 do {
512 next = pmd_addr_end(addr, end); 512 next = pmd_addr_end(addr, end);
513 split_huge_page_pmd(vma->vm_mm, pmd); 513 split_huge_page_pmd(vma->vm_mm, pmd);
514 if (pmd_none_or_clear_bad(pmd)) 514 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
515 continue; 515 continue;
516 if (check_pte_range(vma, pmd, addr, next, nodes, 516 if (check_pte_range(vma, pmd, addr, next, nodes,
517 flags, private)) 517 flags, private))
@@ -606,27 +606,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
606 return first; 606 return first;
607} 607}
608 608
609/* Apply policy to a single VMA */
610static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
611{
612 int err = 0;
613 struct mempolicy *old = vma->vm_policy;
614
615 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
616 vma->vm_start, vma->vm_end, vma->vm_pgoff,
617 vma->vm_ops, vma->vm_file,
618 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
619
620 if (vma->vm_ops && vma->vm_ops->set_policy)
621 err = vma->vm_ops->set_policy(vma, new);
622 if (!err) {
623 mpol_get(new);
624 vma->vm_policy = new;
625 mpol_put(old);
626 }
627 return err;
628}
629
630/* Step 2: apply policy to a range and do splits. */ 609/* Step 2: apply policy to a range and do splits. */
631static int mbind_range(struct mm_struct *mm, unsigned long start, 610static int mbind_range(struct mm_struct *mm, unsigned long start,
632 unsigned long end, struct mempolicy *new_pol) 611 unsigned long end, struct mempolicy *new_pol)
@@ -666,9 +645,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
666 if (err) 645 if (err)
667 goto out; 646 goto out;
668 } 647 }
669 err = policy_vma(vma, new_pol); 648
670 if (err) 649 /*
671 goto out; 650 * Apply policy to a single VMA. The reference counting of
651 * policy for vma_policy linkages has already been handled by
652 * vma_merge and split_vma as necessary. If this is a shared
653 * policy then ->set_policy will increment the reference count
654 * for an sp node.
655 */
656 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
657 vma->vm_start, vma->vm_end, vma->vm_pgoff,
658 vma->vm_ops, vma->vm_file,
659 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
660 if (vma->vm_ops && vma->vm_ops->set_policy) {
661 err = vma->vm_ops->set_policy(vma, new_pol);
662 if (err)
663 goto out;
664 }
672 } 665 }
673 666
674 out: 667 out:
@@ -933,7 +926,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
933 926
934 if (!list_empty(&pagelist)) { 927 if (!list_empty(&pagelist)) {
935 err = migrate_pages(&pagelist, new_node_page, dest, 928 err = migrate_pages(&pagelist, new_node_page, dest,
936 false, true); 929 false, MIGRATE_SYNC);
937 if (err) 930 if (err)
938 putback_lru_pages(&pagelist); 931 putback_lru_pages(&pagelist);
939 } 932 }
@@ -1817,18 +1810,24 @@ struct page *
1817alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 1810alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1818 unsigned long addr, int node) 1811 unsigned long addr, int node)
1819{ 1812{
1820 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1813 struct mempolicy *pol;
1821 struct zonelist *zl; 1814 struct zonelist *zl;
1822 struct page *page; 1815 struct page *page;
1816 unsigned int cpuset_mems_cookie;
1817
1818retry_cpuset:
1819 pol = get_vma_policy(current, vma, addr);
1820 cpuset_mems_cookie = get_mems_allowed();
1823 1821
1824 get_mems_allowed();
1825 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1822 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1826 unsigned nid; 1823 unsigned nid;
1827 1824
1828 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 1825 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1829 mpol_cond_put(pol); 1826 mpol_cond_put(pol);
1830 page = alloc_page_interleave(gfp, order, nid); 1827 page = alloc_page_interleave(gfp, order, nid);
1831 put_mems_allowed(); 1828 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1829 goto retry_cpuset;
1830
1832 return page; 1831 return page;
1833 } 1832 }
1834 zl = policy_zonelist(gfp, pol, node); 1833 zl = policy_zonelist(gfp, pol, node);
@@ -1839,7 +1838,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1839 struct page *page = __alloc_pages_nodemask(gfp, order, 1838 struct page *page = __alloc_pages_nodemask(gfp, order,
1840 zl, policy_nodemask(gfp, pol)); 1839 zl, policy_nodemask(gfp, pol));
1841 __mpol_put(pol); 1840 __mpol_put(pol);
1842 put_mems_allowed(); 1841 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1842 goto retry_cpuset;
1843 return page; 1843 return page;
1844 } 1844 }
1845 /* 1845 /*
@@ -1847,7 +1847,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1847 */ 1847 */
1848 page = __alloc_pages_nodemask(gfp, order, zl, 1848 page = __alloc_pages_nodemask(gfp, order, zl,
1849 policy_nodemask(gfp, pol)); 1849 policy_nodemask(gfp, pol));
1850 put_mems_allowed(); 1850 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1851 goto retry_cpuset;
1851 return page; 1852 return page;
1852} 1853}
1853 1854
@@ -1874,11 +1875,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1874{ 1875{
1875 struct mempolicy *pol = current->mempolicy; 1876 struct mempolicy *pol = current->mempolicy;
1876 struct page *page; 1877 struct page *page;
1878 unsigned int cpuset_mems_cookie;
1877 1879
1878 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1880 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1879 pol = &default_policy; 1881 pol = &default_policy;
1880 1882
1881 get_mems_allowed(); 1883retry_cpuset:
1884 cpuset_mems_cookie = get_mems_allowed();
1885
1882 /* 1886 /*
1883 * No reference counting needed for current->mempolicy 1887 * No reference counting needed for current->mempolicy
1884 * nor system default_policy 1888 * nor system default_policy
@@ -1889,7 +1893,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1889 page = __alloc_pages_nodemask(gfp, order, 1893 page = __alloc_pages_nodemask(gfp, order,
1890 policy_zonelist(gfp, pol, numa_node_id()), 1894 policy_zonelist(gfp, pol, numa_node_id()),
1891 policy_nodemask(gfp, pol)); 1895 policy_nodemask(gfp, pol));
1892 put_mems_allowed(); 1896
1897 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1898 goto retry_cpuset;
1899
1893 return page; 1900 return page;
1894} 1901}
1895EXPORT_SYMBOL(alloc_pages_current); 1902EXPORT_SYMBOL(alloc_pages_current);
diff --git a/mm/migrate.c b/mm/migrate.c
index 666e4e67741..480714b6f3f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
120 120
121 ptep = pte_offset_map(pmd, addr); 121 ptep = pte_offset_map(pmd, addr);
122 122
123 if (!is_swap_pte(*ptep)) { 123 /*
124 pte_unmap(ptep); 124 * Peek to check is_swap_pte() before taking ptlock? No, we
125 goto out; 125 * can race mremap's move_ptes(), which skips anon_vma lock.
126 } 126 */
127 127
128 ptl = pte_lockptr(mm, pmd); 128 ptl = pte_lockptr(mm, pmd);
129 } 129 }
@@ -220,6 +220,56 @@ out:
220 pte_unmap_unlock(ptep, ptl); 220 pte_unmap_unlock(ptep, ptl);
221} 221}
222 222
223#ifdef CONFIG_BLOCK
224/* Returns true if all buffers are successfully locked */
225static bool buffer_migrate_lock_buffers(struct buffer_head *head,
226 enum migrate_mode mode)
227{
228 struct buffer_head *bh = head;
229
230 /* Simple case, sync compaction */
231 if (mode != MIGRATE_ASYNC) {
232 do {
233 get_bh(bh);
234 lock_buffer(bh);
235 bh = bh->b_this_page;
236
237 } while (bh != head);
238
239 return true;
240 }
241
242 /* async case, we cannot block on lock_buffer so use trylock_buffer */
243 do {
244 get_bh(bh);
245 if (!trylock_buffer(bh)) {
246 /*
247 * We failed to lock the buffer and cannot stall in
248 * async migration. Release the taken locks
249 */
250 struct buffer_head *failed_bh = bh;
251 put_bh(failed_bh);
252 bh = head;
253 while (bh != failed_bh) {
254 unlock_buffer(bh);
255 put_bh(bh);
256 bh = bh->b_this_page;
257 }
258 return false;
259 }
260
261 bh = bh->b_this_page;
262 } while (bh != head);
263 return true;
264}
265#else
266static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
267 enum migrate_mode mode)
268{
269 return true;
270}
271#endif /* CONFIG_BLOCK */
272
223/* 273/*
224 * Replace the page in the mapping. 274 * Replace the page in the mapping.
225 * 275 *
@@ -229,7 +279,8 @@ out:
229 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 279 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
230 */ 280 */
231static int migrate_page_move_mapping(struct address_space *mapping, 281static int migrate_page_move_mapping(struct address_space *mapping,
232 struct page *newpage, struct page *page) 282 struct page *newpage, struct page *page,
283 struct buffer_head *head, enum migrate_mode mode)
233{ 284{
234 int expected_count; 285 int expected_count;
235 void **pslot; 286 void **pslot;
@@ -259,6 +310,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
259 } 310 }
260 311
261 /* 312 /*
313 * In the async migration case of moving a page with buffers, lock the
314 * buffers using trylock before the mapping is moved. If the mapping
315 * was moved, we later failed to lock the buffers and could not move
316 * the mapping back due to an elevated page count, we would have to
317 * block waiting on other references to be dropped.
318 */
319 if (mode == MIGRATE_ASYNC && head &&
320 !buffer_migrate_lock_buffers(head, mode)) {
321 page_unfreeze_refs(page, expected_count);
322 spin_unlock_irq(&mapping->tree_lock);
323 return -EAGAIN;
324 }
325
326 /*
262 * Now we know that no one else is looking at the page. 327 * Now we know that no one else is looking at the page.
263 */ 328 */
264 get_page(newpage); /* add cache reference */ 329 get_page(newpage); /* add cache reference */
@@ -415,13 +480,14 @@ EXPORT_SYMBOL(fail_migrate_page);
415 * Pages are locked upon entry and exit. 480 * Pages are locked upon entry and exit.
416 */ 481 */
417int migrate_page(struct address_space *mapping, 482int migrate_page(struct address_space *mapping,
418 struct page *newpage, struct page *page) 483 struct page *newpage, struct page *page,
484 enum migrate_mode mode)
419{ 485{
420 int rc; 486 int rc;
421 487
422 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 488 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
423 489
424 rc = migrate_page_move_mapping(mapping, newpage, page); 490 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
425 491
426 if (rc) 492 if (rc)
427 return rc; 493 return rc;
@@ -438,28 +504,28 @@ EXPORT_SYMBOL(migrate_page);
438 * exist. 504 * exist.
439 */ 505 */
440int buffer_migrate_page(struct address_space *mapping, 506int buffer_migrate_page(struct address_space *mapping,
441 struct page *newpage, struct page *page) 507 struct page *newpage, struct page *page, enum migrate_mode mode)
442{ 508{
443 struct buffer_head *bh, *head; 509 struct buffer_head *bh, *head;
444 int rc; 510 int rc;
445 511
446 if (!page_has_buffers(page)) 512 if (!page_has_buffers(page))
447 return migrate_page(mapping, newpage, page); 513 return migrate_page(mapping, newpage, page, mode);
448 514
449 head = page_buffers(page); 515 head = page_buffers(page);
450 516
451 rc = migrate_page_move_mapping(mapping, newpage, page); 517 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
452 518
453 if (rc) 519 if (rc)
454 return rc; 520 return rc;
455 521
456 bh = head; 522 /*
457 do { 523 * In the async case, migrate_page_move_mapping locked the buffers
458 get_bh(bh); 524 * with an IRQ-safe spinlock held. In the sync case, the buffers
459 lock_buffer(bh); 525 * need to be locked now
460 bh = bh->b_this_page; 526 */
461 527 if (mode != MIGRATE_ASYNC)
462 } while (bh != head); 528 BUG_ON(!buffer_migrate_lock_buffers(head, mode));
463 529
464 ClearPagePrivate(page); 530 ClearPagePrivate(page);
465 set_page_private(newpage, page_private(page)); 531 set_page_private(newpage, page_private(page));
@@ -536,10 +602,14 @@ static int writeout(struct address_space *mapping, struct page *page)
536 * Default handling if a filesystem does not provide a migration function. 602 * Default handling if a filesystem does not provide a migration function.
537 */ 603 */
538static int fallback_migrate_page(struct address_space *mapping, 604static int fallback_migrate_page(struct address_space *mapping,
539 struct page *newpage, struct page *page) 605 struct page *newpage, struct page *page, enum migrate_mode mode)
540{ 606{
541 if (PageDirty(page)) 607 if (PageDirty(page)) {
608 /* Only writeback pages in full synchronous migration */
609 if (mode != MIGRATE_SYNC)
610 return -EBUSY;
542 return writeout(mapping, page); 611 return writeout(mapping, page);
612 }
543 613
544 /* 614 /*
545 * Buffers may be managed in a filesystem specific way. 615 * Buffers may be managed in a filesystem specific way.
@@ -549,7 +619,7 @@ static int fallback_migrate_page(struct address_space *mapping,
549 !try_to_release_page(page, GFP_KERNEL)) 619 !try_to_release_page(page, GFP_KERNEL))
550 return -EAGAIN; 620 return -EAGAIN;
551 621
552 return migrate_page(mapping, newpage, page); 622 return migrate_page(mapping, newpage, page, mode);
553} 623}
554 624
555/* 625/*
@@ -564,7 +634,7 @@ static int fallback_migrate_page(struct address_space *mapping,
564 * == 0 - success 634 * == 0 - success
565 */ 635 */
566static int move_to_new_page(struct page *newpage, struct page *page, 636static int move_to_new_page(struct page *newpage, struct page *page,
567 int remap_swapcache, bool sync) 637 int remap_swapcache, enum migrate_mode mode)
568{ 638{
569 struct address_space *mapping; 639 struct address_space *mapping;
570 int rc; 640 int rc;
@@ -585,29 +655,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
585 655
586 mapping = page_mapping(page); 656 mapping = page_mapping(page);
587 if (!mapping) 657 if (!mapping)
588 rc = migrate_page(mapping, newpage, page); 658 rc = migrate_page(mapping, newpage, page, mode);
589 else { 659 else if (mapping->a_ops->migratepage)
590 /* 660 /*
591 * Do not writeback pages if !sync and migratepage is 661 * Most pages have a mapping and most filesystems provide a
592 * not pointing to migrate_page() which is nonblocking 662 * migratepage callback. Anonymous pages are part of swap
593 * (swapcache/tmpfs uses migratepage = migrate_page). 663 * space which also has its own migratepage callback. This
664 * is the most common path for page migration.
594 */ 665 */
595 if (PageDirty(page) && !sync && 666 rc = mapping->a_ops->migratepage(mapping,
596 mapping->a_ops->migratepage != migrate_page) 667 newpage, page, mode);
597 rc = -EBUSY; 668 else
598 else if (mapping->a_ops->migratepage) 669 rc = fallback_migrate_page(mapping, newpage, page, mode);
599 /*
600 * Most pages have a mapping and most filesystems
601 * should provide a migration function. Anonymous
602 * pages are part of swap space which also has its
603 * own migration function. This is the most common
604 * path for page migration.
605 */
606 rc = mapping->a_ops->migratepage(mapping,
607 newpage, page);
608 else
609 rc = fallback_migrate_page(mapping, newpage, page);
610 }
611 670
612 if (rc) { 671 if (rc) {
613 newpage->mapping = NULL; 672 newpage->mapping = NULL;
@@ -621,38 +680,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
621 return rc; 680 return rc;
622} 681}
623 682
624/* 683static int __unmap_and_move(struct page *page, struct page *newpage,
625 * Obtain the lock on page, remove all ptes and migrate the page 684 int force, bool offlining, enum migrate_mode mode)
626 * to the newly allocated page in newpage.
627 */
628static int unmap_and_move(new_page_t get_new_page, unsigned long private,
629 struct page *page, int force, bool offlining, bool sync)
630{ 685{
631 int rc = 0; 686 int rc = -EAGAIN;
632 int *result = NULL;
633 struct page *newpage = get_new_page(page, private, &result);
634 int remap_swapcache = 1; 687 int remap_swapcache = 1;
635 int charge = 0; 688 int charge = 0;
636 struct mem_cgroup *mem; 689 struct mem_cgroup *mem;
637 struct anon_vma *anon_vma = NULL; 690 struct anon_vma *anon_vma = NULL;
638 691
639 if (!newpage)
640 return -ENOMEM;
641
642 if (page_count(page) == 1) {
643 /* page was freed from under us. So we are done. */
644 goto move_newpage;
645 }
646 if (unlikely(PageTransHuge(page)))
647 if (unlikely(split_huge_page(page)))
648 goto move_newpage;
649
650 /* prepare cgroup just returns 0 or -ENOMEM */
651 rc = -EAGAIN;
652
653 if (!trylock_page(page)) { 692 if (!trylock_page(page)) {
654 if (!force || !sync) 693 if (!force || mode == MIGRATE_ASYNC)
655 goto move_newpage; 694 goto out;
656 695
657 /* 696 /*
658 * It's not safe for direct compaction to call lock_page. 697 * It's not safe for direct compaction to call lock_page.
@@ -668,7 +707,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
668 * altogether. 707 * altogether.
669 */ 708 */
670 if (current->flags & PF_MEMALLOC) 709 if (current->flags & PF_MEMALLOC)
671 goto move_newpage; 710 goto out;
672 711
673 lock_page(page); 712 lock_page(page);
674 } 713 }
@@ -697,10 +736,12 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
697 736
698 if (PageWriteback(page)) { 737 if (PageWriteback(page)) {
699 /* 738 /*
700 * For !sync, there is no point retrying as the retry loop 739 * Only in the case of a full syncronous migration is it
701 * is expected to be too short for PageWriteback to be cleared 740 * necessary to wait for PageWriteback. In the async case,
741 * the retry loop is too short and in the sync-light case,
742 * the overhead of stalling is too much
702 */ 743 */
703 if (!sync) { 744 if (mode != MIGRATE_SYNC) {
704 rc = -EBUSY; 745 rc = -EBUSY;
705 goto uncharge; 746 goto uncharge;
706 } 747 }
@@ -771,7 +812,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
771 812
772skip_unmap: 813skip_unmap:
773 if (!page_mapped(page)) 814 if (!page_mapped(page))
774 rc = move_to_new_page(newpage, page, remap_swapcache, sync); 815 rc = move_to_new_page(newpage, page, remap_swapcache, mode);
775 816
776 if (rc && remap_swapcache) 817 if (rc && remap_swapcache)
777 remove_migration_ptes(page, page); 818 remove_migration_ptes(page, page);
@@ -785,27 +826,53 @@ uncharge:
785 mem_cgroup_end_migration(mem, page, newpage, rc == 0); 826 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
786unlock: 827unlock:
787 unlock_page(page); 828 unlock_page(page);
829out:
830 return rc;
831}
788 832
789move_newpage: 833/*
834 * Obtain the lock on page, remove all ptes and migrate the page
835 * to the newly allocated page in newpage.
836 */
837static int unmap_and_move(new_page_t get_new_page, unsigned long private,
838 struct page *page, int force, bool offlining,
839 enum migrate_mode mode)
840{
841 int rc = 0;
842 int *result = NULL;
843 struct page *newpage = get_new_page(page, private, &result);
844
845 if (!newpage)
846 return -ENOMEM;
847
848 if (page_count(page) == 1) {
849 /* page was freed from under us. So we are done. */
850 goto out;
851 }
852
853 if (unlikely(PageTransHuge(page)))
854 if (unlikely(split_huge_page(page)))
855 goto out;
856
857 rc = __unmap_and_move(page, newpage, force, offlining, mode);
858out:
790 if (rc != -EAGAIN) { 859 if (rc != -EAGAIN) {
791 /* 860 /*
792 * A page that has been migrated has all references 861 * A page that has been migrated has all references
793 * removed and will be freed. A page that has not been 862 * removed and will be freed. A page that has not been
794 * migrated will have kepts its references and be 863 * migrated will have kepts its references and be
795 * restored. 864 * restored.
796 */ 865 */
797 list_del(&page->lru); 866 list_del(&page->lru);
798 dec_zone_page_state(page, NR_ISOLATED_ANON + 867 dec_zone_page_state(page, NR_ISOLATED_ANON +
799 page_is_file_cache(page)); 868 page_is_file_cache(page));
800 putback_lru_page(page); 869 putback_lru_page(page);
801 } 870 }
802
803 /* 871 /*
804 * Move the new page to the LRU. If migration was not successful 872 * Move the new page to the LRU. If migration was not successful
805 * then this will free the page. 873 * then this will free the page.
806 */ 874 */
807 putback_lru_page(newpage); 875 putback_lru_page(newpage);
808
809 if (result) { 876 if (result) {
810 if (rc) 877 if (rc)
811 *result = rc; 878 *result = rc;
@@ -835,7 +902,8 @@ move_newpage:
835 */ 902 */
836static int unmap_and_move_huge_page(new_page_t get_new_page, 903static int unmap_and_move_huge_page(new_page_t get_new_page,
837 unsigned long private, struct page *hpage, 904 unsigned long private, struct page *hpage,
838 int force, bool offlining, bool sync) 905 int force, bool offlining,
906 enum migrate_mode mode)
839{ 907{
840 int rc = 0; 908 int rc = 0;
841 int *result = NULL; 909 int *result = NULL;
@@ -848,7 +916,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
848 rc = -EAGAIN; 916 rc = -EAGAIN;
849 917
850 if (!trylock_page(hpage)) { 918 if (!trylock_page(hpage)) {
851 if (!force || !sync) 919 if (!force || mode != MIGRATE_SYNC)
852 goto out; 920 goto out;
853 lock_page(hpage); 921 lock_page(hpage);
854 } 922 }
@@ -859,7 +927,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
859 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 927 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
860 928
861 if (!page_mapped(hpage)) 929 if (!page_mapped(hpage))
862 rc = move_to_new_page(new_hpage, hpage, 1, sync); 930 rc = move_to_new_page(new_hpage, hpage, 1, mode);
863 931
864 if (rc) 932 if (rc)
865 remove_migration_ptes(hpage, hpage); 933 remove_migration_ptes(hpage, hpage);
@@ -902,7 +970,7 @@ out:
902 */ 970 */
903int migrate_pages(struct list_head *from, 971int migrate_pages(struct list_head *from,
904 new_page_t get_new_page, unsigned long private, bool offlining, 972 new_page_t get_new_page, unsigned long private, bool offlining,
905 bool sync) 973 enum migrate_mode mode)
906{ 974{
907 int retry = 1; 975 int retry = 1;
908 int nr_failed = 0; 976 int nr_failed = 0;
@@ -923,7 +991,7 @@ int migrate_pages(struct list_head *from,
923 991
924 rc = unmap_and_move(get_new_page, private, 992 rc = unmap_and_move(get_new_page, private,
925 page, pass > 2, offlining, 993 page, pass > 2, offlining,
926 sync); 994 mode);
927 995
928 switch(rc) { 996 switch(rc) {
929 case -ENOMEM: 997 case -ENOMEM:
@@ -953,7 +1021,7 @@ out:
953 1021
954int migrate_huge_pages(struct list_head *from, 1022int migrate_huge_pages(struct list_head *from,
955 new_page_t get_new_page, unsigned long private, bool offlining, 1023 new_page_t get_new_page, unsigned long private, bool offlining,
956 bool sync) 1024 enum migrate_mode mode)
957{ 1025{
958 int retry = 1; 1026 int retry = 1;
959 int nr_failed = 0; 1027 int nr_failed = 0;
@@ -970,7 +1038,7 @@ int migrate_huge_pages(struct list_head *from,
970 1038
971 rc = unmap_and_move_huge_page(get_new_page, 1039 rc = unmap_and_move_huge_page(get_new_page,
972 private, page, pass > 2, offlining, 1040 private, page, pass > 2, offlining,
973 sync); 1041 mode);
974 1042
975 switch(rc) { 1043 switch(rc) {
976 case -ENOMEM: 1044 case -ENOMEM:
@@ -1099,7 +1167,7 @@ set_status:
1099 err = 0; 1167 err = 0;
1100 if (!list_empty(&pagelist)) { 1168 if (!list_empty(&pagelist)) {
1101 err = migrate_pages(&pagelist, new_page_node, 1169 err = migrate_pages(&pagelist, new_page_node,
1102 (unsigned long)pm, 0, true); 1170 (unsigned long)pm, 0, MIGRATE_SYNC);
1103 if (err) 1171 if (err)
1104 putback_lru_pages(&pagelist); 1172 putback_lru_pages(&pagelist);
1105 } 1173 }
diff --git a/mm/mincore.c b/mm/mincore.c
index a4e6b9d75c7..117ff549279 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -161,7 +161,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
161 } 161 }
162 /* fall through */ 162 /* fall through */
163 } 163 }
164 if (pmd_none_or_clear_bad(pmd)) 164 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
165 mincore_unmapped_range(vma, addr, next, vec); 165 mincore_unmapped_range(vma, addr, next, vec);
166 else 166 else
167 mincore_pte_range(vma, pmd, addr, next, vec); 167 mincore_pte_range(vma, pmd, addr, next, vec);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8d032de4088..71c78115c45 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
33void __mmu_notifier_release(struct mm_struct *mm) 33void __mmu_notifier_release(struct mm_struct *mm)
34{ 34{
35 struct mmu_notifier *mn; 35 struct mmu_notifier *mn;
36 struct hlist_node *n;
37
38 /*
39 * RCU here will block mmu_notifier_unregister until
40 * ->release returns.
41 */
42 rcu_read_lock();
43 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
44 /*
45 * if ->release runs before mmu_notifier_unregister it
46 * must be handled as it's the only way for the driver
47 * to flush all existing sptes and stop the driver
48 * from establishing any more sptes before all the
49 * pages in the mm are freed.
50 */
51 if (mn->ops->release)
52 mn->ops->release(mn, mm);
53 rcu_read_unlock();
36 54
37 spin_lock(&mm->mmu_notifier_mm->lock); 55 spin_lock(&mm->mmu_notifier_mm->lock);
38 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 56 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
46 * mmu_notifier_unregister to return. 64 * mmu_notifier_unregister to return.
47 */ 65 */
48 hlist_del_init_rcu(&mn->hlist); 66 hlist_del_init_rcu(&mn->hlist);
49 /*
50 * RCU here will block mmu_notifier_unregister until
51 * ->release returns.
52 */
53 rcu_read_lock();
54 spin_unlock(&mm->mmu_notifier_mm->lock);
55 /*
56 * if ->release runs before mmu_notifier_unregister it
57 * must be handled as it's the only way for the driver
58 * to flush all existing sptes and stop the driver
59 * from establishing any more sptes before all the
60 * pages in the mm are freed.
61 */
62 if (mn->ops->release)
63 mn->ops->release(mn, mm);
64 rcu_read_unlock();
65 spin_lock(&mm->mmu_notifier_mm->lock);
66 } 67 }
67 spin_unlock(&mm->mmu_notifier_mm->lock); 68 spin_unlock(&mm->mmu_notifier_mm->lock);
68 69
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
284{ 285{
285 BUG_ON(atomic_read(&mm->mm_count) <= 0); 286 BUG_ON(atomic_read(&mm->mm_count) <= 0);
286 287
287 spin_lock(&mm->mmu_notifier_mm->lock);
288 if (!hlist_unhashed(&mn->hlist)) { 288 if (!hlist_unhashed(&mn->hlist)) {
289 hlist_del_rcu(&mn->hlist);
290
291 /* 289 /*
292 * RCU here will force exit_mmap to wait ->release to finish 290 * RCU here will force exit_mmap to wait ->release to finish
293 * before freeing the pages. 291 * before freeing the pages.
294 */ 292 */
295 rcu_read_lock(); 293 rcu_read_lock();
296 spin_unlock(&mm->mmu_notifier_mm->lock); 294
297 /* 295 /*
298 * exit_mmap will block in mmu_notifier_release to 296 * exit_mmap will block in mmu_notifier_release to
299 * guarantee ->release is called before freeing the 297 * guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
302 if (mn->ops->release) 300 if (mn->ops->release)
303 mn->ops->release(mn, mm); 301 mn->ops->release(mn, mm);
304 rcu_read_unlock(); 302 rcu_read_unlock();
305 } else 303
304 spin_lock(&mm->mmu_notifier_mm->lock);
305 hlist_del_rcu(&mn->hlist);
306 spin_unlock(&mm->mmu_notifier_mm->lock); 306 spin_unlock(&mm->mmu_notifier_mm->lock);
307 }
307 308
308 /* 309 /*
309 * Wait any running method to finish, of course including 310 * Wait any running method to finish, of course including
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 6e93dc7f258..e39e3efe4a4 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -83,8 +83,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
83 83
84static void __init __free_pages_memory(unsigned long start, unsigned long end) 84static void __init __free_pages_memory(unsigned long start, unsigned long end)
85{ 85{
86 int i; 86 unsigned long i, start_aligned, end_aligned;
87 unsigned long start_aligned, end_aligned;
88 int order = ilog2(BITS_PER_LONG); 87 int order = ilog2(BITS_PER_LONG);
89 88
90 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); 89 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
diff --git a/mm/nommu.c b/mm/nommu.c
index 9edc897a397..5ff9b35883e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -697,9 +697,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
697 if (vma->vm_file) { 697 if (vma->vm_file) {
698 mapping = vma->vm_file->f_mapping; 698 mapping = vma->vm_file->f_mapping;
699 699
700 mutex_lock(&mapping->i_mmap_mutex);
700 flush_dcache_mmap_lock(mapping); 701 flush_dcache_mmap_lock(mapping);
701 vma_prio_tree_insert(vma, &mapping->i_mmap); 702 vma_prio_tree_insert(vma, &mapping->i_mmap);
702 flush_dcache_mmap_unlock(mapping); 703 flush_dcache_mmap_unlock(mapping);
704 mutex_unlock(&mapping->i_mmap_mutex);
703 } 705 }
704 706
705 /* add the VMA to the tree */ 707 /* add the VMA to the tree */
@@ -761,9 +763,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
761 if (vma->vm_file) { 763 if (vma->vm_file) {
762 mapping = vma->vm_file->f_mapping; 764 mapping = vma->vm_file->f_mapping;
763 765
766 mutex_lock(&mapping->i_mmap_mutex);
764 flush_dcache_mmap_lock(mapping); 767 flush_dcache_mmap_lock(mapping);
765 vma_prio_tree_remove(vma, &mapping->i_mmap); 768 vma_prio_tree_remove(vma, &mapping->i_mmap);
766 flush_dcache_mmap_unlock(mapping); 769 flush_dcache_mmap_unlock(mapping);
770 mutex_unlock(&mapping->i_mmap_mutex);
767 } 771 }
768 772
769 /* remove from the MM's tree and list */ 773 /* remove from the MM's tree and list */
@@ -776,8 +780,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
776 780
777 if (vma->vm_next) 781 if (vma->vm_next)
778 vma->vm_next->vm_prev = vma->vm_prev; 782 vma->vm_next->vm_prev = vma->vm_prev;
779
780 vma->vm_mm = NULL;
781} 783}
782 784
783/* 785/*
@@ -2061,6 +2063,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2061 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 2063 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
2062 2064
2063 down_write(&nommu_region_sem); 2065 down_write(&nommu_region_sem);
2066 mutex_lock(&inode->i_mapping->i_mmap_mutex);
2064 2067
2065 /* search for VMAs that fall within the dead zone */ 2068 /* search for VMAs that fall within the dead zone */
2066 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, 2069 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
@@ -2068,6 +2071,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2068 /* found one - only interested if it's shared out of the page 2071 /* found one - only interested if it's shared out of the page
2069 * cache */ 2072 * cache */
2070 if (vma->vm_flags & VM_SHARED) { 2073 if (vma->vm_flags & VM_SHARED) {
2074 mutex_unlock(&inode->i_mapping->i_mmap_mutex);
2071 up_write(&nommu_region_sem); 2075 up_write(&nommu_region_sem);
2072 return -ETXTBSY; /* not quite true, but near enough */ 2076 return -ETXTBSY; /* not quite true, but near enough */
2073 } 2077 }
@@ -2095,6 +2099,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2095 } 2099 }
2096 } 2100 }
2097 2101
2102 mutex_unlock(&inode->i_mapping->i_mmap_mutex);
2098 up_write(&nommu_region_sem); 2103 up_write(&nommu_region_sem);
2099 return 0; 2104 return 0;
2100} 2105}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e4b0991ca35..7c72487ca45 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -162,7 +162,7 @@ static bool oom_unkillable_task(struct task_struct *p,
162unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, 162unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
163 const nodemask_t *nodemask, unsigned long totalpages) 163 const nodemask_t *nodemask, unsigned long totalpages)
164{ 164{
165 int points; 165 long points;
166 166
167 if (oom_unkillable_task(p, mem, nodemask)) 167 if (oom_unkillable_task(p, mem, nodemask))
168 return 0; 168 return 0;
@@ -303,7 +303,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
303 do_each_thread(g, p) { 303 do_each_thread(g, p) {
304 unsigned int points; 304 unsigned int points;
305 305
306 if (!p->mm) 306 if (p->exit_state)
307 continue; 307 continue;
308 if (oom_unkillable_task(p, mem, nodemask)) 308 if (oom_unkillable_task(p, mem, nodemask))
309 continue; 309 continue;
@@ -319,6 +319,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
319 */ 319 */
320 if (test_tsk_thread_flag(p, TIF_MEMDIE)) 320 if (test_tsk_thread_flag(p, TIF_MEMDIE))
321 return ERR_PTR(-1UL); 321 return ERR_PTR(-1UL);
322 if (!p->mm)
323 continue;
322 324
323 if (p->flags & PF_EXITING) { 325 if (p->flags & PF_EXITING) {
324 /* 326 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 31f69886242..903e46bff32 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -37,24 +37,22 @@
37#include <trace/events/writeback.h> 37#include <trace/events/writeback.h>
38 38
39/* 39/*
40 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 40 * Sleep at most 200ms at a time in balance_dirty_pages().
41 * will look to see if it needs to force writeback or throttling.
42 */ 41 */
43static long ratelimit_pages = 32; 42#define MAX_PAUSE max(HZ/5, 1)
44 43
45/* 44/*
46 * When balance_dirty_pages decides that the caller needs to perform some 45 * Estimate write bandwidth at 200ms intervals.
47 * non-background writeback, this is how many pages it will attempt to write.
48 * It should be somewhat larger than dirtied pages to ensure that reasonably
49 * large amounts of I/O are submitted.
50 */ 46 */
51static inline long sync_writeback_pages(unsigned long dirtied) 47#define BANDWIDTH_INTERVAL max(HZ/5, 1)
52{
53 if (dirtied < ratelimit_pages)
54 dirtied = ratelimit_pages;
55 48
56 return dirtied + dirtied / 2; 49#define RATELIMIT_CALC_SHIFT 10
57} 50
51/*
52 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
53 * will look to see if it needs to force writeback or throttling.
54 */
55static long ratelimit_pages = 32;
58 56
59/* The following parameters are exported via /proc/sys/vm */ 57/* The following parameters are exported via /proc/sys/vm */
60 58
@@ -111,6 +109,7 @@ EXPORT_SYMBOL(laptop_mode);
111 109
112/* End of sysctl-exported parameters */ 110/* End of sysctl-exported parameters */
113 111
112unsigned long global_dirty_limit;
114 113
115/* 114/*
116 * Scale the writeback cache size proportional to the relative writeout speeds. 115 * Scale the writeback cache size proportional to the relative writeout speeds.
@@ -156,6 +155,8 @@ static void update_completion_period(void)
156 int shift = calc_period_shift(); 155 int shift = calc_period_shift();
157 prop_change_shift(&vm_completions, shift); 156 prop_change_shift(&vm_completions, shift);
158 prop_change_shift(&vm_dirties, shift); 157 prop_change_shift(&vm_dirties, shift);
158
159 writeback_set_ratelimit();
159} 160}
160 161
161int dirty_background_ratio_handler(struct ctl_table *table, int write, 162int dirty_background_ratio_handler(struct ctl_table *table, int write,
@@ -219,6 +220,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
219 */ 220 */
220static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) 221static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
221{ 222{
223 __inc_bdi_stat(bdi, BDI_WRITTEN);
222 __prop_inc_percpu_max(&vm_completions, &bdi->completions, 224 __prop_inc_percpu_max(&vm_completions, &bdi->completions,
223 bdi->max_prop_frac); 225 bdi->max_prop_frac);
224} 226}
@@ -244,50 +246,8 @@ void task_dirty_inc(struct task_struct *tsk)
244static void bdi_writeout_fraction(struct backing_dev_info *bdi, 246static void bdi_writeout_fraction(struct backing_dev_info *bdi,
245 long *numerator, long *denominator) 247 long *numerator, long *denominator)
246{ 248{
247 if (bdi_cap_writeback_dirty(bdi)) { 249 prop_fraction_percpu(&vm_completions, &bdi->completions,
248 prop_fraction_percpu(&vm_completions, &bdi->completions,
249 numerator, denominator); 250 numerator, denominator);
250 } else {
251 *numerator = 0;
252 *denominator = 1;
253 }
254}
255
256static inline void task_dirties_fraction(struct task_struct *tsk,
257 long *numerator, long *denominator)
258{
259 prop_fraction_single(&vm_dirties, &tsk->dirties,
260 numerator, denominator);
261}
262
263/*
264 * task_dirty_limit - scale down dirty throttling threshold for one task
265 *
266 * task specific dirty limit:
267 *
268 * dirty -= (dirty/8) * p_{t}
269 *
270 * To protect light/slow dirtying tasks from heavier/fast ones, we start
271 * throttling individual tasks before reaching the bdi dirty limit.
272 * Relatively low thresholds will be allocated to heavy dirtiers. So when
273 * dirty pages grow large, heavy dirtiers will be throttled first, which will
274 * effectively curb the growth of dirty pages. Light dirtiers with high enough
275 * dirty threshold may never get throttled.
276 */
277static unsigned long task_dirty_limit(struct task_struct *tsk,
278 unsigned long bdi_dirty)
279{
280 long numerator, denominator;
281 unsigned long dirty = bdi_dirty;
282 u64 inv = dirty >> 3;
283
284 task_dirties_fraction(tsk, &numerator, &denominator);
285 inv *= numerator;
286 do_div(inv, denominator);
287
288 dirty -= inv;
289
290 return max(dirty, bdi_dirty/2);
291} 251}
292 252
293/* 253/*
@@ -397,6 +357,17 @@ unsigned long determine_dirtyable_memory(void)
397 return x + 1; /* Ensure that we never return 0 */ 357 return x + 1; /* Ensure that we never return 0 */
398} 358}
399 359
360static unsigned long dirty_freerun_ceiling(unsigned long thresh,
361 unsigned long bg_thresh)
362{
363 return (thresh + bg_thresh) / 2;
364}
365
366static unsigned long hard_dirty_limit(unsigned long thresh)
367{
368 return max(thresh, global_dirty_limit);
369}
370
400/* 371/*
401 * global_dirty_limits - background-writeback and dirty-throttling thresholds 372 * global_dirty_limits - background-writeback and dirty-throttling thresholds
402 * 373 *
@@ -435,12 +406,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
435 } 406 }
436 *pbackground = background; 407 *pbackground = background;
437 *pdirty = dirty; 408 *pdirty = dirty;
409 trace_global_dirty_state(background, dirty);
438} 410}
439 411
440/* 412/**
441 * bdi_dirty_limit - @bdi's share of dirty throttling threshold 413 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
414 * @bdi: the backing_dev_info to query
415 * @dirty: global dirty limit in pages
416 *
417 * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
418 * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
419 * And the "limit" in the name is not seriously taken as hard limit in
420 * balance_dirty_pages().
442 * 421 *
443 * Allocate high/low dirty limits to fast/slow devices, in order to prevent 422 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
444 * - starving fast devices 423 * - starving fast devices
445 * - piling up dirty pages (that will take long time to sync) on slow devices 424 * - piling up dirty pages (that will take long time to sync) on slow devices
446 * 425 *
@@ -469,36 +448,588 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
469} 448}
470 449
471/* 450/*
451 * Dirty position control.
452 *
453 * (o) global/bdi setpoints
454 *
455 * We want the dirty pages be balanced around the global/bdi setpoints.
456 * When the number of dirty pages is higher/lower than the setpoint, the
457 * dirty position control ratio (and hence task dirty ratelimit) will be
458 * decreased/increased to bring the dirty pages back to the setpoint.
459 *
460 * pos_ratio = 1 << RATELIMIT_CALC_SHIFT
461 *
462 * if (dirty < setpoint) scale up pos_ratio
463 * if (dirty > setpoint) scale down pos_ratio
464 *
465 * if (bdi_dirty < bdi_setpoint) scale up pos_ratio
466 * if (bdi_dirty > bdi_setpoint) scale down pos_ratio
467 *
468 * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
469 *
470 * (o) global control line
471 *
472 * ^ pos_ratio
473 * |
474 * | |<===== global dirty control scope ======>|
475 * 2.0 .............*
476 * | .*
477 * | . *
478 * | . *
479 * | . *
480 * | . *
481 * | . *
482 * 1.0 ................................*
483 * | . . *
484 * | . . *
485 * | . . *
486 * | . . *
487 * | . . *
488 * 0 +------------.------------------.----------------------*------------->
489 * freerun^ setpoint^ limit^ dirty pages
490 *
491 * (o) bdi control line
492 *
493 * ^ pos_ratio
494 * |
495 * | *
496 * | *
497 * | *
498 * | *
499 * | * |<=========== span ============>|
500 * 1.0 .......................*
501 * | . *
502 * | . *
503 * | . *
504 * | . *
505 * | . *
506 * | . *
507 * | . *
508 * | . *
509 * | . *
510 * | . *
511 * | . *
512 * 1/4 ...............................................* * * * * * * * * * * *
513 * | . .
514 * | . .
515 * | . .
516 * 0 +----------------------.-------------------------------.------------->
517 * bdi_setpoint^ x_intercept^
518 *
519 * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
520 * be smoothly throttled down to normal if it starts high in situations like
521 * - start writing to a slow SD card and a fast disk at the same time. The SD
522 * card's bdi_dirty may rush to many times higher than bdi_setpoint.
523 * - the bdi dirty thresh drops quickly due to change of JBOD workload
524 */
525static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
526 unsigned long thresh,
527 unsigned long bg_thresh,
528 unsigned long dirty,
529 unsigned long bdi_thresh,
530 unsigned long bdi_dirty)
531{
532 unsigned long write_bw = bdi->avg_write_bandwidth;
533 unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
534 unsigned long limit = hard_dirty_limit(thresh);
535 unsigned long x_intercept;
536 unsigned long setpoint; /* dirty pages' target balance point */
537 unsigned long bdi_setpoint;
538 unsigned long span;
539 long long pos_ratio; /* for scaling up/down the rate limit */
540 long x;
541
542 if (unlikely(dirty >= limit))
543 return 0;
544
545 /*
546 * global setpoint
547 *
548 * setpoint - dirty 3
549 * f(dirty) := 1.0 + (----------------)
550 * limit - setpoint
551 *
552 * it's a 3rd order polynomial that subjects to
553 *
554 * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
555 * (2) f(setpoint) = 1.0 => the balance point
556 * (3) f(limit) = 0 => the hard limit
557 * (4) df/dx <= 0 => negative feedback control
558 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
559 * => fast response on large errors; small oscillation near setpoint
560 */
561 setpoint = (freerun + limit) / 2;
562 x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
563 limit - setpoint + 1);
564 pos_ratio = x;
565 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
566 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
567 pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
568
569 /*
570 * We have computed basic pos_ratio above based on global situation. If
571 * the bdi is over/under its share of dirty pages, we want to scale
572 * pos_ratio further down/up. That is done by the following mechanism.
573 */
574
575 /*
576 * bdi setpoint
577 *
578 * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
579 *
580 * x_intercept - bdi_dirty
581 * := --------------------------
582 * x_intercept - bdi_setpoint
583 *
584 * The main bdi control line is a linear function that subjects to
585 *
586 * (1) f(bdi_setpoint) = 1.0
587 * (2) k = - 1 / (8 * write_bw) (in single bdi case)
588 * or equally: x_intercept = bdi_setpoint + 8 * write_bw
589 *
590 * For single bdi case, the dirty pages are observed to fluctuate
591 * regularly within range
592 * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
593 * for various filesystems, where (2) can yield in a reasonable 12.5%
594 * fluctuation range for pos_ratio.
595 *
596 * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
597 * own size, so move the slope over accordingly and choose a slope that
598 * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
599 */
600 if (unlikely(bdi_thresh > thresh))
601 bdi_thresh = thresh;
602 bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
603 /*
604 * scale global setpoint to bdi's:
605 * bdi_setpoint = setpoint * bdi_thresh / thresh
606 */
607 x = div_u64((u64)bdi_thresh << 16, thresh + 1);
608 bdi_setpoint = setpoint * (u64)x >> 16;
609 /*
610 * Use span=(8*write_bw) in single bdi case as indicated by
611 * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
612 *
613 * bdi_thresh thresh - bdi_thresh
614 * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
615 * thresh thresh
616 */
617 span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
618 x_intercept = bdi_setpoint + span;
619
620 if (bdi_dirty < x_intercept - span / 4) {
621 pos_ratio *= x_intercept - bdi_dirty;
622 do_div(pos_ratio, x_intercept - bdi_setpoint + 1);
623 } else
624 pos_ratio /= 4;
625
626 /*
627 * bdi reserve area, safeguard against dirty pool underrun and disk idle
628 * It may push the desired control point of global dirty pages higher
629 * than setpoint.
630 */
631 x_intercept = bdi_thresh / 2;
632 if (bdi_dirty < x_intercept) {
633 if (bdi_dirty > x_intercept / 8) {
634 pos_ratio *= x_intercept;
635 do_div(pos_ratio, bdi_dirty);
636 } else
637 pos_ratio *= 8;
638 }
639
640 return pos_ratio;
641}
642
643static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
644 unsigned long elapsed,
645 unsigned long written)
646{
647 const unsigned long period = roundup_pow_of_two(3 * HZ);
648 unsigned long avg = bdi->avg_write_bandwidth;
649 unsigned long old = bdi->write_bandwidth;
650 u64 bw;
651
652 /*
653 * bw = written * HZ / elapsed
654 *
655 * bw * elapsed + write_bandwidth * (period - elapsed)
656 * write_bandwidth = ---------------------------------------------------
657 * period
658 */
659 bw = written - bdi->written_stamp;
660 bw *= HZ;
661 if (unlikely(elapsed > period)) {
662 do_div(bw, elapsed);
663 avg = bw;
664 goto out;
665 }
666 bw += (u64)bdi->write_bandwidth * (period - elapsed);
667 bw >>= ilog2(period);
668
669 /*
670 * one more level of smoothing, for filtering out sudden spikes
671 */
672 if (avg > old && old >= (unsigned long)bw)
673 avg -= (avg - old) >> 3;
674
675 if (avg < old && old <= (unsigned long)bw)
676 avg += (old - avg) >> 3;
677
678out:
679 bdi->write_bandwidth = bw;
680 bdi->avg_write_bandwidth = avg;
681}
682
683/*
684 * The global dirtyable memory and dirty threshold could be suddenly knocked
685 * down by a large amount (eg. on the startup of KVM in a swapless system).
686 * This may throw the system into deep dirty exceeded state and throttle
687 * heavy/light dirtiers alike. To retain good responsiveness, maintain
688 * global_dirty_limit for tracking slowly down to the knocked down dirty
689 * threshold.
690 */
691static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
692{
693 unsigned long limit = global_dirty_limit;
694
695 /*
696 * Follow up in one step.
697 */
698 if (limit < thresh) {
699 limit = thresh;
700 goto update;
701 }
702
703 /*
704 * Follow down slowly. Use the higher one as the target, because thresh
705 * may drop below dirty. This is exactly the reason to introduce
706 * global_dirty_limit which is guaranteed to lie above the dirty pages.
707 */
708 thresh = max(thresh, dirty);
709 if (limit > thresh) {
710 limit -= (limit - thresh) >> 5;
711 goto update;
712 }
713 return;
714update:
715 global_dirty_limit = limit;
716}
717
718static void global_update_bandwidth(unsigned long thresh,
719 unsigned long dirty,
720 unsigned long now)
721{
722 static DEFINE_SPINLOCK(dirty_lock);
723 static unsigned long update_time;
724
725 /*
726 * check locklessly first to optimize away locking for the most time
727 */
728 if (time_before(now, update_time + BANDWIDTH_INTERVAL))
729 return;
730
731 spin_lock(&dirty_lock);
732 if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
733 update_dirty_limit(thresh, dirty);
734 update_time = now;
735 }
736 spin_unlock(&dirty_lock);
737}
738
739/*
740 * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
741 *
742 * Normal bdi tasks will be curbed at or below it in long term.
743 * Obviously it should be around (write_bw / N) when there are N dd tasks.
744 */
745static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
746 unsigned long thresh,
747 unsigned long bg_thresh,
748 unsigned long dirty,
749 unsigned long bdi_thresh,
750 unsigned long bdi_dirty,
751 unsigned long dirtied,
752 unsigned long elapsed)
753{
754 unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
755 unsigned long limit = hard_dirty_limit(thresh);
756 unsigned long setpoint = (freerun + limit) / 2;
757 unsigned long write_bw = bdi->avg_write_bandwidth;
758 unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
759 unsigned long dirty_rate;
760 unsigned long task_ratelimit;
761 unsigned long balanced_dirty_ratelimit;
762 unsigned long pos_ratio;
763 unsigned long step;
764 unsigned long x;
765
766 /*
767 * The dirty rate will match the writeout rate in long term, except
768 * when dirty pages are truncated by userspace or re-dirtied by FS.
769 */
770 dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
771
772 pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
773 bdi_thresh, bdi_dirty);
774 /*
775 * task_ratelimit reflects each dd's dirty rate for the past 200ms.
776 */
777 task_ratelimit = (u64)dirty_ratelimit *
778 pos_ratio >> RATELIMIT_CALC_SHIFT;
779 task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
780
781 /*
782 * A linear estimation of the "balanced" throttle rate. The theory is,
783 * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
784 * dirty_rate will be measured to be (N * task_ratelimit). So the below
785 * formula will yield the balanced rate limit (write_bw / N).
786 *
787 * Note that the expanded form is not a pure rate feedback:
788 * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1)
789 * but also takes pos_ratio into account:
790 * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2)
791 *
792 * (1) is not realistic because pos_ratio also takes part in balancing
793 * the dirty rate. Consider the state
794 * pos_ratio = 0.5 (3)
795 * rate = 2 * (write_bw / N) (4)
796 * If (1) is used, it will stuck in that state! Because each dd will
797 * be throttled at
798 * task_ratelimit = pos_ratio * rate = (write_bw / N) (5)
799 * yielding
800 * dirty_rate = N * task_ratelimit = write_bw (6)
801 * put (6) into (1) we get
802 * rate_(i+1) = rate_(i) (7)
803 *
804 * So we end up using (2) to always keep
805 * rate_(i+1) ~= (write_bw / N) (8)
806 * regardless of the value of pos_ratio. As long as (8) is satisfied,
807 * pos_ratio is able to drive itself to 1.0, which is not only where
808 * the dirty count meet the setpoint, but also where the slope of
809 * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
810 */
811 balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
812 dirty_rate | 1);
813
814 /*
815 * We could safely do this and return immediately:
816 *
817 * bdi->dirty_ratelimit = balanced_dirty_ratelimit;
818 *
819 * However to get a more stable dirty_ratelimit, the below elaborated
820 * code makes use of task_ratelimit to filter out sigular points and
821 * limit the step size.
822 *
823 * The below code essentially only uses the relative value of
824 *
825 * task_ratelimit - dirty_ratelimit
826 * = (pos_ratio - 1) * dirty_ratelimit
827 *
828 * which reflects the direction and size of dirty position error.
829 */
830
831 /*
832 * dirty_ratelimit will follow balanced_dirty_ratelimit iff
833 * task_ratelimit is on the same side of dirty_ratelimit, too.
834 * For example, when
835 * - dirty_ratelimit > balanced_dirty_ratelimit
836 * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
837 * lowering dirty_ratelimit will help meet both the position and rate
838 * control targets. Otherwise, don't update dirty_ratelimit if it will
839 * only help meet the rate target. After all, what the users ultimately
840 * feel and care are stable dirty rate and small position error.
841 *
842 * |task_ratelimit - dirty_ratelimit| is used to limit the step size
843 * and filter out the sigular points of balanced_dirty_ratelimit. Which
844 * keeps jumping around randomly and can even leap far away at times
845 * due to the small 200ms estimation period of dirty_rate (we want to
846 * keep that period small to reduce time lags).
847 */
848 step = 0;
849 if (dirty < setpoint) {
850 x = min(bdi->balanced_dirty_ratelimit,
851 min(balanced_dirty_ratelimit, task_ratelimit));
852 if (dirty_ratelimit < x)
853 step = x - dirty_ratelimit;
854 } else {
855 x = max(bdi->balanced_dirty_ratelimit,
856 max(balanced_dirty_ratelimit, task_ratelimit));
857 if (dirty_ratelimit > x)
858 step = dirty_ratelimit - x;
859 }
860
861 /*
862 * Don't pursue 100% rate matching. It's impossible since the balanced
863 * rate itself is constantly fluctuating. So decrease the track speed
864 * when it gets close to the target. Helps eliminate pointless tremors.
865 */
866 step >>= dirty_ratelimit / (2 * step + 1);
867 /*
868 * Limit the tracking speed to avoid overshooting.
869 */
870 step = (step + 7) / 8;
871
872 if (dirty_ratelimit < balanced_dirty_ratelimit)
873 dirty_ratelimit += step;
874 else
875 dirty_ratelimit -= step;
876
877 bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
878 bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
879
880 trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
881}
882
883void __bdi_update_bandwidth(struct backing_dev_info *bdi,
884 unsigned long thresh,
885 unsigned long bg_thresh,
886 unsigned long dirty,
887 unsigned long bdi_thresh,
888 unsigned long bdi_dirty,
889 unsigned long start_time)
890{
891 unsigned long now = jiffies;
892 unsigned long elapsed = now - bdi->bw_time_stamp;
893 unsigned long dirtied;
894 unsigned long written;
895
896 /*
897 * rate-limit, only update once every 200ms.
898 */
899 if (elapsed < BANDWIDTH_INTERVAL)
900 return;
901
902 dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
903 written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
904
905 /*
906 * Skip quiet periods when disk bandwidth is under-utilized.
907 * (at least 1s idle time between two flusher runs)
908 */
909 if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
910 goto snapshot;
911
912 if (thresh) {
913 global_update_bandwidth(thresh, dirty, now);
914 bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
915 bdi_thresh, bdi_dirty,
916 dirtied, elapsed);
917 }
918 bdi_update_write_bandwidth(bdi, elapsed, written);
919
920snapshot:
921 bdi->dirtied_stamp = dirtied;
922 bdi->written_stamp = written;
923 bdi->bw_time_stamp = now;
924}
925
926static void bdi_update_bandwidth(struct backing_dev_info *bdi,
927 unsigned long thresh,
928 unsigned long bg_thresh,
929 unsigned long dirty,
930 unsigned long bdi_thresh,
931 unsigned long bdi_dirty,
932 unsigned long start_time)
933{
934 if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
935 return;
936 spin_lock(&bdi->wb.list_lock);
937 __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
938 bdi_thresh, bdi_dirty, start_time);
939 spin_unlock(&bdi->wb.list_lock);
940}
941
942/*
943 * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
944 * will look to see if it needs to start dirty throttling.
945 *
946 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
947 * global_page_state() too often. So scale it near-sqrt to the safety margin
948 * (the number of pages we may dirty without exceeding the dirty limits).
949 */
950static unsigned long dirty_poll_interval(unsigned long dirty,
951 unsigned long thresh)
952{
953 if (thresh > dirty)
954 return 1UL << (ilog2(thresh - dirty) >> 1);
955
956 return 1;
957}
958
959static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
960 unsigned long bdi_dirty)
961{
962 unsigned long bw = bdi->avg_write_bandwidth;
963 unsigned long hi = ilog2(bw);
964 unsigned long lo = ilog2(bdi->dirty_ratelimit);
965 unsigned long t;
966
967 /* target for 20ms max pause on 1-dd case */
968 t = HZ / 50;
969
970 /*
971 * Scale up pause time for concurrent dirtiers in order to reduce CPU
972 * overheads.
973 *
974 * (N * 20ms) on 2^N concurrent tasks.
975 */
976 if (hi > lo)
977 t += (hi - lo) * (20 * HZ) / 1024;
978
979 /*
980 * Limit pause time for small memory systems. If sleeping for too long
981 * time, a small pool of dirty/writeback pages may go empty and disk go
982 * idle.
983 *
984 * 8 serves as the safety ratio.
985 */
986 if (bdi_dirty)
987 t = min(t, bdi_dirty * HZ / (8 * bw + 1));
988
989 /*
990 * The pause time will be settled within range (max_pause/4, max_pause).
991 * Apply a minimal value of 4 to get a non-zero max_pause/4.
992 */
993 return clamp_val(t, 4, MAX_PAUSE);
994}
995
996/*
472 * balance_dirty_pages() must be called by processes which are generating dirty 997 * balance_dirty_pages() must be called by processes which are generating dirty
473 * data. It looks at the number of dirty pages in the machine and will force 998 * data. It looks at the number of dirty pages in the machine and will force
474 * the caller to perform writeback if the system is over `vm_dirty_ratio'. 999 * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
475 * If we're over `background_thresh' then the writeback threads are woken to 1000 * If we're over `background_thresh' then the writeback threads are woken to
476 * perform some writeout. 1001 * perform some writeout.
477 */ 1002 */
478static void balance_dirty_pages(struct address_space *mapping, 1003static void balance_dirty_pages(struct address_space *mapping,
479 unsigned long write_chunk) 1004 unsigned long pages_dirtied)
480{ 1005{
481 long nr_reclaimable, bdi_nr_reclaimable; 1006 unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
482 long nr_writeback, bdi_nr_writeback; 1007 unsigned long bdi_reclaimable;
1008 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
1009 unsigned long bdi_dirty;
1010 unsigned long freerun;
483 unsigned long background_thresh; 1011 unsigned long background_thresh;
484 unsigned long dirty_thresh; 1012 unsigned long dirty_thresh;
485 unsigned long bdi_thresh; 1013 unsigned long bdi_thresh;
486 unsigned long pages_written = 0; 1014 long pause = 0;
487 unsigned long pause = 1; 1015 long max_pause;
488 bool dirty_exceeded = false; 1016 bool dirty_exceeded = false;
1017 unsigned long task_ratelimit;
1018 unsigned long dirty_ratelimit;
1019 unsigned long pos_ratio;
489 struct backing_dev_info *bdi = mapping->backing_dev_info; 1020 struct backing_dev_info *bdi = mapping->backing_dev_info;
1021 unsigned long start_time = jiffies;
490 1022
491 for (;;) { 1023 for (;;) {
492 struct writeback_control wbc = { 1024 /*
493 .sync_mode = WB_SYNC_NONE, 1025 * Unstable writes are a feature of certain networked
494 .older_than_this = NULL, 1026 * filesystems (i.e. NFS) in which data may have been
495 .nr_to_write = write_chunk, 1027 * written to the server's write cache, but has not yet
496 .range_cyclic = 1, 1028 * been flushed to permanent storage.
497 }; 1029 */
498
499 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 1030 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
500 global_page_state(NR_UNSTABLE_NFS); 1031 global_page_state(NR_UNSTABLE_NFS);
501 nr_writeback = global_page_state(NR_WRITEBACK); 1032 nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
502 1033
503 global_dirty_limits(&background_thresh, &dirty_thresh); 1034 global_dirty_limits(&background_thresh, &dirty_thresh);
504 1035
@@ -507,12 +1038,28 @@ static void balance_dirty_pages(struct address_space *mapping,
507 * catch-up. This avoids (excessively) small writeouts 1038 * catch-up. This avoids (excessively) small writeouts
508 * when the bdi limits are ramping up. 1039 * when the bdi limits are ramping up.
509 */ 1040 */
510 if (nr_reclaimable + nr_writeback <= 1041 freerun = dirty_freerun_ceiling(dirty_thresh,
511 (background_thresh + dirty_thresh) / 2) 1042 background_thresh);
1043 if (nr_dirty <= freerun)
512 break; 1044 break;
513 1045
1046 if (unlikely(!writeback_in_progress(bdi)))
1047 bdi_start_background_writeback(bdi);
1048
1049 /*
1050 * bdi_thresh is not treated as some limiting factor as
1051 * dirty_thresh, due to reasons
1052 * - in JBOD setup, bdi_thresh can fluctuate a lot
1053 * - in a system with HDD and USB key, the USB key may somehow
1054 * go into state (bdi_dirty >> bdi_thresh) either because
1055 * bdi_dirty starts high, or because bdi_thresh drops low.
1056 * In this case we don't want to hard throttle the USB key
1057 * dirtiers for 100 seconds until bdi_dirty drops under
1058 * bdi_thresh. Instead the auxiliary bdi control line in
1059 * bdi_position_ratio() will let the dirtier task progress
1060 * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
1061 */
514 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 1062 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
515 bdi_thresh = task_dirty_limit(current, bdi_thresh);
516 1063
517 /* 1064 /*
518 * In order to avoid the stacked BDI deadlock we need 1065 * In order to avoid the stacked BDI deadlock we need
@@ -524,63 +1071,98 @@ static void balance_dirty_pages(struct address_space *mapping,
524 * actually dirty; with m+n sitting in the percpu 1071 * actually dirty; with m+n sitting in the percpu
525 * deltas. 1072 * deltas.
526 */ 1073 */
527 if (bdi_thresh < 2*bdi_stat_error(bdi)) { 1074 if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
528 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); 1075 bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
529 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); 1076 bdi_dirty = bdi_reclaimable +
1077 bdi_stat_sum(bdi, BDI_WRITEBACK);
530 } else { 1078 } else {
531 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 1079 bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
532 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); 1080 bdi_dirty = bdi_reclaimable +
1081 bdi_stat(bdi, BDI_WRITEBACK);
533 } 1082 }
534 1083
535 /* 1084 dirty_exceeded = (bdi_dirty > bdi_thresh) ||
536 * The bdi thresh is somehow "soft" limit derived from the 1085 (nr_dirty > dirty_thresh);
537 * global "hard" limit. The former helps to prevent heavy IO 1086 if (dirty_exceeded && !bdi->dirty_exceeded)
538 * bdi or process from holding back light ones; The latter is 1087 bdi->dirty_exceeded = 1;
539 * the last resort safeguard.
540 */
541 dirty_exceeded =
542 (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
543 || (nr_reclaimable + nr_writeback > dirty_thresh);
544 1088
545 if (!dirty_exceeded) 1089 bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
546 break; 1090 nr_dirty, bdi_thresh, bdi_dirty,
1091 start_time);
547 1092
548 if (!bdi->dirty_exceeded) 1093 max_pause = bdi_max_pause(bdi, bdi_dirty);
549 bdi->dirty_exceeded = 1;
550 1094
551 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 1095 dirty_ratelimit = bdi->dirty_ratelimit;
552 * Unstable writes are a feature of certain networked 1096 pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
553 * filesystems (i.e. NFS) in which data may have been 1097 background_thresh, nr_dirty,
554 * written to the server's write cache, but has not yet 1098 bdi_thresh, bdi_dirty);
555 * been flushed to permanent storage. 1099 if (unlikely(pos_ratio == 0)) {
556 * Only move pages to writeback if this bdi is over its 1100 pause = max_pause;
557 * threshold otherwise wait until the disk writes catch 1101 goto pause;
558 * up. 1102 }
559 */ 1103 task_ratelimit = (u64)dirty_ratelimit *
560 trace_wbc_balance_dirty_start(&wbc, bdi); 1104 pos_ratio >> RATELIMIT_CALC_SHIFT;
561 if (bdi_nr_reclaimable > bdi_thresh) { 1105 pause = (HZ * pages_dirtied) / (task_ratelimit | 1);
562 writeback_inodes_wb(&bdi->wb, &wbc); 1106 if (unlikely(pause <= 0)) {
563 pages_written += write_chunk - wbc.nr_to_write; 1107 trace_balance_dirty_pages(bdi,
564 trace_wbc_balance_dirty_written(&wbc, bdi); 1108 dirty_thresh,
565 if (pages_written >= write_chunk) 1109 background_thresh,
566 break; /* We've done our duty */ 1110 nr_dirty,
1111 bdi_thresh,
1112 bdi_dirty,
1113 dirty_ratelimit,
1114 task_ratelimit,
1115 pages_dirtied,
1116 pause,
1117 start_time);
1118 pause = 1; /* avoid resetting nr_dirtied_pause below */
1119 break;
567 } 1120 }
568 trace_wbc_balance_dirty_wait(&wbc, bdi); 1121 pause = min(pause, max_pause);
1122
1123pause:
1124 trace_balance_dirty_pages(bdi,
1125 dirty_thresh,
1126 background_thresh,
1127 nr_dirty,
1128 bdi_thresh,
1129 bdi_dirty,
1130 dirty_ratelimit,
1131 task_ratelimit,
1132 pages_dirtied,
1133 pause,
1134 start_time);
569 __set_current_state(TASK_UNINTERRUPTIBLE); 1135 __set_current_state(TASK_UNINTERRUPTIBLE);
570 io_schedule_timeout(pause); 1136 io_schedule_timeout(pause);
571 1137
572 /* 1138 /*
573 * Increase the delay for each loop, up to our previous 1139 * This is typically equal to (nr_dirty < dirty_thresh) and can
574 * default of taking a 100ms nap. 1140 * also keep "1000+ dd on a slow USB stick" under control.
575 */ 1141 */
576 pause <<= 1; 1142 if (task_ratelimit)
577 if (pause > HZ / 10) 1143 break;
578 pause = HZ / 10;
579 } 1144 }
580 1145
581 if (!dirty_exceeded && bdi->dirty_exceeded) 1146 if (!dirty_exceeded && bdi->dirty_exceeded)
582 bdi->dirty_exceeded = 0; 1147 bdi->dirty_exceeded = 0;
583 1148
1149 current->nr_dirtied = 0;
1150 if (pause == 0) { /* in freerun area */
1151 current->nr_dirtied_pause =
1152 dirty_poll_interval(nr_dirty, dirty_thresh);
1153 } else if (pause <= max_pause / 4 &&
1154 pages_dirtied >= current->nr_dirtied_pause) {
1155 current->nr_dirtied_pause = clamp_val(
1156 dirty_ratelimit * (max_pause / 2) / HZ,
1157 pages_dirtied + pages_dirtied / 8,
1158 pages_dirtied * 4);
1159 } else if (pause >= max_pause) {
1160 current->nr_dirtied_pause = 1 | clamp_val(
1161 dirty_ratelimit * (max_pause / 2) / HZ,
1162 pages_dirtied / 4,
1163 pages_dirtied - pages_dirtied / 8);
1164 }
1165
584 if (writeback_in_progress(bdi)) 1166 if (writeback_in_progress(bdi))
585 return; 1167 return;
586 1168
@@ -592,8 +1174,10 @@ static void balance_dirty_pages(struct address_space *mapping,
592 * In normal mode, we start background writeout at the lower 1174 * In normal mode, we start background writeout at the lower
593 * background_thresh, to keep the amount of dirty memory low. 1175 * background_thresh, to keep the amount of dirty memory low.
594 */ 1176 */
595 if ((laptop_mode && pages_written) || 1177 if (laptop_mode)
596 (!laptop_mode && (nr_reclaimable > background_thresh))) 1178 return;
1179
1180 if (nr_reclaimable > background_thresh)
597 bdi_start_background_writeback(bdi); 1181 bdi_start_background_writeback(bdi);
598} 1182}
599 1183
@@ -607,7 +1191,7 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
607 } 1191 }
608} 1192}
609 1193
610static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; 1194static DEFINE_PER_CPU(int, bdp_ratelimits);
611 1195
612/** 1196/**
613 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 1197 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
@@ -626,28 +1210,40 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
626void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 1210void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
627 unsigned long nr_pages_dirtied) 1211 unsigned long nr_pages_dirtied)
628{ 1212{
629 unsigned long ratelimit; 1213 struct backing_dev_info *bdi = mapping->backing_dev_info;
630 unsigned long *p; 1214 int ratelimit;
1215 int *p;
1216
1217 if (!bdi_cap_account_dirty(bdi))
1218 return;
631 1219
632 ratelimit = ratelimit_pages; 1220 ratelimit = current->nr_dirtied_pause;
633 if (mapping->backing_dev_info->dirty_exceeded) 1221 if (bdi->dirty_exceeded)
634 ratelimit = 8; 1222 ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
635 1223
1224 current->nr_dirtied += nr_pages_dirtied;
1225
1226 preempt_disable();
636 /* 1227 /*
637 * Check the rate limiting. Also, we do not want to throttle real-time 1228 * This prevents one CPU to accumulate too many dirtied pages without
638 * tasks in balance_dirty_pages(). Period. 1229 * calling into balance_dirty_pages(), which can happen when there are
1230 * 1000+ tasks, all of them start dirtying pages at exactly the same
1231 * time, hence all honoured too large initial task->nr_dirtied_pause.
639 */ 1232 */
640 preempt_disable();
641 p = &__get_cpu_var(bdp_ratelimits); 1233 p = &__get_cpu_var(bdp_ratelimits);
642 *p += nr_pages_dirtied; 1234 if (unlikely(current->nr_dirtied >= ratelimit))
643 if (unlikely(*p >= ratelimit)) {
644 ratelimit = sync_writeback_pages(*p);
645 *p = 0; 1235 *p = 0;
646 preempt_enable(); 1236 else {
647 balance_dirty_pages(mapping, ratelimit); 1237 *p += nr_pages_dirtied;
648 return; 1238 if (unlikely(*p >= ratelimit_pages)) {
1239 *p = 0;
1240 ratelimit = 0;
1241 }
649 } 1242 }
650 preempt_enable(); 1243 preempt_enable();
1244
1245 if (unlikely(current->nr_dirtied >= ratelimit))
1246 balance_dirty_pages(mapping, current->nr_dirtied);
651} 1247}
652EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); 1248EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
653 1249
@@ -703,7 +1299,8 @@ void laptop_mode_timer_fn(unsigned long data)
703 * threshold 1299 * threshold
704 */ 1300 */
705 if (bdi_has_dirty_io(&q->backing_dev_info)) 1301 if (bdi_has_dirty_io(&q->backing_dev_info))
706 bdi_start_writeback(&q->backing_dev_info, nr_pages); 1302 bdi_start_writeback(&q->backing_dev_info, nr_pages,
1303 WB_REASON_LAPTOP_TIMER);
707} 1304}
708 1305
709/* 1306/*
@@ -742,22 +1339,17 @@ void laptop_sync_completion(void)
742 * 1339 *
743 * Here we set ratelimit_pages to a level which ensures that when all CPUs are 1340 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
744 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory 1341 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
745 * thresholds before writeback cuts in. 1342 * thresholds.
746 *
747 * But the limit should not be set too high. Because it also controls the
748 * amount of memory which the balance_dirty_pages() caller has to write back.
749 * If this is too large then the caller will block on the IO queue all the
750 * time. So limit it to four megabytes - the balance_dirty_pages() caller
751 * will write six megabyte chunks, max.
752 */ 1343 */
753 1344
754void writeback_set_ratelimit(void) 1345void writeback_set_ratelimit(void)
755{ 1346{
756 ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); 1347 unsigned long background_thresh;
1348 unsigned long dirty_thresh;
1349 global_dirty_limits(&background_thresh, &dirty_thresh);
1350 ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
757 if (ratelimit_pages < 16) 1351 if (ratelimit_pages < 16)
758 ratelimit_pages = 16; 1352 ratelimit_pages = 16;
759 if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
760 ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
761} 1353}
762 1354
763static int __cpuinit 1355static int __cpuinit
@@ -892,12 +1484,12 @@ int write_cache_pages(struct address_space *mapping,
892 range_whole = 1; 1484 range_whole = 1;
893 cycled = 1; /* ignore range_cyclic tests */ 1485 cycled = 1; /* ignore range_cyclic tests */
894 } 1486 }
895 if (wbc->sync_mode == WB_SYNC_ALL) 1487 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
896 tag = PAGECACHE_TAG_TOWRITE; 1488 tag = PAGECACHE_TAG_TOWRITE;
897 else 1489 else
898 tag = PAGECACHE_TAG_DIRTY; 1490 tag = PAGECACHE_TAG_DIRTY;
899retry: 1491retry:
900 if (wbc->sync_mode == WB_SYNC_ALL) 1492 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
901 tag_pages_for_writeback(mapping, index, end); 1493 tag_pages_for_writeback(mapping, index, end);
902 done_index = index; 1494 done_index = index;
903 while (!done && (index <= end)) { 1495 while (!done && (index <= end)) {
@@ -1127,6 +1719,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
1127 __inc_zone_page_state(page, NR_FILE_DIRTY); 1719 __inc_zone_page_state(page, NR_FILE_DIRTY);
1128 __inc_zone_page_state(page, NR_DIRTIED); 1720 __inc_zone_page_state(page, NR_DIRTIED);
1129 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 1721 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1722 __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
1130 task_dirty_inc(current); 1723 task_dirty_inc(current);
1131 task_io_account_write(PAGE_CACHE_SIZE); 1724 task_io_account_write(PAGE_CACHE_SIZE);
1132 } 1725 }
@@ -1141,7 +1734,6 @@ EXPORT_SYMBOL(account_page_dirtied);
1141void account_page_writeback(struct page *page) 1734void account_page_writeback(struct page *page)
1142{ 1735{
1143 inc_zone_page_state(page, NR_WRITEBACK); 1736 inc_zone_page_state(page, NR_WRITEBACK);
1144 inc_zone_page_state(page, NR_WRITTEN);
1145} 1737}
1146EXPORT_SYMBOL(account_page_writeback); 1738EXPORT_SYMBOL(account_page_writeback);
1147 1739
@@ -1358,8 +1950,10 @@ int test_clear_page_writeback(struct page *page)
1358 } else { 1950 } else {
1359 ret = TestClearPageWriteback(page); 1951 ret = TestClearPageWriteback(page);
1360 } 1952 }
1361 if (ret) 1953 if (ret) {
1362 dec_zone_page_state(page, NR_WRITEBACK); 1954 dec_zone_page_state(page, NR_WRITEBACK);
1955 inc_zone_page_state(page, NR_WRITTEN);
1956 }
1363 return ret; 1957 return ret;
1364} 1958}
1365 1959
@@ -1405,10 +1999,6 @@ EXPORT_SYMBOL(test_set_page_writeback);
1405 */ 1999 */
1406int mapping_tagged(struct address_space *mapping, int tag) 2000int mapping_tagged(struct address_space *mapping, int tag)
1407{ 2001{
1408 int ret; 2002 return radix_tree_tagged(&mapping->page_tree, tag);
1409 rcu_read_lock();
1410 ret = radix_tree_tagged(&mapping->page_tree, tag);
1411 rcu_read_unlock();
1412 return ret;
1413} 2003}
1414EXPORT_SYMBOL(mapping_tagged); 2004EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e8985acdab..87b0a3f074e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -127,6 +127,20 @@ void pm_restrict_gfp_mask(void)
127 saved_gfp_mask = gfp_allowed_mask; 127 saved_gfp_mask = gfp_allowed_mask;
128 gfp_allowed_mask &= ~GFP_IOFS; 128 gfp_allowed_mask &= ~GFP_IOFS;
129} 129}
130
131static bool pm_suspending(void)
132{
133 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
134 return false;
135 return true;
136}
137
138#else
139
140static bool pm_suspending(void)
141{
142 return false;
143}
130#endif /* CONFIG_PM_SLEEP */ 144#endif /* CONFIG_PM_SLEEP */
131 145
132#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 146#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -176,6 +190,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
176}; 190};
177 191
178int min_free_kbytes = 1024; 192int min_free_kbytes = 1024;
193int min_free_order_shift = 1;
179 194
180static unsigned long __meminitdata nr_kernel_pages; 195static unsigned long __meminitdata nr_kernel_pages;
181static unsigned long __meminitdata nr_all_pages; 196static unsigned long __meminitdata nr_all_pages;
@@ -355,8 +370,8 @@ void prep_compound_page(struct page *page, unsigned long order)
355 __SetPageHead(page); 370 __SetPageHead(page);
356 for (i = 1; i < nr_pages; i++) { 371 for (i = 1; i < nr_pages; i++) {
357 struct page *p = page + i; 372 struct page *p = page + i;
358
359 __SetPageTail(p); 373 __SetPageTail(p);
374 set_page_count(p, 0);
360 p->first_page = page; 375 p->first_page = page;
361 } 376 }
362} 377}
@@ -1487,7 +1502,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1487 free_pages -= z->free_area[o].nr_free << o; 1502 free_pages -= z->free_area[o].nr_free << o;
1488 1503
1489 /* Require fewer higher order pages to be free */ 1504 /* Require fewer higher order pages to be free */
1490 min >>= 1; 1505 min >>= min_free_order_shift;
1491 1506
1492 if (free_pages <= min) 1507 if (free_pages <= min)
1493 return false; 1508 return false;
@@ -1616,6 +1631,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1616 set_bit(i, zlc->fullzones); 1631 set_bit(i, zlc->fullzones);
1617} 1632}
1618 1633
1634/*
1635 * clear all zones full, called after direct reclaim makes progress so that
1636 * a zone that was recently full is not skipped over for up to a second
1637 */
1638static void zlc_clear_zones_full(struct zonelist *zonelist)
1639{
1640 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1641
1642 zlc = zonelist->zlcache_ptr;
1643 if (!zlc)
1644 return;
1645
1646 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1647}
1648
1619#else /* CONFIG_NUMA */ 1649#else /* CONFIG_NUMA */
1620 1650
1621static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1651static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1632,6 +1662,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1632static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1662static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1633{ 1663{
1634} 1664}
1665
1666static void zlc_clear_zones_full(struct zonelist *zonelist)
1667{
1668}
1635#endif /* CONFIG_NUMA */ 1669#endif /* CONFIG_NUMA */
1636 1670
1637/* 1671/*
@@ -1664,7 +1698,7 @@ zonelist_scan:
1664 continue; 1698 continue;
1665 if ((alloc_flags & ALLOC_CPUSET) && 1699 if ((alloc_flags & ALLOC_CPUSET) &&
1666 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1700 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1667 goto try_next_zone; 1701 continue;
1668 1702
1669 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1703 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1670 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1704 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1676,17 +1710,36 @@ zonelist_scan:
1676 classzone_idx, alloc_flags)) 1710 classzone_idx, alloc_flags))
1677 goto try_this_zone; 1711 goto try_this_zone;
1678 1712
1713 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1714 /*
1715 * we do zlc_setup if there are multiple nodes
1716 * and before considering the first zone allowed
1717 * by the cpuset.
1718 */
1719 allowednodes = zlc_setup(zonelist, alloc_flags);
1720 zlc_active = 1;
1721 did_zlc_setup = 1;
1722 }
1723
1679 if (zone_reclaim_mode == 0) 1724 if (zone_reclaim_mode == 0)
1680 goto this_zone_full; 1725 goto this_zone_full;
1681 1726
1727 /*
1728 * As we may have just activated ZLC, check if the first
1729 * eligible zone has failed zone_reclaim recently.
1730 */
1731 if (NUMA_BUILD && zlc_active &&
1732 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1733 continue;
1734
1682 ret = zone_reclaim(zone, gfp_mask, order); 1735 ret = zone_reclaim(zone, gfp_mask, order);
1683 switch (ret) { 1736 switch (ret) {
1684 case ZONE_RECLAIM_NOSCAN: 1737 case ZONE_RECLAIM_NOSCAN:
1685 /* did not scan */ 1738 /* did not scan */
1686 goto try_next_zone; 1739 continue;
1687 case ZONE_RECLAIM_FULL: 1740 case ZONE_RECLAIM_FULL:
1688 /* scanned but unreclaimable */ 1741 /* scanned but unreclaimable */
1689 goto this_zone_full; 1742 continue;
1690 default: 1743 default:
1691 /* did we reclaim enough */ 1744 /* did we reclaim enough */
1692 if (!zone_watermark_ok(zone, order, mark, 1745 if (!zone_watermark_ok(zone, order, mark,
@@ -1703,16 +1756,6 @@ try_this_zone:
1703this_zone_full: 1756this_zone_full:
1704 if (NUMA_BUILD) 1757 if (NUMA_BUILD)
1705 zlc_mark_zone_full(zonelist, z); 1758 zlc_mark_zone_full(zonelist, z);
1706try_next_zone:
1707 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1708 /*
1709 * we do zlc_setup after the first zone is tried but only
1710 * if there are multiple nodes make it worthwhile
1711 */
1712 allowednodes = zlc_setup(zonelist, alloc_flags);
1713 zlc_active = 1;
1714 did_zlc_setup = 1;
1715 }
1716 } 1759 }
1717 1760
1718 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1761 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1869,14 +1912,20 @@ static struct page *
1869__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1912__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1870 struct zonelist *zonelist, enum zone_type high_zoneidx, 1913 struct zonelist *zonelist, enum zone_type high_zoneidx,
1871 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1914 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1872 int migratetype, unsigned long *did_some_progress, 1915 int migratetype, bool sync_migration,
1873 bool sync_migration) 1916 bool *deferred_compaction,
1917 unsigned long *did_some_progress)
1874{ 1918{
1875 struct page *page; 1919 struct page *page;
1876 1920
1877 if (!order || compaction_deferred(preferred_zone)) 1921 if (!order)
1878 return NULL; 1922 return NULL;
1879 1923
1924 if (compaction_deferred(preferred_zone)) {
1925 *deferred_compaction = true;
1926 return NULL;
1927 }
1928
1880 current->flags |= PF_MEMALLOC; 1929 current->flags |= PF_MEMALLOC;
1881 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 1930 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1882 nodemask, sync_migration); 1931 nodemask, sync_migration);
@@ -1904,7 +1953,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1904 * but not enough to satisfy watermarks. 1953 * but not enough to satisfy watermarks.
1905 */ 1954 */
1906 count_vm_event(COMPACTFAIL); 1955 count_vm_event(COMPACTFAIL);
1907 defer_compaction(preferred_zone); 1956
1957 /*
1958 * As async compaction considers a subset of pageblocks, only
1959 * defer if the failure was a sync compaction failure.
1960 */
1961 if (sync_migration)
1962 defer_compaction(preferred_zone);
1908 1963
1909 cond_resched(); 1964 cond_resched();
1910 } 1965 }
@@ -1916,8 +1971,9 @@ static inline struct page *
1916__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1971__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1917 struct zonelist *zonelist, enum zone_type high_zoneidx, 1972 struct zonelist *zonelist, enum zone_type high_zoneidx,
1918 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1973 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1919 int migratetype, unsigned long *did_some_progress, 1974 int migratetype, bool sync_migration,
1920 bool sync_migration) 1975 bool *deferred_compaction,
1976 unsigned long *did_some_progress)
1921{ 1977{
1922 return NULL; 1978 return NULL;
1923} 1979}
@@ -1954,6 +2010,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1954 if (unlikely(!(*did_some_progress))) 2010 if (unlikely(!(*did_some_progress)))
1955 return NULL; 2011 return NULL;
1956 2012
2013 /* After successful reclaim, reconsider all zones for allocation */
2014 if (NUMA_BUILD)
2015 zlc_clear_zones_full(zonelist);
2016
1957retry: 2017retry:
1958 page = get_page_from_freelist(gfp_mask, nodemask, order, 2018 page = get_page_from_freelist(gfp_mask, nodemask, order,
1959 zonelist, high_zoneidx, 2019 zonelist, high_zoneidx,
@@ -2063,6 +2123,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2063 unsigned long pages_reclaimed = 0; 2123 unsigned long pages_reclaimed = 0;
2064 unsigned long did_some_progress; 2124 unsigned long did_some_progress;
2065 bool sync_migration = false; 2125 bool sync_migration = false;
2126 bool deferred_compaction = false;
2066 2127
2067 /* 2128 /*
2068 * In the slowpath, we sanity check order to avoid ever trying to 2129 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2143,12 +2204,22 @@ rebalance:
2143 zonelist, high_zoneidx, 2204 zonelist, high_zoneidx,
2144 nodemask, 2205 nodemask,
2145 alloc_flags, preferred_zone, 2206 alloc_flags, preferred_zone,
2146 migratetype, &did_some_progress, 2207 migratetype, sync_migration,
2147 sync_migration); 2208 &deferred_compaction,
2209 &did_some_progress);
2148 if (page) 2210 if (page)
2149 goto got_pg; 2211 goto got_pg;
2150 sync_migration = true; 2212 sync_migration = true;
2151 2213
2214 /*
2215 * If compaction is deferred for high-order allocations, it is because
2216 * sync compaction recently failed. In this is the case and the caller
2217 * has requested the system not be heavily disrupted, fail the
2218 * allocation now instead of entering direct reclaim
2219 */
2220 if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
2221 goto nopage;
2222
2152 /* Try direct reclaim and then allocating */ 2223 /* Try direct reclaim and then allocating */
2153 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2224 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2154 zonelist, high_zoneidx, 2225 zonelist, high_zoneidx,
@@ -2193,6 +2264,14 @@ rebalance:
2193 2264
2194 goto restart; 2265 goto restart;
2195 } 2266 }
2267
2268 /*
2269 * Suspend converts GFP_KERNEL to __GFP_WAIT which can
2270 * prevent reclaim making forward progress without
2271 * invoking OOM. Bail if we are suspending
2272 */
2273 if (pm_suspending())
2274 goto nopage;
2196 } 2275 }
2197 2276
2198 /* Check if we should retry the allocation */ 2277 /* Check if we should retry the allocation */
@@ -2211,8 +2290,9 @@ rebalance:
2211 zonelist, high_zoneidx, 2290 zonelist, high_zoneidx,
2212 nodemask, 2291 nodemask,
2213 alloc_flags, preferred_zone, 2292 alloc_flags, preferred_zone,
2214 migratetype, &did_some_progress, 2293 migratetype, sync_migration,
2215 sync_migration); 2294 &deferred_compaction,
2295 &did_some_progress);
2216 if (page) 2296 if (page)
2217 goto got_pg; 2297 goto got_pg;
2218 } 2298 }
@@ -2236,8 +2316,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2236{ 2316{
2237 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2317 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2238 struct zone *preferred_zone; 2318 struct zone *preferred_zone;
2239 struct page *page; 2319 struct page *page = NULL;
2240 int migratetype = allocflags_to_migratetype(gfp_mask); 2320 int migratetype = allocflags_to_migratetype(gfp_mask);
2321 unsigned int cpuset_mems_cookie;
2241 2322
2242 gfp_mask &= gfp_allowed_mask; 2323 gfp_mask &= gfp_allowed_mask;
2243 2324
@@ -2256,15 +2337,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2256 if (unlikely(!zonelist->_zonerefs->zone)) 2337 if (unlikely(!zonelist->_zonerefs->zone))
2257 return NULL; 2338 return NULL;
2258 2339
2259 get_mems_allowed(); 2340retry_cpuset:
2341 cpuset_mems_cookie = get_mems_allowed();
2342
2260 /* The preferred zone is used for statistics later */ 2343 /* The preferred zone is used for statistics later */
2261 first_zones_zonelist(zonelist, high_zoneidx, 2344 first_zones_zonelist(zonelist, high_zoneidx,
2262 nodemask ? : &cpuset_current_mems_allowed, 2345 nodemask ? : &cpuset_current_mems_allowed,
2263 &preferred_zone); 2346 &preferred_zone);
2264 if (!preferred_zone) { 2347 if (!preferred_zone)
2265 put_mems_allowed(); 2348 goto out;
2266 return NULL;
2267 }
2268 2349
2269 /* First allocation attempt */ 2350 /* First allocation attempt */
2270 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2351 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2274,9 +2355,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2274 page = __alloc_pages_slowpath(gfp_mask, order, 2355 page = __alloc_pages_slowpath(gfp_mask, order,
2275 zonelist, high_zoneidx, nodemask, 2356 zonelist, high_zoneidx, nodemask,
2276 preferred_zone, migratetype); 2357 preferred_zone, migratetype);
2277 put_mems_allowed();
2278 2358
2279 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2359 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2360
2361out:
2362 /*
2363 * When updating a task's mems_allowed, it is possible to race with
2364 * parallel threads in such a way that an allocation can fail while
2365 * the mask is being updated. If a page allocation is about to fail,
2366 * check if the cpuset changed during allocation and if so, retry.
2367 */
2368 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2369 goto retry_cpuset;
2370
2280 return page; 2371 return page;
2281} 2372}
2282EXPORT_SYMBOL(__alloc_pages_nodemask); 2373EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2500,13 +2591,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2500bool skip_free_areas_node(unsigned int flags, int nid) 2591bool skip_free_areas_node(unsigned int flags, int nid)
2501{ 2592{
2502 bool ret = false; 2593 bool ret = false;
2594 unsigned int cpuset_mems_cookie;
2503 2595
2504 if (!(flags & SHOW_MEM_FILTER_NODES)) 2596 if (!(flags & SHOW_MEM_FILTER_NODES))
2505 goto out; 2597 goto out;
2506 2598
2507 get_mems_allowed(); 2599 do {
2508 ret = !node_isset(nid, cpuset_current_mems_allowed); 2600 cpuset_mems_cookie = get_mems_allowed();
2509 put_mems_allowed(); 2601 ret = !node_isset(nid, cpuset_current_mems_allowed);
2602 } while (!put_mems_allowed(cpuset_mems_cookie));
2510out: 2603out:
2511 return ret; 2604 return ret;
2512} 2605}
@@ -3356,9 +3449,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3356 unsigned long block_migratetype; 3449 unsigned long block_migratetype;
3357 int reserve; 3450 int reserve;
3358 3451
3359 /* Get the start pfn, end pfn and the number of blocks to reserve */ 3452 /*
3453 * Get the start pfn, end pfn and the number of blocks to reserve
3454 * We have to be careful to be aligned to pageblock_nr_pages to
3455 * make sure that we always check pfn_valid for the first page in
3456 * the block.
3457 */
3360 start_pfn = zone->zone_start_pfn; 3458 start_pfn = zone->zone_start_pfn;
3361 end_pfn = start_pfn + zone->spanned_pages; 3459 end_pfn = start_pfn + zone->spanned_pages;
3460 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3362 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3461 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3363 pageblock_order; 3462 pageblock_order;
3364 3463
@@ -3380,25 +3479,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3380 if (page_to_nid(page) != zone_to_nid(zone)) 3479 if (page_to_nid(page) != zone_to_nid(zone))
3381 continue; 3480 continue;
3382 3481
3383 /* Blocks with reserved pages will never free, skip them. */
3384 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3385 if (pageblock_is_reserved(pfn, block_end_pfn))
3386 continue;
3387
3388 block_migratetype = get_pageblock_migratetype(page); 3482 block_migratetype = get_pageblock_migratetype(page);
3389 3483
3390 /* If this block is reserved, account for it */ 3484 /* Only test what is necessary when the reserves are not met */
3391 if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { 3485 if (reserve > 0) {
3392 reserve--; 3486 /*
3393 continue; 3487 * Blocks with reserved pages will never free, skip
3394 } 3488 * them.
3489 */
3490 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3491 if (pageblock_is_reserved(pfn, block_end_pfn))
3492 continue;
3395 3493
3396 /* Suitable for reserving if this block is movable */ 3494 /* If this block is reserved, account for it */
3397 if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { 3495 if (block_migratetype == MIGRATE_RESERVE) {
3398 set_pageblock_migratetype(page, MIGRATE_RESERVE); 3496 reserve--;
3399 move_freepages_block(zone, page, MIGRATE_RESERVE); 3497 continue;
3400 reserve--; 3498 }
3401 continue; 3499
3500 /* Suitable for reserving if this block is movable */
3501 if (block_migratetype == MIGRATE_MOVABLE) {
3502 set_pageblock_migratetype(page,
3503 MIGRATE_RESERVE);
3504 move_freepages_block(zone, page,
3505 MIGRATE_RESERVE);
3506 reserve--;
3507 continue;
3508 }
3402 } 3509 }
3403 3510
3404 /* 3511 /*
@@ -5527,6 +5634,17 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5527bool is_pageblock_removable_nolock(struct page *page) 5634bool is_pageblock_removable_nolock(struct page *page)
5528{ 5635{
5529 struct zone *zone = page_zone(page); 5636 struct zone *zone = page_zone(page);
5637 unsigned long pfn = page_to_pfn(page);
5638
5639 /*
5640 * We have to be careful here because we are iterating over memory
5641 * sections which are not zone aware so we might end up outside of
5642 * the zone but still within the section.
5643 */
5644 if (!zone || zone->zone_start_pfn > pfn ||
5645 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5646 return false;
5647
5530 return __count_immobile_pages(zone, page, 0); 5648 return __count_immobile_pages(zone, page, 0);
5531} 5649}
5532 5650
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index c3450d53361..87eac0ea2bf 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -59,7 +59,7 @@ again:
59 continue; 59 continue;
60 60
61 split_huge_page_pmd(walk->mm, pmd); 61 split_huge_page_pmd(walk->mm, pmd);
62 if (pmd_none_or_clear_bad(pmd)) 62 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
63 goto again; 63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk); 64 err = walk_pte_range(pmd, addr, next, walk);
65 if (err) 65 if (err)
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index ea534960a04..bfad7246665 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -143,8 +143,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
143 int page_start, int page_end) 143 int page_start, int page_end)
144{ 144{
145 flush_cache_vunmap( 145 flush_cache_vunmap(
146 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), 146 pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
147 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); 147 pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
148} 148}
149 149
150static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) 150static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
@@ -206,8 +206,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
206 int page_start, int page_end) 206 int page_start, int page_end)
207{ 207{
208 flush_tlb_kernel_range( 208 flush_tlb_kernel_range(
209 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), 209 pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
210 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); 210 pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
211} 211}
212 212
213static int __pcpu_map_pages(unsigned long addr, struct page **pages, 213static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@@ -284,8 +284,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
284 int page_start, int page_end) 284 int page_start, int page_end)
285{ 285{
286 flush_cache_vmap( 286 flush_cache_vmap(
287 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), 287 pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
288 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); 288 pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
289} 289}
290 290
291/** 291/**
diff --git a/mm/percpu.c b/mm/percpu.c
index bf80e55dbed..af0cc7a58f9 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -116,9 +116,9 @@ static int pcpu_atom_size __read_mostly;
116static int pcpu_nr_slots __read_mostly; 116static int pcpu_nr_slots __read_mostly;
117static size_t pcpu_chunk_struct_size __read_mostly; 117static size_t pcpu_chunk_struct_size __read_mostly;
118 118
119/* cpus with the lowest and highest unit numbers */ 119/* cpus with the lowest and highest unit addresses */
120static unsigned int pcpu_first_unit_cpu __read_mostly; 120static unsigned int pcpu_low_unit_cpu __read_mostly;
121static unsigned int pcpu_last_unit_cpu __read_mostly; 121static unsigned int pcpu_high_unit_cpu __read_mostly;
122 122
123/* the address of the first chunk which starts with the kernel static area */ 123/* the address of the first chunk which starts with the kernel static area */
124void *pcpu_base_addr __read_mostly; 124void *pcpu_base_addr __read_mostly;
@@ -984,19 +984,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
984{ 984{
985 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); 985 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
986 bool in_first_chunk = false; 986 bool in_first_chunk = false;
987 unsigned long first_start, first_end; 987 unsigned long first_low, first_high;
988 unsigned int cpu; 988 unsigned int cpu;
989 989
990 /* 990 /*
991 * The following test on first_start/end isn't strictly 991 * The following test on unit_low/high isn't strictly
992 * necessary but will speed up lookups of addresses which 992 * necessary but will speed up lookups of addresses which
993 * aren't in the first chunk. 993 * aren't in the first chunk.
994 */ 994 */
995 first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0); 995 first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
996 first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu, 996 first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
997 pcpu_unit_pages); 997 pcpu_unit_pages);
998 if ((unsigned long)addr >= first_start && 998 if ((unsigned long)addr >= first_low &&
999 (unsigned long)addr < first_end) { 999 (unsigned long)addr < first_high) {
1000 for_each_possible_cpu(cpu) { 1000 for_each_possible_cpu(cpu) {
1001 void *start = per_cpu_ptr(base, cpu); 1001 void *start = per_cpu_ptr(base, cpu);
1002 1002
@@ -1011,9 +1011,11 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
1011 if (!is_vmalloc_addr(addr)) 1011 if (!is_vmalloc_addr(addr))
1012 return __pa(addr); 1012 return __pa(addr);
1013 else 1013 else
1014 return page_to_phys(vmalloc_to_page(addr)); 1014 return page_to_phys(vmalloc_to_page(addr)) +
1015 offset_in_page(addr);
1015 } else 1016 } else
1016 return page_to_phys(pcpu_addr_to_page(addr)); 1017 return page_to_phys(pcpu_addr_to_page(addr)) +
1018 offset_in_page(addr);
1017} 1019}
1018 1020
1019/** 1021/**
@@ -1233,7 +1235,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1233 1235
1234 for (cpu = 0; cpu < nr_cpu_ids; cpu++) 1236 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1235 unit_map[cpu] = UINT_MAX; 1237 unit_map[cpu] = UINT_MAX;
1236 pcpu_first_unit_cpu = NR_CPUS; 1238
1239 pcpu_low_unit_cpu = NR_CPUS;
1240 pcpu_high_unit_cpu = NR_CPUS;
1237 1241
1238 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { 1242 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
1239 const struct pcpu_group_info *gi = &ai->groups[group]; 1243 const struct pcpu_group_info *gi = &ai->groups[group];
@@ -1253,9 +1257,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1253 unit_map[cpu] = unit + i; 1257 unit_map[cpu] = unit + i;
1254 unit_off[cpu] = gi->base_offset + i * ai->unit_size; 1258 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
1255 1259
1256 if (pcpu_first_unit_cpu == NR_CPUS) 1260 /* determine low/high unit_cpu */
1257 pcpu_first_unit_cpu = cpu; 1261 if (pcpu_low_unit_cpu == NR_CPUS ||
1258 pcpu_last_unit_cpu = cpu; 1262 unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
1263 pcpu_low_unit_cpu = cpu;
1264 if (pcpu_high_unit_cpu == NR_CPUS ||
1265 unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
1266 pcpu_high_unit_cpu = cpu;
1259 } 1267 }
1260 } 1268 }
1261 pcpu_nr_units = unit; 1269 pcpu_nr_units = unit;
@@ -1622,6 +1630,16 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1622 areas[group] = ptr; 1630 areas[group] = ptr;
1623 1631
1624 base = min(ptr, base); 1632 base = min(ptr, base);
1633 }
1634
1635 /*
1636 * Copy data and free unused parts. This should happen after all
1637 * allocations are complete; otherwise, we may end up with
1638 * overlapping groups.
1639 */
1640 for (group = 0; group < ai->nr_groups; group++) {
1641 struct pcpu_group_info *gi = &ai->groups[group];
1642 void *ptr = areas[group];
1625 1643
1626 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { 1644 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
1627 if (gi->cpu_map[i] == NR_CPUS) { 1645 if (gi->cpu_map[i] == NR_CPUS) {
diff --git a/mm/shmem.c b/mm/shmem.c
index fcedf5464eb..b5a1b89b2d6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -51,6 +51,7 @@ static struct vfsmount *shm_mnt;
51#include <linux/shmem_fs.h> 51#include <linux/shmem_fs.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/blkdev.h> 53#include <linux/blkdev.h>
54#include <linux/splice.h>
54#include <linux/security.h> 55#include <linux/security.h>
55#include <linux/swapops.h> 56#include <linux/swapops.h>
56#include <linux/mempolicy.h> 57#include <linux/mempolicy.h>
@@ -126,8 +127,15 @@ static unsigned long shmem_default_max_inodes(void)
126} 127}
127#endif 128#endif
128 129
129static int shmem_getpage(struct inode *inode, unsigned long idx, 130static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
130 struct page **pagep, enum sgp_type sgp, int *type); 131 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
132
133static inline int shmem_getpage(struct inode *inode, pgoff_t index,
134 struct page **pagep, enum sgp_type sgp, int *fault_type)
135{
136 return shmem_getpage_gfp(inode, index, pagep, sgp,
137 mapping_gfp_mask(inode->i_mapping), fault_type);
138}
131 139
132static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) 140static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
133{ 141{
@@ -405,10 +413,12 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
405 * @info: info structure for the inode 413 * @info: info structure for the inode
406 * @index: index of the page to find 414 * @index: index of the page to find
407 * @sgp: check and recheck i_size? skip allocation? 415 * @sgp: check and recheck i_size? skip allocation?
416 * @gfp: gfp mask to use for any page allocation
408 * 417 *
409 * If the entry does not exist, allocate it. 418 * If the entry does not exist, allocate it.
410 */ 419 */
411static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) 420static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info,
421 unsigned long index, enum sgp_type sgp, gfp_t gfp)
412{ 422{
413 struct inode *inode = &info->vfs_inode; 423 struct inode *inode = &info->vfs_inode;
414 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 424 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
@@ -438,7 +448,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
438 } 448 }
439 449
440 spin_unlock(&info->lock); 450 spin_unlock(&info->lock);
441 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); 451 page = shmem_dir_alloc(gfp);
442 spin_lock(&info->lock); 452 spin_lock(&info->lock);
443 453
444 if (!page) { 454 if (!page) {
@@ -1228,92 +1238,83 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1228#endif 1238#endif
1229 1239
1230/* 1240/*
1231 * shmem_getpage - either get the page from swap or allocate a new one 1241 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1232 * 1242 *
1233 * If we allocate a new one we do not mark it dirty. That's up to the 1243 * If we allocate a new one we do not mark it dirty. That's up to the
1234 * vm. If we swap it in we mark it dirty since we also free the swap 1244 * vm. If we swap it in we mark it dirty since we also free the swap
1235 * entry since a page cannot live in both the swap and page cache 1245 * entry since a page cannot live in both the swap and page cache
1236 */ 1246 */
1237static int shmem_getpage(struct inode *inode, unsigned long idx, 1247static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx,
1238 struct page **pagep, enum sgp_type sgp, int *type) 1248 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
1239{ 1249{
1240 struct address_space *mapping = inode->i_mapping; 1250 struct address_space *mapping = inode->i_mapping;
1241 struct shmem_inode_info *info = SHMEM_I(inode); 1251 struct shmem_inode_info *info = SHMEM_I(inode);
1242 struct shmem_sb_info *sbinfo; 1252 struct shmem_sb_info *sbinfo;
1243 struct page *filepage = *pagep; 1253 struct page *page;
1244 struct page *swappage;
1245 struct page *prealloc_page = NULL; 1254 struct page *prealloc_page = NULL;
1246 swp_entry_t *entry; 1255 swp_entry_t *entry;
1247 swp_entry_t swap; 1256 swp_entry_t swap;
1248 gfp_t gfp;
1249 int error; 1257 int error;
1258 int ret;
1250 1259
1251 if (idx >= SHMEM_MAX_INDEX) 1260 if (idx >= SHMEM_MAX_INDEX)
1252 return -EFBIG; 1261 return -EFBIG;
1253
1254 if (type)
1255 *type = 0;
1256
1257 /*
1258 * Normally, filepage is NULL on entry, and either found
1259 * uptodate immediately, or allocated and zeroed, or read
1260 * in under swappage, which is then assigned to filepage.
1261 * But shmem_readpage (required for splice) passes in a locked
1262 * filepage, which may be found not uptodate by other callers
1263 * too, and may need to be copied from the swappage read in.
1264 */
1265repeat: 1262repeat:
1266 if (!filepage) 1263 page = find_lock_page(mapping, idx);
1267 filepage = find_lock_page(mapping, idx); 1264 if (page) {
1268 if (filepage && PageUptodate(filepage))
1269 goto done;
1270 gfp = mapping_gfp_mask(mapping);
1271 if (!filepage) {
1272 /* 1265 /*
1273 * Try to preload while we can wait, to not make a habit of 1266 * Once we can get the page lock, it must be uptodate:
1274 * draining atomic reserves; but don't latch on to this cpu. 1267 * if there were an error in reading back from swap,
1268 * the page would not be inserted into the filecache.
1275 */ 1269 */
1276 error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 1270 BUG_ON(!PageUptodate(page));
1277 if (error) 1271 goto done;
1278 goto failed; 1272 }
1279 radix_tree_preload_end(); 1273
1280 if (sgp != SGP_READ && !prealloc_page) { 1274 /*
1281 /* We don't care if this fails */ 1275 * Try to preload while we can wait, to not make a habit of
1282 prealloc_page = shmem_alloc_page(gfp, info, idx); 1276 * draining atomic reserves; but don't latch on to this cpu.
1283 if (prealloc_page) { 1277 */
1284 if (mem_cgroup_cache_charge(prealloc_page, 1278 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
1285 current->mm, GFP_KERNEL)) { 1279 if (error)
1286 page_cache_release(prealloc_page); 1280 goto out;
1287 prealloc_page = NULL; 1281 radix_tree_preload_end();
1288 } 1282
1283 if (sgp != SGP_READ && !prealloc_page) {
1284 prealloc_page = shmem_alloc_page(gfp, info, idx);
1285 if (prealloc_page) {
1286 SetPageSwapBacked(prealloc_page);
1287 if (mem_cgroup_cache_charge(prealloc_page,
1288 current->mm, GFP_KERNEL)) {
1289 page_cache_release(prealloc_page);
1290 prealloc_page = NULL;
1289 } 1291 }
1290 } 1292 }
1291 } 1293 }
1292 error = 0;
1293 1294
1294 spin_lock(&info->lock); 1295 spin_lock(&info->lock);
1295 shmem_recalc_inode(inode); 1296 shmem_recalc_inode(inode);
1296 entry = shmem_swp_alloc(info, idx, sgp); 1297 entry = shmem_swp_alloc(info, idx, sgp, gfp);
1297 if (IS_ERR(entry)) { 1298 if (IS_ERR(entry)) {
1298 spin_unlock(&info->lock); 1299 spin_unlock(&info->lock);
1299 error = PTR_ERR(entry); 1300 error = PTR_ERR(entry);
1300 goto failed; 1301 goto out;
1301 } 1302 }
1302 swap = *entry; 1303 swap = *entry;
1303 1304
1304 if (swap.val) { 1305 if (swap.val) {
1305 /* Look it up and read it in.. */ 1306 /* Look it up and read it in.. */
1306 swappage = lookup_swap_cache(swap); 1307 page = lookup_swap_cache(swap);
1307 if (!swappage) { 1308 if (!page) {
1308 shmem_swp_unmap(entry); 1309 shmem_swp_unmap(entry);
1309 spin_unlock(&info->lock); 1310 spin_unlock(&info->lock);
1310 /* here we actually do the io */ 1311 /* here we actually do the io */
1311 if (type) 1312 if (fault_type)
1312 *type |= VM_FAULT_MAJOR; 1313 *fault_type |= VM_FAULT_MAJOR;
1313 swappage = shmem_swapin(swap, gfp, info, idx); 1314 page = shmem_swapin(swap, gfp, info, idx);
1314 if (!swappage) { 1315 if (!page) {
1315 spin_lock(&info->lock); 1316 spin_lock(&info->lock);
1316 entry = shmem_swp_alloc(info, idx, sgp); 1317 entry = shmem_swp_alloc(info, idx, sgp, gfp);
1317 if (IS_ERR(entry)) 1318 if (IS_ERR(entry))
1318 error = PTR_ERR(entry); 1319 error = PTR_ERR(entry);
1319 else { 1320 else {
@@ -1323,62 +1324,42 @@ repeat:
1323 } 1324 }
1324 spin_unlock(&info->lock); 1325 spin_unlock(&info->lock);
1325 if (error) 1326 if (error)
1326 goto failed; 1327 goto out;
1327 goto repeat; 1328 goto repeat;
1328 } 1329 }
1329 wait_on_page_locked(swappage); 1330 wait_on_page_locked(page);
1330 page_cache_release(swappage); 1331 page_cache_release(page);
1331 goto repeat; 1332 goto repeat;
1332 } 1333 }
1333 1334
1334 /* We have to do this with page locked to prevent races */ 1335 /* We have to do this with page locked to prevent races */
1335 if (!trylock_page(swappage)) { 1336 if (!trylock_page(page)) {
1336 shmem_swp_unmap(entry); 1337 shmem_swp_unmap(entry);
1337 spin_unlock(&info->lock); 1338 spin_unlock(&info->lock);
1338 wait_on_page_locked(swappage); 1339 wait_on_page_locked(page);
1339 page_cache_release(swappage); 1340 page_cache_release(page);
1340 goto repeat; 1341 goto repeat;
1341 } 1342 }
1342 if (PageWriteback(swappage)) { 1343 if (PageWriteback(page)) {
1343 shmem_swp_unmap(entry); 1344 shmem_swp_unmap(entry);
1344 spin_unlock(&info->lock); 1345 spin_unlock(&info->lock);
1345 wait_on_page_writeback(swappage); 1346 wait_on_page_writeback(page);
1346 unlock_page(swappage); 1347 unlock_page(page);
1347 page_cache_release(swappage); 1348 page_cache_release(page);
1348 goto repeat; 1349 goto repeat;
1349 } 1350 }
1350 if (!PageUptodate(swappage)) { 1351 if (!PageUptodate(page)) {
1351 shmem_swp_unmap(entry); 1352 shmem_swp_unmap(entry);
1352 spin_unlock(&info->lock); 1353 spin_unlock(&info->lock);
1353 unlock_page(swappage); 1354 unlock_page(page);
1354 page_cache_release(swappage); 1355 page_cache_release(page);
1355 error = -EIO; 1356 error = -EIO;
1356 goto failed; 1357 goto out;
1357 } 1358 }
1358 1359
1359 if (filepage) { 1360 error = add_to_page_cache_locked(page, mapping,
1360 shmem_swp_set(info, entry, 0); 1361 idx, GFP_NOWAIT);
1361 shmem_swp_unmap(entry); 1362 if (error) {
1362 delete_from_swap_cache(swappage);
1363 spin_unlock(&info->lock);
1364 copy_highpage(filepage, swappage);
1365 unlock_page(swappage);
1366 page_cache_release(swappage);
1367 flush_dcache_page(filepage);
1368 SetPageUptodate(filepage);
1369 set_page_dirty(filepage);
1370 swap_free(swap);
1371 } else if (!(error = add_to_page_cache_locked(swappage, mapping,
1372 idx, GFP_NOWAIT))) {
1373 info->flags |= SHMEM_PAGEIN;
1374 shmem_swp_set(info, entry, 0);
1375 shmem_swp_unmap(entry);
1376 delete_from_swap_cache(swappage);
1377 spin_unlock(&info->lock);
1378 filepage = swappage;
1379 set_page_dirty(filepage);
1380 swap_free(swap);
1381 } else {
1382 shmem_swp_unmap(entry); 1363 shmem_swp_unmap(entry);
1383 spin_unlock(&info->lock); 1364 spin_unlock(&info->lock);
1384 if (error == -ENOMEM) { 1365 if (error == -ENOMEM) {
@@ -1387,32 +1368,38 @@ repeat:
1387 * call memcg's OOM if needed. 1368 * call memcg's OOM if needed.
1388 */ 1369 */
1389 error = mem_cgroup_shmem_charge_fallback( 1370 error = mem_cgroup_shmem_charge_fallback(
1390 swappage, 1371 page, current->mm, gfp);
1391 current->mm,
1392 gfp);
1393 if (error) { 1372 if (error) {
1394 unlock_page(swappage); 1373 unlock_page(page);
1395 page_cache_release(swappage); 1374 page_cache_release(page);
1396 goto failed; 1375 goto out;
1397 } 1376 }
1398 } 1377 }
1399 unlock_page(swappage); 1378 unlock_page(page);
1400 page_cache_release(swappage); 1379 page_cache_release(page);
1401 goto repeat; 1380 goto repeat;
1402 } 1381 }
1403 } else if (sgp == SGP_READ && !filepage) { 1382
1383 info->flags |= SHMEM_PAGEIN;
1384 shmem_swp_set(info, entry, 0);
1404 shmem_swp_unmap(entry); 1385 shmem_swp_unmap(entry);
1405 filepage = find_get_page(mapping, idx); 1386 delete_from_swap_cache(page);
1406 if (filepage && 1387 spin_unlock(&info->lock);
1407 (!PageUptodate(filepage) || !trylock_page(filepage))) { 1388 set_page_dirty(page);
1389 swap_free(swap);
1390
1391 } else if (sgp == SGP_READ) {
1392 shmem_swp_unmap(entry);
1393 page = find_get_page(mapping, idx);
1394 if (page && !trylock_page(page)) {
1408 spin_unlock(&info->lock); 1395 spin_unlock(&info->lock);
1409 wait_on_page_locked(filepage); 1396 wait_on_page_locked(page);
1410 page_cache_release(filepage); 1397 page_cache_release(page);
1411 filepage = NULL;
1412 goto repeat; 1398 goto repeat;
1413 } 1399 }
1414 spin_unlock(&info->lock); 1400 spin_unlock(&info->lock);
1415 } else { 1401
1402 } else if (prealloc_page) {
1416 shmem_swp_unmap(entry); 1403 shmem_swp_unmap(entry);
1417 sbinfo = SHMEM_SB(inode->i_sb); 1404 sbinfo = SHMEM_SB(inode->i_sb);
1418 if (sbinfo->max_blocks) { 1405 if (sbinfo->max_blocks) {
@@ -1426,121 +1413,82 @@ repeat:
1426 spin_unlock(&inode->i_lock); 1413 spin_unlock(&inode->i_lock);
1427 } else if (shmem_acct_block(info->flags)) 1414 } else if (shmem_acct_block(info->flags))
1428 goto nospace; 1415 goto nospace;
1429 1416
1430 if (!filepage) { 1417 page = prealloc_page;
1431 int ret; 1418 prealloc_page = NULL;
1432 1419
1433 if (!prealloc_page) { 1420 entry = shmem_swp_alloc(info, idx, sgp, gfp);
1434 spin_unlock(&info->lock);
1435 filepage = shmem_alloc_page(gfp, info, idx);
1436 if (!filepage) {
1437 shmem_unacct_blocks(info->flags, 1);
1438 shmem_free_blocks(inode, 1);
1439 error = -ENOMEM;
1440 goto failed;
1441 }
1442 SetPageSwapBacked(filepage);
1443
1444 /*
1445 * Precharge page while we can wait, compensate
1446 * after
1447 */
1448 error = mem_cgroup_cache_charge(filepage,
1449 current->mm, GFP_KERNEL);
1450 if (error) {
1451 page_cache_release(filepage);
1452 shmem_unacct_blocks(info->flags, 1);
1453 shmem_free_blocks(inode, 1);
1454 filepage = NULL;
1455 goto failed;
1456 }
1457
1458 spin_lock(&info->lock);
1459 } else {
1460 filepage = prealloc_page;
1461 prealloc_page = NULL;
1462 SetPageSwapBacked(filepage);
1463 }
1464
1465 entry = shmem_swp_alloc(info, idx, sgp);
1466 if (IS_ERR(entry)) 1421 if (IS_ERR(entry))
1467 error = PTR_ERR(entry); 1422 error = PTR_ERR(entry);
1468 else { 1423 else {
1469 swap = *entry; 1424 swap = *entry;
1470 shmem_swp_unmap(entry); 1425 shmem_swp_unmap(entry);
1471 } 1426 }
1472 ret = error || swap.val; 1427 ret = error || swap.val;
1473 if (ret) 1428 if(ret)
1474 mem_cgroup_uncharge_cache_page(filepage); 1429 mem_cgroup_uncharge_cache_page(page);
1475 else 1430 else
1476 ret = add_to_page_cache_lru(filepage, mapping, 1431 ret = add_to_page_cache_lru(page, mapping, idx, GFP_NOWAIT);
1477 idx, GFP_NOWAIT); 1432 /*
1478 /* 1433 * At add_to_page_cache_lru() failure,
1479 * At add_to_page_cache_lru() failure, uncharge will 1434 * uncharge will be done automatically.
1480 * be done automatically. 1435 */
1481 */ 1436 if (ret) {
1482 if (ret) { 1437 shmem_unacct_blocks(info->flags, 1);
1483 spin_unlock(&info->lock); 1438 shmem_free_blocks(inode, 1);
1484 page_cache_release(filepage); 1439 spin_unlock(&info->lock);
1485 shmem_unacct_blocks(info->flags, 1); 1440 page_cache_release(page);
1486 shmem_free_blocks(inode, 1); 1441 if (error)
1487 filepage = NULL; 1442 goto out;
1488 if (error) 1443 goto repeat;
1489 goto failed;
1490 goto repeat;
1491 }
1492 info->flags |= SHMEM_PAGEIN;
1493 } 1444 }
1494 1445
1446 info->flags |= SHMEM_PAGEIN;
1495 info->alloced++; 1447 info->alloced++;
1496 spin_unlock(&info->lock); 1448 spin_unlock(&info->lock);
1497 clear_highpage(filepage); 1449 clear_highpage(page);
1498 flush_dcache_page(filepage); 1450 flush_dcache_page(page);
1499 SetPageUptodate(filepage); 1451 SetPageUptodate(page);
1500 if (sgp == SGP_DIRTY) 1452 if (sgp == SGP_DIRTY)
1501 set_page_dirty(filepage); 1453 set_page_dirty(page);
1454
1455 } else {
1456 spin_unlock(&info->lock);
1457 error = -ENOMEM;
1458 goto out;
1502 } 1459 }
1503done: 1460done:
1504 *pagep = filepage; 1461 *pagep = page;
1505 error = 0; 1462 error = 0;
1506 goto out; 1463out:
1464 if (prealloc_page) {
1465 mem_cgroup_uncharge_cache_page(prealloc_page);
1466 page_cache_release(prealloc_page);
1467 }
1468 return error;
1507 1469
1508nospace: 1470nospace:
1509 /* 1471 /*
1510 * Perhaps the page was brought in from swap between find_lock_page 1472 * Perhaps the page was brought in from swap between find_lock_page
1511 * and taking info->lock? We allow for that at add_to_page_cache_lru, 1473 * and taking info->lock? We allow for that at add_to_page_cache_lru,
1512 * but must also avoid reporting a spurious ENOSPC while working on a 1474 * but must also avoid reporting a spurious ENOSPC while working on a
1513 * full tmpfs. (When filepage has been passed in to shmem_getpage, it 1475 * full tmpfs.
1514 * is already in page cache, which prevents this race from occurring.)
1515 */ 1476 */
1516 if (!filepage) { 1477 page = find_get_page(mapping, idx);
1517 struct page *page = find_get_page(mapping, idx);
1518 if (page) {
1519 spin_unlock(&info->lock);
1520 page_cache_release(page);
1521 goto repeat;
1522 }
1523 }
1524 spin_unlock(&info->lock); 1478 spin_unlock(&info->lock);
1525 error = -ENOSPC; 1479 if (page) {
1526failed: 1480 page_cache_release(page);
1527 if (*pagep != filepage) { 1481 goto repeat;
1528 unlock_page(filepage);
1529 page_cache_release(filepage);
1530 }
1531out:
1532 if (prealloc_page) {
1533 mem_cgroup_uncharge_cache_page(prealloc_page);
1534 page_cache_release(prealloc_page);
1535 } 1482 }
1536 return error; 1483 error = -ENOSPC;
1484 goto out;
1537} 1485}
1538 1486
1539static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1487static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1540{ 1488{
1541 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1489 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1542 int error; 1490 int error;
1543 int ret; 1491 int ret = VM_FAULT_LOCKED;
1544 1492
1545 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 1493 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1546 return VM_FAULT_SIGBUS; 1494 return VM_FAULT_SIGBUS;
@@ -1548,11 +1496,12 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1548 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1496 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1549 if (error) 1497 if (error)
1550 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1498 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1499
1551 if (ret & VM_FAULT_MAJOR) { 1500 if (ret & VM_FAULT_MAJOR) {
1552 count_vm_event(PGMAJFAULT); 1501 count_vm_event(PGMAJFAULT);
1553 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1502 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1554 } 1503 }
1555 return ret | VM_FAULT_LOCKED; 1504 return ret;
1556} 1505}
1557 1506
1558#ifdef CONFIG_NUMA 1507#ifdef CONFIG_NUMA
@@ -1669,19 +1618,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1669static const struct inode_operations shmem_symlink_inode_operations; 1618static const struct inode_operations shmem_symlink_inode_operations;
1670static const struct inode_operations shmem_symlink_inline_operations; 1619static const struct inode_operations shmem_symlink_inline_operations;
1671 1620
1672/*
1673 * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
1674 * but providing them allows a tmpfs file to be used for splice, sendfile, and
1675 * below the loop driver, in the generic fashion that many filesystems support.
1676 */
1677static int shmem_readpage(struct file *file, struct page *page)
1678{
1679 struct inode *inode = page->mapping->host;
1680 int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
1681 unlock_page(page);
1682 return error;
1683}
1684
1685static int 1621static int
1686shmem_write_begin(struct file *file, struct address_space *mapping, 1622shmem_write_begin(struct file *file, struct address_space *mapping,
1687 loff_t pos, unsigned len, unsigned flags, 1623 loff_t pos, unsigned len, unsigned flags,
@@ -1689,7 +1625,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
1689{ 1625{
1690 struct inode *inode = mapping->host; 1626 struct inode *inode = mapping->host;
1691 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1627 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1692 *pagep = NULL;
1693 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 1628 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1694} 1629}
1695 1630
@@ -1846,6 +1781,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1846 return retval; 1781 return retval;
1847} 1782}
1848 1783
1784static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1785 struct pipe_inode_info *pipe, size_t len,
1786 unsigned int flags)
1787{
1788 struct address_space *mapping = in->f_mapping;
1789 struct inode *inode = mapping->host;
1790 unsigned int loff, nr_pages, req_pages;
1791 struct page *pages[PIPE_DEF_BUFFERS];
1792 struct partial_page partial[PIPE_DEF_BUFFERS];
1793 struct page *page;
1794 pgoff_t index, end_index;
1795 loff_t isize, left;
1796 int error, page_nr;
1797 struct splice_pipe_desc spd = {
1798 .pages = pages,
1799 .partial = partial,
1800 .flags = flags,
1801 .ops = &page_cache_pipe_buf_ops,
1802 .spd_release = spd_release_page,
1803 };
1804
1805 isize = i_size_read(inode);
1806 if (unlikely(*ppos >= isize))
1807 return 0;
1808
1809 left = isize - *ppos;
1810 if (unlikely(left < len))
1811 len = left;
1812
1813 if (splice_grow_spd(pipe, &spd))
1814 return -ENOMEM;
1815
1816 index = *ppos >> PAGE_CACHE_SHIFT;
1817 loff = *ppos & ~PAGE_CACHE_MASK;
1818 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1819 nr_pages = min(req_pages, pipe->buffers);
1820
1821 spd.nr_pages = find_get_pages_contig(mapping, index,
1822 nr_pages, spd.pages);
1823 index += spd.nr_pages;
1824 error = 0;
1825
1826 while (spd.nr_pages < nr_pages) {
1827 error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
1828 if (error)
1829 break;
1830 unlock_page(page);
1831 spd.pages[spd.nr_pages++] = page;
1832 index++;
1833 }
1834
1835 index = *ppos >> PAGE_CACHE_SHIFT;
1836 nr_pages = spd.nr_pages;
1837 spd.nr_pages = 0;
1838
1839 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
1840 unsigned int this_len;
1841
1842 if (!len)
1843 break;
1844
1845 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
1846 page = spd.pages[page_nr];
1847
1848 if (!PageUptodate(page) || page->mapping != mapping) {
1849 error = shmem_getpage(inode, index, &page,
1850 SGP_CACHE, NULL);
1851 if (error)
1852 break;
1853 unlock_page(page);
1854 page_cache_release(spd.pages[page_nr]);
1855 spd.pages[page_nr] = page;
1856 }
1857
1858 isize = i_size_read(inode);
1859 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1860 if (unlikely(!isize || index > end_index))
1861 break;
1862
1863 if (end_index == index) {
1864 unsigned int plen;
1865
1866 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1867 if (plen <= loff)
1868 break;
1869
1870 this_len = min(this_len, plen - loff);
1871 len = this_len;
1872 }
1873
1874 spd.partial[page_nr].offset = loff;
1875 spd.partial[page_nr].len = this_len;
1876 len -= this_len;
1877 loff = 0;
1878 spd.nr_pages++;
1879 index++;
1880 }
1881
1882 while (page_nr < nr_pages)
1883 page_cache_release(spd.pages[page_nr++]);
1884
1885 if (spd.nr_pages)
1886 error = splice_to_pipe(pipe, &spd);
1887
1888 splice_shrink_spd(pipe, &spd);
1889
1890 if (error > 0) {
1891 *ppos += error;
1892 file_accessed(in);
1893 }
1894 return error;
1895}
1896
1849static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1897static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1850{ 1898{
1851 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 1899 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -2006,7 +2054,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2006 int error; 2054 int error;
2007 int len; 2055 int len;
2008 struct inode *inode; 2056 struct inode *inode;
2009 struct page *page = NULL; 2057 struct page *page;
2010 char *kaddr; 2058 char *kaddr;
2011 struct shmem_inode_info *info; 2059 struct shmem_inode_info *info;
2012 2060
@@ -2684,7 +2732,6 @@ static const struct address_space_operations shmem_aops = {
2684 .writepage = shmem_writepage, 2732 .writepage = shmem_writepage,
2685 .set_page_dirty = __set_page_dirty_no_writeback, 2733 .set_page_dirty = __set_page_dirty_no_writeback,
2686#ifdef CONFIG_TMPFS 2734#ifdef CONFIG_TMPFS
2687 .readpage = shmem_readpage,
2688 .write_begin = shmem_write_begin, 2735 .write_begin = shmem_write_begin,
2689 .write_end = shmem_write_end, 2736 .write_end = shmem_write_end,
2690#endif 2737#endif
@@ -2701,7 +2748,7 @@ static const struct file_operations shmem_file_operations = {
2701 .aio_read = shmem_file_aio_read, 2748 .aio_read = shmem_file_aio_read,
2702 .aio_write = generic_file_aio_write, 2749 .aio_write = generic_file_aio_write,
2703 .fsync = noop_fsync, 2750 .fsync = noop_fsync,
2704 .splice_read = generic_file_splice_read, 2751 .splice_read = shmem_file_splice_read,
2705 .splice_write = generic_file_splice_write, 2752 .splice_write = generic_file_splice_write,
2706#endif 2753#endif
2707}; 2754};
@@ -3015,6 +3062,15 @@ put_memory:
3015} 3062}
3016EXPORT_SYMBOL_GPL(shmem_file_setup); 3063EXPORT_SYMBOL_GPL(shmem_file_setup);
3017 3064
3065void shmem_set_file(struct vm_area_struct *vma, struct file *file)
3066{
3067 if (vma->vm_file)
3068 fput(vma->vm_file);
3069 vma->vm_file = file;
3070 vma->vm_ops = &shmem_vm_ops;
3071 vma->vm_flags |= VM_CAN_NONLINEAR;
3072}
3073
3018/** 3074/**
3019 * shmem_zero_setup - setup a shared anonymous mapping 3075 * shmem_zero_setup - setup a shared anonymous mapping
3020 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff 3076 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
@@ -3028,11 +3084,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
3028 if (IS_ERR(file)) 3084 if (IS_ERR(file))
3029 return PTR_ERR(file); 3085 return PTR_ERR(file);
3030 3086
3031 if (vma->vm_file) 3087 shmem_set_file(vma, file);
3032 fput(vma->vm_file);
3033 vma->vm_file = file;
3034 vma->vm_ops = &shmem_vm_ops;
3035 vma->vm_flags |= VM_CAN_NONLINEAR;
3036 return 0; 3088 return 0;
3037} 3089}
3038 3090
@@ -3048,13 +3100,29 @@ int shmem_zero_setup(struct vm_area_struct *vma)
3048 * suit tmpfs, since it may have pages in swapcache, and needs to find those 3100 * suit tmpfs, since it may have pages in swapcache, and needs to find those
3049 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 3101 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
3050 * 3102 *
3051 * Provide a stub for those callers to start using now, then later 3103 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
3052 * flesh it out to call shmem_getpage() with additional gfp mask, when 3104 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
3053 * shmem_file_splice_read() is added and shmem_readpage() is removed.
3054 */ 3105 */
3055struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 3106struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
3056 pgoff_t index, gfp_t gfp) 3107 pgoff_t index, gfp_t gfp)
3057{ 3108{
3109#ifdef CONFIG_SHMEM
3110 struct inode *inode = mapping->host;
3111 struct page *page;
3112 int error;
3113
3114 BUG_ON(mapping->a_ops != &shmem_aops);
3115 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
3116 if (error)
3117 page = ERR_PTR(error);
3118 else
3119 unlock_page(page);
3120 return page;
3121#else
3122 /*
3123 * The tiny !SHMEM case uses ramfs without swap
3124 */
3058 return read_cache_page_gfp(mapping, index, gfp); 3125 return read_cache_page_gfp(mapping, index, gfp);
3126#endif
3059} 3127}
3060EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 3128EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/slab.c b/mm/slab.c
index d96e223de77..a67f8121ce5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3218,12 +3218,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3218 if (in_interrupt() || (flags & __GFP_THISNODE)) 3218 if (in_interrupt() || (flags & __GFP_THISNODE))
3219 return NULL; 3219 return NULL;
3220 nid_alloc = nid_here = numa_mem_id(); 3220 nid_alloc = nid_here = numa_mem_id();
3221 get_mems_allowed();
3222 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3221 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3223 nid_alloc = cpuset_slab_spread_node(); 3222 nid_alloc = cpuset_slab_spread_node();
3224 else if (current->mempolicy) 3223 else if (current->mempolicy)
3225 nid_alloc = slab_node(current->mempolicy); 3224 nid_alloc = slab_node(current->mempolicy);
3226 put_mems_allowed();
3227 if (nid_alloc != nid_here) 3225 if (nid_alloc != nid_here)
3228 return ____cache_alloc_node(cachep, flags, nid_alloc); 3226 return ____cache_alloc_node(cachep, flags, nid_alloc);
3229 return NULL; 3227 return NULL;
@@ -3246,14 +3244,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3246 enum zone_type high_zoneidx = gfp_zone(flags); 3244 enum zone_type high_zoneidx = gfp_zone(flags);
3247 void *obj = NULL; 3245 void *obj = NULL;
3248 int nid; 3246 int nid;
3247 unsigned int cpuset_mems_cookie;
3249 3248
3250 if (flags & __GFP_THISNODE) 3249 if (flags & __GFP_THISNODE)
3251 return NULL; 3250 return NULL;
3252 3251
3253 get_mems_allowed();
3254 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3255 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3252 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3256 3253
3254retry_cpuset:
3255 cpuset_mems_cookie = get_mems_allowed();
3256 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3257
3257retry: 3258retry:
3258 /* 3259 /*
3259 * Look through allowed nodes for objects available 3260 * Look through allowed nodes for objects available
@@ -3306,7 +3307,9 @@ retry:
3306 } 3307 }
3307 } 3308 }
3308 } 3309 }
3309 put_mems_allowed(); 3310
3311 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
3312 goto retry_cpuset;
3310 return obj; 3313 return obj;
3311} 3314}
3312 3315
diff --git a/mm/slqb.c b/mm/slqb.c
new file mode 100644
index 00000000000..fbd2ebde3c3
--- /dev/null
+++ b/mm/slqb.c
@@ -0,0 +1,3816 @@
1/*
2 * SLQB: A slab allocator that focuses on per-CPU scaling, and good performance
3 * with order-0 allocations. Fastpaths emphasis is placed on local allocaiton
4 * and freeing, but with a secondary goal of good remote freeing (freeing on
5 * another CPU from that which allocated).
6 *
7 * Using ideas and code from mm/slab.c, mm/slob.c, and mm/slub.c.
8 */
9
10#include <linux/mm.h>
11#include <linux/swap.h> /* struct reclaim_state */
12#include <linux/module.h>
13#include <linux/interrupt.h>
14#include <linux/slab.h>
15#include <linux/seq_file.h>
16#include <linux/cpu.h>
17#include <linux/cpuset.h>
18#include <linux/mempolicy.h>
19#include <linux/ctype.h>
20#include <linux/kallsyms.h>
21#include <linux/memory.h>
22#include <linux/fault-inject.h>
23
24/*
25 * TODO
26 * - fix up releasing of offlined data structures. Not a big deal because
27 * they don't get cumulatively leaked with successive online/offline cycles
28 * - allow OOM conditions to flush back per-CPU pages to common lists to be
29 * reused by other CPUs.
30 * - investiage performance with memoryless nodes. Perhaps CPUs can be given
31 * a default closest home node via which it can use fastpath functions.
32 * Perhaps it is not a big problem.
33 */
34
35/*
36 * slqb_page overloads struct page, and is used to manage some slob allocation
37 * aspects, however to avoid the horrible mess in include/linux/mm_types.h,
38 * we'll just define our own struct slqb_page type variant here.
39 */
40struct slqb_page {
41 union {
42 struct {
43 unsigned long flags; /* mandatory */
44 atomic_t _count; /* mandatory */
45 unsigned int inuse; /* Nr of objects */
46 struct kmem_cache_list *list; /* Pointer to list */
47 void **freelist; /* LIFO freelist */
48 union {
49 struct list_head lru; /* misc. list */
50 struct rcu_head rcu_head; /* for rcu freeing */
51 };
52 };
53 struct page page;
54 };
55};
56static inline void struct_slqb_page_wrong_size(void)
57{ BUILD_BUG_ON(sizeof(struct slqb_page) != sizeof(struct page)); }
58
59#define PG_SLQB_BIT (1 << PG_slab)
60
61/*
62 * slqb_min_order: minimum allocation order for slabs
63 */
64static int slqb_min_order;
65
66/*
67 * slqb_min_objects: minimum number of objects per slab. Increasing this
68 * will increase the allocation order for slabs with larger objects
69 */
70static int slqb_min_objects = 1;
71
72#ifdef CONFIG_NUMA
73static inline int slab_numa(struct kmem_cache *s)
74{
75 return s->flags & SLAB_NUMA;
76}
77#else
78static inline int slab_numa(struct kmem_cache *s)
79{
80 return 0;
81}
82#endif
83
84static inline int slab_hiwater(struct kmem_cache *s)
85{
86 return s->hiwater;
87}
88
89static inline int slab_freebatch(struct kmem_cache *s)
90{
91 return s->freebatch;
92}
93
94/*
95 * Lock order:
96 * kmem_cache_node->list_lock
97 * kmem_cache_remote_free->lock
98 *
99 * Data structures:
100 * SLQB is primarily per-cpu. For each kmem_cache, each CPU has:
101 *
102 * - A LIFO list of node-local objects. Allocation and freeing of node local
103 * objects goes first to this list.
104 *
105 * - 2 Lists of slab pages, free and partial pages. If an allocation misses
106 * the object list, it tries from the partial list, then the free list.
107 * After freeing an object to the object list, if it is over a watermark,
108 * some objects are freed back to pages. If an allocation misses these lists,
109 * a new slab page is allocated from the page allocator. If the free list
110 * reaches a watermark, some of its pages are returned to the page allocator.
111 *
112 * - A remote free queue, where objects freed that did not come from the local
113 * node are queued to. When this reaches a watermark, the objects are
114 * flushed.
115 *
116 * - A remotely freed queue, where objects allocated from this CPU are flushed
117 * to from other CPUs' remote free queues. kmem_cache_remote_free->lock is
118 * used to protect access to this queue.
119 *
120 * When the remotely freed queue reaches a watermark, a flag is set to tell
121 * the owner CPU to check it. The owner CPU will then check the queue on the
122 * next allocation that misses the object list. It will move all objects from
123 * this list onto the object list and then allocate one.
124 *
125 * This system of remote queueing is intended to reduce lock and remote
126 * cacheline acquisitions, and give a cooling off period for remotely freed
127 * objects before they are re-allocated.
128 *
129 * node specific allocations from somewhere other than the local node are
130 * handled by a per-node list which is the same as the above per-CPU data
131 * structures except for the following differences:
132 *
133 * - kmem_cache_node->list_lock is used to protect access for multiple CPUs to
134 * allocate from a given node.
135 *
136 * - There is no remote free queue. Nodes don't free objects, CPUs do.
137 */
138
139static inline void slqb_stat_inc(struct kmem_cache_list *list,
140 enum stat_item si)
141{
142#ifdef CONFIG_SLQB_STATS
143 list->stats[si]++;
144#endif
145}
146
147static inline void slqb_stat_add(struct kmem_cache_list *list,
148 enum stat_item si, unsigned long nr)
149{
150#ifdef CONFIG_SLQB_STATS
151 list->stats[si] += nr;
152#endif
153}
154
155static inline int slqb_page_to_nid(struct slqb_page *page)
156{
157 return page_to_nid(&page->page);
158}
159
160static inline void *slqb_page_address(struct slqb_page *page)
161{
162 return page_address(&page->page);
163}
164
165static inline struct zone *slqb_page_zone(struct slqb_page *page)
166{
167 return page_zone(&page->page);
168}
169
170static inline int virt_to_nid(const void *addr)
171{
172 return page_to_nid(virt_to_page(addr));
173}
174
175static inline struct slqb_page *virt_to_head_slqb_page(const void *addr)
176{
177 struct page *p;
178
179 p = virt_to_head_page(addr);
180 return (struct slqb_page *)p;
181}
182
183static inline void __free_slqb_pages(struct slqb_page *page, unsigned int order,
184 int pages)
185{
186 struct page *p = &page->page;
187
188 reset_page_mapcount(p);
189 p->mapping = NULL;
190 VM_BUG_ON(!(p->flags & PG_SLQB_BIT));
191 p->flags &= ~PG_SLQB_BIT;
192
193 if (current->reclaim_state)
194 current->reclaim_state->reclaimed_slab += pages;
195 __free_pages(p, order);
196}
197
198#ifdef CONFIG_SLQB_DEBUG
199static inline int slab_debug(struct kmem_cache *s)
200{
201 return s->flags &
202 (SLAB_DEBUG_FREE |
203 SLAB_RED_ZONE |
204 SLAB_POISON |
205 SLAB_STORE_USER |
206 SLAB_TRACE);
207}
208static inline int slab_poison(struct kmem_cache *s)
209{
210 return s->flags & SLAB_POISON;
211}
212#else
213static inline int slab_debug(struct kmem_cache *s)
214{
215 return 0;
216}
217static inline int slab_poison(struct kmem_cache *s)
218{
219 return 0;
220}
221#endif
222
223#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
224 SLAB_POISON | SLAB_STORE_USER)
225
226/* Internal SLQB flags */
227#define __OBJECT_POISON 0x80000000 /* Poison object */
228
229/* Not all arches define cache_line_size */
230#ifndef cache_line_size
231#define cache_line_size() L1_CACHE_BYTES
232#endif
233
234#ifdef CONFIG_SMP
235static struct notifier_block slab_notifier;
236#endif
237
238/*
239 * slqb_lock protects slab_caches list and serialises hotplug operations.
240 * hotplug operations take lock for write, other operations can hold off
241 * hotplug by taking it for read (or write).
242 */
243static DECLARE_RWSEM(slqb_lock);
244
245/*
246 * A list of all slab caches on the system
247 */
248static LIST_HEAD(slab_caches);
249
250/*
251 * Tracking user of a slab.
252 */
253struct track {
254 unsigned long addr; /* Called from address */
255 int cpu; /* Was running on cpu */
256 int pid; /* Pid context */
257 unsigned long when; /* When did the operation occur */
258};
259
260enum track_item { TRACK_ALLOC, TRACK_FREE };
261
262static struct kmem_cache kmem_cache_cache;
263
264#ifdef CONFIG_SLQB_SYSFS
265static int sysfs_slab_add(struct kmem_cache *s);
266static void sysfs_slab_remove(struct kmem_cache *s);
267#else
268static inline int sysfs_slab_add(struct kmem_cache *s)
269{
270 return 0;
271}
272static inline void sysfs_slab_remove(struct kmem_cache *s)
273{
274 kmem_cache_free(&kmem_cache_cache, s);
275}
276#endif
277
278/********************************************************************
279 * Core slab cache functions
280 *******************************************************************/
281
282static int __slab_is_available __read_mostly;
283int slab_is_available(void)
284{
285 return __slab_is_available;
286}
287
288static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
289{
290#ifdef CONFIG_SMP
291 VM_BUG_ON(!s->cpu_slab[cpu]);
292 return s->cpu_slab[cpu];
293#else
294 return &s->cpu_slab;
295#endif
296}
297
298static inline int check_valid_pointer(struct kmem_cache *s,
299 struct slqb_page *page, const void *object)
300{
301 void *base;
302
303 base = slqb_page_address(page);
304 if (object < base || object >= base + s->objects * s->size ||
305 (object - base) % s->size) {
306 return 0;
307 }
308
309 return 1;
310}
311
312static inline void *get_freepointer(struct kmem_cache *s, void *object)
313{
314 return *(void **)(object + s->offset);
315}
316
317static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
318{
319 *(void **)(object + s->offset) = fp;
320}
321
322/* Loop over all objects in a slab */
323#define for_each_object(__p, __s, __addr) \
324 for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
325 __p += (__s)->size)
326
327/* Scan freelist */
328#define for_each_free_object(__p, __s, __free) \
329 for (__p = (__free); (__p) != NULL; __p = get_freepointer((__s),\
330 __p))
331
332#ifdef CONFIG_SLQB_DEBUG
333/*
334 * Debug settings:
335 */
336#ifdef CONFIG_SLQB_DEBUG_ON
337static int slqb_debug __read_mostly = DEBUG_DEFAULT_FLAGS;
338#else
339static int slqb_debug __read_mostly;
340#endif
341
342static char *slqb_debug_slabs;
343
344/*
345 * Object debugging
346 */
347static void print_section(char *text, u8 *addr, unsigned int length)
348{
349 int i, offset;
350 int newline = 1;
351 char ascii[17];
352
353 ascii[16] = 0;
354
355 for (i = 0; i < length; i++) {
356 if (newline) {
357 printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
358 newline = 0;
359 }
360 printk(KERN_CONT " %02x", addr[i]);
361 offset = i % 16;
362 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
363 if (offset == 15) {
364 printk(KERN_CONT " %s\n", ascii);
365 newline = 1;
366 }
367 }
368 if (!newline) {
369 i %= 16;
370 while (i < 16) {
371 printk(KERN_CONT " ");
372 ascii[i] = ' ';
373 i++;
374 }
375 printk(KERN_CONT " %s\n", ascii);
376 }
377}
378
379static struct track *get_track(struct kmem_cache *s, void *object,
380 enum track_item alloc)
381{
382 struct track *p;
383
384 if (s->offset)
385 p = object + s->offset + sizeof(void *);
386 else
387 p = object + s->inuse;
388
389 return p + alloc;
390}
391
392static void set_track(struct kmem_cache *s, void *object,
393 enum track_item alloc, unsigned long addr)
394{
395 struct track *p;
396
397 if (s->offset)
398 p = object + s->offset + sizeof(void *);
399 else
400 p = object + s->inuse;
401
402 p += alloc;
403 if (addr) {
404 p->addr = addr;
405 p->cpu = raw_smp_processor_id();
406 p->pid = current ? current->pid : -1;
407 p->when = jiffies;
408 } else
409 memset(p, 0, sizeof(struct track));
410}
411
412static void init_tracking(struct kmem_cache *s, void *object)
413{
414 if (!(s->flags & SLAB_STORE_USER))
415 return;
416
417 set_track(s, object, TRACK_FREE, 0UL);
418 set_track(s, object, TRACK_ALLOC, 0UL);
419}
420
421static void print_track(const char *s, struct track *t)
422{
423 if (!t->addr)
424 return;
425
426 printk(KERN_ERR "INFO: %s in ", s);
427 __print_symbol("%s", (unsigned long)t->addr);
428 printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
429}
430
431static void print_tracking(struct kmem_cache *s, void *object)
432{
433 if (!(s->flags & SLAB_STORE_USER))
434 return;
435
436 print_track("Allocated", get_track(s, object, TRACK_ALLOC));
437 print_track("Freed", get_track(s, object, TRACK_FREE));
438}
439
440static void print_page_info(struct slqb_page *page)
441{
442 printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n",
443 page, page->inuse, page->freelist, page->flags);
444
445}
446
447#define MAX_ERR_STR 100
448static void slab_bug(struct kmem_cache *s, char *fmt, ...)
449{
450 va_list args;
451 char buf[MAX_ERR_STR];
452
453 va_start(args, fmt);
454 vsnprintf(buf, sizeof(buf), fmt, args);
455 va_end(args);
456 printk(KERN_ERR "========================================"
457 "=====================================\n");
458 printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
459 printk(KERN_ERR "----------------------------------------"
460 "-------------------------------------\n\n");
461}
462
463static void slab_fix(struct kmem_cache *s, char *fmt, ...)
464{
465 va_list args;
466 char buf[100];
467
468 va_start(args, fmt);
469 vsnprintf(buf, sizeof(buf), fmt, args);
470 va_end(args);
471 printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
472}
473
474static void print_trailer(struct kmem_cache *s, struct slqb_page *page, u8 *p)
475{
476 unsigned int off; /* Offset of last byte */
477 u8 *addr = slqb_page_address(page);
478
479 print_tracking(s, p);
480
481 print_page_info(page);
482
483 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
484 p, p - addr, get_freepointer(s, p));
485
486 if (p > addr + 16)
487 print_section("Bytes b4", p - 16, 16);
488
489 print_section("Object", p, min(s->objsize, 128));
490
491 if (s->flags & SLAB_RED_ZONE)
492 print_section("Redzone", p + s->objsize, s->inuse - s->objsize);
493
494 if (s->offset)
495 off = s->offset + sizeof(void *);
496 else
497 off = s->inuse;
498
499 if (s->flags & SLAB_STORE_USER)
500 off += 2 * sizeof(struct track);
501
502 if (off != s->size) {
503 /* Beginning of the filler is the free pointer */
504 print_section("Padding", p + off, s->size - off);
505 }
506
507 dump_stack();
508}
509
510static void object_err(struct kmem_cache *s, struct slqb_page *page,
511 u8 *object, char *reason)
512{
513 slab_bug(s, reason);
514 print_trailer(s, page, object);
515}
516
517static void slab_err(struct kmem_cache *s, struct slqb_page *page,
518 char *fmt, ...)
519{
520 slab_bug(s, fmt);
521 print_page_info(page);
522 dump_stack();
523}
524
525static void init_object(struct kmem_cache *s, void *object, int active)
526{
527 u8 *p = object;
528
529 if (s->flags & __OBJECT_POISON) {
530 memset(p, POISON_FREE, s->objsize - 1);
531 p[s->objsize - 1] = POISON_END;
532 }
533
534 if (s->flags & SLAB_RED_ZONE) {
535 memset(p + s->objsize,
536 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
537 s->inuse - s->objsize);
538 }
539}
540
541static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
542{
543 while (bytes) {
544 if (*start != (u8)value)
545 return start;
546 start++;
547 bytes--;
548 }
549 return NULL;
550}
551
552static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
553 void *from, void *to)
554{
555 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
556 memset(from, data, to - from);
557}
558
559static int check_bytes_and_report(struct kmem_cache *s, struct slqb_page *page,
560 u8 *object, char *what,
561 u8 *start, unsigned int value, unsigned int bytes)
562{
563 u8 *fault;
564 u8 *end;
565
566 fault = check_bytes(start, value, bytes);
567 if (!fault)
568 return 1;
569
570 end = start + bytes;
571 while (end > fault && end[-1] == value)
572 end--;
573
574 slab_bug(s, "%s overwritten", what);
575 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
576 fault, end - 1, fault[0], value);
577 print_trailer(s, page, object);
578
579 restore_bytes(s, what, value, fault, end);
580 return 0;
581}
582
583/*
584 * Object layout:
585 *
586 * object address
587 * Bytes of the object to be managed.
588 * If the freepointer may overlay the object then the free
589 * pointer is the first word of the object.
590 *
591 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
592 * 0xa5 (POISON_END)
593 *
594 * object + s->objsize
595 * Padding to reach word boundary. This is also used for Redzoning.
596 * Padding is extended by another word if Redzoning is enabled and
597 * objsize == inuse.
598 *
599 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
600 * 0xcc (RED_ACTIVE) for objects in use.
601 *
602 * object + s->inuse
603 * Meta data starts here.
604 *
605 * A. Free pointer (if we cannot overwrite object on free)
606 * B. Tracking data for SLAB_STORE_USER
607 * C. Padding to reach required alignment boundary or at mininum
608 * one word if debuggin is on to be able to detect writes
609 * before the word boundary.
610 *
611 * Padding is done using 0x5a (POISON_INUSE)
612 *
613 * object + s->size
614 * Nothing is used beyond s->size.
615 */
616
617static int check_pad_bytes(struct kmem_cache *s, struct slqb_page *page, u8 *p)
618{
619 unsigned long off = s->inuse; /* The end of info */
620
621 if (s->offset) {
622 /* Freepointer is placed after the object. */
623 off += sizeof(void *);
624 }
625
626 if (s->flags & SLAB_STORE_USER) {
627 /* We also have user information there */
628 off += 2 * sizeof(struct track);
629 }
630
631 if (s->size == off)
632 return 1;
633
634 return check_bytes_and_report(s, page, p, "Object padding",
635 p + off, POISON_INUSE, s->size - off);
636}
637
638static int slab_pad_check(struct kmem_cache *s, struct slqb_page *page)
639{
640 u8 *start;
641 u8 *fault;
642 u8 *end;
643 int length;
644 int remainder;
645
646 if (!(s->flags & SLAB_POISON))
647 return 1;
648
649 start = slqb_page_address(page);
650 end = start + (PAGE_SIZE << s->order);
651 length = s->objects * s->size;
652 remainder = end - (start + length);
653 if (!remainder)
654 return 1;
655
656 fault = check_bytes(start + length, POISON_INUSE, remainder);
657 if (!fault)
658 return 1;
659
660 while (end > fault && end[-1] == POISON_INUSE)
661 end--;
662
663 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
664 print_section("Padding", start, length);
665
666 restore_bytes(s, "slab padding", POISON_INUSE, start, end);
667 return 0;
668}
669
670static int check_object(struct kmem_cache *s, struct slqb_page *page,
671 void *object, int active)
672{
673 u8 *p = object;
674 u8 *endobject = object + s->objsize;
675
676 if (s->flags & SLAB_RED_ZONE) {
677 unsigned int red =
678 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
679
680 if (!check_bytes_and_report(s, page, object, "Redzone",
681 endobject, red, s->inuse - s->objsize))
682 return 0;
683 } else {
684 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
685 check_bytes_and_report(s, page, p, "Alignment padding",
686 endobject, POISON_INUSE, s->inuse - s->objsize);
687 }
688 }
689
690 if (s->flags & SLAB_POISON) {
691 if (!active && (s->flags & __OBJECT_POISON)) {
692 if (!check_bytes_and_report(s, page, p, "Poison", p,
693 POISON_FREE, s->objsize - 1))
694 return 0;
695
696 if (!check_bytes_and_report(s, page, p, "Poison",
697 p + s->objsize - 1, POISON_END, 1))
698 return 0;
699 }
700
701 /*
702 * check_pad_bytes cleans up on its own.
703 */
704 check_pad_bytes(s, page, p);
705 }
706
707 return 1;
708}
709
710static int check_slab(struct kmem_cache *s, struct slqb_page *page)
711{
712 if (!(page->flags & PG_SLQB_BIT)) {
713 slab_err(s, page, "Not a valid slab page");
714 return 0;
715 }
716 if (page->inuse == 0) {
717 slab_err(s, page, "inuse before free / after alloc", s->name);
718 return 0;
719 }
720 if (page->inuse > s->objects) {
721 slab_err(s, page, "inuse %u > max %u",
722 s->name, page->inuse, s->objects);
723 return 0;
724 }
725 /* Slab_pad_check fixes things up after itself */
726 slab_pad_check(s, page);
727 return 1;
728}
729
730static void trace(struct kmem_cache *s, struct slqb_page *page,
731 void *object, int alloc)
732{
733 if (s->flags & SLAB_TRACE) {
734 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
735 s->name,
736 alloc ? "alloc" : "free",
737 object, page->inuse,
738 page->freelist);
739
740 if (!alloc)
741 print_section("Object", (void *)object, s->objsize);
742
743 dump_stack();
744 }
745}
746
747static void setup_object_debug(struct kmem_cache *s, struct slqb_page *page,
748 void *object)
749{
750 if (!slab_debug(s))
751 return;
752
753 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
754 return;
755
756 init_object(s, object, 0);
757 init_tracking(s, object);
758}
759
760static int alloc_debug_processing(struct kmem_cache *s,
761 void *object, unsigned long addr)
762{
763 struct slqb_page *page;
764 page = virt_to_head_slqb_page(object);
765
766 if (!check_slab(s, page))
767 goto bad;
768
769 if (!check_valid_pointer(s, page, object)) {
770 object_err(s, page, object, "Freelist Pointer check fails");
771 goto bad;
772 }
773
774 if (object && !check_object(s, page, object, 0))
775 goto bad;
776
777 /* Success perform special debug activities for allocs */
778 if (s->flags & SLAB_STORE_USER)
779 set_track(s, object, TRACK_ALLOC, addr);
780 trace(s, page, object, 1);
781 init_object(s, object, 1);
782 return 1;
783
784bad:
785 return 0;
786}
787
788static int free_debug_processing(struct kmem_cache *s,
789 void *object, unsigned long addr)
790{
791 struct slqb_page *page;
792 page = virt_to_head_slqb_page(object);
793
794 if (!check_slab(s, page))
795 goto fail;
796
797 if (!check_valid_pointer(s, page, object)) {
798 slab_err(s, page, "Invalid object pointer 0x%p", object);
799 goto fail;
800 }
801
802 if (!check_object(s, page, object, 1))
803 return 0;
804
805 /* Special debug activities for freeing objects */
806 if (s->flags & SLAB_STORE_USER)
807 set_track(s, object, TRACK_FREE, addr);
808 trace(s, page, object, 0);
809 init_object(s, object, 0);
810 return 1;
811
812fail:
813 slab_fix(s, "Object at 0x%p not freed", object);
814 return 0;
815}
816
817static int __init setup_slqb_debug(char *str)
818{
819 slqb_debug = DEBUG_DEFAULT_FLAGS;
820 if (*str++ != '=' || !*str) {
821 /*
822 * No options specified. Switch on full debugging.
823 */
824 goto out;
825 }
826
827 if (*str == ',') {
828 /*
829 * No options but restriction on slabs. This means full
830 * debugging for slabs matching a pattern.
831 */
832 goto check_slabs;
833 }
834
835 slqb_debug = 0;
836 if (*str == '-') {
837 /*
838 * Switch off all debugging measures.
839 */
840 goto out;
841 }
842
843 /*
844 * Determine which debug features should be switched on
845 */
846 for (; *str && *str != ','; str++) {
847 switch (tolower(*str)) {
848 case 'f':
849 slqb_debug |= SLAB_DEBUG_FREE;
850 break;
851 case 'z':
852 slqb_debug |= SLAB_RED_ZONE;
853 break;
854 case 'p':
855 slqb_debug |= SLAB_POISON;
856 break;
857 case 'u':
858 slqb_debug |= SLAB_STORE_USER;
859 break;
860 case 't':
861 slqb_debug |= SLAB_TRACE;
862 break;
863 case 'a':
864 slqb_debug |= SLAB_FAILSLAB;
865 break;
866 default:
867 printk(KERN_ERR "slqb_debug option '%c' "
868 "unknown. skipped\n", *str);
869 }
870 }
871
872check_slabs:
873 if (*str == ',')
874 slqb_debug_slabs = str + 1;
875out:
876 return 1;
877}
878__setup("slqb_debug", setup_slqb_debug);
879
880static int __init setup_slqb_min_order(char *str)
881{
882 get_option(&str, &slqb_min_order);
883 slqb_min_order = min(slqb_min_order, MAX_ORDER - 1);
884
885 return 1;
886}
887__setup("slqb_min_order=", setup_slqb_min_order);
888
889static int __init setup_slqb_min_objects(char *str)
890{
891 get_option(&str, &slqb_min_objects);
892
893 return 1;
894}
895
896__setup("slqb_min_objects=", setup_slqb_min_objects);
897
898static unsigned long kmem_cache_flags(unsigned long objsize,
899 unsigned long flags, const char *name,
900 void (*ctor)(void *))
901{
902 /*
903 * Enable debugging if selected on the kernel commandline.
904 */
905 if (slqb_debug && (!slqb_debug_slabs ||
906 strncmp(slqb_debug_slabs, name,
907 strlen(slqb_debug_slabs)) == 0))
908 flags |= slqb_debug;
909
910 if (num_possible_nodes() > 1)
911 flags |= SLAB_NUMA;
912
913 return flags;
914}
915#else
916static inline void setup_object_debug(struct kmem_cache *s,
917 struct slqb_page *page, void *object)
918{
919}
920
921static inline int alloc_debug_processing(struct kmem_cache *s,
922 void *object, unsigned long addr)
923{
924 return 0;
925}
926
927static inline int free_debug_processing(struct kmem_cache *s,
928 void *object, unsigned long addr)
929{
930 return 0;
931}
932
933static inline int slab_pad_check(struct kmem_cache *s, struct slqb_page *page)
934{
935 return 1;
936}
937
938static inline int check_object(struct kmem_cache *s, struct slqb_page *page,
939 void *object, int active)
940{
941 return 1;
942}
943
944static inline void add_full(struct kmem_cache_node *n, struct slqb_page *page)
945{
946}
947
948static inline unsigned long kmem_cache_flags(unsigned long objsize,
949 unsigned long flags, const char *name, void (*ctor)(void *))
950{
951 if (num_possible_nodes() > 1)
952 flags |= SLAB_NUMA;
953 return flags;
954}
955
956static const int slqb_debug;
957#endif
958
959/*
960 * allocate a new slab (return its corresponding struct slqb_page)
961 */
962static struct slqb_page *allocate_slab(struct kmem_cache *s,
963 gfp_t flags, int node)
964{
965 struct slqb_page *page;
966 int pages = 1 << s->order;
967
968 flags |= s->allocflags;
969
970 page = (struct slqb_page *)alloc_pages_node(node, flags, s->order);
971 if (!page)
972 return NULL;
973
974 mod_zone_page_state(slqb_page_zone(page),
975 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
976 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
977 pages);
978
979 return page;
980}
981
982/*
983 * Called once for each object on a new slab page
984 */
985static void setup_object(struct kmem_cache *s,
986 struct slqb_page *page, void *object)
987{
988 setup_object_debug(s, page, object);
989 if (unlikely(s->ctor))
990 s->ctor(object);
991}
992
993/*
994 * Allocate a new slab, set up its object list.
995 */
996static struct slqb_page *new_slab_page(struct kmem_cache *s,
997 gfp_t flags, int node, unsigned int colour)
998{
999 struct slqb_page *page;
1000 void *start;
1001 void *last;
1002 void *p;
1003
1004 BUG_ON(flags & GFP_SLAB_BUG_MASK);
1005
1006 page = allocate_slab(s,
1007 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1008 if (!page)
1009 goto out;
1010
1011 page->flags |= PG_SLQB_BIT;
1012
1013 start = page_address(&page->page);
1014
1015 if (unlikely(slab_poison(s)))
1016 memset(start, POISON_INUSE, PAGE_SIZE << s->order);
1017
1018 start += colour;
1019
1020 last = start;
1021 for_each_object(p, s, start) {
1022 setup_object(s, page, p);
1023 set_freepointer(s, last, p);
1024 last = p;
1025 }
1026 set_freepointer(s, last, NULL);
1027
1028 page->freelist = start;
1029 page->inuse = 0;
1030out:
1031 return page;
1032}
1033
1034/*
1035 * Free a slab page back to the page allocator
1036 */
1037static void __free_slab(struct kmem_cache *s, struct slqb_page *page)
1038{
1039 int pages = 1 << s->order;
1040
1041 if (unlikely(slab_debug(s))) {
1042 void *p;
1043
1044 slab_pad_check(s, page);
1045 for_each_free_object(p, s, page->freelist)
1046 check_object(s, page, p, 0);
1047 }
1048
1049 mod_zone_page_state(slqb_page_zone(page),
1050 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1051 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1052 -pages);
1053
1054 __free_slqb_pages(page, s->order, pages);
1055}
1056
1057static void rcu_free_slab(struct rcu_head *h)
1058{
1059 struct slqb_page *page;
1060
1061 page = container_of(h, struct slqb_page, rcu_head);
1062 __free_slab(page->list->cache, page);
1063}
1064
1065static void free_slab(struct kmem_cache *s, struct slqb_page *page)
1066{
1067 VM_BUG_ON(page->inuse);
1068 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU))
1069 call_rcu(&page->rcu_head, rcu_free_slab);
1070 else
1071 __free_slab(s, page);
1072}
1073
1074/*
1075 * Return an object to its slab.
1076 *
1077 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1078 * list_lock in the case of per-node list.
1079 */
1080static int free_object_to_page(struct kmem_cache *s,
1081 struct kmem_cache_list *l, struct slqb_page *page,
1082 void *object)
1083{
1084 VM_BUG_ON(page->list != l);
1085
1086 set_freepointer(s, object, page->freelist);
1087 page->freelist = object;
1088 page->inuse--;
1089
1090 if (!page->inuse) {
1091 if (likely(s->objects > 1)) {
1092 l->nr_partial--;
1093 list_del(&page->lru);
1094 }
1095 l->nr_slabs--;
1096 free_slab(s, page);
1097 slqb_stat_inc(l, FLUSH_SLAB_FREE);
1098 return 1;
1099
1100 } else if (page->inuse + 1 == s->objects) {
1101 l->nr_partial++;
1102 list_add(&page->lru, &l->partial);
1103 slqb_stat_inc(l, FLUSH_SLAB_PARTIAL);
1104 return 0;
1105 }
1106 return 0;
1107}
1108
1109#ifdef CONFIG_SMP
1110static void slab_free_to_remote(struct kmem_cache *s, struct slqb_page *page,
1111 void *object, struct kmem_cache_cpu *c);
1112#endif
1113
1114/*
1115 * Flush the LIFO list of objects on a list. They are sent back to their pages
1116 * in case the pages also belong to the list, or to our CPU's remote-free list
1117 * in the case they do not.
1118 *
1119 * Doesn't flush the entire list. flush_free_list_all does.
1120 *
1121 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1122 * list_lock in the case of per-node list.
1123 */
1124static void flush_free_list(struct kmem_cache *s, struct kmem_cache_list *l)
1125{
1126 void **head;
1127 int nr;
1128 int locked = 0;
1129
1130 nr = l->freelist.nr;
1131 if (unlikely(!nr))
1132 return;
1133
1134 nr = min(slab_freebatch(s), nr);
1135
1136 slqb_stat_inc(l, FLUSH_FREE_LIST);
1137 slqb_stat_add(l, FLUSH_FREE_LIST_OBJECTS, nr);
1138
1139 l->freelist.nr -= nr;
1140 head = l->freelist.head;
1141
1142 do {
1143 struct slqb_page *page;
1144 void **object;
1145
1146 object = head;
1147 VM_BUG_ON(!object);
1148 head = get_freepointer(s, object);
1149 page = virt_to_head_slqb_page(object);
1150
1151#ifdef CONFIG_SMP
1152 if (page->list != l) {
1153 struct kmem_cache_cpu *c;
1154
1155 if (locked) {
1156 spin_unlock(&l->page_lock);
1157 locked = 0;
1158 }
1159
1160 c = get_cpu_slab(s, smp_processor_id());
1161
1162 slab_free_to_remote(s, page, object, c);
1163 slqb_stat_inc(l, FLUSH_FREE_LIST_REMOTE);
1164 } else
1165#endif
1166 {
1167 if (!locked) {
1168 spin_lock(&l->page_lock);
1169 locked = 1;
1170 }
1171 free_object_to_page(s, l, page, object);
1172 }
1173
1174 nr--;
1175 } while (nr);
1176
1177 if (locked)
1178 spin_unlock(&l->page_lock);
1179
1180 l->freelist.head = head;
1181 if (!l->freelist.nr)
1182 l->freelist.tail = NULL;
1183}
1184
1185static void flush_free_list_all(struct kmem_cache *s, struct kmem_cache_list *l)
1186{
1187 while (l->freelist.nr)
1188 flush_free_list(s, l);
1189}
1190
1191#ifdef CONFIG_SMP
1192/*
1193 * If enough objects have been remotely freed back to this list,
1194 * remote_free_check will be set. In which case, we'll eventually come here
1195 * to take those objects off our remote_free list and onto our LIFO freelist.
1196 *
1197 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1198 * list_lock in the case of per-node list.
1199 */
1200static void claim_remote_free_list(struct kmem_cache *s,
1201 struct kmem_cache_list *l)
1202{
1203 void **head, **tail;
1204 int nr;
1205
1206 if (!l->remote_free.list.nr)
1207 return;
1208
1209 spin_lock(&l->remote_free.lock);
1210
1211 l->remote_free_check = 0;
1212 head = l->remote_free.list.head;
1213 l->remote_free.list.head = NULL;
1214 tail = l->remote_free.list.tail;
1215 l->remote_free.list.tail = NULL;
1216 nr = l->remote_free.list.nr;
1217 l->remote_free.list.nr = 0;
1218
1219 spin_unlock(&l->remote_free.lock);
1220
1221 VM_BUG_ON(!nr);
1222
1223 if (!l->freelist.nr) {
1224 /* Get head hot for likely subsequent allocation or flush */
1225 prefetchw(head);
1226 l->freelist.head = head;
1227 } else
1228 set_freepointer(s, l->freelist.tail, head);
1229 l->freelist.tail = tail;
1230
1231 l->freelist.nr += nr;
1232
1233 slqb_stat_inc(l, CLAIM_REMOTE_LIST);
1234 slqb_stat_add(l, CLAIM_REMOTE_LIST_OBJECTS, nr);
1235}
1236#else
1237static inline void claim_remote_free_list(struct kmem_cache *s,
1238 struct kmem_cache_list *l)
1239{
1240}
1241#endif
1242
1243/*
1244 * Allocation fastpath. Get an object from the list's LIFO freelist, or
1245 * return NULL if it is empty.
1246 *
1247 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1248 * list_lock in the case of per-node list.
1249 */
1250static __always_inline void *__cache_list_get_object(struct kmem_cache *s,
1251 struct kmem_cache_list *l)
1252{
1253 void *object;
1254
1255 object = l->freelist.head;
1256 if (likely(object)) {
1257 void *next = get_freepointer(s, object);
1258
1259 VM_BUG_ON(!l->freelist.nr);
1260 l->freelist.nr--;
1261 l->freelist.head = next;
1262
1263 return object;
1264 }
1265 VM_BUG_ON(l->freelist.nr);
1266
1267#ifdef CONFIG_SMP
1268 if (unlikely(l->remote_free_check)) {
1269 claim_remote_free_list(s, l);
1270
1271 if (l->freelist.nr > slab_hiwater(s))
1272 flush_free_list(s, l);
1273
1274 /* repetition here helps gcc :( */
1275 object = l->freelist.head;
1276 if (likely(object)) {
1277 void *next = get_freepointer(s, object);
1278
1279 VM_BUG_ON(!l->freelist.nr);
1280 l->freelist.nr--;
1281 l->freelist.head = next;
1282
1283 return object;
1284 }
1285 VM_BUG_ON(l->freelist.nr);
1286 }
1287#endif
1288
1289 return NULL;
1290}
1291
1292/*
1293 * Slow(er) path. Get a page from this list's existing pages. Will be a
1294 * new empty page in the case that __slab_alloc_page has just been called
1295 * (empty pages otherwise never get queued up on the lists), or a partial page
1296 * already on the list.
1297 *
1298 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1299 * list_lock in the case of per-node list.
1300 */
1301static noinline void *__cache_list_get_page(struct kmem_cache *s,
1302 struct kmem_cache_list *l)
1303{
1304 struct slqb_page *page;
1305 void *object;
1306
1307 if (unlikely(!l->nr_partial))
1308 return NULL;
1309
1310 page = list_first_entry(&l->partial, struct slqb_page, lru);
1311 VM_BUG_ON(page->inuse == s->objects);
1312 if (page->inuse + 1 == s->objects) {
1313 l->nr_partial--;
1314 list_del(&page->lru);
1315 }
1316
1317 VM_BUG_ON(!page->freelist);
1318
1319 page->inuse++;
1320
1321 object = page->freelist;
1322 page->freelist = get_freepointer(s, object);
1323 if (page->freelist)
1324 prefetchw(page->freelist);
1325 VM_BUG_ON((page->inuse == s->objects) != (page->freelist == NULL));
1326 slqb_stat_inc(l, ALLOC_SLAB_FILL);
1327
1328 return object;
1329}
1330
1331static void *cache_list_get_page(struct kmem_cache *s,
1332 struct kmem_cache_list *l)
1333{
1334 void *object;
1335
1336 if (unlikely(!l->nr_partial))
1337 return NULL;
1338
1339 spin_lock(&l->page_lock);
1340 object = __cache_list_get_page(s, l);
1341 spin_unlock(&l->page_lock);
1342
1343 return object;
1344}
1345
1346/*
1347 * Allocation slowpath. Allocate a new slab page from the page allocator, and
1348 * put it on the list's partial list. Must be followed by an allocation so
1349 * that we don't have dangling empty pages on the partial list.
1350 *
1351 * Returns 0 on allocation failure.
1352 *
1353 * Must be called with interrupts disabled.
1354 */
1355static noinline void *__slab_alloc_page(struct kmem_cache *s,
1356 gfp_t gfpflags, int node)
1357{
1358 struct slqb_page *page;
1359 struct kmem_cache_list *l;
1360 struct kmem_cache_cpu *c;
1361 unsigned int colour;
1362 void *object;
1363
1364 c = get_cpu_slab(s, smp_processor_id());
1365 colour = c->colour_next;
1366 c->colour_next += s->colour_off;
1367 if (c->colour_next >= s->colour_range)
1368 c->colour_next = 0;
1369
1370 /* Caller handles __GFP_ZERO */
1371 gfpflags &= ~__GFP_ZERO;
1372
1373 if (gfpflags & __GFP_WAIT)
1374 local_irq_enable();
1375 page = new_slab_page(s, gfpflags, node, colour);
1376 if (gfpflags & __GFP_WAIT)
1377 local_irq_disable();
1378 if (unlikely(!page))
1379 return page;
1380
1381 if (!NUMA_BUILD || likely(slqb_page_to_nid(page) == numa_node_id())) {
1382 struct kmem_cache_cpu *c;
1383 int cpu = smp_processor_id();
1384
1385 c = get_cpu_slab(s, cpu);
1386 l = &c->list;
1387 page->list = l;
1388
1389 spin_lock(&l->page_lock);
1390 l->nr_slabs++;
1391 l->nr_partial++;
1392 list_add(&page->lru, &l->partial);
1393 slqb_stat_inc(l, ALLOC);
1394 slqb_stat_inc(l, ALLOC_SLAB_NEW);
1395 object = __cache_list_get_page(s, l);
1396 spin_unlock(&l->page_lock);
1397 } else {
1398#ifdef CONFIG_NUMA
1399 struct kmem_cache_node *n;
1400
1401 n = s->node_slab[slqb_page_to_nid(page)];
1402 l = &n->list;
1403 page->list = l;
1404
1405 spin_lock(&n->list_lock);
1406 spin_lock(&l->page_lock);
1407 l->nr_slabs++;
1408 l->nr_partial++;
1409 list_add(&page->lru, &l->partial);
1410 slqb_stat_inc(l, ALLOC);
1411 slqb_stat_inc(l, ALLOC_SLAB_NEW);
1412 object = __cache_list_get_page(s, l);
1413 spin_unlock(&l->page_lock);
1414 spin_unlock(&n->list_lock);
1415#endif
1416 }
1417 VM_BUG_ON(!object);
1418 return object;
1419}
1420
1421#ifdef CONFIG_NUMA
1422static noinline int alternate_nid(struct kmem_cache *s,
1423 gfp_t gfpflags, int node)
1424{
1425 if (in_interrupt() || (gfpflags & __GFP_THISNODE))
1426 return node;
1427 if (cpuset_do_slab_mem_spread() && (s->flags & SLAB_MEM_SPREAD))
1428 return cpuset_mem_spread_node();
1429 else if (current->mempolicy)
1430 return slab_node(current->mempolicy);
1431 return node;
1432}
1433
1434/*
1435 * Allocate an object from a remote node. Return NULL if none could be found
1436 * (in which case, caller should allocate a new slab)
1437 *
1438 * Must be called with interrupts disabled.
1439 */
1440static void *__remote_slab_alloc_node(struct kmem_cache *s,
1441 gfp_t gfpflags, int node)
1442{
1443 struct kmem_cache_node *n;
1444 struct kmem_cache_list *l;
1445 void *object;
1446
1447 n = s->node_slab[node];
1448 if (unlikely(!n)) /* node has no memory */
1449 return NULL;
1450 l = &n->list;
1451
1452 spin_lock(&n->list_lock);
1453
1454 object = __cache_list_get_object(s, l);
1455 if (unlikely(!object)) {
1456 object = cache_list_get_page(s, l);
1457 if (unlikely(!object)) {
1458 spin_unlock(&n->list_lock);
1459 return __slab_alloc_page(s, gfpflags, node);
1460 }
1461 }
1462 if (likely(object))
1463 slqb_stat_inc(l, ALLOC);
1464 spin_unlock(&n->list_lock);
1465 return object;
1466}
1467
1468static noinline void *__remote_slab_alloc(struct kmem_cache *s,
1469 gfp_t gfpflags, int node)
1470{
1471 void *object;
1472 struct zonelist *zonelist;
1473 struct zoneref *z;
1474 struct zone *zone;
1475 enum zone_type high_zoneidx = gfp_zone(gfpflags);
1476
1477 object = __remote_slab_alloc_node(s, gfpflags, node);
1478 if (likely(object || (gfpflags & __GFP_THISNODE)))
1479 return object;
1480
1481 zonelist = node_zonelist(slab_node(current->mempolicy), gfpflags);
1482 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1483 if (!cpuset_zone_allowed_hardwall(zone, gfpflags))
1484 continue;
1485
1486 node = zone_to_nid(zone);
1487 object = __remote_slab_alloc_node(s, gfpflags, node);
1488 if (likely(object))
1489 return object;
1490 }
1491 return NULL;
1492}
1493#endif
1494
1495/*
1496 * Main allocation path. Return an object, or NULL on allocation failure.
1497 *
1498 * Must be called with interrupts disabled.
1499 */
1500static __always_inline void *__slab_alloc(struct kmem_cache *s,
1501 gfp_t gfpflags, int node)
1502{
1503 void *object;
1504 struct kmem_cache_cpu *c;
1505 struct kmem_cache_list *l;
1506
1507#ifdef CONFIG_NUMA
1508 if (unlikely(node != -1) && unlikely(node != numa_node_id())) {
1509try_remote:
1510 return __remote_slab_alloc(s, gfpflags, node);
1511 }
1512#endif
1513
1514 c = get_cpu_slab(s, smp_processor_id());
1515 VM_BUG_ON(!c);
1516 l = &c->list;
1517 object = __cache_list_get_object(s, l);
1518 if (unlikely(!object)) {
1519#ifdef CONFIG_NUMA
1520 int thisnode = numa_node_id();
1521
1522 /*
1523 * If the local node is memoryless, try remote alloc before
1524 * trying the page allocator. Otherwise, what happens is
1525 * objects are always freed to remote lists but the allocation
1526 * side always allocates a new page with only one object
1527 * used in each page
1528 */
1529 if (unlikely(!node_state(thisnode, N_HIGH_MEMORY)))
1530 object = __remote_slab_alloc(s, gfpflags, thisnode);
1531#endif
1532
1533 if (!object) {
1534 object = cache_list_get_page(s, l);
1535 if (unlikely(!object)) {
1536 object = __slab_alloc_page(s, gfpflags, node);
1537#ifdef CONFIG_NUMA
1538 if (unlikely(!object)) {
1539 node = numa_node_id();
1540 goto try_remote;
1541 }
1542#endif
1543 return object;
1544 }
1545 }
1546 }
1547 if (likely(object))
1548 slqb_stat_inc(l, ALLOC);
1549 return object;
1550}
1551
1552/*
1553 * Perform some interrupts-on processing around the main allocation path
1554 * (debug checking and memset()ing).
1555 */
1556static __always_inline void *slab_alloc(struct kmem_cache *s,
1557 gfp_t gfpflags, int node, unsigned long addr)
1558{
1559 void *object;
1560 unsigned long flags;
1561
1562 gfpflags &= gfp_allowed_mask;
1563
1564 lockdep_trace_alloc(gfpflags);
1565 might_sleep_if(gfpflags & __GFP_WAIT);
1566
1567 if (should_failslab(s->objsize, gfpflags, s->flags))
1568 return NULL;
1569
1570again:
1571 local_irq_save(flags);
1572 object = __slab_alloc(s, gfpflags, node);
1573 local_irq_restore(flags);
1574
1575 if (unlikely(slab_debug(s)) && likely(object)) {
1576 if (unlikely(!alloc_debug_processing(s, object, addr)))
1577 goto again;
1578 }
1579
1580 if (unlikely(gfpflags & __GFP_ZERO) && likely(object))
1581 memset(object, 0, s->objsize);
1582
1583 return object;
1584}
1585
1586static __always_inline void *__kmem_cache_alloc(struct kmem_cache *s,
1587 gfp_t gfpflags, unsigned long caller)
1588{
1589 int node = -1;
1590
1591#ifdef CONFIG_NUMA
1592 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
1593 node = alternate_nid(s, gfpflags, node);
1594#endif
1595 return slab_alloc(s, gfpflags, node, caller);
1596}
1597
1598void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1599{
1600 return __kmem_cache_alloc(s, gfpflags, _RET_IP_);
1601}
1602EXPORT_SYMBOL(kmem_cache_alloc);
1603
1604#ifdef CONFIG_NUMA
1605void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1606{
1607 return slab_alloc(s, gfpflags, node, _RET_IP_);
1608}
1609EXPORT_SYMBOL(kmem_cache_alloc_node);
1610#endif
1611
1612#ifdef CONFIG_SMP
1613/*
1614 * Flush this CPU's remote free list of objects back to the list from where
1615 * they originate. They end up on that list's remotely freed list, and
1616 * eventually we set it's remote_free_check if there are enough objects on it.
1617 *
1618 * This seems convoluted, but it keeps is from stomping on the target CPU's
1619 * fastpath cachelines.
1620 *
1621 * Must be called with interrupts disabled.
1622 */
1623static void flush_remote_free_cache(struct kmem_cache *s,
1624 struct kmem_cache_cpu *c)
1625{
1626 struct kmlist *src;
1627 struct kmem_cache_list *dst;
1628 unsigned int nr;
1629 int set;
1630
1631 src = &c->rlist;
1632 nr = src->nr;
1633 if (unlikely(!nr))
1634 return;
1635
1636#ifdef CONFIG_SLQB_STATS
1637 {
1638 struct kmem_cache_list *l = &c->list;
1639
1640 slqb_stat_inc(l, FLUSH_RFREE_LIST);
1641 slqb_stat_add(l, FLUSH_RFREE_LIST_OBJECTS, nr);
1642 }
1643#endif
1644
1645 dst = c->remote_cache_list;
1646
1647 /*
1648 * Less common case, dst is filling up so free synchronously.
1649 * No point in having remote CPU free thse as it will just
1650 * free them back to the page list anyway.
1651 */
1652 if (unlikely(dst->remote_free.list.nr > (slab_hiwater(s) >> 1))) {
1653 void **head;
1654
1655 head = src->head;
1656 spin_lock(&dst->page_lock);
1657 do {
1658 struct slqb_page *page;
1659 void **object;
1660
1661 object = head;
1662 VM_BUG_ON(!object);
1663 head = get_freepointer(s, object);
1664 page = virt_to_head_slqb_page(object);
1665
1666 free_object_to_page(s, dst, page, object);
1667 nr--;
1668 } while (nr);
1669 spin_unlock(&dst->page_lock);
1670
1671 src->head = NULL;
1672 src->tail = NULL;
1673 src->nr = 0;
1674
1675 return;
1676 }
1677
1678 spin_lock(&dst->remote_free.lock);
1679
1680 if (!dst->remote_free.list.head)
1681 dst->remote_free.list.head = src->head;
1682 else
1683 set_freepointer(s, dst->remote_free.list.tail, src->head);
1684 dst->remote_free.list.tail = src->tail;
1685
1686 src->head = NULL;
1687 src->tail = NULL;
1688 src->nr = 0;
1689
1690 if (dst->remote_free.list.nr < slab_freebatch(s))
1691 set = 1;
1692 else
1693 set = 0;
1694
1695 dst->remote_free.list.nr += nr;
1696
1697 if (unlikely(dst->remote_free.list.nr >= slab_freebatch(s) && set))
1698 dst->remote_free_check = 1;
1699
1700 spin_unlock(&dst->remote_free.lock);
1701}
1702
1703/*
1704 * Free an object to this CPU's remote free list.
1705 *
1706 * Must be called with interrupts disabled.
1707 */
1708static noinline void slab_free_to_remote(struct kmem_cache *s,
1709 struct slqb_page *page, void *object,
1710 struct kmem_cache_cpu *c)
1711{
1712 struct kmlist *r;
1713
1714 /*
1715 * Our remote free list corresponds to a different list. Must
1716 * flush it and switch.
1717 */
1718 if (page->list != c->remote_cache_list) {
1719 flush_remote_free_cache(s, c);
1720 c->remote_cache_list = page->list;
1721 }
1722
1723 r = &c->rlist;
1724 if (!r->head)
1725 r->head = object;
1726 else
1727 set_freepointer(s, r->tail, object);
1728 set_freepointer(s, object, NULL);
1729 r->tail = object;
1730 r->nr++;
1731
1732 if (unlikely(r->nr >= slab_freebatch(s)))
1733 flush_remote_free_cache(s, c);
1734}
1735#endif
1736
1737/*
1738 * Main freeing path. Return an object, or NULL on allocation failure.
1739 *
1740 * Must be called with interrupts disabled.
1741 */
1742static __always_inline void __slab_free(struct kmem_cache *s,
1743 struct slqb_page *page, void *object)
1744{
1745 struct kmem_cache_cpu *c;
1746 struct kmem_cache_list *l;
1747 int thiscpu = smp_processor_id();
1748
1749 c = get_cpu_slab(s, thiscpu);
1750 l = &c->list;
1751
1752 slqb_stat_inc(l, FREE);
1753
1754 if (!NUMA_BUILD || !slab_numa(s) ||
1755 likely(slqb_page_to_nid(page) == numa_node_id())) {
1756 /*
1757 * Freeing fastpath. Collects all local-node objects, not
1758 * just those allocated from our per-CPU list. This allows
1759 * fast transfer of objects from one CPU to another within
1760 * a given node.
1761 */
1762 set_freepointer(s, object, l->freelist.head);
1763 l->freelist.head = object;
1764 if (!l->freelist.nr)
1765 l->freelist.tail = object;
1766 l->freelist.nr++;
1767
1768 if (unlikely(l->freelist.nr > slab_hiwater(s)))
1769 flush_free_list(s, l);
1770
1771 } else {
1772#ifdef CONFIG_SMP
1773 /*
1774 * Freeing an object that was allocated on a remote node.
1775 */
1776 slab_free_to_remote(s, page, object, c);
1777 slqb_stat_inc(l, FREE_REMOTE);
1778#endif
1779 }
1780}
1781
1782/*
1783 * Perform some interrupts-on processing around the main freeing path
1784 * (debug checking).
1785 */
1786static __always_inline void slab_free(struct kmem_cache *s,
1787 struct slqb_page *page, void *object)
1788{
1789 unsigned long flags;
1790
1791 prefetchw(object);
1792
1793 debug_check_no_locks_freed(object, s->objsize);
1794 if (likely(object) && unlikely(slab_debug(s))) {
1795 if (unlikely(!free_debug_processing(s, object, _RET_IP_)))
1796 return;
1797 }
1798
1799 local_irq_save(flags);
1800 __slab_free(s, page, object);
1801 local_irq_restore(flags);
1802}
1803
1804void kmem_cache_free(struct kmem_cache *s, void *object)
1805{
1806 struct slqb_page *page = NULL;
1807
1808 if (slab_numa(s))
1809 page = virt_to_head_slqb_page(object);
1810 slab_free(s, page, object);
1811}
1812EXPORT_SYMBOL(kmem_cache_free);
1813
1814/*
1815 * Calculate the order of allocation given an slab object size.
1816 *
1817 * Order 0 allocations are preferred since order 0 does not cause fragmentation
1818 * in the page allocator, and they have fastpaths in the page allocator. But
1819 * also minimise external fragmentation with large objects.
1820 */
1821static int slab_order(int size, int max_order, int frac)
1822{
1823 int order;
1824
1825 if (fls(size - 1) <= PAGE_SHIFT)
1826 order = 0;
1827 else
1828 order = fls(size - 1) - PAGE_SHIFT;
1829 if (order < slqb_min_order)
1830 order = slqb_min_order;
1831
1832 while (order <= max_order) {
1833 unsigned long slab_size = PAGE_SIZE << order;
1834 unsigned long objects;
1835 unsigned long waste;
1836
1837 objects = slab_size / size;
1838 if (!objects)
1839 goto next;
1840
1841 if (order < MAX_ORDER && objects < slqb_min_objects) {
1842 /*
1843 * if we don't have enough objects for min_objects,
1844 * then try the next size up. Unless we have reached
1845 * our maximum possible page size.
1846 */
1847 goto next;
1848 }
1849
1850 waste = slab_size - (objects * size);
1851
1852 if (waste * frac <= slab_size)
1853 break;
1854
1855next:
1856 order++;
1857 }
1858
1859 return order;
1860}
1861
1862static int calculate_order(int size)
1863{
1864 int order;
1865
1866 /*
1867 * Attempt to find best configuration for a slab. This
1868 * works by first attempting to generate a layout with
1869 * the best configuration and backing off gradually.
1870 */
1871 order = slab_order(size, 1, 4);
1872 if (order <= 1)
1873 return order;
1874
1875 /*
1876 * This size cannot fit in order-1. Allow bigger orders, but
1877 * forget about trying to save space.
1878 */
1879 order = slab_order(size, MAX_ORDER - 1, 0);
1880 if (order < MAX_ORDER)
1881 return order;
1882
1883 return -ENOSYS;
1884}
1885
1886/*
1887 * Figure out what the alignment of the objects will be.
1888 */
1889static unsigned long calculate_alignment(unsigned long flags,
1890 unsigned long align, unsigned long size)
1891{
1892 /*
1893 * If the user wants hardware cache aligned objects then follow that
1894 * suggestion if the object is sufficiently large.
1895 *
1896 * The hardware cache alignment cannot override the specified
1897 * alignment though. If that is greater then use it.
1898 */
1899 if (flags & SLAB_HWCACHE_ALIGN) {
1900 unsigned long ralign = cache_line_size();
1901
1902 while (size <= ralign / 2)
1903 ralign /= 2;
1904 align = max(align, ralign);
1905 }
1906
1907 if (align < ARCH_SLAB_MINALIGN)
1908 align = ARCH_SLAB_MINALIGN;
1909
1910 return ALIGN(align, sizeof(void *));
1911}
1912
1913static void init_kmem_cache_list(struct kmem_cache *s,
1914 struct kmem_cache_list *l)
1915{
1916 l->cache = s;
1917 l->freelist.nr = 0;
1918 l->freelist.head = NULL;
1919 l->freelist.tail = NULL;
1920 l->nr_partial = 0;
1921 l->nr_slabs = 0;
1922 INIT_LIST_HEAD(&l->partial);
1923 spin_lock_init(&l->page_lock);
1924
1925#ifdef CONFIG_SMP
1926 l->remote_free_check = 0;
1927 spin_lock_init(&l->remote_free.lock);
1928 l->remote_free.list.nr = 0;
1929 l->remote_free.list.head = NULL;
1930 l->remote_free.list.tail = NULL;
1931#endif
1932
1933#ifdef CONFIG_SLQB_STATS
1934 memset(l->stats, 0, sizeof(l->stats));
1935#endif
1936}
1937
1938static void init_kmem_cache_cpu(struct kmem_cache *s,
1939 struct kmem_cache_cpu *c)
1940{
1941 init_kmem_cache_list(s, &c->list);
1942
1943 c->colour_next = 0;
1944#ifdef CONFIG_SMP
1945 c->rlist.nr = 0;
1946 c->rlist.head = NULL;
1947 c->rlist.tail = NULL;
1948 c->remote_cache_list = NULL;
1949#endif
1950}
1951
1952#ifdef CONFIG_NUMA
1953static void init_kmem_cache_node(struct kmem_cache *s,
1954 struct kmem_cache_node *n)
1955{
1956 spin_lock_init(&n->list_lock);
1957 init_kmem_cache_list(s, &n->list);
1958}
1959#endif
1960
1961/* Initial slabs. */
1962#ifdef CONFIG_SMP
1963static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cache_cpus);
1964#endif
1965#ifdef CONFIG_NUMA
1966/* XXX: really need a DEFINE_PER_NODE for per-node data because a static
1967 * array is wasteful */
1968static struct kmem_cache_node kmem_cache_nodes[MAX_NUMNODES];
1969#endif
1970
1971#ifdef CONFIG_SMP
1972static struct kmem_cache kmem_cpu_cache;
1973static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cpu_cpus);
1974#ifdef CONFIG_NUMA
1975static struct kmem_cache_node kmem_cpu_nodes[MAX_NUMNODES]; /* XXX per-nid */
1976#endif
1977#endif
1978
1979#ifdef CONFIG_NUMA
1980static struct kmem_cache kmem_node_cache;
1981#ifdef CONFIG_SMP
1982static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_node_cpus);
1983#endif
1984static struct kmem_cache_node kmem_node_nodes[MAX_NUMNODES]; /*XXX per-nid */
1985#endif
1986
1987#ifdef CONFIG_SMP
1988static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
1989 int cpu)
1990{
1991 struct kmem_cache_cpu *c;
1992 int node;
1993
1994 node = cpu_to_node(cpu);
1995
1996 c = kmem_cache_alloc_node(&kmem_cpu_cache, GFP_KERNEL, node);
1997 if (!c)
1998 return NULL;
1999
2000 init_kmem_cache_cpu(s, c);
2001 return c;
2002}
2003
2004static void free_kmem_cache_cpus(struct kmem_cache *s)
2005{
2006 int cpu;
2007
2008 for_each_online_cpu(cpu) {
2009 struct kmem_cache_cpu *c;
2010
2011 c = s->cpu_slab[cpu];
2012 if (c) {
2013 kmem_cache_free(&kmem_cpu_cache, c);
2014 s->cpu_slab[cpu] = NULL;
2015 }
2016 }
2017}
2018
2019static int alloc_kmem_cache_cpus(struct kmem_cache *s)
2020{
2021 int cpu;
2022
2023 for_each_online_cpu(cpu) {
2024 struct kmem_cache_cpu *c;
2025
2026 c = s->cpu_slab[cpu];
2027 if (c)
2028 continue;
2029
2030 c = alloc_kmem_cache_cpu(s, cpu);
2031 if (!c) {
2032 free_kmem_cache_cpus(s);
2033 return 0;
2034 }
2035 s->cpu_slab[cpu] = c;
2036 }
2037 return 1;
2038}
2039
2040#else
2041static inline void free_kmem_cache_cpus(struct kmem_cache *s)
2042{
2043}
2044
2045static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2046{
2047 init_kmem_cache_cpu(s, &s->cpu_slab);
2048 return 1;
2049}
2050#endif
2051
2052#ifdef CONFIG_NUMA
2053static void free_kmem_cache_nodes(struct kmem_cache *s)
2054{
2055 int node;
2056
2057 for_each_node_state(node, N_NORMAL_MEMORY) {
2058 struct kmem_cache_node *n;
2059
2060 n = s->node_slab[node];
2061 if (n) {
2062 kmem_cache_free(&kmem_node_cache, n);
2063 s->node_slab[node] = NULL;
2064 }
2065 }
2066}
2067
2068static int alloc_kmem_cache_nodes(struct kmem_cache *s)
2069{
2070 int node;
2071
2072 for_each_node_state(node, N_NORMAL_MEMORY) {
2073 struct kmem_cache_node *n;
2074
2075 n = kmem_cache_alloc_node(&kmem_node_cache, GFP_KERNEL, node);
2076 if (!n) {
2077 free_kmem_cache_nodes(s);
2078 return 0;
2079 }
2080 init_kmem_cache_node(s, n);
2081 s->node_slab[node] = n;
2082 }
2083 return 1;
2084}
2085#else
2086static void free_kmem_cache_nodes(struct kmem_cache *s)
2087{
2088}
2089
2090static int alloc_kmem_cache_nodes(struct kmem_cache *s)
2091{
2092 return 1;
2093}
2094#endif
2095
2096/*
2097 * calculate_sizes() determines the order and the distribution of data within
2098 * a slab object.
2099 */
2100static int calculate_sizes(struct kmem_cache *s)
2101{
2102 unsigned long flags = s->flags;
2103 unsigned long size = s->objsize;
2104 unsigned long align = s->align;
2105
2106 /*
2107 * Determine if we can poison the object itself. If the user of
2108 * the slab may touch the object after free or before allocation
2109 * then we should never poison the object itself.
2110 */
2111 if (slab_poison(s) && !(flags & SLAB_DESTROY_BY_RCU) && !s->ctor)
2112 s->flags |= __OBJECT_POISON;
2113 else
2114 s->flags &= ~__OBJECT_POISON;
2115
2116 /*
2117 * Round up object size to the next word boundary. We can only
2118 * place the free pointer at word boundaries and this determines
2119 * the possible location of the free pointer.
2120 */
2121 size = ALIGN(size, sizeof(void *));
2122
2123#ifdef CONFIG_SLQB_DEBUG
2124 /*
2125 * If we are Redzoning then check if there is some space between the
2126 * end of the object and the free pointer. If not then add an
2127 * additional word to have some bytes to store Redzone information.
2128 */
2129 if ((flags & SLAB_RED_ZONE) && size == s->objsize)
2130 size += sizeof(void *);
2131#endif
2132
2133 /*
2134 * With that we have determined the number of bytes in actual use
2135 * by the object. This is the potential offset to the free pointer.
2136 */
2137 s->inuse = size;
2138
2139 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || s->ctor)) {
2140 /*
2141 * Relocate free pointer after the object if it is not
2142 * permitted to overwrite the first word of the object on
2143 * kmem_cache_free.
2144 *
2145 * This is the case if we do RCU, have a constructor or
2146 * destructor or are poisoning the objects.
2147 */
2148 s->offset = size;
2149 size += sizeof(void *);
2150 }
2151
2152#ifdef CONFIG_SLQB_DEBUG
2153 if (flags & SLAB_STORE_USER) {
2154 /*
2155 * Need to store information about allocs and frees after
2156 * the object.
2157 */
2158 size += 2 * sizeof(struct track);
2159 }
2160
2161 if (flags & SLAB_RED_ZONE) {
2162 /*
2163 * Add some empty padding so that we can catch
2164 * overwrites from earlier objects rather than let
2165 * tracking information or the free pointer be
2166 * corrupted if an user writes before the start
2167 * of the object.
2168 */
2169 size += sizeof(void *);
2170 }
2171#endif
2172
2173 /*
2174 * Determine the alignment based on various parameters that the
2175 * user specified and the dynamic determination of cache line size
2176 * on bootup.
2177 */
2178 align = calculate_alignment(flags, align, s->objsize);
2179
2180 /*
2181 * SLQB stores one object immediately after another beginning from
2182 * offset 0. In order to align the objects we have to simply size
2183 * each object to conform to the alignment.
2184 */
2185 size = ALIGN(size, align);
2186 s->size = size;
2187 s->order = calculate_order(size);
2188
2189 if (s->order < 0)
2190 return 0;
2191
2192 s->allocflags = 0;
2193 if (s->order)
2194 s->allocflags |= __GFP_COMP;
2195
2196 if (s->flags & SLAB_CACHE_DMA)
2197 s->allocflags |= SLQB_DMA;
2198
2199 if (s->flags & SLAB_RECLAIM_ACCOUNT)
2200 s->allocflags |= __GFP_RECLAIMABLE;
2201
2202 /*
2203 * Determine the number of objects per slab
2204 */
2205 s->objects = (PAGE_SIZE << s->order) / size;
2206
2207 s->freebatch = max(4UL*PAGE_SIZE / size,
2208 min(256UL, 64*PAGE_SIZE / size));
2209 if (!s->freebatch)
2210 s->freebatch = 1;
2211 s->hiwater = s->freebatch << 2;
2212
2213 return !!s->objects;
2214
2215}
2216
2217#ifdef CONFIG_SMP
2218/*
2219 * Per-cpu allocator can't be used because it always uses slab allocator,
2220 * and it can't do per-node allocations.
2221 */
2222static void *kmem_cache_dyn_array_alloc(int ids)
2223{
2224 size_t size = sizeof(void *) * ids;
2225
2226 BUG_ON(!size);
2227
2228 if (unlikely(!slab_is_available())) {
2229 static void *nextmem;
2230 static size_t nextleft;
2231 void *ret;
2232
2233 /*
2234 * Special case for setting up initial caches. These will
2235 * never get freed by definition so we can do it rather
2236 * simply.
2237 */
2238 if (size > nextleft) {
2239 nextmem = alloc_pages_exact(size, GFP_KERNEL);
2240 if (!nextmem)
2241 return NULL;
2242 nextleft = roundup(size, PAGE_SIZE);
2243 }
2244
2245 ret = nextmem;
2246 nextleft -= size;
2247 nextmem += size;
2248 memset(ret, 0, size);
2249 return ret;
2250 } else {
2251 return kzalloc(size, GFP_KERNEL);
2252 }
2253}
2254
2255static void kmem_cache_dyn_array_free(void *array)
2256{
2257 if (unlikely(!slab_is_available()))
2258 return; /* error case without crashing here (will panic soon) */
2259 kfree(array);
2260}
2261#endif
2262
2263/*
2264 * Except in early boot, this should be called with slqb_lock held for write
2265 * to lock out hotplug, and protect list modifications.
2266 */
2267static int kmem_cache_open(struct kmem_cache *s,
2268 const char *name, size_t size, size_t align,
2269 unsigned long flags, void (*ctor)(void *), int alloc)
2270{
2271 unsigned int left_over;
2272
2273 memset(s, 0, sizeof(struct kmem_cache));
2274 s->name = name;
2275 s->ctor = ctor;
2276 s->objsize = size;
2277 s->align = align;
2278 s->flags = kmem_cache_flags(size, flags, name, ctor);
2279
2280 if (!calculate_sizes(s))
2281 goto error;
2282
2283 if (!slab_debug(s)) {
2284 left_over = (PAGE_SIZE << s->order) - (s->objects * s->size);
2285 s->colour_off = max(cache_line_size(), s->align);
2286 s->colour_range = left_over;
2287 } else {
2288 s->colour_off = 0;
2289 s->colour_range = 0;
2290 }
2291
2292#ifdef CONFIG_SMP
2293 s->cpu_slab = kmem_cache_dyn_array_alloc(nr_cpu_ids);
2294 if (!s->cpu_slab)
2295 goto error;
2296# ifdef CONFIG_NUMA
2297 s->node_slab = kmem_cache_dyn_array_alloc(nr_node_ids);
2298 if (!s->node_slab)
2299 goto error_cpu_array;
2300# endif
2301#endif
2302
2303 if (likely(alloc)) {
2304 if (!alloc_kmem_cache_nodes(s))
2305 goto error_node_array;
2306
2307 if (!alloc_kmem_cache_cpus(s))
2308 goto error_nodes;
2309 }
2310
2311 sysfs_slab_add(s);
2312 list_add(&s->list, &slab_caches);
2313
2314 return 1;
2315
2316error_nodes:
2317 free_kmem_cache_nodes(s);
2318error_node_array:
2319#if defined(CONFIG_NUMA) && defined(CONFIG_SMP)
2320 kmem_cache_dyn_array_free(s->node_slab);
2321error_cpu_array:
2322#endif
2323#ifdef CONFIG_SMP
2324 kmem_cache_dyn_array_free(s->cpu_slab);
2325#endif
2326error:
2327 if (flags & SLAB_PANIC)
2328 panic("%s: failed to create slab `%s'\n", __func__, name);
2329 return 0;
2330}
2331
2332/**
2333 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
2334 * @s: the cache we're checking against
2335 * @ptr: pointer to validate
2336 *
2337 * This verifies that the untrusted pointer looks sane;
2338 * it is _not_ a guarantee that the pointer is actually
2339 * part of the slab cache in question, but it at least
2340 * validates that the pointer can be dereferenced and
2341 * looks half-way sane.
2342 *
2343 * Currently only used for dentry validation.
2344 */
2345int kmem_ptr_validate(struct kmem_cache *s, const void *ptr)
2346{
2347 unsigned long addr = (unsigned long)ptr;
2348 struct slqb_page *page;
2349
2350 if (unlikely(addr < PAGE_OFFSET))
2351 goto out;
2352 if (unlikely(addr > (unsigned long)high_memory - s->size))
2353 goto out;
2354 if (unlikely(!IS_ALIGNED(addr, s->align)))
2355 goto out;
2356 if (unlikely(!kern_addr_valid(addr)))
2357 goto out;
2358 if (unlikely(!kern_addr_valid(addr + s->size - 1)))
2359 goto out;
2360 if (unlikely(!pfn_valid(addr >> PAGE_SHIFT)))
2361 goto out;
2362 page = virt_to_head_slqb_page(ptr);
2363 if (unlikely(!(page->flags & PG_SLQB_BIT)))
2364 goto out;
2365 if (unlikely(page->list->cache != s)) /* XXX: ouch, racy */
2366 goto out;
2367 return 1;
2368out:
2369 return 0;
2370}
2371EXPORT_SYMBOL(kmem_ptr_validate);
2372
2373/*
2374 * Determine the size of a slab object
2375 */
2376unsigned int kmem_cache_size(struct kmem_cache *s)
2377{
2378 return s->objsize;
2379}
2380EXPORT_SYMBOL(kmem_cache_size);
2381
2382const char *kmem_cache_name(struct kmem_cache *s)
2383{
2384 return s->name;
2385}
2386EXPORT_SYMBOL(kmem_cache_name);
2387
2388/*
2389 * Release all resources used by a slab cache. No more concurrency on the
2390 * slab, so we can touch remote kmem_cache_cpu structures.
2391 */
2392void kmem_cache_destroy(struct kmem_cache *s)
2393{
2394#ifdef CONFIG_NUMA
2395 int node;
2396#endif
2397 int cpu;
2398
2399 down_write(&slqb_lock);
2400 list_del(&s->list);
2401
2402 local_irq_disable();
2403#ifdef CONFIG_SMP
2404 for_each_online_cpu(cpu) {
2405 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2406 struct kmem_cache_list *l = &c->list;
2407
2408 flush_free_list_all(s, l);
2409 flush_remote_free_cache(s, c);
2410 }
2411#endif
2412
2413 for_each_online_cpu(cpu) {
2414 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2415 struct kmem_cache_list *l = &c->list;
2416
2417 claim_remote_free_list(s, l);
2418 flush_free_list_all(s, l);
2419
2420 WARN_ON(l->freelist.nr);
2421 WARN_ON(l->nr_slabs);
2422 WARN_ON(l->nr_partial);
2423 }
2424
2425 free_kmem_cache_cpus(s);
2426
2427#ifdef CONFIG_NUMA
2428 for_each_node_state(node, N_NORMAL_MEMORY) {
2429 struct kmem_cache_node *n;
2430 struct kmem_cache_list *l;
2431
2432 n = s->node_slab[node];
2433 if (!n)
2434 continue;
2435 l = &n->list;
2436
2437 claim_remote_free_list(s, l);
2438 flush_free_list_all(s, l);
2439
2440 WARN_ON(l->freelist.nr);
2441 WARN_ON(l->nr_slabs);
2442 WARN_ON(l->nr_partial);
2443 }
2444
2445 free_kmem_cache_nodes(s);
2446#endif
2447 local_irq_enable();
2448
2449 sysfs_slab_remove(s);
2450 up_write(&slqb_lock);
2451}
2452EXPORT_SYMBOL(kmem_cache_destroy);
2453
2454/********************************************************************
2455 * Kmalloc subsystem
2456 *******************************************************************/
2457
2458struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned;
2459EXPORT_SYMBOL(kmalloc_caches);
2460
2461#ifdef CONFIG_ZONE_DMA
2462struct kmem_cache kmalloc_caches_dma[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned;
2463EXPORT_SYMBOL(kmalloc_caches_dma);
2464#endif
2465
2466#ifndef ARCH_KMALLOC_FLAGS
2467#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
2468#endif
2469
2470static struct kmem_cache *open_kmalloc_cache(struct kmem_cache *s,
2471 const char *name, int size, gfp_t gfp_flags)
2472{
2473 unsigned int flags = ARCH_KMALLOC_FLAGS | SLAB_PANIC;
2474
2475 if (gfp_flags & SLQB_DMA)
2476 flags |= SLAB_CACHE_DMA;
2477
2478 kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL, 1);
2479
2480 return s;
2481}
2482
2483/*
2484 * Conversion table for small slabs sizes / 8 to the index in the
2485 * kmalloc array. This is necessary for slabs < 192 since we have non power
2486 * of two cache sizes there. The size of larger slabs can be determined using
2487 * fls.
2488 */
2489static s8 size_index[24] __cacheline_aligned = {
2490 3, /* 8 */
2491 4, /* 16 */
2492 5, /* 24 */
2493 5, /* 32 */
2494 6, /* 40 */
2495 6, /* 48 */
2496 6, /* 56 */
2497 6, /* 64 */
2498#if L1_CACHE_BYTES < 64
2499 1, /* 72 */
2500 1, /* 80 */
2501 1, /* 88 */
2502 1, /* 96 */
2503#else
2504 7,
2505 7,
2506 7,
2507 7,
2508#endif
2509 7, /* 104 */
2510 7, /* 112 */
2511 7, /* 120 */
2512 7, /* 128 */
2513#if L1_CACHE_BYTES < 128
2514 2, /* 136 */
2515 2, /* 144 */
2516 2, /* 152 */
2517 2, /* 160 */
2518 2, /* 168 */
2519 2, /* 176 */
2520 2, /* 184 */
2521 2 /* 192 */
2522#else
2523 -1,
2524 -1,
2525 -1,
2526 -1,
2527 -1,
2528 -1,
2529 -1,
2530 -1
2531#endif
2532};
2533
2534static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2535{
2536 int index;
2537
2538 if (unlikely(size <= KMALLOC_MIN_SIZE)) {
2539 if (unlikely(!size))
2540 return ZERO_SIZE_PTR;
2541
2542 index = KMALLOC_SHIFT_LOW;
2543 goto got_index;
2544 }
2545
2546#if L1_CACHE_BYTES >= 128
2547 if (size <= 128) {
2548#else
2549 if (size <= 192) {
2550#endif
2551 index = size_index[(size - 1) / 8];
2552 } else {
2553 if (unlikely(size > 1UL << KMALLOC_SHIFT_SLQB_HIGH))
2554 return NULL;
2555
2556 index = fls(size - 1);
2557 }
2558
2559got_index:
2560 if (unlikely((flags & SLQB_DMA)))
2561 return &kmalloc_caches_dma[index];
2562 else
2563 return &kmalloc_caches[index];
2564}
2565
2566void *__kmalloc(size_t size, gfp_t flags)
2567{
2568 struct kmem_cache *s;
2569
2570 s = get_slab(size, flags);
2571 if (unlikely(ZERO_OR_NULL_PTR(s)))
2572 return s;
2573
2574 return __kmem_cache_alloc(s, flags, _RET_IP_);
2575}
2576EXPORT_SYMBOL(__kmalloc);
2577
2578#ifdef CONFIG_NUMA
2579void *__kmalloc_node(size_t size, gfp_t flags, int node)
2580{
2581 struct kmem_cache *s;
2582
2583 s = get_slab(size, flags);
2584 if (unlikely(ZERO_OR_NULL_PTR(s)))
2585 return s;
2586
2587 return kmem_cache_alloc_node(s, flags, node);
2588}
2589EXPORT_SYMBOL(__kmalloc_node);
2590#endif
2591
2592size_t ksize(const void *object)
2593{
2594 struct slqb_page *page;
2595 struct kmem_cache *s;
2596
2597 BUG_ON(!object);
2598 if (unlikely(object == ZERO_SIZE_PTR))
2599 return 0;
2600
2601 page = virt_to_head_slqb_page(object);
2602 BUG_ON(!(page->flags & PG_SLQB_BIT));
2603
2604 s = page->list->cache;
2605
2606 /*
2607 * Debugging requires use of the padding between object
2608 * and whatever may come after it.
2609 */
2610 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2611 return s->objsize;
2612
2613 /*
2614 * If we have the need to store the freelist pointer
2615 * back there or track user information then we can
2616 * only use the space before that information.
2617 */
2618 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2619 return s->inuse;
2620
2621 /*
2622 * Else we can use all the padding etc for the allocation
2623 */
2624 return s->size;
2625}
2626EXPORT_SYMBOL(ksize);
2627
2628void kfree(const void *object)
2629{
2630 struct kmem_cache *s;
2631 struct slqb_page *page;
2632
2633 if (unlikely(ZERO_OR_NULL_PTR(object)))
2634 return;
2635
2636 page = virt_to_head_slqb_page(object);
2637 s = page->list->cache;
2638
2639 slab_free(s, page, (void *)object);
2640}
2641EXPORT_SYMBOL(kfree);
2642
2643static void kmem_cache_trim_percpu(void *arg)
2644{
2645 int cpu = smp_processor_id();
2646 struct kmem_cache *s = arg;
2647 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2648 struct kmem_cache_list *l = &c->list;
2649
2650 claim_remote_free_list(s, l);
2651 flush_free_list(s, l);
2652#ifdef CONFIG_SMP
2653 flush_remote_free_cache(s, c);
2654#endif
2655}
2656
2657int kmem_cache_shrink(struct kmem_cache *s)
2658{
2659#ifdef CONFIG_NUMA
2660 int node;
2661#endif
2662
2663 on_each_cpu(kmem_cache_trim_percpu, s, 1);
2664
2665#ifdef CONFIG_NUMA
2666 for_each_node_state(node, N_NORMAL_MEMORY) {
2667 struct kmem_cache_node *n;
2668 struct kmem_cache_list *l;
2669
2670 n = s->node_slab[node];
2671 if (!n)
2672 continue;
2673 l = &n->list;
2674
2675 spin_lock_irq(&n->list_lock);
2676 claim_remote_free_list(s, l);
2677 flush_free_list(s, l);
2678 spin_unlock_irq(&n->list_lock);
2679 }
2680#endif
2681
2682 return 0;
2683}
2684EXPORT_SYMBOL(kmem_cache_shrink);
2685
2686#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
2687static void kmem_cache_reap_percpu(void *arg)
2688{
2689 int cpu = smp_processor_id();
2690 struct kmem_cache *s;
2691 long phase = (long)arg;
2692
2693 list_for_each_entry(s, &slab_caches, list) {
2694 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2695 struct kmem_cache_list *l = &c->list;
2696
2697 if (phase == 0) {
2698 flush_free_list_all(s, l);
2699 flush_remote_free_cache(s, c);
2700 }
2701
2702 if (phase == 1) {
2703 claim_remote_free_list(s, l);
2704 flush_free_list_all(s, l);
2705 }
2706 }
2707}
2708
2709static void kmem_cache_reap(void)
2710{
2711 struct kmem_cache *s;
2712 int node;
2713
2714 down_read(&slqb_lock);
2715 on_each_cpu(kmem_cache_reap_percpu, (void *)0, 1);
2716 on_each_cpu(kmem_cache_reap_percpu, (void *)1, 1);
2717
2718 list_for_each_entry(s, &slab_caches, list) {
2719 for_each_node_state(node, N_NORMAL_MEMORY) {
2720 struct kmem_cache_node *n;
2721 struct kmem_cache_list *l;
2722
2723 n = s->node_slab[node];
2724 if (!n)
2725 continue;
2726 l = &n->list;
2727
2728 spin_lock_irq(&n->list_lock);
2729 claim_remote_free_list(s, l);
2730 flush_free_list_all(s, l);
2731 spin_unlock_irq(&n->list_lock);
2732 }
2733 }
2734 up_read(&slqb_lock);
2735}
2736#endif
2737
2738static void cache_trim_worker(struct work_struct *w)
2739{
2740 struct delayed_work *work =
2741 container_of(w, struct delayed_work, work);
2742 struct kmem_cache *s;
2743
2744 if (!down_read_trylock(&slqb_lock))
2745 goto out;
2746
2747 list_for_each_entry(s, &slab_caches, list) {
2748#ifdef CONFIG_NUMA
2749 int node = numa_node_id();
2750 struct kmem_cache_node *n = s->node_slab[node];
2751
2752 if (n) {
2753 struct kmem_cache_list *l = &n->list;
2754
2755 spin_lock_irq(&n->list_lock);
2756 claim_remote_free_list(s, l);
2757 flush_free_list(s, l);
2758 spin_unlock_irq(&n->list_lock);
2759 }
2760#endif
2761
2762 local_irq_disable();
2763 kmem_cache_trim_percpu(s);
2764 local_irq_enable();
2765 }
2766
2767 up_read(&slqb_lock);
2768out:
2769 schedule_delayed_work(work, round_jiffies_relative(3*HZ));
2770}
2771
2772static DEFINE_PER_CPU(struct delayed_work, slqb_cache_trim_work);
2773
2774static void __cpuinit start_cpu_timer(int cpu)
2775{
2776 struct delayed_work *cache_trim_work = &per_cpu(slqb_cache_trim_work,
2777 cpu);
2778
2779 /*
2780 * When this gets called from do_initcalls via cpucache_init(),
2781 * init_workqueues() has already run, so keventd will be setup
2782 * at that time.
2783 */
2784 if (keventd_up() && cache_trim_work->work.func == NULL) {
2785 INIT_DELAYED_WORK(cache_trim_work, cache_trim_worker);
2786 schedule_delayed_work_on(cpu, cache_trim_work,
2787 __round_jiffies_relative(HZ, cpu));
2788 }
2789}
2790
2791static int __init cpucache_init(void)
2792{
2793 int cpu;
2794
2795 for_each_online_cpu(cpu)
2796 start_cpu_timer(cpu);
2797
2798 return 0;
2799}
2800device_initcall(cpucache_init);
2801
2802#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
2803static void slab_mem_going_offline_callback(void *arg)
2804{
2805 kmem_cache_reap();
2806}
2807
2808static void slab_mem_offline_callback(void *arg)
2809{
2810 /* XXX: should release structures, see CPU offline comment */
2811}
2812
2813static int slab_mem_going_online_callback(void *arg)
2814{
2815 struct kmem_cache *s;
2816 struct kmem_cache_node *n;
2817 struct memory_notify *marg = arg;
2818 int nid = marg->status_change_nid;
2819 int ret = 0;
2820
2821 /*
2822 * If the node's memory is already available, then kmem_cache_node is
2823 * already created. Nothing to do.
2824 */
2825 if (nid < 0)
2826 return 0;
2827
2828 /*
2829 * We are bringing a node online. No memory is availabe yet. We must
2830 * allocate a kmem_cache_node structure in order to bring the node
2831 * online.
2832 */
2833 down_write(&slqb_lock);
2834 list_for_each_entry(s, &slab_caches, list) {
2835 /*
2836 * XXX: kmem_cache_alloc_node will fallback to other nodes
2837 * since memory is not yet available from the node that
2838 * is brought up.
2839 */
2840 if (s->node_slab[nid]) /* could be lefover from last online */
2841 continue;
2842 n = kmem_cache_alloc(&kmem_node_cache, GFP_KERNEL);
2843 if (!n) {
2844 ret = -ENOMEM;
2845 goto out;
2846 }
2847 init_kmem_cache_node(s, n);
2848 s->node_slab[nid] = n;
2849 }
2850out:
2851 up_write(&slqb_lock);
2852 return ret;
2853}
2854
2855static int slab_memory_callback(struct notifier_block *self,
2856 unsigned long action, void *arg)
2857{
2858 int ret = 0;
2859
2860 switch (action) {
2861 case MEM_GOING_ONLINE:
2862 ret = slab_mem_going_online_callback(arg);
2863 break;
2864 case MEM_GOING_OFFLINE:
2865 slab_mem_going_offline_callback(arg);
2866 break;
2867 case MEM_OFFLINE:
2868 case MEM_CANCEL_ONLINE:
2869 slab_mem_offline_callback(arg);
2870 break;
2871 case MEM_ONLINE:
2872 case MEM_CANCEL_OFFLINE:
2873 break;
2874 }
2875
2876 if (ret)
2877 ret = notifier_from_errno(ret);
2878 else
2879 ret = NOTIFY_OK;
2880 return ret;
2881}
2882
2883#endif /* CONFIG_MEMORY_HOTPLUG */
2884
2885/********************************************************************
2886 * Basic setup of slabs
2887 *******************************************************************/
2888
2889void __init kmem_cache_init(void)
2890{
2891 int i;
2892 unsigned int flags = SLAB_HWCACHE_ALIGN|SLAB_PANIC;
2893
2894 /*
2895 * All the ifdefs are rather ugly here, but it's just the setup code,
2896 * so it doesn't have to be too readable :)
2897 */
2898
2899 /*
2900 * No need to take slqb_lock here: there should be no concurrency
2901 * anyway, and spin_unlock_irq in rwsem code could enable interrupts
2902 * too early.
2903 */
2904 kmem_cache_open(&kmem_cache_cache, "kmem_cache",
2905 sizeof(struct kmem_cache), 0, flags, NULL, 0);
2906#ifdef CONFIG_SMP
2907 kmem_cache_open(&kmem_cpu_cache, "kmem_cache_cpu",
2908 sizeof(struct kmem_cache_cpu), 0, flags, NULL, 0);
2909#endif
2910#ifdef CONFIG_NUMA
2911 kmem_cache_open(&kmem_node_cache, "kmem_cache_node",
2912 sizeof(struct kmem_cache_node), 0, flags, NULL, 0);
2913#endif
2914
2915#ifdef CONFIG_SMP
2916 for_each_possible_cpu(i) {
2917 struct kmem_cache_cpu *c;
2918
2919 c = &per_cpu(kmem_cache_cpus, i);
2920 init_kmem_cache_cpu(&kmem_cache_cache, c);
2921 kmem_cache_cache.cpu_slab[i] = c;
2922
2923 c = &per_cpu(kmem_cpu_cpus, i);
2924 init_kmem_cache_cpu(&kmem_cpu_cache, c);
2925 kmem_cpu_cache.cpu_slab[i] = c;
2926
2927#ifdef CONFIG_NUMA
2928 c = &per_cpu(kmem_node_cpus, i);
2929 init_kmem_cache_cpu(&kmem_node_cache, c);
2930 kmem_node_cache.cpu_slab[i] = c;
2931#endif
2932 }
2933#else
2934 init_kmem_cache_cpu(&kmem_cache_cache, &kmem_cache_cache.cpu_slab);
2935#endif
2936
2937#ifdef CONFIG_NUMA
2938 for_each_node_state(i, N_NORMAL_MEMORY) {
2939 struct kmem_cache_node *n;
2940
2941 n = &kmem_cache_nodes[i];
2942 init_kmem_cache_node(&kmem_cache_cache, n);
2943 kmem_cache_cache.node_slab[i] = n;
2944#ifdef CONFIG_SMP
2945 n = &kmem_cpu_nodes[i];
2946 init_kmem_cache_node(&kmem_cpu_cache, n);
2947 kmem_cpu_cache.node_slab[i] = n;
2948#endif
2949 n = &kmem_node_nodes[i];
2950 init_kmem_cache_node(&kmem_node_cache, n);
2951 kmem_node_cache.node_slab[i] = n;
2952 }
2953#endif
2954
2955 /* Caches that are not of the two-to-the-power-of size */
2956 if (L1_CACHE_BYTES < 64 && KMALLOC_MIN_SIZE <= 64) {
2957 open_kmalloc_cache(&kmalloc_caches[1],
2958 "kmalloc-96", 96, GFP_KERNEL);
2959#ifdef CONFIG_ZONE_DMA
2960 open_kmalloc_cache(&kmalloc_caches_dma[1],
2961 "kmalloc_dma-96", 96, GFP_KERNEL|SLQB_DMA);
2962#endif
2963 }
2964 if (L1_CACHE_BYTES < 128 && KMALLOC_MIN_SIZE <= 128) {
2965 open_kmalloc_cache(&kmalloc_caches[2],
2966 "kmalloc-192", 192, GFP_KERNEL);
2967#ifdef CONFIG_ZONE_DMA
2968 open_kmalloc_cache(&kmalloc_caches_dma[2],
2969 "kmalloc_dma-192", 192, GFP_KERNEL|SLQB_DMA);
2970#endif
2971 }
2972
2973 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) {
2974 open_kmalloc_cache(&kmalloc_caches[i],
2975 "kmalloc", 1 << i, GFP_KERNEL);
2976#ifdef CONFIG_ZONE_DMA
2977 open_kmalloc_cache(&kmalloc_caches_dma[i],
2978 "kmalloc_dma", 1 << i, GFP_KERNEL|SLQB_DMA);
2979#endif
2980 }
2981
2982 /*
2983 * Patch up the size_index table if we have strange large alignment
2984 * requirements for the kmalloc array. This is only the case for
2985 * mips it seems. The standard arches will not generate any code here.
2986 *
2987 * Largest permitted alignment is 256 bytes due to the way we
2988 * handle the index determination for the smaller caches.
2989 *
2990 * Make sure that nothing crazy happens if someone starts tinkering
2991 * around with ARCH_KMALLOC_MINALIGN
2992 */
2993 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
2994 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
2995
2996 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
2997 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
2998
2999 /* Provide the correct kmalloc names now that the caches are up */
3000 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) {
3001 kmalloc_caches[i].name =
3002 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
3003#ifdef CONFIG_ZONE_DMA
3004 kmalloc_caches_dma[i].name =
3005 kasprintf(GFP_KERNEL, "kmalloc_dma-%d", 1 << i);
3006#endif
3007 }
3008
3009#ifdef CONFIG_SMP
3010 register_cpu_notifier(&slab_notifier);
3011#endif
3012#ifdef CONFIG_NUMA
3013 hotplug_memory_notifier(slab_memory_callback, 1);
3014#endif
3015 /*
3016 * smp_init() has not yet been called, so no worries about memory
3017 * ordering with __slab_is_available.
3018 */
3019 __slab_is_available = 1;
3020}
3021
3022void __init kmem_cache_init_late(void)
3023{
3024}
3025
3026/*
3027 * Some basic slab creation sanity checks
3028 */
3029static int kmem_cache_create_ok(const char *name, size_t size,
3030 size_t align, unsigned long flags)
3031{
3032 struct kmem_cache *tmp;
3033
3034 /*
3035 * Sanity checks... these are all serious usage bugs.
3036 */
3037 if (!name || in_interrupt() || (size < sizeof(void *))) {
3038 printk(KERN_ERR "kmem_cache_create(): early error in slab %s\n",
3039 name);
3040 dump_stack();
3041
3042 return 0;
3043 }
3044
3045 list_for_each_entry(tmp, &slab_caches, list) {
3046 char x;
3047 int res;
3048
3049 /*
3050 * This happens when the module gets unloaded and doesn't
3051 * destroy its slab cache and no-one else reuses the vmalloc
3052 * area of the module. Print a warning.
3053 */
3054 res = probe_kernel_address(tmp->name, x);
3055 if (res) {
3056 printk(KERN_ERR
3057 "SLAB: cache with size %d has lost its name\n",
3058 tmp->size);
3059 continue;
3060 }
3061
3062 if (!strcmp(tmp->name, name)) {
3063 printk(KERN_ERR
3064 "SLAB: duplicate cache %s\n", name);
3065 dump_stack();
3066
3067 return 0;
3068 }
3069 }
3070
3071 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
3072 if (flags & SLAB_DESTROY_BY_RCU)
3073 WARN_ON(flags & SLAB_POISON);
3074
3075 return 1;
3076}
3077
3078struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3079 size_t align, unsigned long flags, void (*ctor)(void *))
3080{
3081 struct kmem_cache *s;
3082
3083 down_write(&slqb_lock);
3084 if (!kmem_cache_create_ok(name, size, align, flags))
3085 goto err;
3086
3087 s = kmem_cache_alloc(&kmem_cache_cache, GFP_KERNEL);
3088 if (!s)
3089 goto err;
3090
3091 if (kmem_cache_open(s, name, size, align, flags, ctor, 1)) {
3092 up_write(&slqb_lock);
3093 return s;
3094 }
3095
3096 kmem_cache_free(&kmem_cache_cache, s);
3097
3098err:
3099 up_write(&slqb_lock);
3100 if (flags & SLAB_PANIC)
3101 panic("%s: failed to create slab `%s'\n", __func__, name);
3102
3103 return NULL;
3104}
3105EXPORT_SYMBOL(kmem_cache_create);
3106
3107#ifdef CONFIG_SMP
3108/*
3109 * Use the cpu notifier to insure that the cpu slabs are flushed when
3110 * necessary.
3111 */
3112static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3113 unsigned long action, void *hcpu)
3114{
3115 long cpu = (long)hcpu;
3116 struct kmem_cache *s;
3117
3118 switch (action) {
3119 case CPU_UP_PREPARE:
3120 case CPU_UP_PREPARE_FROZEN:
3121 down_write(&slqb_lock);
3122 list_for_each_entry(s, &slab_caches, list) {
3123 if (s->cpu_slab[cpu]) /* could be lefover last online */
3124 continue;
3125 s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu);
3126 if (!s->cpu_slab[cpu]) {
3127 up_read(&slqb_lock);
3128 return NOTIFY_BAD;
3129 }
3130 }
3131 up_write(&slqb_lock);
3132 break;
3133
3134 case CPU_ONLINE:
3135 case CPU_ONLINE_FROZEN:
3136 case CPU_DOWN_FAILED:
3137 case CPU_DOWN_FAILED_FROZEN:
3138 start_cpu_timer(cpu);
3139 break;
3140
3141 case CPU_DOWN_PREPARE:
3142 case CPU_DOWN_PREPARE_FROZEN:
3143 cancel_delayed_work_sync(&per_cpu(slqb_cache_trim_work,
3144 cpu));
3145 per_cpu(slqb_cache_trim_work, cpu).work.func = NULL;
3146 break;
3147
3148 case CPU_UP_CANCELED:
3149 case CPU_UP_CANCELED_FROZEN:
3150 case CPU_DEAD:
3151 case CPU_DEAD_FROZEN:
3152 /*
3153 * XXX: Freeing here doesn't work because objects can still be
3154 * on this CPU's list. periodic timer needs to check if a CPU
3155 * is offline and then try to cleanup from there. Same for node
3156 * offline.
3157 */
3158 default:
3159 break;
3160 }
3161 return NOTIFY_OK;
3162}
3163
3164static struct notifier_block __cpuinitdata slab_notifier = {
3165 .notifier_call = slab_cpuup_callback
3166};
3167
3168#endif
3169
3170#ifdef CONFIG_SLQB_DEBUG
3171void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3172{
3173 struct kmem_cache *s;
3174 int node = -1;
3175
3176 s = get_slab(size, flags);
3177 if (unlikely(ZERO_OR_NULL_PTR(s)))
3178 return s;
3179
3180#ifdef CONFIG_NUMA
3181 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
3182 node = alternate_nid(s, flags, node);
3183#endif
3184 return slab_alloc(s, flags, node, caller);
3185}
3186
3187void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
3188 unsigned long caller)
3189{
3190 struct kmem_cache *s;
3191
3192 s = get_slab(size, flags);
3193 if (unlikely(ZERO_OR_NULL_PTR(s)))
3194 return s;
3195
3196 return slab_alloc(s, flags, node, caller);
3197}
3198#endif
3199
3200#if defined(CONFIG_SLQB_SYSFS) || defined(CONFIG_SLABINFO)
3201struct stats_gather {
3202 struct kmem_cache *s;
3203 spinlock_t lock;
3204 unsigned long nr_slabs;
3205 unsigned long nr_partial;
3206 unsigned long nr_inuse;
3207 unsigned long nr_objects;
3208
3209#ifdef CONFIG_SLQB_STATS
3210 unsigned long stats[NR_SLQB_STAT_ITEMS];
3211#endif
3212};
3213
3214static void __gather_stats(void *arg)
3215{
3216 unsigned long nr_slabs;
3217 unsigned long nr_partial;
3218 unsigned long nr_inuse;
3219 struct stats_gather *gather = arg;
3220 int cpu = smp_processor_id();
3221 struct kmem_cache *s = gather->s;
3222 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3223 struct kmem_cache_list *l = &c->list;
3224 struct slqb_page *page;
3225#ifdef CONFIG_SLQB_STATS
3226 int i;
3227#endif
3228
3229 spin_lock(&l->page_lock);
3230 nr_slabs = l->nr_slabs;
3231 nr_partial = l->nr_partial;
3232 nr_inuse = (nr_slabs - nr_partial) * s->objects;
3233
3234 list_for_each_entry(page, &l->partial, lru) {
3235 nr_inuse += page->inuse;
3236 }
3237 spin_unlock(&l->page_lock);
3238
3239 spin_lock(&gather->lock);
3240 gather->nr_slabs += nr_slabs;
3241 gather->nr_partial += nr_partial;
3242 gather->nr_inuse += nr_inuse;
3243#ifdef CONFIG_SLQB_STATS
3244 for (i = 0; i < NR_SLQB_STAT_ITEMS; i++)
3245 gather->stats[i] += l->stats[i];
3246#endif
3247 spin_unlock(&gather->lock);
3248}
3249
3250/* must be called with slqb_lock held */
3251static void gather_stats_locked(struct kmem_cache *s,
3252 struct stats_gather *stats)
3253{
3254#ifdef CONFIG_NUMA
3255 int node;
3256#endif
3257
3258 memset(stats, 0, sizeof(struct stats_gather));
3259 stats->s = s;
3260 spin_lock_init(&stats->lock);
3261
3262 on_each_cpu(__gather_stats, stats, 1);
3263
3264#ifdef CONFIG_NUMA
3265 for_each_online_node(node) {
3266 struct kmem_cache_node *n = s->node_slab[node];
3267 struct kmem_cache_list *l = &n->list;
3268 struct slqb_page *page;
3269 unsigned long flags;
3270#ifdef CONFIG_SLQB_STATS
3271 int i;
3272#endif
3273
3274 spin_lock_irqsave(&n->list_lock, flags);
3275#ifdef CONFIG_SLQB_STATS
3276 for (i = 0; i < NR_SLQB_STAT_ITEMS; i++)
3277 stats->stats[i] += l->stats[i];
3278#endif
3279 stats->nr_slabs += l->nr_slabs;
3280 stats->nr_partial += l->nr_partial;
3281 stats->nr_inuse += (l->nr_slabs - l->nr_partial) * s->objects;
3282
3283 list_for_each_entry(page, &l->partial, lru) {
3284 stats->nr_inuse += page->inuse;
3285 }
3286 spin_unlock_irqrestore(&n->list_lock, flags);
3287 }
3288#endif
3289
3290 stats->nr_objects = stats->nr_slabs * s->objects;
3291}
3292
3293#ifdef CONFIG_SLQB_SYSFS
3294static void gather_stats(struct kmem_cache *s, struct stats_gather *stats)
3295{
3296 down_read(&slqb_lock); /* hold off hotplug */
3297 gather_stats_locked(s, stats);
3298 up_read(&slqb_lock);
3299}
3300#endif
3301#endif
3302
3303/*
3304 * The /proc/slabinfo ABI
3305 */
3306#ifdef CONFIG_SLABINFO
3307#include <linux/proc_fs.h>
3308ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3309 size_t count, loff_t *ppos)
3310{
3311 return -EINVAL;
3312}
3313
3314static void print_slabinfo_header(struct seq_file *m)
3315{
3316 seq_puts(m, "slabinfo - version: 2.1\n");
3317 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
3318 "<objperslab> <pagesperslab>");
3319 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
3320 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
3321 seq_putc(m, '\n');
3322}
3323
3324static void *s_start(struct seq_file *m, loff_t *pos)
3325{
3326 loff_t n = *pos;
3327
3328 down_read(&slqb_lock);
3329 if (!n)
3330 print_slabinfo_header(m);
3331
3332 return seq_list_start(&slab_caches, *pos);
3333}
3334
3335static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3336{
3337 return seq_list_next(p, &slab_caches, pos);
3338}
3339
3340static void s_stop(struct seq_file *m, void *p)
3341{
3342 up_read(&slqb_lock);
3343}
3344
3345static int s_show(struct seq_file *m, void *p)
3346{
3347 struct stats_gather stats;
3348 struct kmem_cache *s;
3349
3350 s = list_entry(p, struct kmem_cache, list);
3351
3352 gather_stats_locked(s, &stats);
3353
3354 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, stats.nr_inuse,
3355 stats.nr_objects, s->size, s->objects, (1 << s->order));
3356 seq_printf(m, " : tunables %4u %4u %4u", slab_hiwater(s),
3357 slab_freebatch(s), 0);
3358 seq_printf(m, " : slabdata %6lu %6lu %6lu", stats.nr_slabs,
3359 stats.nr_slabs, 0UL);
3360 seq_putc(m, '\n');
3361 return 0;
3362}
3363
3364static const struct seq_operations slabinfo_op = {
3365 .start = s_start,
3366 .next = s_next,
3367 .stop = s_stop,
3368 .show = s_show,
3369};
3370
3371static int slabinfo_open(struct inode *inode, struct file *file)
3372{
3373 return seq_open(file, &slabinfo_op);
3374}
3375
3376static const struct file_operations proc_slabinfo_operations = {
3377 .open = slabinfo_open,
3378 .read = seq_read,
3379 .llseek = seq_lseek,
3380 .release = seq_release,
3381};
3382
3383static int __init slab_proc_init(void)
3384{
3385 proc_create("slabinfo", S_IWUSR|S_IRUGO, NULL,
3386 &proc_slabinfo_operations);
3387 return 0;
3388}
3389module_init(slab_proc_init);
3390#endif /* CONFIG_SLABINFO */
3391
3392#ifdef CONFIG_SLQB_SYSFS
3393/*
3394 * sysfs API
3395 */
3396#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
3397#define to_slab(n) container_of(n, struct kmem_cache, kobj);
3398
3399struct slab_attribute {
3400 struct attribute attr;
3401 ssize_t (*show)(struct kmem_cache *s, char *buf);
3402 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
3403};
3404
3405#define SLAB_ATTR_RO(_name) \
3406 static struct slab_attribute _name##_attr = __ATTR_RO(_name)
3407
3408#define SLAB_ATTR(_name) \
3409 static struct slab_attribute _name##_attr = \
3410 __ATTR(_name, 0644, _name##_show, _name##_store)
3411
3412static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
3413{
3414 return sprintf(buf, "%d\n", s->size);
3415}
3416SLAB_ATTR_RO(slab_size);
3417
3418static ssize_t align_show(struct kmem_cache *s, char *buf)
3419{
3420 return sprintf(buf, "%d\n", s->align);
3421}
3422SLAB_ATTR_RO(align);
3423
3424static ssize_t object_size_show(struct kmem_cache *s, char *buf)
3425{
3426 return sprintf(buf, "%d\n", s->objsize);
3427}
3428SLAB_ATTR_RO(object_size);
3429
3430static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
3431{
3432 return sprintf(buf, "%d\n", s->objects);
3433}
3434SLAB_ATTR_RO(objs_per_slab);
3435
3436static ssize_t order_show(struct kmem_cache *s, char *buf)
3437{
3438 return sprintf(buf, "%d\n", s->order);
3439}
3440SLAB_ATTR_RO(order);
3441
3442static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3443{
3444 if (s->ctor) {
3445 int n = sprint_symbol(buf, (unsigned long)s->ctor);
3446
3447 return n + sprintf(buf + n, "\n");
3448 }
3449 return 0;
3450}
3451SLAB_ATTR_RO(ctor);
3452
3453static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3454{
3455 struct stats_gather stats;
3456
3457 gather_stats(s, &stats);
3458
3459 return sprintf(buf, "%lu\n", stats.nr_slabs);
3460}
3461SLAB_ATTR_RO(slabs);
3462
3463static ssize_t objects_show(struct kmem_cache *s, char *buf)
3464{
3465 struct stats_gather stats;
3466
3467 gather_stats(s, &stats);
3468
3469 return sprintf(buf, "%lu\n", stats.nr_inuse);
3470}
3471SLAB_ATTR_RO(objects);
3472
3473static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
3474{
3475 struct stats_gather stats;
3476
3477 gather_stats(s, &stats);
3478
3479 return sprintf(buf, "%lu\n", stats.nr_objects);
3480}
3481SLAB_ATTR_RO(total_objects);
3482
3483#ifdef CONFIG_FAILSLAB
3484static ssize_t failslab_show(struct kmem_cache *s, char *buf)
3485{
3486 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
3487}
3488
3489static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
3490 size_t length)
3491{
3492 s->flags &= ~SLAB_FAILSLAB;
3493 if (buf[0] == '1')
3494 s->flags |= SLAB_FAILSLAB;
3495 return length;
3496}
3497SLAB_ATTR(failslab);
3498#endif
3499
3500static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3501{
3502 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3503}
3504SLAB_ATTR_RO(reclaim_account);
3505
3506static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
3507{
3508 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
3509}
3510SLAB_ATTR_RO(hwcache_align);
3511
3512#ifdef CONFIG_ZONE_DMA
3513static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
3514{
3515 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
3516}
3517SLAB_ATTR_RO(cache_dma);
3518#endif
3519
3520static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
3521{
3522 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
3523}
3524SLAB_ATTR_RO(destroy_by_rcu);
3525
3526static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
3527{
3528 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
3529}
3530SLAB_ATTR_RO(red_zone);
3531
3532static ssize_t poison_show(struct kmem_cache *s, char *buf)
3533{
3534 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
3535}
3536SLAB_ATTR_RO(poison);
3537
3538static ssize_t store_user_show(struct kmem_cache *s, char *buf)
3539{
3540 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
3541}
3542SLAB_ATTR_RO(store_user);
3543
3544static ssize_t hiwater_store(struct kmem_cache *s,
3545 const char *buf, size_t length)
3546{
3547 long hiwater;
3548 int err;
3549
3550 err = strict_strtol(buf, 10, &hiwater);
3551 if (err)
3552 return err;
3553
3554 if (hiwater < 0)
3555 return -EINVAL;
3556
3557 s->hiwater = hiwater;
3558
3559 return length;
3560}
3561
3562static ssize_t hiwater_show(struct kmem_cache *s, char *buf)
3563{
3564 return sprintf(buf, "%d\n", slab_hiwater(s));
3565}
3566SLAB_ATTR(hiwater);
3567
3568static ssize_t freebatch_store(struct kmem_cache *s,
3569 const char *buf, size_t length)
3570{
3571 long freebatch;
3572 int err;
3573
3574 err = strict_strtol(buf, 10, &freebatch);
3575 if (err)
3576 return err;
3577
3578 if (freebatch <= 0 || freebatch - 1 > s->hiwater)
3579 return -EINVAL;
3580
3581 s->freebatch = freebatch;
3582
3583 return length;
3584}
3585
3586static ssize_t freebatch_show(struct kmem_cache *s, char *buf)
3587{
3588 return sprintf(buf, "%d\n", slab_freebatch(s));
3589}
3590SLAB_ATTR(freebatch);
3591
3592#ifdef CONFIG_SLQB_STATS
3593static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
3594{
3595 struct stats_gather stats;
3596 int len;
3597#ifdef CONFIG_SMP
3598 int cpu;
3599#endif
3600
3601 gather_stats(s, &stats);
3602
3603 len = sprintf(buf, "%lu", stats.stats[si]);
3604
3605#ifdef CONFIG_SMP
3606 for_each_online_cpu(cpu) {
3607 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3608 struct kmem_cache_list *l = &c->list;
3609
3610 if (len < PAGE_SIZE - 20)
3611 len += sprintf(buf+len, " C%d=%lu", cpu, l->stats[si]);
3612 }
3613#endif
3614 return len + sprintf(buf + len, "\n");
3615}
3616
3617#define STAT_ATTR(si, text) \
3618static ssize_t text##_show(struct kmem_cache *s, char *buf) \
3619{ \
3620 return show_stat(s, buf, si); \
3621} \
3622SLAB_ATTR_RO(text); \
3623
3624STAT_ATTR(ALLOC, alloc);
3625STAT_ATTR(ALLOC_SLAB_FILL, alloc_slab_fill);
3626STAT_ATTR(ALLOC_SLAB_NEW, alloc_slab_new);
3627STAT_ATTR(FREE, free);
3628STAT_ATTR(FREE_REMOTE, free_remote);
3629STAT_ATTR(FLUSH_FREE_LIST, flush_free_list);
3630STAT_ATTR(FLUSH_FREE_LIST_OBJECTS, flush_free_list_objects);
3631STAT_ATTR(FLUSH_FREE_LIST_REMOTE, flush_free_list_remote);
3632STAT_ATTR(FLUSH_SLAB_PARTIAL, flush_slab_partial);
3633STAT_ATTR(FLUSH_SLAB_FREE, flush_slab_free);
3634STAT_ATTR(FLUSH_RFREE_LIST, flush_rfree_list);
3635STAT_ATTR(FLUSH_RFREE_LIST_OBJECTS, flush_rfree_list_objects);
3636STAT_ATTR(CLAIM_REMOTE_LIST, claim_remote_list);
3637STAT_ATTR(CLAIM_REMOTE_LIST_OBJECTS, claim_remote_list_objects);
3638#endif
3639
3640static struct attribute *slab_attrs[] = {
3641 &slab_size_attr.attr,
3642 &object_size_attr.attr,
3643 &objs_per_slab_attr.attr,
3644 &order_attr.attr,
3645 &objects_attr.attr,
3646 &total_objects_attr.attr,
3647 &slabs_attr.attr,
3648 &ctor_attr.attr,
3649 &align_attr.attr,
3650 &hwcache_align_attr.attr,
3651 &reclaim_account_attr.attr,
3652 &destroy_by_rcu_attr.attr,
3653 &red_zone_attr.attr,
3654 &poison_attr.attr,
3655 &store_user_attr.attr,
3656 &hiwater_attr.attr,
3657 &freebatch_attr.attr,
3658#ifdef CONFIG_ZONE_DMA
3659 &cache_dma_attr.attr,
3660#endif
3661#ifdef CONFIG_SLQB_STATS
3662 &alloc_attr.attr,
3663 &alloc_slab_fill_attr.attr,
3664 &alloc_slab_new_attr.attr,
3665 &free_attr.attr,
3666 &free_remote_attr.attr,
3667 &flush_free_list_attr.attr,
3668 &flush_free_list_objects_attr.attr,
3669 &flush_free_list_remote_attr.attr,
3670 &flush_slab_partial_attr.attr,
3671 &flush_slab_free_attr.attr,
3672 &flush_rfree_list_attr.attr,
3673 &flush_rfree_list_objects_attr.attr,
3674 &claim_remote_list_attr.attr,
3675 &claim_remote_list_objects_attr.attr,
3676#endif
3677#ifdef CONFIG_FAILSLAB
3678 &failslab_attr.attr,
3679#endif
3680
3681 NULL
3682};
3683
3684static struct attribute_group slab_attr_group = {
3685 .attrs = slab_attrs,
3686};
3687
3688static ssize_t slab_attr_show(struct kobject *kobj,
3689 struct attribute *attr, char *buf)
3690{
3691 struct slab_attribute *attribute;
3692 struct kmem_cache *s;
3693 int err;
3694
3695 attribute = to_slab_attr(attr);
3696 s = to_slab(kobj);
3697
3698 if (!attribute->show)
3699 return -EIO;
3700
3701 err = attribute->show(s, buf);
3702
3703 return err;
3704}
3705
3706static ssize_t slab_attr_store(struct kobject *kobj,
3707 struct attribute *attr, const char *buf, size_t len)
3708{
3709 struct slab_attribute *attribute;
3710 struct kmem_cache *s;
3711 int err;
3712
3713 attribute = to_slab_attr(attr);
3714 s = to_slab(kobj);
3715
3716 if (!attribute->store)
3717 return -EIO;
3718
3719 err = attribute->store(s, buf, len);
3720
3721 return err;
3722}
3723
3724static void kmem_cache_release(struct kobject *kobj)
3725{
3726 struct kmem_cache *s = to_slab(kobj);
3727
3728 kmem_cache_free(&kmem_cache_cache, s);
3729}
3730
3731static struct sysfs_ops slab_sysfs_ops = {
3732 .show = slab_attr_show,
3733 .store = slab_attr_store,
3734};
3735
3736static struct kobj_type slab_ktype = {
3737 .sysfs_ops = &slab_sysfs_ops,
3738 .release = kmem_cache_release
3739};
3740
3741static int uevent_filter(struct kset *kset, struct kobject *kobj)
3742{
3743 struct kobj_type *ktype = get_ktype(kobj);
3744
3745 if (ktype == &slab_ktype)
3746 return 1;
3747 return 0;
3748}
3749
3750static struct kset_uevent_ops slab_uevent_ops = {
3751 .filter = uevent_filter,
3752};
3753
3754static struct kset *slab_kset;
3755
3756static int sysfs_available __read_mostly;
3757
3758static int sysfs_slab_add(struct kmem_cache *s)
3759{
3760 int err;
3761
3762 if (!sysfs_available)
3763 return 0;
3764
3765 s->kobj.kset = slab_kset;
3766 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, s->name);
3767 if (err) {
3768 kobject_put(&s->kobj);
3769 return err;
3770 }
3771
3772 err = sysfs_create_group(&s->kobj, &slab_attr_group);
3773 if (err)
3774 return err;
3775
3776 kobject_uevent(&s->kobj, KOBJ_ADD);
3777
3778 return 0;
3779}
3780
3781static void sysfs_slab_remove(struct kmem_cache *s)
3782{
3783 kobject_uevent(&s->kobj, KOBJ_REMOVE);
3784 kobject_del(&s->kobj);
3785 kobject_put(&s->kobj);
3786}
3787
3788static int __init slab_sysfs_init(void)
3789{
3790 struct kmem_cache *s;
3791 int err;
3792
3793 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
3794 if (!slab_kset) {
3795 printk(KERN_ERR "Cannot register slab subsystem.\n");
3796 return -ENOSYS;
3797 }
3798
3799 down_write(&slqb_lock);
3800
3801 sysfs_available = 1;
3802
3803 list_for_each_entry(s, &slab_caches, list) {
3804 err = sysfs_slab_add(s);
3805 if (err)
3806 printk(KERN_ERR "SLQB: Unable to add boot slab %s"
3807 " to sysfs\n", s->name);
3808 }
3809
3810 up_write(&slqb_lock);
3811
3812 return 0;
3813}
3814device_initcall(slab_sysfs_init);
3815
3816#endif
diff --git a/mm/slub.c b/mm/slub.c
index 35f351f2619..1796b6513ca 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1457,6 +1457,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1457 struct zone *zone; 1457 struct zone *zone;
1458 enum zone_type high_zoneidx = gfp_zone(flags); 1458 enum zone_type high_zoneidx = gfp_zone(flags);
1459 struct page *page; 1459 struct page *page;
1460 unsigned int cpuset_mems_cookie;
1460 1461
1461 /* 1462 /*
1462 * The defrag ratio allows a configuration of the tradeoffs between 1463 * The defrag ratio allows a configuration of the tradeoffs between
@@ -1480,23 +1481,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1480 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1481 get_cycles() % 1024 > s->remote_node_defrag_ratio)
1481 return NULL; 1482 return NULL;
1482 1483
1483 get_mems_allowed(); 1484 do {
1484 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1485 cpuset_mems_cookie = get_mems_allowed();
1485 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1486 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1486 struct kmem_cache_node *n; 1487 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1487 1488 struct kmem_cache_node *n;
1488 n = get_node(s, zone_to_nid(zone)); 1489
1489 1490 n = get_node(s, zone_to_nid(zone));
1490 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1491
1491 n->nr_partial > s->min_partial) { 1492 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1492 page = get_partial_node(n); 1493 n->nr_partial > s->min_partial) {
1493 if (page) { 1494 page = get_partial_node(n);
1494 put_mems_allowed(); 1495 if (page) {
1495 return page; 1496 /*
1497 * Return the object even if
1498 * put_mems_allowed indicated that
1499 * the cpuset mems_allowed was
1500 * updated in parallel. It's a
1501 * harmless race between the alloc
1502 * and the cpuset update.
1503 */
1504 put_mems_allowed(cpuset_mems_cookie);
1505 return page;
1506 }
1496 } 1507 }
1497 } 1508 }
1498 } 1509 } while (!put_mems_allowed(cpuset_mems_cookie));
1499 put_mems_allowed();
1500#endif 1510#endif
1501 return NULL; 1511 return NULL;
1502} 1512}
@@ -1818,6 +1828,11 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1818 if (unlikely(!node_match(c, node))) 1828 if (unlikely(!node_match(c, node)))
1819 goto another_slab; 1829 goto another_slab;
1820 1830
1831 /* must check again c->freelist in case of cpu migration or IRQ */
1832 object = c->freelist;
1833 if (object)
1834 goto update_freelist;
1835
1821 stat(s, ALLOC_REFILL); 1836 stat(s, ALLOC_REFILL);
1822 1837
1823load_freelist: 1838load_freelist:
@@ -1827,6 +1842,7 @@ load_freelist:
1827 if (kmem_cache_debug(s)) 1842 if (kmem_cache_debug(s))
1828 goto debug; 1843 goto debug;
1829 1844
1845update_freelist:
1830 c->freelist = get_freepointer(s, object); 1846 c->freelist = get_freepointer(s, object);
1831 page->inuse = page->objects; 1847 page->inuse = page->objects;
1832 page->freelist = NULL; 1848 page->freelist = NULL;
@@ -2163,7 +2179,7 @@ EXPORT_SYMBOL(kmem_cache_free);
2163 * take the list_lock. 2179 * take the list_lock.
2164 */ 2180 */
2165static int slub_min_order; 2181static int slub_min_order;
2166static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; 2182static int slub_max_order;
2167static int slub_min_objects; 2183static int slub_min_objects;
2168 2184
2169/* 2185/*
@@ -3433,13 +3449,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3433 if (kmem_cache_open(s, n, 3449 if (kmem_cache_open(s, n,
3434 size, align, flags, ctor)) { 3450 size, align, flags, ctor)) {
3435 list_add(&s->list, &slab_caches); 3451 list_add(&s->list, &slab_caches);
3452 up_write(&slub_lock);
3436 if (sysfs_slab_add(s)) { 3453 if (sysfs_slab_add(s)) {
3454 down_write(&slub_lock);
3437 list_del(&s->list); 3455 list_del(&s->list);
3438 kfree(n); 3456 kfree(n);
3439 kfree(s); 3457 kfree(s);
3440 goto err; 3458 goto err;
3441 } 3459 }
3442 up_write(&slub_lock);
3443 return s; 3460 return s;
3444 } 3461 }
3445 kfree(n); 3462 kfree(n);
diff --git a/mm/sparse.c b/mm/sparse.c
index aa64b12831a..4cd05e5f2f4 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
353 353
354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), 354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
355 usemap_count); 355 usemap_count);
356 if (usemap) { 356 if (!usemap) {
357 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 357 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
358 if (!present_section_nr(pnum)) 358 if (!usemap) {
359 continue; 359 printk(KERN_WARNING "%s: allocation failed\n", __func__);
360 usemap_map[pnum] = usemap; 360 return;
361 usemap += size;
362 } 361 }
363 return;
364 } 362 }
365 363
366 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); 364 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
367 if (usemap) { 365 if (!present_section_nr(pnum))
368 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 366 continue;
369 if (!present_section_nr(pnum)) 367 usemap_map[pnum] = usemap;
370 continue; 368 usemap += size;
371 usemap_map[pnum] = usemap; 369 check_usemap_section_nr(nodeid, usemap_map[pnum]);
372 usemap += size;
373 check_usemap_section_nr(nodeid, usemap_map[pnum]);
374 }
375 return;
376 } 370 }
377
378 printk(KERN_WARNING "%s: allocation failed\n", __func__);
379} 371}
380 372
381#ifndef CONFIG_SPARSEMEM_VMEMMAP 373#ifndef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/mm/swap.c b/mm/swap.c
index 3a442f18b0b..4a1fc6db89e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page)
78{ 78{
79 if (unlikely(PageTail(page))) { 79 if (unlikely(PageTail(page))) {
80 /* __split_huge_page_refcount can run under us */ 80 /* __split_huge_page_refcount can run under us */
81 struct page *page_head = page->first_page; 81 struct page *page_head = compound_trans_head(page);
82 smp_rmb(); 82
83 /* 83 if (likely(page != page_head &&
84 * If PageTail is still set after smp_rmb() we can be sure 84 get_page_unless_zero(page_head))) {
85 * that the page->first_page we read wasn't a dangling pointer.
86 * See __split_huge_page_refcount() smp_wmb().
87 */
88 if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
89 unsigned long flags; 85 unsigned long flags;
90 /* 86 /*
91 * Verify that our page_head wasn't converted 87 * page_head wasn't a dangling pointer but it
92 * to a a regular page before we got a 88 * may not be a head page anymore by the time
93 * reference on it. 89 * we obtain the lock. That is ok as long as it
90 * can't be freed from under us.
94 */ 91 */
95 if (unlikely(!PageHead(page_head))) {
96 /* PageHead is cleared after PageTail */
97 smp_rmb();
98 VM_BUG_ON(PageTail(page));
99 goto out_put_head;
100 }
101 /*
102 * Only run compound_lock on a valid PageHead,
103 * after having it pinned with
104 * get_page_unless_zero() above.
105 */
106 smp_mb();
107 /* page_head wasn't a dangling pointer */
108 flags = compound_lock_irqsave(page_head); 92 flags = compound_lock_irqsave(page_head);
109 if (unlikely(!PageTail(page))) { 93 if (unlikely(!PageTail(page))) {
110 /* __split_huge_page_refcount run before us */ 94 /* __split_huge_page_refcount run before us */
111 compound_unlock_irqrestore(page_head, flags); 95 compound_unlock_irqrestore(page_head, flags);
112 VM_BUG_ON(PageHead(page_head)); 96 VM_BUG_ON(PageHead(page_head));
113 out_put_head:
114 if (put_page_testzero(page_head)) 97 if (put_page_testzero(page_head))
115 __put_single_page(page_head); 98 __put_single_page(page_head);
116 out_put_single: 99 out_put_single:
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page)
121 VM_BUG_ON(page_head != page->first_page); 104 VM_BUG_ON(page_head != page->first_page);
122 /* 105 /*
123 * We can release the refcount taken by 106 * We can release the refcount taken by
124 * get_page_unless_zero now that 107 * get_page_unless_zero() now that
125 * split_huge_page_refcount is blocked on the 108 * __split_huge_page_refcount() is blocked on
126 * compound_lock. 109 * the compound_lock.
127 */ 110 */
128 if (put_page_testzero(page_head)) 111 if (put_page_testzero(page_head))
129 VM_BUG_ON(1); 112 VM_BUG_ON(1);
130 /* __split_huge_page_refcount will wait now */ 113 /* __split_huge_page_refcount will wait now */
131 VM_BUG_ON(atomic_read(&page->_count) <= 0); 114 VM_BUG_ON(page_mapcount(page) <= 0);
132 atomic_dec(&page->_count); 115 atomic_dec(&page->_mapcount);
133 VM_BUG_ON(atomic_read(&page_head->_count) <= 0); 116 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
117 VM_BUG_ON(atomic_read(&page->_count) != 0);
134 compound_unlock_irqrestore(page_head, flags); 118 compound_unlock_irqrestore(page_head, flags);
135 if (put_page_testzero(page_head)) { 119 if (put_page_testzero(page_head)) {
136 if (PageHead(page_head)) 120 if (PageHead(page_head))
@@ -160,6 +144,45 @@ void put_page(struct page *page)
160} 144}
161EXPORT_SYMBOL(put_page); 145EXPORT_SYMBOL(put_page);
162 146
147/*
148 * This function is exported but must not be called by anything other
149 * than get_page(). It implements the slow path of get_page().
150 */
151bool __get_page_tail(struct page *page)
152{
153 /*
154 * This takes care of get_page() if run on a tail page
155 * returned by one of the get_user_pages/follow_page variants.
156 * get_user_pages/follow_page itself doesn't need the compound
157 * lock because it runs __get_page_tail_foll() under the
158 * proper PT lock that already serializes against
159 * split_huge_page().
160 */
161 unsigned long flags;
162 bool got = false;
163 struct page *page_head = compound_trans_head(page);
164
165 if (likely(page != page_head && get_page_unless_zero(page_head))) {
166 /*
167 * page_head wasn't a dangling pointer but it
168 * may not be a head page anymore by the time
169 * we obtain the lock. That is ok as long as it
170 * can't be freed from under us.
171 */
172 flags = compound_lock_irqsave(page_head);
173 /* here __split_huge_page_refcount won't run anymore */
174 if (likely(PageTail(page))) {
175 __get_page_tail_foll(page, false);
176 got = true;
177 }
178 compound_unlock_irqrestore(page_head, flags);
179 if (unlikely(!got))
180 put_page(page_head);
181 }
182 return got;
183}
184EXPORT_SYMBOL(__get_page_tail);
185
163/** 186/**
164 * put_pages_list() - release a list of pages 187 * put_pages_list() - release a list of pages
165 * @pages: list of pages threaded on page->lru 188 * @pages: list of pages threaded on page->lru
@@ -644,7 +667,7 @@ void lru_add_page_tail(struct zone* zone,
644 VM_BUG_ON(!PageHead(page)); 667 VM_BUG_ON(!PageHead(page));
645 VM_BUG_ON(PageCompound(page_tail)); 668 VM_BUG_ON(PageCompound(page_tail));
646 VM_BUG_ON(PageLRU(page_tail)); 669 VM_BUG_ON(PageLRU(page_tail));
647 VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); 670 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock));
648 671
649 SetPageLRU(page_tail); 672 SetPageLRU(page_tail);
650 673
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 46680461785..10e9198778c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -28,7 +28,7 @@
28 */ 28 */
29static const struct address_space_operations swap_aops = { 29static const struct address_space_operations swap_aops = {
30 .writepage = swap_writepage, 30 .writepage = swap_writepage,
31 .set_page_dirty = __set_page_dirty_nobuffers, 31 .set_page_dirty = __set_page_dirty_no_writeback,
32 .migratepage = migrate_page, 32 .migratepage = migrate_page,
33}; 33};
34 34
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ff8dc1a18cb..c8f4338848d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
932 pmd = pmd_offset(pud, addr); 932 pmd = pmd_offset(pud, addr);
933 do { 933 do {
934 next = pmd_addr_end(addr, end); 934 next = pmd_addr_end(addr, end);
935 if (unlikely(pmd_trans_huge(*pmd))) 935 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
936 continue;
937 if (pmd_none_or_clear_bad(pmd))
938 continue; 936 continue;
939 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 937 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
940 if (ret) 938 if (ret)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1d34d75366a..bdb70042c12 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -256,7 +256,7 @@ struct vmap_area {
256 struct rb_node rb_node; /* address sorted rbtree */ 256 struct rb_node rb_node; /* address sorted rbtree */
257 struct list_head list; /* address sorted list */ 257 struct list_head list; /* address sorted list */
258 struct list_head purge_list; /* "lazy purge" list */ 258 struct list_head purge_list; /* "lazy purge" list */
259 void *private; 259 struct vm_struct *vm;
260 struct rcu_head rcu_head; 260 struct rcu_head rcu_head;
261}; 261};
262 262
@@ -732,9 +732,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr)
732#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 732#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
733#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 733#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
734#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 734#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
735#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 735#define VMAP_BBMAP_BITS \
736 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 736 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
737 VMALLOC_PAGES / NR_CPUS / 16)) 737 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
738 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
738 739
739#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 740#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
740 741
@@ -1173,9 +1174,10 @@ void __init vmalloc_init(void)
1173 /* Import existing vmlist entries. */ 1174 /* Import existing vmlist entries. */
1174 for (tmp = vmlist; tmp; tmp = tmp->next) { 1175 for (tmp = vmlist; tmp; tmp = tmp->next) {
1175 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); 1176 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
1176 va->flags = tmp->flags | VM_VM_AREA; 1177 va->flags = VM_VM_AREA;
1177 va->va_start = (unsigned long)tmp->addr; 1178 va->va_start = (unsigned long)tmp->addr;
1178 va->va_end = va->va_start + tmp->size; 1179 va->va_end = va->va_start + tmp->size;
1180 va->vm = tmp;
1179 __insert_vmap_area(va); 1181 __insert_vmap_area(va);
1180 } 1182 }
1181 1183
@@ -1266,18 +1268,22 @@ EXPORT_SYMBOL_GPL(map_vm_area);
1266DEFINE_RWLOCK(vmlist_lock); 1268DEFINE_RWLOCK(vmlist_lock);
1267struct vm_struct *vmlist; 1269struct vm_struct *vmlist;
1268 1270
1269static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1271static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1270 unsigned long flags, void *caller) 1272 unsigned long flags, void *caller)
1271{ 1273{
1272 struct vm_struct *tmp, **p;
1273
1274 vm->flags = flags; 1274 vm->flags = flags;
1275 vm->addr = (void *)va->va_start; 1275 vm->addr = (void *)va->va_start;
1276 vm->size = va->va_end - va->va_start; 1276 vm->size = va->va_end - va->va_start;
1277 vm->caller = caller; 1277 vm->caller = caller;
1278 va->private = vm; 1278 va->vm = vm;
1279 va->flags |= VM_VM_AREA; 1279 va->flags |= VM_VM_AREA;
1280}
1280 1281
1282static void insert_vmalloc_vmlist(struct vm_struct *vm)
1283{
1284 struct vm_struct *tmp, **p;
1285
1286 vm->flags &= ~VM_UNLIST;
1281 write_lock(&vmlist_lock); 1287 write_lock(&vmlist_lock);
1282 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 1288 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1283 if (tmp->addr >= vm->addr) 1289 if (tmp->addr >= vm->addr)
@@ -1288,6 +1294,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1288 write_unlock(&vmlist_lock); 1294 write_unlock(&vmlist_lock);
1289} 1295}
1290 1296
1297static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1298 unsigned long flags, void *caller)
1299{
1300 setup_vmalloc_vm(vm, va, flags, caller);
1301 insert_vmalloc_vmlist(vm);
1302}
1303
1291static struct vm_struct *__get_vm_area_node(unsigned long size, 1304static struct vm_struct *__get_vm_area_node(unsigned long size,
1292 unsigned long align, unsigned long flags, unsigned long start, 1305 unsigned long align, unsigned long flags, unsigned long start,
1293 unsigned long end, int node, gfp_t gfp_mask, void *caller) 1306 unsigned long end, int node, gfp_t gfp_mask, void *caller)
@@ -1326,7 +1339,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1326 return NULL; 1339 return NULL;
1327 } 1340 }
1328 1341
1329 insert_vmalloc_vm(area, va, flags, caller); 1342 /*
1343 * When this function is called from __vmalloc_node_range,
1344 * we do not add vm_struct to vmlist here to avoid
1345 * accessing uninitialized members of vm_struct such as
1346 * pages and nr_pages fields. They will be set later.
1347 * To distinguish it from others, we use a VM_UNLIST flag.
1348 */
1349 if (flags & VM_UNLIST)
1350 setup_vmalloc_vm(area, va, flags, caller);
1351 else
1352 insert_vmalloc_vm(area, va, flags, caller);
1353
1330 return area; 1354 return area;
1331} 1355}
1332 1356
@@ -1374,7 +1398,7 @@ static struct vm_struct *find_vm_area(const void *addr)
1374 1398
1375 va = find_vmap_area((unsigned long)addr); 1399 va = find_vmap_area((unsigned long)addr);
1376 if (va && va->flags & VM_VM_AREA) 1400 if (va && va->flags & VM_VM_AREA)
1377 return va->private; 1401 return va->vm;
1378 1402
1379 return NULL; 1403 return NULL;
1380} 1404}
@@ -1393,18 +1417,21 @@ struct vm_struct *remove_vm_area(const void *addr)
1393 1417
1394 va = find_vmap_area((unsigned long)addr); 1418 va = find_vmap_area((unsigned long)addr);
1395 if (va && va->flags & VM_VM_AREA) { 1419 if (va && va->flags & VM_VM_AREA) {
1396 struct vm_struct *vm = va->private; 1420 struct vm_struct *vm = va->vm;
1397 struct vm_struct *tmp, **p; 1421
1398 /* 1422 if (!(vm->flags & VM_UNLIST)) {
1399 * remove from list and disallow access to this vm_struct 1423 struct vm_struct *tmp, **p;
1400 * before unmap. (address range confliction is maintained by 1424 /*
1401 * vmap.) 1425 * remove from list and disallow access to
1402 */ 1426 * this vm_struct before unmap. (address range
1403 write_lock(&vmlist_lock); 1427 * confliction is maintained by vmap.)
1404 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) 1428 */
1405 ; 1429 write_lock(&vmlist_lock);
1406 *p = tmp->next; 1430 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1407 write_unlock(&vmlist_lock); 1431 ;
1432 *p = tmp->next;
1433 write_unlock(&vmlist_lock);
1434 }
1408 1435
1409 vmap_debug_free_range(va->va_start, va->va_end); 1436 vmap_debug_free_range(va->va_start, va->va_end);
1410 free_unmap_vmap_area(va); 1437 free_unmap_vmap_area(va);
@@ -1615,13 +1642,21 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1615 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1642 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1616 return NULL; 1643 return NULL;
1617 1644
1618 area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, 1645 area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
1619 gfp_mask, caller); 1646 start, end, node, gfp_mask, caller);
1620 1647
1621 if (!area) 1648 if (!area)
1622 return NULL; 1649 return NULL;
1623 1650
1624 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1651 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
1652 if (!addr)
1653 return NULL;
1654
1655 /*
1656 * In this function, newly allocated vm_struct is not added
1657 * to vmlist at __get_vm_area_node(). so, it is added here.
1658 */
1659 insert_vmalloc_vmlist(area);
1625 1660
1626 /* 1661 /*
1627 * A ref_count = 3 is needed because the vm_struct and vmap_area 1662 * A ref_count = 3 is needed because the vm_struct and vmap_area
@@ -2153,6 +2188,14 @@ struct vm_struct *alloc_vm_area(size_t size)
2153 return NULL; 2188 return NULL;
2154 } 2189 }
2155 2190
2191 /*
2192 * If the allocated address space is passed to a hypercall
2193 * before being used then we cannot rely on a page fault to
2194 * trigger an update of the page tables. So sync all the page
2195 * tables here.
2196 */
2197 vmalloc_sync_all();
2198
2156 return area; 2199 return area;
2157} 2200}
2158EXPORT_SYMBOL_GPL(alloc_vm_area); 2201EXPORT_SYMBOL_GPL(alloc_vm_area);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d036e59d302..1eb3edf7920 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -248,35 +248,66 @@ unsigned long shrink_slab(struct shrink_control *shrink,
248 248
249 list_for_each_entry(shrinker, &shrinker_list, list) { 249 list_for_each_entry(shrinker, &shrinker_list, list) {
250 unsigned long long delta; 250 unsigned long long delta;
251 unsigned long total_scan; 251 long total_scan;
252 unsigned long max_pass; 252 long max_pass;
253 int shrink_ret = 0;
254 long nr;
255 long new_nr;
253 256
254 max_pass = do_shrinker_shrink(shrinker, shrink, 0); 257 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
258 if (max_pass <= 0)
259 continue;
260
261 /*
262 * copy the current shrinker scan count into a local variable
263 * and zero it so that other concurrent shrinker invocations
264 * don't also do this scanning work.
265 */
266 do {
267 nr = shrinker->nr;
268 } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
269
270 total_scan = nr;
255 delta = (4 * nr_pages_scanned) / shrinker->seeks; 271 delta = (4 * nr_pages_scanned) / shrinker->seeks;
256 delta *= max_pass; 272 delta *= max_pass;
257 do_div(delta, lru_pages + 1); 273 do_div(delta, lru_pages + 1);
258 shrinker->nr += delta; 274 total_scan += delta;
259 if (shrinker->nr < 0) { 275 if (total_scan < 0) {
260 printk(KERN_ERR "shrink_slab: %pF negative objects to " 276 printk(KERN_ERR "shrink_slab: %pF negative objects to "
261 "delete nr=%ld\n", 277 "delete nr=%ld\n",
262 shrinker->shrink, shrinker->nr); 278 shrinker->shrink, total_scan);
263 shrinker->nr = max_pass; 279 total_scan = max_pass;
264 } 280 }
265 281
266 /* 282 /*
283 * We need to avoid excessive windup on filesystem shrinkers
284 * due to large numbers of GFP_NOFS allocations causing the
285 * shrinkers to return -1 all the time. This results in a large
286 * nr being built up so when a shrink that can do some work
287 * comes along it empties the entire cache due to nr >>>
288 * max_pass. This is bad for sustaining a working set in
289 * memory.
290 *
291 * Hence only allow the shrinker to scan the entire cache when
292 * a large delta change is calculated directly.
293 */
294 if (delta < max_pass / 4)
295 total_scan = min(total_scan, max_pass / 2);
296
297 /*
267 * Avoid risking looping forever due to too large nr value: 298 * Avoid risking looping forever due to too large nr value:
268 * never try to free more than twice the estimate number of 299 * never try to free more than twice the estimate number of
269 * freeable entries. 300 * freeable entries.
270 */ 301 */
271 if (shrinker->nr > max_pass * 2) 302 if (total_scan > max_pass * 2)
272 shrinker->nr = max_pass * 2; 303 total_scan = max_pass * 2;
273 304
274 total_scan = shrinker->nr; 305 trace_mm_shrink_slab_start(shrinker, shrink, nr,
275 shrinker->nr = 0; 306 nr_pages_scanned, lru_pages,
307 max_pass, delta, total_scan);
276 308
277 while (total_scan >= SHRINK_BATCH) { 309 while (total_scan >= SHRINK_BATCH) {
278 long this_scan = SHRINK_BATCH; 310 long this_scan = SHRINK_BATCH;
279 int shrink_ret;
280 int nr_before; 311 int nr_before;
281 312
282 nr_before = do_shrinker_shrink(shrinker, shrink, 0); 313 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
@@ -292,7 +323,19 @@ unsigned long shrink_slab(struct shrink_control *shrink,
292 cond_resched(); 323 cond_resched();
293 } 324 }
294 325
295 shrinker->nr += total_scan; 326 /*
327 * move the unused scan count back into the shrinker in a
328 * manner that handles concurrent updates. If we exhausted the
329 * scan, there is no need to do an update.
330 */
331 do {
332 nr = shrinker->nr;
333 new_nr = total_scan + nr;
334 if (total_scan <= 0)
335 break;
336 } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
337
338 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
296 } 339 }
297 up_read(&shrinker_rwsem); 340 up_read(&shrinker_rwsem);
298out: 341out:
@@ -455,15 +498,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
455 return PAGE_ACTIVATE; 498 return PAGE_ACTIVATE;
456 } 499 }
457 500
458 /*
459 * Wait on writeback if requested to. This happens when
460 * direct reclaiming a large contiguous area and the
461 * first attempt to free a range of pages fails.
462 */
463 if (PageWriteback(page) &&
464 (sc->reclaim_mode & RECLAIM_MODE_SYNC))
465 wait_on_page_writeback(page);
466
467 if (!PageWriteback(page)) { 501 if (!PageWriteback(page)) {
468 /* synchronous write or broken a_ops? */ 502 /* synchronous write or broken a_ops? */
469 ClearPageReclaim(page); 503 ClearPageReclaim(page);
@@ -581,6 +615,10 @@ void putback_lru_page(struct page *page)
581 int was_unevictable = PageUnevictable(page); 615 int was_unevictable = PageUnevictable(page);
582 616
583 VM_BUG_ON(PageLRU(page)); 617 VM_BUG_ON(PageLRU(page));
618#ifdef CONFIG_CLEANCACHE
619 if (active)
620 SetPageWasActive(page);
621#endif
584 622
585redo: 623redo:
586 ClearPageUnevictable(page); 624 ClearPageUnevictable(page);
@@ -665,7 +703,7 @@ static enum page_references page_check_references(struct page *page,
665 return PAGEREF_RECLAIM; 703 return PAGEREF_RECLAIM;
666 704
667 if (referenced_ptes) { 705 if (referenced_ptes) {
668 if (PageAnon(page)) 706 if (PageSwapBacked(page))
669 return PAGEREF_ACTIVATE; 707 return PAGEREF_ACTIVATE;
670 /* 708 /*
671 * All mapped pages start out with page table 709 * All mapped pages start out with page table
@@ -683,7 +721,13 @@ static enum page_references page_check_references(struct page *page,
683 */ 721 */
684 SetPageReferenced(page); 722 SetPageReferenced(page);
685 723
686 if (referenced_page) 724 if (referenced_page || referenced_ptes > 1)
725 return PAGEREF_ACTIVATE;
726
727 /*
728 * Activate file-backed executable pages after first usage.
729 */
730 if (vm_flags & VM_EXEC)
687 return PAGEREF_ACTIVATE; 731 return PAGEREF_ACTIVATE;
688 732
689 return PAGEREF_KEEP; 733 return PAGEREF_KEEP;
@@ -719,7 +763,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
719 */ 763 */
720static unsigned long shrink_page_list(struct list_head *page_list, 764static unsigned long shrink_page_list(struct list_head *page_list,
721 struct zone *zone, 765 struct zone *zone,
722 struct scan_control *sc) 766 struct scan_control *sc,
767 int priority,
768 unsigned long *ret_nr_dirty,
769 unsigned long *ret_nr_writeback)
723{ 770{
724 LIST_HEAD(ret_pages); 771 LIST_HEAD(ret_pages);
725 LIST_HEAD(free_pages); 772 LIST_HEAD(free_pages);
@@ -727,6 +774,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
727 unsigned long nr_dirty = 0; 774 unsigned long nr_dirty = 0;
728 unsigned long nr_congested = 0; 775 unsigned long nr_congested = 0;
729 unsigned long nr_reclaimed = 0; 776 unsigned long nr_reclaimed = 0;
777 unsigned long nr_writeback = 0;
730 778
731 cond_resched(); 779 cond_resched();
732 780
@@ -763,13 +811,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
763 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 811 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
764 812
765 if (PageWriteback(page)) { 813 if (PageWriteback(page)) {
814 nr_writeback++;
766 /* 815 /*
767 * Synchronous reclaim is performed in two passes, 816 * Synchronous reclaim cannot queue pages for
768 * first an asynchronous pass over the list to 817 * writeback due to the possibility of stack overflow
769 * start parallel writeback, and a second synchronous 818 * but if it encounters a page under writeback, wait
770 * pass to wait for the IO to complete. Wait here 819 * for the IO to complete.
771 * for any page for which writeback has already
772 * started.
773 */ 820 */
774 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && 821 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
775 may_enter_fs) 822 may_enter_fs)
@@ -825,6 +872,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
825 if (PageDirty(page)) { 872 if (PageDirty(page)) {
826 nr_dirty++; 873 nr_dirty++;
827 874
875 /*
876 * Only kswapd can writeback filesystem pages to
877 * avoid risk of stack overflow but do not writeback
878 * unless under significant pressure.
879 */
880 if (page_is_file_cache(page) &&
881 (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
882 /*
883 * Immediately reclaim when written back.
884 * Similar in principal to deactivate_page()
885 * except we already have the page isolated
886 * and know it's dirty
887 */
888 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
889 SetPageReclaim(page);
890
891 goto keep_locked;
892 }
893
828 if (references == PAGEREF_RECLAIM_CLEAN) 894 if (references == PAGEREF_RECLAIM_CLEAN)
829 goto keep_locked; 895 goto keep_locked;
830 if (!may_enter_fs) 896 if (!may_enter_fs)
@@ -959,6 +1025,8 @@ keep_lumpy:
959 1025
960 list_splice(&ret_pages, page_list); 1026 list_splice(&ret_pages, page_list);
961 count_vm_events(PGACTIVATE, pgactivate); 1027 count_vm_events(PGACTIVATE, pgactivate);
1028 *ret_nr_dirty += nr_dirty;
1029 *ret_nr_writeback += nr_writeback;
962 return nr_reclaimed; 1030 return nr_reclaimed;
963} 1031}
964 1032
@@ -972,23 +1040,27 @@ keep_lumpy:
972 * 1040 *
973 * returns 0 on success, -ve errno on failure. 1041 * returns 0 on success, -ve errno on failure.
974 */ 1042 */
975int __isolate_lru_page(struct page *page, int mode, int file) 1043int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
976{ 1044{
1045 bool all_lru_mode;
977 int ret = -EINVAL; 1046 int ret = -EINVAL;
978 1047
979 /* Only take pages on the LRU. */ 1048 /* Only take pages on the LRU. */
980 if (!PageLRU(page)) 1049 if (!PageLRU(page))
981 return ret; 1050 return ret;
982 1051
1052 all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
1053 (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
1054
983 /* 1055 /*
984 * When checking the active state, we need to be sure we are 1056 * When checking the active state, we need to be sure we are
985 * dealing with comparible boolean values. Take the logical not 1057 * dealing with comparible boolean values. Take the logical not
986 * of each. 1058 * of each.
987 */ 1059 */
988 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 1060 if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
989 return ret; 1061 return ret;
990 1062
991 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) 1063 if (!all_lru_mode && !!page_is_file_cache(page) != file)
992 return ret; 1064 return ret;
993 1065
994 /* 1066 /*
@@ -1001,6 +1073,43 @@ int __isolate_lru_page(struct page *page, int mode, int file)
1001 1073
1002 ret = -EBUSY; 1074 ret = -EBUSY;
1003 1075
1076 /*
1077 * To minimise LRU disruption, the caller can indicate that it only
1078 * wants to isolate pages it will be able to operate on without
1079 * blocking - clean pages for the most part.
1080 *
1081 * ISOLATE_CLEAN means that only clean pages should be isolated. This
1082 * is used by reclaim when it is cannot write to backing storage
1083 *
1084 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
1085 * that it is possible to migrate without blocking
1086 */
1087 if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
1088 /* All the caller can do on PageWriteback is block */
1089 if (PageWriteback(page))
1090 return ret;
1091
1092 if (PageDirty(page)) {
1093 struct address_space *mapping;
1094
1095 /* ISOLATE_CLEAN means only clean pages */
1096 if (mode & ISOLATE_CLEAN)
1097 return ret;
1098
1099 /*
1100 * Only pages without mappings or that have a
1101 * ->migratepage callback are possible to migrate
1102 * without blocking
1103 */
1104 mapping = page_mapping(page);
1105 if (mapping && !mapping->a_ops->migratepage)
1106 return ret;
1107 }
1108 }
1109
1110 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1111 return ret;
1112
1004 if (likely(get_page_unless_zero(page))) { 1113 if (likely(get_page_unless_zero(page))) {
1005 /* 1114 /*
1006 * Be careful not to clear PageLRU until after we're 1115 * Be careful not to clear PageLRU until after we're
@@ -1036,7 +1145,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
1036 */ 1145 */
1037static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1146static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1038 struct list_head *src, struct list_head *dst, 1147 struct list_head *src, struct list_head *dst,
1039 unsigned long *scanned, int order, int mode, int file) 1148 unsigned long *scanned, int order, isolate_mode_t mode,
1149 int file)
1040{ 1150{
1041 unsigned long nr_taken = 0; 1151 unsigned long nr_taken = 0;
1042 unsigned long nr_lumpy_taken = 0; 1152 unsigned long nr_lumpy_taken = 0;
@@ -1111,7 +1221,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1111 * anon page which don't already have a swap slot is 1221 * anon page which don't already have a swap slot is
1112 * pointless. 1222 * pointless.
1113 */ 1223 */
1114 if (nr_swap_pages <= 0 && PageAnon(cursor_page) && 1224 if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
1115 !PageSwapCache(cursor_page)) 1225 !PageSwapCache(cursor_page))
1116 break; 1226 break;
1117 1227
@@ -1161,8 +1271,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1161static unsigned long isolate_pages_global(unsigned long nr, 1271static unsigned long isolate_pages_global(unsigned long nr,
1162 struct list_head *dst, 1272 struct list_head *dst,
1163 unsigned long *scanned, int order, 1273 unsigned long *scanned, int order,
1164 int mode, struct zone *z, 1274 isolate_mode_t mode,
1165 int active, int file) 1275 struct zone *z, int active, int file)
1166{ 1276{
1167 int lru = LRU_BASE; 1277 int lru = LRU_BASE;
1168 if (active) 1278 if (active)
@@ -1190,6 +1300,9 @@ static unsigned long clear_active_flags(struct list_head *page_list,
1190 if (PageActive(page)) { 1300 if (PageActive(page)) {
1191 lru += LRU_ACTIVE; 1301 lru += LRU_ACTIVE;
1192 ClearPageActive(page); 1302 ClearPageActive(page);
1303#ifdef CONFIG_CLEANCACHE
1304 SetPageWasActive(page);
1305#endif
1193 nr_active += numpages; 1306 nr_active += numpages;
1194 } 1307 }
1195 if (count) 1308 if (count)
@@ -1354,7 +1467,7 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
1354} 1467}
1355 1468
1356/* 1469/*
1357 * Returns true if the caller should wait to clean dirty/writeback pages. 1470 * Returns true if a direct reclaim should wait on pages under writeback.
1358 * 1471 *
1359 * If we are direct reclaiming for contiguous pages and we do not reclaim 1472 * If we are direct reclaiming for contiguous pages and we do not reclaim
1360 * everything in the list, try again and wait for writeback IO to complete. 1473 * everything in the list, try again and wait for writeback IO to complete.
@@ -1408,6 +1521,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1408 unsigned long nr_taken; 1521 unsigned long nr_taken;
1409 unsigned long nr_anon; 1522 unsigned long nr_anon;
1410 unsigned long nr_file; 1523 unsigned long nr_file;
1524 unsigned long nr_dirty = 0;
1525 unsigned long nr_writeback = 0;
1526
1527 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
1411 1528
1412 while (unlikely(too_many_isolated(zone, file, sc))) { 1529 while (unlikely(too_many_isolated(zone, file, sc))) {
1413 congestion_wait(BLK_RW_ASYNC, HZ/10); 1530 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1418,15 +1535,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1418 } 1535 }
1419 1536
1420 set_reclaim_mode(priority, sc, false); 1537 set_reclaim_mode(priority, sc, false);
1538 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1539 reclaim_mode |= ISOLATE_ACTIVE;
1540
1421 lru_add_drain(); 1541 lru_add_drain();
1542
1543 if (!sc->may_unmap)
1544 reclaim_mode |= ISOLATE_UNMAPPED;
1545 if (!sc->may_writepage)
1546 reclaim_mode |= ISOLATE_CLEAN;
1547
1422 spin_lock_irq(&zone->lru_lock); 1548 spin_lock_irq(&zone->lru_lock);
1423 1549
1424 if (scanning_global_lru(sc)) { 1550 if (scanning_global_lru(sc)) {
1425 nr_taken = isolate_pages_global(nr_to_scan, 1551 nr_taken = isolate_pages_global(nr_to_scan, &page_list,
1426 &page_list, &nr_scanned, sc->order, 1552 &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
1427 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1428 ISOLATE_BOTH : ISOLATE_INACTIVE,
1429 zone, 0, file);
1430 zone->pages_scanned += nr_scanned; 1553 zone->pages_scanned += nr_scanned;
1431 if (current_is_kswapd()) 1554 if (current_is_kswapd())
1432 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1555 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1435,12 +1558,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1435 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1558 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1436 nr_scanned); 1559 nr_scanned);
1437 } else { 1560 } else {
1438 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1561 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
1439 &page_list, &nr_scanned, sc->order, 1562 &nr_scanned, sc->order, reclaim_mode, zone,
1440 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? 1563 sc->mem_cgroup, 0, file);
1441 ISOLATE_BOTH : ISOLATE_INACTIVE,
1442 zone, sc->mem_cgroup,
1443 0, file);
1444 /* 1564 /*
1445 * mem_cgroup_isolate_pages() keeps track of 1565 * mem_cgroup_isolate_pages() keeps track of
1446 * scanned pages on its own. 1566 * scanned pages on its own.
@@ -1456,12 +1576,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1456 1576
1457 spin_unlock_irq(&zone->lru_lock); 1577 spin_unlock_irq(&zone->lru_lock);
1458 1578
1459 nr_reclaimed = shrink_page_list(&page_list, zone, sc); 1579 nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
1580 &nr_dirty, &nr_writeback);
1460 1581
1461 /* Check if we should syncronously wait for writeback */ 1582 /* Check if we should syncronously wait for writeback */
1462 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1583 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1463 set_reclaim_mode(priority, sc, true); 1584 set_reclaim_mode(priority, sc, true);
1464 nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1585 nr_reclaimed += shrink_page_list(&page_list, zone, sc,
1586 priority, &nr_dirty, &nr_writeback);
1465 } 1587 }
1466 1588
1467 local_irq_disable(); 1589 local_irq_disable();
@@ -1471,6 +1593,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1471 1593
1472 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1594 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1473 1595
1596 /*
1597 * If reclaim is isolating dirty pages under writeback, it implies
1598 * that the long-lived page allocation rate is exceeding the page
1599 * laundering rate. Either the global limits are not being effective
1600 * at throttling processes due to the page distribution throughout
1601 * zones or there is heavy usage of a slow backing device. The
1602 * only option is to throttle from reclaim context which is not ideal
1603 * as there is no guarantee the dirtying process is throttled in the
1604 * same way balance_dirty_pages() manages.
1605 *
1606 * This scales the number of dirty pages that must be under writeback
1607 * before throttling depending on priority. It is a simple backoff
1608 * function that has the most effect in the range DEF_PRIORITY to
1609 * DEF_PRIORITY-2 which is the priority reclaim is considered to be
1610 * in trouble and reclaim is considered to be in trouble.
1611 *
1612 * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
1613 * DEF_PRIORITY-1 50% must be PageWriteback
1614 * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
1615 * ...
1616 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1617 * isolated page is PageWriteback
1618 */
1619 if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
1620 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1621
1474 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1622 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1475 zone_idx(zone), 1623 zone_idx(zone),
1476 nr_scanned, nr_reclaimed, 1624 nr_scanned, nr_reclaimed,
@@ -1542,19 +1690,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1542 struct page *page; 1690 struct page *page;
1543 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1691 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1544 unsigned long nr_rotated = 0; 1692 unsigned long nr_rotated = 0;
1693 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
1545 1694
1546 lru_add_drain(); 1695 lru_add_drain();
1696
1697 if (!sc->may_unmap)
1698 reclaim_mode |= ISOLATE_UNMAPPED;
1699 if (!sc->may_writepage)
1700 reclaim_mode |= ISOLATE_CLEAN;
1701
1547 spin_lock_irq(&zone->lru_lock); 1702 spin_lock_irq(&zone->lru_lock);
1548 if (scanning_global_lru(sc)) { 1703 if (scanning_global_lru(sc)) {
1549 nr_taken = isolate_pages_global(nr_pages, &l_hold, 1704 nr_taken = isolate_pages_global(nr_pages, &l_hold,
1550 &pgscanned, sc->order, 1705 &pgscanned, sc->order,
1551 ISOLATE_ACTIVE, zone, 1706 reclaim_mode, zone,
1552 1, file); 1707 1, file);
1553 zone->pages_scanned += pgscanned; 1708 zone->pages_scanned += pgscanned;
1554 } else { 1709 } else {
1555 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, 1710 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1556 &pgscanned, sc->order, 1711 &pgscanned, sc->order,
1557 ISOLATE_ACTIVE, zone, 1712 reclaim_mode, zone,
1558 sc->mem_cgroup, 1, file); 1713 sc->mem_cgroup, 1, file);
1559 /* 1714 /*
1560 * mem_cgroup_isolate_pages() keeps track of 1715 * mem_cgroup_isolate_pages() keeps track of
@@ -1600,6 +1755,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1600 } 1755 }
1601 1756
1602 ClearPageActive(page); /* we are de-activating */ 1757 ClearPageActive(page); /* we are de-activating */
1758#ifdef CONFIG_CLEANCACHE
1759 SetPageWasActive(page);
1760#endif
1603 list_add(&page->lru, &l_inactive); 1761 list_add(&page->lru, &l_inactive);
1604 } 1762 }
1605 1763
@@ -1747,22 +1905,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1747 u64 fraction[2], denominator; 1905 u64 fraction[2], denominator;
1748 enum lru_list l; 1906 enum lru_list l;
1749 int noswap = 0; 1907 int noswap = 0;
1750 int force_scan = 0; 1908 bool force_scan = false;
1751 1909 unsigned long nr_force_scan[2];
1752 1910
1753 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1911 /* kswapd does zone balancing and needs to scan this zone */
1754 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1912 if (scanning_global_lru(sc) && current_is_kswapd() &&
1755 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + 1913 zone->all_unreclaimable)
1756 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 1914 force_scan = true;
1757 1915 /* memcg may have small limit and need to avoid priority drop */
1758 if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { 1916 if (!scanning_global_lru(sc))
1759 /* kswapd does zone balancing and need to scan this zone */ 1917 force_scan = true;
1760 if (scanning_global_lru(sc) && current_is_kswapd())
1761 force_scan = 1;
1762 /* memcg may have small limit and need to avoid priority drop */
1763 if (!scanning_global_lru(sc))
1764 force_scan = 1;
1765 }
1766 1918
1767 /* If we have no swap space, do not bother scanning anon pages. */ 1919 /* If we have no swap space, do not bother scanning anon pages. */
1768 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1920 if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1770,9 +1922,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1770 fraction[0] = 0; 1922 fraction[0] = 0;
1771 fraction[1] = 1; 1923 fraction[1] = 1;
1772 denominator = 1; 1924 denominator = 1;
1925 nr_force_scan[0] = 0;
1926 nr_force_scan[1] = SWAP_CLUSTER_MAX;
1773 goto out; 1927 goto out;
1774 } 1928 }
1775 1929
1930 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1931 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1932 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1933 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1934
1776 if (scanning_global_lru(sc)) { 1935 if (scanning_global_lru(sc)) {
1777 free = zone_page_state(zone, NR_FREE_PAGES); 1936 free = zone_page_state(zone, NR_FREE_PAGES);
1778 /* If we have very few page cache pages, 1937 /* If we have very few page cache pages,
@@ -1781,6 +1940,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1781 fraction[0] = 1; 1940 fraction[0] = 1;
1782 fraction[1] = 0; 1941 fraction[1] = 0;
1783 denominator = 1; 1942 denominator = 1;
1943 nr_force_scan[0] = SWAP_CLUSTER_MAX;
1944 nr_force_scan[1] = 0;
1784 goto out; 1945 goto out;
1785 } 1946 }
1786 } 1947 }
@@ -1829,6 +1990,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1829 fraction[0] = ap; 1990 fraction[0] = ap;
1830 fraction[1] = fp; 1991 fraction[1] = fp;
1831 denominator = ap + fp + 1; 1992 denominator = ap + fp + 1;
1993 if (force_scan) {
1994 unsigned long scan = SWAP_CLUSTER_MAX;
1995 nr_force_scan[0] = div64_u64(scan * ap, denominator);
1996 nr_force_scan[1] = div64_u64(scan * fp, denominator);
1997 }
1832out: 1998out:
1833 for_each_evictable_lru(l) { 1999 for_each_evictable_lru(l) {
1834 int file = is_file_lru(l); 2000 int file = is_file_lru(l);
@@ -1849,12 +2015,8 @@ out:
1849 * memcg, priority drop can cause big latency. So, it's better 2015 * memcg, priority drop can cause big latency. So, it's better
1850 * to scan small amount. See may_noscan above. 2016 * to scan small amount. See may_noscan above.
1851 */ 2017 */
1852 if (!scan && force_scan) { 2018 if (!scan && force_scan)
1853 if (file) 2019 scan = nr_force_scan[file];
1854 scan = SWAP_CLUSTER_MAX;
1855 else if (!noswap)
1856 scan = SWAP_CLUSTER_MAX;
1857 }
1858 nr[l] = scan; 2020 nr[l] = scan;
1859 } 2021 }
1860} 2022}
@@ -1906,8 +2068,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
1906 * inactive lists are large enough, continue reclaiming 2068 * inactive lists are large enough, continue reclaiming
1907 */ 2069 */
1908 pages_for_compaction = (2UL << sc->order); 2070 pages_for_compaction = (2UL << sc->order);
1909 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + 2071 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1910 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 2072 if (nr_swap_pages > 0)
2073 inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1911 if (sc->nr_reclaimed < pages_for_compaction && 2074 if (sc->nr_reclaimed < pages_for_compaction &&
1912 inactive_lru_pages > pages_for_compaction) 2075 inactive_lru_pages > pages_for_compaction)
1913 return true; 2076 return true;
@@ -1979,6 +2142,42 @@ restart:
1979 throttle_vm_writeout(sc->gfp_mask); 2142 throttle_vm_writeout(sc->gfp_mask);
1980} 2143}
1981 2144
2145/* Returns true if compaction should go ahead for a high-order request */
2146static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2147{
2148 unsigned long balance_gap, watermark;
2149 bool watermark_ok;
2150
2151 /* Do not consider compaction for orders reclaim is meant to satisfy */
2152 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
2153 return false;
2154
2155 /*
2156 * Compaction takes time to run and there are potentially other
2157 * callers using the pages just freed. Continue reclaiming until
2158 * there is a buffer of free pages available to give compaction
2159 * a reasonable chance of completing and allocating the page
2160 */
2161 balance_gap = min(low_wmark_pages(zone),
2162 (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2163 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2164 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
2165 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
2166
2167 /*
2168 * If compaction is deferred, reclaim up to a point where
2169 * compaction will have a chance of success when re-enabled
2170 */
2171 if (compaction_deferred(zone))
2172 return watermark_ok;
2173
2174 /* If compaction is not ready to start, keep reclaiming */
2175 if (!compaction_suitable(zone, sc->order))
2176 return false;
2177
2178 return watermark_ok;
2179}
2180
1982/* 2181/*
1983 * This is the direct reclaim path, for page-allocating processes. We only 2182 * This is the direct reclaim path, for page-allocating processes. We only
1984 * try to reclaim pages from zones which will satisfy the caller's allocation 2183 * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -1994,14 +2193,20 @@ restart:
1994 * 2193 *
1995 * If a zone is deemed to be full of pinned pages then just give it a light 2194 * If a zone is deemed to be full of pinned pages then just give it a light
1996 * scan then give up on it. 2195 * scan then give up on it.
2196 *
2197 * This function returns true if a zone is being reclaimed for a costly
2198 * high-order allocation and compaction is ready to begin. This indicates to
2199 * the caller that it should consider retrying the allocation instead of
2200 * further reclaim.
1997 */ 2201 */
1998static void shrink_zones(int priority, struct zonelist *zonelist, 2202static bool shrink_zones(int priority, struct zonelist *zonelist,
1999 struct scan_control *sc) 2203 struct scan_control *sc)
2000{ 2204{
2001 struct zoneref *z; 2205 struct zoneref *z;
2002 struct zone *zone; 2206 struct zone *zone;
2003 unsigned long nr_soft_reclaimed; 2207 unsigned long nr_soft_reclaimed;
2004 unsigned long nr_soft_scanned; 2208 unsigned long nr_soft_scanned;
2209 bool aborted_reclaim = false;
2005 2210
2006 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2211 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2007 gfp_zone(sc->gfp_mask), sc->nodemask) { 2212 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2016,6 +2221,21 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
2016 continue; 2221 continue;
2017 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2222 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2018 continue; /* Let kswapd poll it */ 2223 continue; /* Let kswapd poll it */
2224 if (COMPACTION_BUILD) {
2225 /*
2226 * If we already have plenty of memory free for
2227 * compaction in this zone, don't free any more.
2228 * Even though compaction is invoked for any
2229 * non-zero order, only frequent costly order
2230 * reclamation is disruptive enough to become a
2231 * noticable problem, like transparent huge page
2232 * allocations.
2233 */
2234 if (compaction_ready(zone, sc)) {
2235 aborted_reclaim = true;
2236 continue;
2237 }
2238 }
2019 /* 2239 /*
2020 * This steals pages from memory cgroups over softlimit 2240 * This steals pages from memory cgroups over softlimit
2021 * and returns the number of reclaimed pages and 2241 * and returns the number of reclaimed pages and
@@ -2033,6 +2253,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
2033 2253
2034 shrink_zone(priority, zone, sc); 2254 shrink_zone(priority, zone, sc);
2035 } 2255 }
2256
2257 return aborted_reclaim;
2036} 2258}
2037 2259
2038static bool zone_reclaimable(struct zone *zone) 2260static bool zone_reclaimable(struct zone *zone)
@@ -2086,8 +2308,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2086 struct zoneref *z; 2308 struct zoneref *z;
2087 struct zone *zone; 2309 struct zone *zone;
2088 unsigned long writeback_threshold; 2310 unsigned long writeback_threshold;
2311 bool aborted_reclaim;
2089 2312
2090 get_mems_allowed();
2091 delayacct_freepages_start(); 2313 delayacct_freepages_start();
2092 2314
2093 if (scanning_global_lru(sc)) 2315 if (scanning_global_lru(sc))
@@ -2097,7 +2319,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2097 sc->nr_scanned = 0; 2319 sc->nr_scanned = 0;
2098 if (!priority) 2320 if (!priority)
2099 disable_swap_token(sc->mem_cgroup); 2321 disable_swap_token(sc->mem_cgroup);
2100 shrink_zones(priority, zonelist, sc); 2322 aborted_reclaim = shrink_zones(priority, zonelist, sc);
2323
2101 /* 2324 /*
2102 * Don't shrink slabs when reclaiming memory from 2325 * Don't shrink slabs when reclaiming memory from
2103 * over limit cgroups 2326 * over limit cgroups
@@ -2131,7 +2354,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2131 */ 2354 */
2132 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; 2355 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2133 if (total_scanned > writeback_threshold) { 2356 if (total_scanned > writeback_threshold) {
2134 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); 2357 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
2358 WB_REASON_TRY_TO_FREE_PAGES);
2135 sc->may_writepage = 1; 2359 sc->may_writepage = 1;
2136 } 2360 }
2137 2361
@@ -2149,7 +2373,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2149 2373
2150out: 2374out:
2151 delayacct_freepages_end(); 2375 delayacct_freepages_end();
2152 put_mems_allowed();
2153 2376
2154 if (sc->nr_reclaimed) 2377 if (sc->nr_reclaimed)
2155 return sc->nr_reclaimed; 2378 return sc->nr_reclaimed;
@@ -2162,6 +2385,10 @@ out:
2162 if (oom_killer_disabled) 2385 if (oom_killer_disabled)
2163 return 0; 2386 return 0;
2164 2387
2388 /* Aborted reclaim to try compaction? don't OOM, then */
2389 if (aborted_reclaim)
2390 return 1;
2391
2165 /* top priority shrink_zones still had more to do? don't OOM, then */ 2392 /* top priority shrink_zones still had more to do? don't OOM, then */
2166 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) 2393 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
2167 return 1; 2394 return 1;
@@ -2453,6 +2680,9 @@ loop_again:
2453 high_wmark_pages(zone), 0, 0)) { 2680 high_wmark_pages(zone), 0, 0)) {
2454 end_zone = i; 2681 end_zone = i;
2455 break; 2682 break;
2683 } else {
2684 /* If balanced, clear the congested flag */
2685 zone_clear_flag(zone, ZONE_CONGESTED);
2456 } 2686 }
2457 } 2687 }
2458 if (i < 0) 2688 if (i < 0)
@@ -2689,7 +2919,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2689 * them before going back to sleep. 2919 * them before going back to sleep.
2690 */ 2920 */
2691 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2921 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2692 schedule(); 2922
2923 if (!kthread_should_stop())
2924 schedule();
2925
2693 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 2926 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2694 } else { 2927 } else {
2695 if (remaining) 2928 if (remaining)
@@ -2716,7 +2949,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2716static int kswapd(void *p) 2949static int kswapd(void *p)
2717{ 2950{
2718 unsigned long order, new_order; 2951 unsigned long order, new_order;
2952 unsigned balanced_order;
2719 int classzone_idx, new_classzone_idx; 2953 int classzone_idx, new_classzone_idx;
2954 int balanced_classzone_idx;
2720 pg_data_t *pgdat = (pg_data_t*)p; 2955 pg_data_t *pgdat = (pg_data_t*)p;
2721 struct task_struct *tsk = current; 2956 struct task_struct *tsk = current;
2722 2957
@@ -2747,7 +2982,9 @@ static int kswapd(void *p)
2747 set_freezable(); 2982 set_freezable();
2748 2983
2749 order = new_order = 0; 2984 order = new_order = 0;
2985 balanced_order = 0;
2750 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 2986 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2987 balanced_classzone_idx = classzone_idx;
2751 for ( ; ; ) { 2988 for ( ; ; ) {
2752 int ret; 2989 int ret;
2753 2990
@@ -2756,7 +2993,8 @@ static int kswapd(void *p)
2756 * new request of a similar or harder type will succeed soon 2993 * new request of a similar or harder type will succeed soon
2757 * so consider going to sleep on the basis we reclaimed at 2994 * so consider going to sleep on the basis we reclaimed at
2758 */ 2995 */
2759 if (classzone_idx >= new_classzone_idx && order == new_order) { 2996 if (balanced_classzone_idx >= new_classzone_idx &&
2997 balanced_order == new_order) {
2760 new_order = pgdat->kswapd_max_order; 2998 new_order = pgdat->kswapd_max_order;
2761 new_classzone_idx = pgdat->classzone_idx; 2999 new_classzone_idx = pgdat->classzone_idx;
2762 pgdat->kswapd_max_order = 0; 3000 pgdat->kswapd_max_order = 0;
@@ -2771,9 +3009,12 @@ static int kswapd(void *p)
2771 order = new_order; 3009 order = new_order;
2772 classzone_idx = new_classzone_idx; 3010 classzone_idx = new_classzone_idx;
2773 } else { 3011 } else {
2774 kswapd_try_to_sleep(pgdat, order, classzone_idx); 3012 kswapd_try_to_sleep(pgdat, balanced_order,
3013 balanced_classzone_idx);
2775 order = pgdat->kswapd_max_order; 3014 order = pgdat->kswapd_max_order;
2776 classzone_idx = pgdat->classzone_idx; 3015 classzone_idx = pgdat->classzone_idx;
3016 new_order = order;
3017 new_classzone_idx = classzone_idx;
2777 pgdat->kswapd_max_order = 0; 3018 pgdat->kswapd_max_order = 0;
2778 pgdat->classzone_idx = pgdat->nr_zones - 1; 3019 pgdat->classzone_idx = pgdat->nr_zones - 1;
2779 } 3020 }
@@ -2788,7 +3029,9 @@ static int kswapd(void *p)
2788 */ 3029 */
2789 if (!ret) { 3030 if (!ret) {
2790 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 3031 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2791 order = balance_pgdat(pgdat, order, &classzone_idx); 3032 balanced_classzone_idx = classzone_idx;
3033 balanced_order = balance_pgdat(pgdat, order,
3034 &balanced_classzone_idx);
2792 } 3035 }
2793 } 3036 }
2794 return 0; 3037 return 0;
@@ -2946,14 +3189,17 @@ int kswapd_run(int nid)
2946} 3189}
2947 3190
2948/* 3191/*
2949 * Called by memory hotplug when all memory in a node is offlined. 3192 * Called by memory hotplug when all memory in a node is offlined. Caller must
3193 * hold lock_memory_hotplug().
2950 */ 3194 */
2951void kswapd_stop(int nid) 3195void kswapd_stop(int nid)
2952{ 3196{
2953 struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 3197 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
2954 3198
2955 if (kswapd) 3199 if (kswapd) {
2956 kthread_stop(kswapd); 3200 kthread_stop(kswapd);
3201 NODE_DATA(nid)->kswapd = NULL;
3202 }
2957} 3203}
2958 3204
2959static int __init kswapd_init(void) 3205static int __init kswapd_init(void)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7694b..9c001a268ab 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu)
78 * 78 *
79 * vm_stat contains the global counters 79 * vm_stat contains the global counters
80 */ 80 */
81atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 81atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
82EXPORT_SYMBOL(vm_stat); 82EXPORT_SYMBOL(vm_stat);
83 83
84#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
@@ -702,6 +702,7 @@ const char * const vmstat_text[] = {
702 "nr_unstable", 702 "nr_unstable",
703 "nr_bounce", 703 "nr_bounce",
704 "nr_vmscan_write", 704 "nr_vmscan_write",
705 "nr_vmscan_immediate_reclaim",
705 "nr_writeback_temp", 706 "nr_writeback_temp",
706 "nr_isolated_anon", 707 "nr_isolated_anon",
707 "nr_isolated_file", 708 "nr_isolated_file",