aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/Makefile1
-rw-r--r--mm/ashmem.c748
-rw-r--r--mm/backing-dev.c123
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/failslab.c39
-rw-r--r--mm/filemap.c160
-rw-r--r--mm/highmem.c4
-rw-r--r--mm/huge_memory.c43
-rw-r--r--mm/hugetlb.c46
-rw-r--r--mm/init-mm.c2
-rw-r--r--mm/internal.h46
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memblock.c8
-rw-r--r--mm/memcontrol.c500
-rw-r--r--mm/memory-failure.c92
-rw-r--r--mm/memory.c127
-rw-r--r--mm/memory_hotplug.c68
-rw-r--r--mm/mempolicy.c32
-rw-r--r--mm/migrate.c8
-rw-r--r--mm/mincore.c11
-rw-r--r--mm/mmap.c34
-rw-r--r--mm/nommu.c37
-rw-r--r--mm/oom_kill.c11
-rw-r--r--mm/page-writeback.c280
-rw-r--r--mm/page_alloc.c207
-rw-r--r--mm/page_cgroup.c10
-rw-r--r--mm/pagewalk.c49
-rw-r--r--mm/percpu-vm.c12
-rw-r--r--mm/percpu.c40
-rw-r--r--mm/rmap.c11
-rw-r--r--mm/shmem.c1828
-rw-r--r--mm/slab.c121
-rw-r--r--mm/slob.c8
-rw-r--r--mm/slub.c882
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c83
-rw-r--r--mm/swapfile.c49
-rw-r--r--mm/thrash.c17
-rw-r--r--mm/truncate.c154
-rw-r--r--mm/vmalloc.c102
-rw-r--r--mm/vmscan.c145
-rw-r--r--mm/vmstat.c4
44 files changed, 3794 insertions, 2358 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 8ca47a5ee9c..f2f1ca19ed5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -356,7 +356,7 @@ config CLEANCACHE
356 for clean pages that the kernel's pageframe replacement algorithm 356 for clean pages that the kernel's pageframe replacement algorithm
357 (PFRA) would like to keep around, but can't since there isn't enough 357 (PFRA) would like to keep around, but can't since there isn't enough
358 memory. So when the PFRA "evicts" a page, it first attempts to use 358 memory. So when the PFRA "evicts" a page, it first attempts to use
359 cleancacne code to put the data contained in that page into 359 cleancache code to put the data contained in that page into
360 "transcendent memory", memory that is not directly accessible or 360 "transcendent memory", memory that is not directly accessible or
361 addressable by the kernel and is of unknown and possibly 361 addressable by the kernel and is of unknown and possibly
362 time-varying size. And when a cleancache-enabled 362 time-varying size. And when a cleancache-enabled
diff --git a/mm/Makefile b/mm/Makefile
index 836e4163c1b..2d00bf57ca4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o
30obj-$(CONFIG_NUMA) += mempolicy.o 30obj-$(CONFIG_NUMA) += mempolicy.o
31obj-$(CONFIG_SPARSEMEM) += sparse.o 31obj-$(CONFIG_SPARSEMEM) += sparse.o
32obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 32obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
33obj-$(CONFIG_ASHMEM) += ashmem.o
33obj-$(CONFIG_SLOB) += slob.o 34obj-$(CONFIG_SLOB) += slob.o
34obj-$(CONFIG_COMPACTION) += compaction.o 35obj-$(CONFIG_COMPACTION) += compaction.o
35obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 36obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
diff --git a/mm/ashmem.c b/mm/ashmem.c
new file mode 100644
index 00000000000..66e3f23ee33
--- /dev/null
+++ b/mm/ashmem.c
@@ -0,0 +1,748 @@
1/* mm/ashmem.c
2**
3** Anonymous Shared Memory Subsystem, ashmem
4**
5** Copyright (C) 2008 Google, Inc.
6**
7** Robert Love <rlove@google.com>
8**
9** This software is licensed under the terms of the GNU General Public
10** License version 2, as published by the Free Software Foundation, and
11** may be copied, distributed, and modified under those terms.
12**
13** This program is distributed in the hope that it will be useful,
14** but WITHOUT ANY WARRANTY; without even the implied warranty of
15** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16** GNU General Public License for more details.
17*/
18
19#include <linux/module.h>
20#include <linux/file.h>
21#include <linux/fs.h>
22#include <linux/miscdevice.h>
23#include <linux/security.h>
24#include <linux/mm.h>
25#include <linux/mman.h>
26#include <linux/uaccess.h>
27#include <linux/personality.h>
28#include <linux/bitops.h>
29#include <linux/mutex.h>
30#include <linux/shmem_fs.h>
31#include <linux/ashmem.h>
32
33#define ASHMEM_NAME_PREFIX "dev/ashmem/"
34#define ASHMEM_NAME_PREFIX_LEN (sizeof(ASHMEM_NAME_PREFIX) - 1)
35#define ASHMEM_FULL_NAME_LEN (ASHMEM_NAME_LEN + ASHMEM_NAME_PREFIX_LEN)
36
37/*
38 * ashmem_area - anonymous shared memory area
39 * Lifecycle: From our parent file's open() until its release()
40 * Locking: Protected by `ashmem_mutex'
41 * Big Note: Mappings do NOT pin this structure; it dies on close()
42 */
43struct ashmem_area {
44 char name[ASHMEM_FULL_NAME_LEN];/* optional name for /proc/pid/maps */
45 struct list_head unpinned_list; /* list of all ashmem areas */
46 struct file *file; /* the shmem-based backing file */
47 size_t size; /* size of the mapping, in bytes */
48 unsigned long prot_mask; /* allowed prot bits, as vm_flags */
49};
50
51/*
52 * ashmem_range - represents an interval of unpinned (evictable) pages
53 * Lifecycle: From unpin to pin
54 * Locking: Protected by `ashmem_mutex'
55 */
56struct ashmem_range {
57 struct list_head lru; /* entry in LRU list */
58 struct list_head unpinned; /* entry in its area's unpinned list */
59 struct ashmem_area *asma; /* associated area */
60 size_t pgstart; /* starting page, inclusive */
61 size_t pgend; /* ending page, inclusive */
62 unsigned int purged; /* ASHMEM_NOT or ASHMEM_WAS_PURGED */
63};
64
65/* LRU list of unpinned pages, protected by ashmem_mutex */
66static LIST_HEAD(ashmem_lru_list);
67
68/* Count of pages on our LRU list, protected by ashmem_mutex */
69static unsigned long lru_count;
70
71/*
72 * ashmem_mutex - protects the list of and each individual ashmem_area
73 *
74 * Lock Ordering: ashmex_mutex -> i_mutex -> i_alloc_sem
75 */
76static DEFINE_MUTEX(ashmem_mutex);
77
78static struct kmem_cache *ashmem_area_cachep __read_mostly;
79static struct kmem_cache *ashmem_range_cachep __read_mostly;
80
81#define range_size(range) \
82 ((range)->pgend - (range)->pgstart + 1)
83
84#define range_on_lru(range) \
85 ((range)->purged == ASHMEM_NOT_PURGED)
86
87#define page_range_subsumes_range(range, start, end) \
88 (((range)->pgstart >= (start)) && ((range)->pgend <= (end)))
89
90#define page_range_subsumed_by_range(range, start, end) \
91 (((range)->pgstart <= (start)) && ((range)->pgend >= (end)))
92
93#define page_in_range(range, page) \
94 (((range)->pgstart <= (page)) && ((range)->pgend >= (page)))
95
96#define page_range_in_range(range, start, end) \
97 (page_in_range(range, start) || page_in_range(range, end) || \
98 page_range_subsumes_range(range, start, end))
99
100#define range_before_page(range, page) \
101 ((range)->pgend < (page))
102
103#define PROT_MASK (PROT_EXEC | PROT_READ | PROT_WRITE)
104
105static inline void lru_add(struct ashmem_range *range)
106{
107 list_add_tail(&range->lru, &ashmem_lru_list);
108 lru_count += range_size(range);
109}
110
111static inline void lru_del(struct ashmem_range *range)
112{
113 list_del(&range->lru);
114 lru_count -= range_size(range);
115}
116
117/*
118 * range_alloc - allocate and initialize a new ashmem_range structure
119 *
120 * 'asma' - associated ashmem_area
121 * 'prev_range' - the previous ashmem_range in the sorted asma->unpinned list
122 * 'purged' - initial purge value (ASMEM_NOT_PURGED or ASHMEM_WAS_PURGED)
123 * 'start' - starting page, inclusive
124 * 'end' - ending page, inclusive
125 *
126 * Caller must hold ashmem_mutex.
127 */
128static int range_alloc(struct ashmem_area *asma,
129 struct ashmem_range *prev_range, unsigned int purged,
130 size_t start, size_t end)
131{
132 struct ashmem_range *range;
133
134 range = kmem_cache_zalloc(ashmem_range_cachep, GFP_KERNEL);
135 if (unlikely(!range))
136 return -ENOMEM;
137
138 range->asma = asma;
139 range->pgstart = start;
140 range->pgend = end;
141 range->purged = purged;
142
143 list_add_tail(&range->unpinned, &prev_range->unpinned);
144
145 if (range_on_lru(range))
146 lru_add(range);
147
148 return 0;
149}
150
151static void range_del(struct ashmem_range *range)
152{
153 list_del(&range->unpinned);
154 if (range_on_lru(range))
155 lru_del(range);
156 kmem_cache_free(ashmem_range_cachep, range);
157}
158
159/*
160 * range_shrink - shrinks a range
161 *
162 * Caller must hold ashmem_mutex.
163 */
164static inline void range_shrink(struct ashmem_range *range,
165 size_t start, size_t end)
166{
167 size_t pre = range_size(range);
168
169 range->pgstart = start;
170 range->pgend = end;
171
172 if (range_on_lru(range))
173 lru_count -= pre - range_size(range);
174}
175
176static int ashmem_open(struct inode *inode, struct file *file)
177{
178 struct ashmem_area *asma;
179 int ret;
180
181 ret = generic_file_open(inode, file);
182 if (unlikely(ret))
183 return ret;
184
185 asma = kmem_cache_zalloc(ashmem_area_cachep, GFP_KERNEL);
186 if (unlikely(!asma))
187 return -ENOMEM;
188
189 INIT_LIST_HEAD(&asma->unpinned_list);
190 memcpy(asma->name, ASHMEM_NAME_PREFIX, ASHMEM_NAME_PREFIX_LEN);
191 asma->prot_mask = PROT_MASK;
192 file->private_data = asma;
193
194 return 0;
195}
196
197static int ashmem_release(struct inode *ignored, struct file *file)
198{
199 struct ashmem_area *asma = file->private_data;
200 struct ashmem_range *range, *next;
201
202 mutex_lock(&ashmem_mutex);
203 list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned)
204 range_del(range);
205 mutex_unlock(&ashmem_mutex);
206
207 if (asma->file)
208 fput(asma->file);
209 kmem_cache_free(ashmem_area_cachep, asma);
210
211 return 0;
212}
213
214static ssize_t ashmem_read(struct file *file, char __user *buf,
215 size_t len, loff_t *pos)
216{
217 struct ashmem_area *asma = file->private_data;
218 int ret = 0;
219
220 mutex_lock(&ashmem_mutex);
221
222 /* If size is not set, or set to 0, always return EOF. */
223 if (asma->size == 0) {
224 goto out;
225 }
226
227 if (!asma->file) {
228 ret = -EBADF;
229 goto out;
230 }
231
232 ret = asma->file->f_op->read(asma->file, buf, len, pos);
233 if (ret < 0) {
234 goto out;
235 }
236
237 /** Update backing file pos, since f_ops->read() doesn't */
238 asma->file->f_pos = *pos;
239
240out:
241 mutex_unlock(&ashmem_mutex);
242 return ret;
243}
244
245static loff_t ashmem_llseek(struct file *file, loff_t offset, int origin)
246{
247 struct ashmem_area *asma = file->private_data;
248 int ret;
249
250 mutex_lock(&ashmem_mutex);
251
252 if (asma->size == 0) {
253 ret = -EINVAL;
254 goto out;
255 }
256
257 if (!asma->file) {
258 ret = -EBADF;
259 goto out;
260 }
261
262 ret = asma->file->f_op->llseek(asma->file, offset, origin);
263 if (ret < 0) {
264 goto out;
265 }
266
267 /** Copy f_pos from backing file, since f_ops->llseek() sets it */
268 file->f_pos = asma->file->f_pos;
269
270out:
271 mutex_unlock(&ashmem_mutex);
272 return ret;
273}
274
275static inline unsigned long
276calc_vm_may_flags(unsigned long prot)
277{
278 return _calc_vm_trans(prot, PROT_READ, VM_MAYREAD ) |
279 _calc_vm_trans(prot, PROT_WRITE, VM_MAYWRITE) |
280 _calc_vm_trans(prot, PROT_EXEC, VM_MAYEXEC);
281}
282
283static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
284{
285 struct ashmem_area *asma = file->private_data;
286 int ret = 0;
287
288 mutex_lock(&ashmem_mutex);
289
290 /* user needs to SET_SIZE before mapping */
291 if (unlikely(!asma->size)) {
292 ret = -EINVAL;
293 goto out;
294 }
295
296 /* requested protection bits must match our allowed protection mask */
297 if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) &
298 calc_vm_prot_bits(PROT_MASK))) {
299 ret = -EPERM;
300 goto out;
301 }
302 vma->vm_flags &= ~calc_vm_may_flags(~asma->prot_mask);
303
304 if (!asma->file) {
305 char *name = ASHMEM_NAME_DEF;
306 struct file *vmfile;
307
308 if (asma->name[ASHMEM_NAME_PREFIX_LEN] != '\0')
309 name = asma->name;
310
311 /* ... and allocate the backing shmem file */
312 vmfile = shmem_file_setup(name, asma->size, vma->vm_flags);
313 if (unlikely(IS_ERR(vmfile))) {
314 ret = PTR_ERR(vmfile);
315 goto out;
316 }
317 asma->file = vmfile;
318 }
319 get_file(asma->file);
320
321 if (vma->vm_flags & VM_SHARED)
322 shmem_set_file(vma, asma->file);
323 else {
324 if (vma->vm_file)
325 fput(vma->vm_file);
326 vma->vm_file = asma->file;
327 }
328 vma->vm_flags |= VM_CAN_NONLINEAR;
329
330out:
331 mutex_unlock(&ashmem_mutex);
332 return ret;
333}
334
335/*
336 * ashmem_shrink - our cache shrinker, called from mm/vmscan.c :: shrink_slab
337 *
338 * 'nr_to_scan' is the number of objects (pages) to prune, or 0 to query how
339 * many objects (pages) we have in total.
340 *
341 * 'gfp_mask' is the mask of the allocation that got us into this mess.
342 *
343 * Return value is the number of objects (pages) remaining, or -1 if we cannot
344 * proceed without risk of deadlock (due to gfp_mask).
345 *
346 * We approximate LRU via least-recently-unpinned, jettisoning unpinned partial
347 * chunks of ashmem regions LRU-wise one-at-a-time until we hit 'nr_to_scan'
348 * pages freed.
349 */
350static int ashmem_shrink(struct shrinker *s, struct shrink_control *sc)
351{
352 struct ashmem_range *range, *next;
353
354 /* We might recurse into filesystem code, so bail out if necessary */
355 if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
356 return -1;
357 if (!sc->nr_to_scan)
358 return lru_count;
359
360 mutex_lock(&ashmem_mutex);
361 list_for_each_entry_safe(range, next, &ashmem_lru_list, lru) {
362 struct inode *inode = range->asma->file->f_dentry->d_inode;
363 loff_t start = range->pgstart * PAGE_SIZE;
364 loff_t end = (range->pgend + 1) * PAGE_SIZE - 1;
365
366 vmtruncate_range(inode, start, end);
367 range->purged = ASHMEM_WAS_PURGED;
368 lru_del(range);
369
370 sc->nr_to_scan -= range_size(range);
371 if (sc->nr_to_scan <= 0)
372 break;
373 }
374 mutex_unlock(&ashmem_mutex);
375
376 return lru_count;
377}
378
379static struct shrinker ashmem_shrinker = {
380 .shrink = ashmem_shrink,
381 .seeks = DEFAULT_SEEKS * 4,
382};
383
384static int set_prot_mask(struct ashmem_area *asma, unsigned long prot)
385{
386 int ret = 0;
387
388 mutex_lock(&ashmem_mutex);
389
390 /* the user can only remove, not add, protection bits */
391 if (unlikely((asma->prot_mask & prot) != prot)) {
392 ret = -EINVAL;
393 goto out;
394 }
395
396 /* does the application expect PROT_READ to imply PROT_EXEC? */
397 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
398 prot |= PROT_EXEC;
399
400 asma->prot_mask = prot;
401
402out:
403 mutex_unlock(&ashmem_mutex);
404 return ret;
405}
406
407static int set_name(struct ashmem_area *asma, void __user *name)
408{
409 int ret = 0;
410
411 mutex_lock(&ashmem_mutex);
412
413 /* cannot change an existing mapping's name */
414 if (unlikely(asma->file)) {
415 ret = -EINVAL;
416 goto out;
417 }
418
419 if (unlikely(copy_from_user(asma->name + ASHMEM_NAME_PREFIX_LEN,
420 name, ASHMEM_NAME_LEN)))
421 ret = -EFAULT;
422 asma->name[ASHMEM_FULL_NAME_LEN-1] = '\0';
423
424out:
425 mutex_unlock(&ashmem_mutex);
426
427 return ret;
428}
429
430static int get_name(struct ashmem_area *asma, void __user *name)
431{
432 int ret = 0;
433
434 mutex_lock(&ashmem_mutex);
435 if (asma->name[ASHMEM_NAME_PREFIX_LEN] != '\0') {
436 size_t len;
437
438 /*
439 * Copying only `len', instead of ASHMEM_NAME_LEN, bytes
440 * prevents us from revealing one user's stack to another.
441 */
442 len = strlen(asma->name + ASHMEM_NAME_PREFIX_LEN) + 1;
443 if (unlikely(copy_to_user(name,
444 asma->name + ASHMEM_NAME_PREFIX_LEN, len)))
445 ret = -EFAULT;
446 } else {
447 if (unlikely(copy_to_user(name, ASHMEM_NAME_DEF,
448 sizeof(ASHMEM_NAME_DEF))))
449 ret = -EFAULT;
450 }
451 mutex_unlock(&ashmem_mutex);
452
453 return ret;
454}
455
456/*
457 * ashmem_pin - pin the given ashmem region, returning whether it was
458 * previously purged (ASHMEM_WAS_PURGED) or not (ASHMEM_NOT_PURGED).
459 *
460 * Caller must hold ashmem_mutex.
461 */
462static int ashmem_pin(struct ashmem_area *asma, size_t pgstart, size_t pgend)
463{
464 struct ashmem_range *range, *next;
465 int ret = ASHMEM_NOT_PURGED;
466
467 list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned) {
468 /* moved past last applicable page; we can short circuit */
469 if (range_before_page(range, pgstart))
470 break;
471
472 /*
473 * The user can ask us to pin pages that span multiple ranges,
474 * or to pin pages that aren't even unpinned, so this is messy.
475 *
476 * Four cases:
477 * 1. The requested range subsumes an existing range, so we
478 * just remove the entire matching range.
479 * 2. The requested range overlaps the start of an existing
480 * range, so we just update that range.
481 * 3. The requested range overlaps the end of an existing
482 * range, so we just update that range.
483 * 4. The requested range punches a hole in an existing range,
484 * so we have to update one side of the range and then
485 * create a new range for the other side.
486 */
487 if (page_range_in_range(range, pgstart, pgend)) {
488 ret |= range->purged;
489
490 /* Case #1: Easy. Just nuke the whole thing. */
491 if (page_range_subsumes_range(range, pgstart, pgend)) {
492 range_del(range);
493 continue;
494 }
495
496 /* Case #2: We overlap from the start, so adjust it */
497 if (range->pgstart >= pgstart) {
498 range_shrink(range, pgend + 1, range->pgend);
499 continue;
500 }
501
502 /* Case #3: We overlap from the rear, so adjust it */
503 if (range->pgend <= pgend) {
504 range_shrink(range, range->pgstart, pgstart-1);
505 continue;
506 }
507
508 /*
509 * Case #4: We eat a chunk out of the middle. A bit
510 * more complicated, we allocate a new range for the
511 * second half and adjust the first chunk's endpoint.
512 */
513 range_alloc(asma, range, range->purged,
514 pgend + 1, range->pgend);
515 range_shrink(range, range->pgstart, pgstart - 1);
516 break;
517 }
518 }
519
520 return ret;
521}
522
523/*
524 * ashmem_unpin - unpin the given range of pages. Returns zero on success.
525 *
526 * Caller must hold ashmem_mutex.
527 */
528static int ashmem_unpin(struct ashmem_area *asma, size_t pgstart, size_t pgend)
529{
530 struct ashmem_range *range, *next;
531 unsigned int purged = ASHMEM_NOT_PURGED;
532
533restart:
534 list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned) {
535 /* short circuit: this is our insertion point */
536 if (range_before_page(range, pgstart))
537 break;
538
539 /*
540 * The user can ask us to unpin pages that are already entirely
541 * or partially pinned. We handle those two cases here.
542 */
543 if (page_range_subsumed_by_range(range, pgstart, pgend))
544 return 0;
545 if (page_range_in_range(range, pgstart, pgend)) {
546 pgstart = min_t(size_t, range->pgstart, pgstart),
547 pgend = max_t(size_t, range->pgend, pgend);
548 purged |= range->purged;
549 range_del(range);
550 goto restart;
551 }
552 }
553
554 return range_alloc(asma, range, purged, pgstart, pgend);
555}
556
557/*
558 * ashmem_get_pin_status - Returns ASHMEM_IS_UNPINNED if _any_ pages in the
559 * given interval are unpinned and ASHMEM_IS_PINNED otherwise.
560 *
561 * Caller must hold ashmem_mutex.
562 */
563static int ashmem_get_pin_status(struct ashmem_area *asma, size_t pgstart,
564 size_t pgend)
565{
566 struct ashmem_range *range;
567 int ret = ASHMEM_IS_PINNED;
568
569 list_for_each_entry(range, &asma->unpinned_list, unpinned) {
570 if (range_before_page(range, pgstart))
571 break;
572 if (page_range_in_range(range, pgstart, pgend)) {
573 ret = ASHMEM_IS_UNPINNED;
574 break;
575 }
576 }
577
578 return ret;
579}
580
581static int ashmem_pin_unpin(struct ashmem_area *asma, unsigned long cmd,
582 void __user *p)
583{
584 struct ashmem_pin pin;
585 size_t pgstart, pgend;
586 int ret = -EINVAL;
587
588 if (unlikely(!asma->file))
589 return -EINVAL;
590
591 if (unlikely(copy_from_user(&pin, p, sizeof(pin))))
592 return -EFAULT;
593
594 /* per custom, you can pass zero for len to mean "everything onward" */
595 if (!pin.len)
596 pin.len = PAGE_ALIGN(asma->size) - pin.offset;
597
598 if (unlikely((pin.offset | pin.len) & ~PAGE_MASK))
599 return -EINVAL;
600
601 if (unlikely(((__u32) -1) - pin.offset < pin.len))
602 return -EINVAL;
603
604 if (unlikely(PAGE_ALIGN(asma->size) < pin.offset + pin.len))
605 return -EINVAL;
606
607 pgstart = pin.offset / PAGE_SIZE;
608 pgend = pgstart + (pin.len / PAGE_SIZE) - 1;
609
610 mutex_lock(&ashmem_mutex);
611
612 switch (cmd) {
613 case ASHMEM_PIN:
614 ret = ashmem_pin(asma, pgstart, pgend);
615 break;
616 case ASHMEM_UNPIN:
617 ret = ashmem_unpin(asma, pgstart, pgend);
618 break;
619 case ASHMEM_GET_PIN_STATUS:
620 ret = ashmem_get_pin_status(asma, pgstart, pgend);
621 break;
622 }
623
624 mutex_unlock(&ashmem_mutex);
625
626 return ret;
627}
628
629static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
630{
631 struct ashmem_area *asma = file->private_data;
632 long ret = -ENOTTY;
633
634 switch (cmd) {
635 case ASHMEM_SET_NAME:
636 ret = set_name(asma, (void __user *) arg);
637 break;
638 case ASHMEM_GET_NAME:
639 ret = get_name(asma, (void __user *) arg);
640 break;
641 case ASHMEM_SET_SIZE:
642 ret = -EINVAL;
643 if (!asma->file) {
644 ret = 0;
645 asma->size = (size_t) arg;
646 }
647 break;
648 case ASHMEM_GET_SIZE:
649 ret = asma->size;
650 break;
651 case ASHMEM_SET_PROT_MASK:
652 ret = set_prot_mask(asma, arg);
653 break;
654 case ASHMEM_GET_PROT_MASK:
655 ret = asma->prot_mask;
656 break;
657 case ASHMEM_PIN:
658 case ASHMEM_UNPIN:
659 case ASHMEM_GET_PIN_STATUS:
660 ret = ashmem_pin_unpin(asma, cmd, (void __user *) arg);
661 break;
662 case ASHMEM_PURGE_ALL_CACHES:
663 ret = -EPERM;
664 if (capable(CAP_SYS_ADMIN)) {
665 struct shrink_control sc = {
666 .gfp_mask = GFP_KERNEL,
667 .nr_to_scan = 0,
668 };
669 ret = ashmem_shrink(&ashmem_shrinker, &sc);
670 sc.nr_to_scan = ret;
671 ashmem_shrink(&ashmem_shrinker, &sc);
672 }
673 break;
674 }
675
676 return ret;
677}
678
679static struct file_operations ashmem_fops = {
680 .owner = THIS_MODULE,
681 .open = ashmem_open,
682 .release = ashmem_release,
683 .read = ashmem_read,
684 .llseek = ashmem_llseek,
685 .mmap = ashmem_mmap,
686 .unlocked_ioctl = ashmem_ioctl,
687 .compat_ioctl = ashmem_ioctl,
688};
689
690static struct miscdevice ashmem_misc = {
691 .minor = MISC_DYNAMIC_MINOR,
692 .name = "ashmem",
693 .fops = &ashmem_fops,
694};
695
696static int __init ashmem_init(void)
697{
698 int ret;
699
700 ashmem_area_cachep = kmem_cache_create("ashmem_area_cache",
701 sizeof(struct ashmem_area),
702 0, 0, NULL);
703 if (unlikely(!ashmem_area_cachep)) {
704 printk(KERN_ERR "ashmem: failed to create slab cache\n");
705 return -ENOMEM;
706 }
707
708 ashmem_range_cachep = kmem_cache_create("ashmem_range_cache",
709 sizeof(struct ashmem_range),
710 0, 0, NULL);
711 if (unlikely(!ashmem_range_cachep)) {
712 printk(KERN_ERR "ashmem: failed to create slab cache\n");
713 return -ENOMEM;
714 }
715
716 ret = misc_register(&ashmem_misc);
717 if (unlikely(ret)) {
718 printk(KERN_ERR "ashmem: failed to register misc device!\n");
719 return ret;
720 }
721
722 register_shrinker(&ashmem_shrinker);
723
724 printk(KERN_INFO "ashmem: initialized\n");
725
726 return 0;
727}
728
729static void __exit ashmem_exit(void)
730{
731 int ret;
732
733 unregister_shrinker(&ashmem_shrinker);
734
735 ret = misc_deregister(&ashmem_misc);
736 if (unlikely(ret))
737 printk(KERN_ERR "ashmem: failed to unregister misc device!\n");
738
739 kmem_cache_destroy(ashmem_range_cachep);
740 kmem_cache_destroy(ashmem_area_cachep);
741
742 printk(KERN_INFO "ashmem: unloaded\n");
743}
744
745module_init(ashmem_init);
746module_exit(ashmem_exit);
747
748MODULE_LICENSE("GPL");
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f032e6e1e09..253b071b7d9 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer;
45static int bdi_sync_supers(void *); 45static int bdi_sync_supers(void *);
46static void sync_supers_timer_fn(unsigned long); 46static void sync_supers_timer_fn(unsigned long);
47 47
48void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
49{
50 if (wb1 < wb2) {
51 spin_lock(&wb1->list_lock);
52 spin_lock_nested(&wb2->list_lock, 1);
53 } else {
54 spin_lock(&wb2->list_lock);
55 spin_lock_nested(&wb1->list_lock, 1);
56 }
57}
58
48#ifdef CONFIG_DEBUG_FS 59#ifdef CONFIG_DEBUG_FS
49#include <linux/debugfs.h> 60#include <linux/debugfs.h>
50#include <linux/seq_file.h> 61#include <linux/seq_file.h>
@@ -67,34 +78,42 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
67 struct inode *inode; 78 struct inode *inode;
68 79
69 nr_dirty = nr_io = nr_more_io = 0; 80 nr_dirty = nr_io = nr_more_io = 0;
70 spin_lock(&inode_wb_list_lock); 81 spin_lock(&wb->list_lock);
71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 82 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
72 nr_dirty++; 83 nr_dirty++;
73 list_for_each_entry(inode, &wb->b_io, i_wb_list) 84 list_for_each_entry(inode, &wb->b_io, i_wb_list)
74 nr_io++; 85 nr_io++;
75 list_for_each_entry(inode, &wb->b_more_io, i_wb_list) 86 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
76 nr_more_io++; 87 nr_more_io++;
77 spin_unlock(&inode_wb_list_lock); 88 spin_unlock(&wb->list_lock);
78 89
79 global_dirty_limits(&background_thresh, &dirty_thresh); 90 global_dirty_limits(&background_thresh, &dirty_thresh);
80 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 91 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
81 92
82#define K(x) ((x) << (PAGE_SHIFT - 10)) 93#define K(x) ((x) << (PAGE_SHIFT - 10))
83 seq_printf(m, 94 seq_printf(m,
84 "BdiWriteback: %8lu kB\n" 95 "BdiWriteback: %10lu kB\n"
85 "BdiReclaimable: %8lu kB\n" 96 "BdiReclaimable: %10lu kB\n"
86 "BdiDirtyThresh: %8lu kB\n" 97 "BdiDirtyThresh: %10lu kB\n"
87 "DirtyThresh: %8lu kB\n" 98 "DirtyThresh: %10lu kB\n"
88 "BackgroundThresh: %8lu kB\n" 99 "BackgroundThresh: %10lu kB\n"
89 "b_dirty: %8lu\n" 100 "BdiWritten: %10lu kB\n"
90 "b_io: %8lu\n" 101 "BdiWriteBandwidth: %10lu kBps\n"
91 "b_more_io: %8lu\n" 102 "b_dirty: %10lu\n"
92 "bdi_list: %8u\n" 103 "b_io: %10lu\n"
93 "state: %8lx\n", 104 "b_more_io: %10lu\n"
105 "bdi_list: %10u\n"
106 "state: %10lx\n",
94 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 107 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
95 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 108 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
96 K(bdi_thresh), K(dirty_thresh), 109 K(bdi_thresh),
97 K(background_thresh), nr_dirty, nr_io, nr_more_io, 110 K(dirty_thresh),
111 K(background_thresh),
112 (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
113 (unsigned long) K(bdi->write_bandwidth),
114 nr_dirty,
115 nr_io,
116 nr_more_io,
98 !list_empty(&bdi->bdi_list), bdi->state); 117 !list_empty(&bdi->bdi_list), bdi->state);
99#undef K 118#undef K
100 119
@@ -249,18 +268,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
249 return wb_has_dirty_io(&bdi->wb); 268 return wb_has_dirty_io(&bdi->wb);
250} 269}
251 270
252static void bdi_flush_io(struct backing_dev_info *bdi)
253{
254 struct writeback_control wbc = {
255 .sync_mode = WB_SYNC_NONE,
256 .older_than_this = NULL,
257 .range_cyclic = 1,
258 .nr_to_write = 1024,
259 };
260
261 writeback_inodes_wb(&bdi->wb, &wbc);
262}
263
264/* 271/*
265 * kupdated() used to do this. We cannot do it from the bdi_forker_thread() 272 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
266 * or we risk deadlocking on ->s_umount. The longer term solution would be 273 * or we risk deadlocking on ->s_umount. The longer term solution would be
@@ -352,6 +359,17 @@ static unsigned long bdi_longest_inactive(void)
352 return max(5UL * 60 * HZ, interval); 359 return max(5UL * 60 * HZ, interval);
353} 360}
354 361
362/*
363 * Clear pending bit and wakeup anybody waiting for flusher thread creation or
364 * shutdown
365 */
366static void bdi_clear_pending(struct backing_dev_info *bdi)
367{
368 clear_bit(BDI_pending, &bdi->state);
369 smp_mb__after_clear_bit();
370 wake_up_bit(&bdi->state, BDI_pending);
371}
372
355static int bdi_forker_thread(void *ptr) 373static int bdi_forker_thread(void *ptr)
356{ 374{
357 struct bdi_writeback *me = ptr; 375 struct bdi_writeback *me = ptr;
@@ -383,6 +401,13 @@ static int bdi_forker_thread(void *ptr)
383 } 401 }
384 402
385 spin_lock_bh(&bdi_lock); 403 spin_lock_bh(&bdi_lock);
404 /*
405 * In the following loop we are going to check whether we have
406 * some work to do without any synchronization with tasks
407 * waking us up to do work for them. So we have to set task
408 * state already here so that we don't miss wakeups coming
409 * after we verify some condition.
410 */
386 set_current_state(TASK_INTERRUPTIBLE); 411 set_current_state(TASK_INTERRUPTIBLE);
387 412
388 list_for_each_entry(bdi, &bdi_list, bdi_list) { 413 list_for_each_entry(bdi, &bdi_list, bdi_list) {
@@ -446,9 +471,10 @@ static int bdi_forker_thread(void *ptr)
446 if (IS_ERR(task)) { 471 if (IS_ERR(task)) {
447 /* 472 /*
448 * If thread creation fails, force writeout of 473 * If thread creation fails, force writeout of
449 * the bdi from the thread. 474 * the bdi from the thread. Hopefully 1024 is
475 * large enough for efficient IO.
450 */ 476 */
451 bdi_flush_io(bdi); 477 writeback_inodes_wb(&bdi->wb, 1024);
452 } else { 478 } else {
453 /* 479 /*
454 * The spinlock makes sure we do not lose 480 * The spinlock makes sure we do not lose
@@ -461,11 +487,13 @@ static int bdi_forker_thread(void *ptr)
461 spin_unlock_bh(&bdi->wb_lock); 487 spin_unlock_bh(&bdi->wb_lock);
462 wake_up_process(task); 488 wake_up_process(task);
463 } 489 }
490 bdi_clear_pending(bdi);
464 break; 491 break;
465 492
466 case KILL_THREAD: 493 case KILL_THREAD:
467 __set_current_state(TASK_RUNNING); 494 __set_current_state(TASK_RUNNING);
468 kthread_stop(task); 495 kthread_stop(task);
496 bdi_clear_pending(bdi);
469 break; 497 break;
470 498
471 case NO_ACTION: 499 case NO_ACTION:
@@ -481,16 +509,8 @@ static int bdi_forker_thread(void *ptr)
481 else 509 else
482 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); 510 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
483 try_to_freeze(); 511 try_to_freeze();
484 /* Back to the main loop */ 512 break;
485 continue;
486 } 513 }
487
488 /*
489 * Clear pending bit and wakeup anybody waiting to tear us down.
490 */
491 clear_bit(BDI_pending, &bdi->state);
492 smp_mb__after_clear_bit();
493 wake_up_bit(&bdi->state, BDI_pending);
494 } 514 }
495 515
496 return 0; 516 return 0;
@@ -505,7 +525,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
505 list_del_rcu(&bdi->bdi_list); 525 list_del_rcu(&bdi->bdi_list);
506 spin_unlock_bh(&bdi_lock); 526 spin_unlock_bh(&bdi_lock);
507 527
508 synchronize_rcu(); 528 synchronize_rcu_expedited();
509} 529}
510 530
511int bdi_register(struct backing_dev_info *bdi, struct device *parent, 531int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -606,6 +626,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
606void bdi_unregister(struct backing_dev_info *bdi) 626void bdi_unregister(struct backing_dev_info *bdi)
607{ 627{
608 if (bdi->dev) { 628 if (bdi->dev) {
629 bdi_set_min_ratio(bdi, 0);
609 trace_writeback_bdi_unregister(bdi); 630 trace_writeback_bdi_unregister(bdi);
610 bdi_prune_sb(bdi); 631 bdi_prune_sb(bdi);
611 del_timer_sync(&bdi->wb.wakeup_timer); 632 del_timer_sync(&bdi->wb.wakeup_timer);
@@ -628,9 +649,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
628 INIT_LIST_HEAD(&wb->b_dirty); 649 INIT_LIST_HEAD(&wb->b_dirty);
629 INIT_LIST_HEAD(&wb->b_io); 650 INIT_LIST_HEAD(&wb->b_io);
630 INIT_LIST_HEAD(&wb->b_more_io); 651 INIT_LIST_HEAD(&wb->b_more_io);
652 spin_lock_init(&wb->list_lock);
631 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); 653 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
632} 654}
633 655
656/*
657 * Initial write bandwidth: 100 MB/s
658 */
659#define INIT_BW (100 << (20 - PAGE_SHIFT))
660
634int bdi_init(struct backing_dev_info *bdi) 661int bdi_init(struct backing_dev_info *bdi)
635{ 662{
636 int i, err; 663 int i, err;
@@ -653,6 +680,13 @@ int bdi_init(struct backing_dev_info *bdi)
653 } 680 }
654 681
655 bdi->dirty_exceeded = 0; 682 bdi->dirty_exceeded = 0;
683
684 bdi->bw_time_stamp = jiffies;
685 bdi->written_stamp = 0;
686
687 bdi->write_bandwidth = INIT_BW;
688 bdi->avg_write_bandwidth = INIT_BW;
689
656 err = prop_local_init_percpu(&bdi->completions); 690 err = prop_local_init_percpu(&bdi->completions);
657 691
658 if (err) { 692 if (err) {
@@ -676,15 +710,24 @@ void bdi_destroy(struct backing_dev_info *bdi)
676 if (bdi_has_dirty_io(bdi)) { 710 if (bdi_has_dirty_io(bdi)) {
677 struct bdi_writeback *dst = &default_backing_dev_info.wb; 711 struct bdi_writeback *dst = &default_backing_dev_info.wb;
678 712
679 spin_lock(&inode_wb_list_lock); 713 bdi_lock_two(&bdi->wb, dst);
680 list_splice(&bdi->wb.b_dirty, &dst->b_dirty); 714 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
681 list_splice(&bdi->wb.b_io, &dst->b_io); 715 list_splice(&bdi->wb.b_io, &dst->b_io);
682 list_splice(&bdi->wb.b_more_io, &dst->b_more_io); 716 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
683 spin_unlock(&inode_wb_list_lock); 717 spin_unlock(&bdi->wb.list_lock);
718 spin_unlock(&dst->list_lock);
684 } 719 }
685 720
686 bdi_unregister(bdi); 721 bdi_unregister(bdi);
687 722
723 /*
724 * If bdi_unregister() had already been called earlier, the
725 * wakeup_timer could still be armed because bdi_prune_sb()
726 * can race with the bdi_wakeup_thread_delayed() calls from
727 * __mark_inode_dirty().
728 */
729 del_timer_sync(&bdi->wb.wakeup_timer);
730
688 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 731 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
689 percpu_counter_destroy(&bdi->bdi_stat[i]); 732 percpu_counter_destroy(&bdi->bdi_stat[i]);
690 733
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 03bf3bb4519..fbb58e34688 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -500,7 +500,7 @@ void dmam_pool_destroy(struct dma_pool *pool)
500{ 500{
501 struct device *dev = pool->dev; 501 struct device *dev = pool->dev;
502 502
503 dma_pool_destroy(pool);
504 WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); 503 WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
504 dma_pool_destroy(pool);
505} 505}
506EXPORT_SYMBOL(dmam_pool_destroy); 506EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/failslab.c b/mm/failslab.c
index c5f88f240dd..0dd7b8fec71 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -5,10 +5,6 @@ static struct {
5 struct fault_attr attr; 5 struct fault_attr attr;
6 u32 ignore_gfp_wait; 6 u32 ignore_gfp_wait;
7 int cache_filter; 7 int cache_filter;
8#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
9 struct dentry *ignore_gfp_wait_file;
10 struct dentry *cache_filter_file;
11#endif
12} failslab = { 8} failslab = {
13 .attr = FAULT_ATTR_INITIALIZER, 9 .attr = FAULT_ATTR_INITIALIZER,
14 .ignore_gfp_wait = 1, 10 .ignore_gfp_wait = 1,
@@ -38,32 +34,25 @@ __setup("failslab=", setup_failslab);
38#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 34#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
39static int __init failslab_debugfs_init(void) 35static int __init failslab_debugfs_init(void)
40{ 36{
41 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
42 struct dentry *dir; 37 struct dentry *dir;
43 int err; 38 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
44
45 err = init_fault_attr_dentries(&failslab.attr, "failslab");
46 if (err)
47 return err;
48 dir = failslab.attr.dentries.dir;
49 39
50 failslab.ignore_gfp_wait_file = 40 dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
51 debugfs_create_bool("ignore-gfp-wait", mode, dir, 41 if (IS_ERR(dir))
52 &failslab.ignore_gfp_wait); 42 return PTR_ERR(dir);
53 43
54 failslab.cache_filter_file = 44 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
55 debugfs_create_bool("cache-filter", mode, dir, 45 &failslab.ignore_gfp_wait))
56 &failslab.cache_filter); 46 goto fail;
47 if (!debugfs_create_bool("cache-filter", mode, dir,
48 &failslab.cache_filter))
49 goto fail;
57 50
58 if (!failslab.ignore_gfp_wait_file || 51 return 0;
59 !failslab.cache_filter_file) { 52fail:
60 err = -ENOMEM; 53 debugfs_remove_recursive(dir);
61 debugfs_remove(failslab.cache_filter_file);
62 debugfs_remove(failslab.ignore_gfp_wait_file);
63 cleanup_fault_attr_dentries(&failslab.attr);
64 }
65 54
66 return err; 55 return -ENOMEM;
67} 56}
68 57
69late_initcall(failslab_debugfs_init); 58late_initcall(failslab_debugfs_init);
diff --git a/mm/filemap.c b/mm/filemap.c
index a8251a8d345..0eedbf85062 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,7 +33,6 @@
33#include <linux/cpuset.h> 33#include <linux/cpuset.h>
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 35#include <linux/memcontrol.h>
36#include <linux/mm_inline.h> /* for page_is_file_cache() */
37#include <linux/cleancache.h> 36#include <linux/cleancache.h>
38#include "internal.h" 37#include "internal.h"
39 38
@@ -78,10 +77,7 @@
78 * ->i_mutex (generic_file_buffered_write) 77 * ->i_mutex (generic_file_buffered_write)
79 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 78 * ->mmap_sem (fault_in_pages_readable->do_page_fault)
80 * 79 *
81 * ->i_mutex 80 * bdi->wb.list_lock
82 * ->i_alloc_sem (various)
83 *
84 * inode_wb_list_lock
85 * sb_lock (fs/fs-writeback.c) 81 * sb_lock (fs/fs-writeback.c)
86 * ->mapping->tree_lock (__sync_single_inode) 82 * ->mapping->tree_lock (__sync_single_inode)
87 * 83 *
@@ -99,9 +95,9 @@
99 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 95 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
100 * ->private_lock (page_remove_rmap->set_page_dirty) 96 * ->private_lock (page_remove_rmap->set_page_dirty)
101 * ->tree_lock (page_remove_rmap->set_page_dirty) 97 * ->tree_lock (page_remove_rmap->set_page_dirty)
102 * inode_wb_list_lock (page_remove_rmap->set_page_dirty) 98 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
103 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 99 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
104 * inode_wb_list_lock (zap_pte_range->set_page_dirty) 100 * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
105 * ->inode->i_lock (zap_pte_range->set_page_dirty) 101 * ->inode->i_lock (zap_pte_range->set_page_dirty)
106 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 102 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
107 * 103 *
@@ -131,6 +127,7 @@ void __delete_from_page_cache(struct page *page)
131 127
132 radix_tree_delete(&mapping->page_tree, page->index); 128 radix_tree_delete(&mapping->page_tree, page->index);
133 page->mapping = NULL; 129 page->mapping = NULL;
130 /* Leave page->index set: truncation lookup relies upon it */
134 mapping->nrpages--; 131 mapping->nrpages--;
135 __dec_zone_page_state(page, NR_FILE_PAGES); 132 __dec_zone_page_state(page, NR_FILE_PAGES);
136 if (PageSwapBacked(page)) 133 if (PageSwapBacked(page))
@@ -396,24 +393,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range);
396int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) 393int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
397{ 394{
398 int error; 395 int error;
399 struct mem_cgroup *memcg = NULL;
400 396
401 VM_BUG_ON(!PageLocked(old)); 397 VM_BUG_ON(!PageLocked(old));
402 VM_BUG_ON(!PageLocked(new)); 398 VM_BUG_ON(!PageLocked(new));
403 VM_BUG_ON(new->mapping); 399 VM_BUG_ON(new->mapping);
404 400
405 /*
406 * This is not page migration, but prepare_migration and
407 * end_migration does enough work for charge replacement.
408 *
409 * In the longer term we probably want a specialized function
410 * for moving the charge from old to new in a more efficient
411 * manner.
412 */
413 error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
414 if (error)
415 return error;
416
417 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 401 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
418 if (!error) { 402 if (!error) {
419 struct address_space *mapping = old->mapping; 403 struct address_space *mapping = old->mapping;
@@ -435,13 +419,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
435 if (PageSwapBacked(new)) 419 if (PageSwapBacked(new))
436 __inc_zone_page_state(new, NR_SHMEM); 420 __inc_zone_page_state(new, NR_SHMEM);
437 spin_unlock_irq(&mapping->tree_lock); 421 spin_unlock_irq(&mapping->tree_lock);
422 /* mem_cgroup codes must not be called under tree_lock */
423 mem_cgroup_replace_page_cache(old, new);
438 radix_tree_preload_end(); 424 radix_tree_preload_end();
439 if (freepage) 425 if (freepage)
440 freepage(old); 426 freepage(old);
441 page_cache_release(old); 427 page_cache_release(old);
442 mem_cgroup_end_migration(memcg, old, new, true);
443 } else {
444 mem_cgroup_end_migration(memcg, old, new, false);
445 } 428 }
446 429
447 return error; 430 return error;
@@ -464,6 +447,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
464 int error; 447 int error;
465 448
466 VM_BUG_ON(!PageLocked(page)); 449 VM_BUG_ON(!PageLocked(page));
450 VM_BUG_ON(PageSwapBacked(page));
467 451
468 error = mem_cgroup_cache_charge(page, current->mm, 452 error = mem_cgroup_cache_charge(page, current->mm,
469 gfp_mask & GFP_RECLAIM_MASK); 453 gfp_mask & GFP_RECLAIM_MASK);
@@ -481,11 +465,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
481 if (likely(!error)) { 465 if (likely(!error)) {
482 mapping->nrpages++; 466 mapping->nrpages++;
483 __inc_zone_page_state(page, NR_FILE_PAGES); 467 __inc_zone_page_state(page, NR_FILE_PAGES);
484 if (PageSwapBacked(page))
485 __inc_zone_page_state(page, NR_SHMEM);
486 spin_unlock_irq(&mapping->tree_lock); 468 spin_unlock_irq(&mapping->tree_lock);
487 } else { 469 } else {
488 page->mapping = NULL; 470 page->mapping = NULL;
471 /* Leave page->index set: truncation relies upon it */
489 spin_unlock_irq(&mapping->tree_lock); 472 spin_unlock_irq(&mapping->tree_lock);
490 mem_cgroup_uncharge_cache_page(page); 473 mem_cgroup_uncharge_cache_page(page);
491 page_cache_release(page); 474 page_cache_release(page);
@@ -503,22 +486,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
503{ 486{
504 int ret; 487 int ret;
505 488
506 /*
507 * Splice_read and readahead add shmem/tmpfs pages into the page cache
508 * before shmem_readpage has a chance to mark them as SwapBacked: they
509 * need to go on the anon lru below, and mem_cgroup_cache_charge
510 * (called in add_to_page_cache) needs to know where they're going too.
511 */
512 if (mapping_cap_swap_backed(mapping))
513 SetPageSwapBacked(page);
514
515 ret = add_to_page_cache(page, mapping, offset, gfp_mask); 489 ret = add_to_page_cache(page, mapping, offset, gfp_mask);
516 if (ret == 0) { 490 if (ret == 0)
517 if (page_is_file_cache(page)) 491 lru_cache_add_file(page);
518 lru_cache_add_file(page);
519 else
520 lru_cache_add_anon(page);
521 }
522 return ret; 492 return ret;
523} 493}
524EXPORT_SYMBOL_GPL(add_to_page_cache_lru); 494EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
@@ -715,9 +685,16 @@ repeat:
715 page = radix_tree_deref_slot(pagep); 685 page = radix_tree_deref_slot(pagep);
716 if (unlikely(!page)) 686 if (unlikely(!page))
717 goto out; 687 goto out;
718 if (radix_tree_deref_retry(page)) 688 if (radix_tree_exception(page)) {
719 goto repeat; 689 if (radix_tree_deref_retry(page))
720 690 goto repeat;
691 /*
692 * Otherwise, shmem/tmpfs must be storing a swap entry
693 * here as an exceptional entry: so return it without
694 * attempting to raise page count.
695 */
696 goto out;
697 }
721 if (!page_cache_get_speculative(page)) 698 if (!page_cache_get_speculative(page))
722 goto repeat; 699 goto repeat;
723 700
@@ -754,7 +731,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
754 731
755repeat: 732repeat:
756 page = find_get_page(mapping, offset); 733 page = find_get_page(mapping, offset);
757 if (page) { 734 if (page && !radix_tree_exception(page)) {
758 lock_page(page); 735 lock_page(page);
759 /* Has the page been truncated? */ 736 /* Has the page been truncated? */
760 if (unlikely(page->mapping != mapping)) { 737 if (unlikely(page->mapping != mapping)) {
@@ -836,13 +813,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
836{ 813{
837 unsigned int i; 814 unsigned int i;
838 unsigned int ret; 815 unsigned int ret;
839 unsigned int nr_found; 816 unsigned int nr_found, nr_skip;
840 817
841 rcu_read_lock(); 818 rcu_read_lock();
842restart: 819restart:
843 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 820 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
844 (void ***)pages, start, nr_pages); 821 (void ***)pages, NULL, start, nr_pages);
845 ret = 0; 822 ret = 0;
823 nr_skip = 0;
846 for (i = 0; i < nr_found; i++) { 824 for (i = 0; i < nr_found; i++) {
847 struct page *page; 825 struct page *page;
848repeat: 826repeat:
@@ -850,13 +828,23 @@ repeat:
850 if (unlikely(!page)) 828 if (unlikely(!page))
851 continue; 829 continue;
852 830
853 /* 831 if (radix_tree_exception(page)) {
854 * This can only trigger when the entry at index 0 moves out 832 if (radix_tree_deref_retry(page)) {
855 * of or back to the root: none yet gotten, safe to restart. 833 /*
856 */ 834 * Transient condition which can only trigger
857 if (radix_tree_deref_retry(page)) { 835 * when entry at index 0 moves out of or back
858 WARN_ON(start | i); 836 * to root: none yet gotten, safe to restart.
859 goto restart; 837 */
838 WARN_ON(start | i);
839 goto restart;
840 }
841 /*
842 * Otherwise, shmem/tmpfs must be storing a swap entry
843 * here as an exceptional entry: so skip over it -
844 * we only reach this from invalidate_mapping_pages().
845 */
846 nr_skip++;
847 continue;
860 } 848 }
861 849
862 if (!page_cache_get_speculative(page)) 850 if (!page_cache_get_speculative(page))
@@ -876,7 +864,7 @@ repeat:
876 * If all entries were removed before we could secure them, 864 * If all entries were removed before we could secure them,
877 * try again, because callers stop trying once 0 is returned. 865 * try again, because callers stop trying once 0 is returned.
878 */ 866 */
879 if (unlikely(!ret && nr_found)) 867 if (unlikely(!ret && nr_found > nr_skip))
880 goto restart; 868 goto restart;
881 rcu_read_unlock(); 869 rcu_read_unlock();
882 return ret; 870 return ret;
@@ -904,7 +892,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
904 rcu_read_lock(); 892 rcu_read_lock();
905restart: 893restart:
906 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 894 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
907 (void ***)pages, index, nr_pages); 895 (void ***)pages, NULL, index, nr_pages);
908 ret = 0; 896 ret = 0;
909 for (i = 0; i < nr_found; i++) { 897 for (i = 0; i < nr_found; i++) {
910 struct page *page; 898 struct page *page;
@@ -913,12 +901,22 @@ repeat:
913 if (unlikely(!page)) 901 if (unlikely(!page))
914 continue; 902 continue;
915 903
916 /* 904 if (radix_tree_exception(page)) {
917 * This can only trigger when the entry at index 0 moves out 905 if (radix_tree_deref_retry(page)) {
918 * of or back to the root: none yet gotten, safe to restart. 906 /*
919 */ 907 * Transient condition which can only trigger
920 if (radix_tree_deref_retry(page)) 908 * when entry at index 0 moves out of or back
921 goto restart; 909 * to root: none yet gotten, safe to restart.
910 */
911 goto restart;
912 }
913 /*
914 * Otherwise, shmem/tmpfs must be storing a swap entry
915 * here as an exceptional entry: so stop looking for
916 * contiguous pages.
917 */
918 break;
919 }
922 920
923 if (!page_cache_get_speculative(page)) 921 if (!page_cache_get_speculative(page))
924 goto repeat; 922 goto repeat;
@@ -978,12 +976,21 @@ repeat:
978 if (unlikely(!page)) 976 if (unlikely(!page))
979 continue; 977 continue;
980 978
981 /* 979 if (radix_tree_exception(page)) {
982 * This can only trigger when the entry at index 0 moves out 980 if (radix_tree_deref_retry(page)) {
983 * of or back to the root: none yet gotten, safe to restart. 981 /*
984 */ 982 * Transient condition which can only trigger
985 if (radix_tree_deref_retry(page)) 983 * when entry at index 0 moves out of or back
986 goto restart; 984 * to root: none yet gotten, safe to restart.
985 */
986 goto restart;
987 }
988 /*
989 * This function is never used on a shmem/tmpfs
990 * mapping, so a swap entry won't be found here.
991 */
992 BUG();
993 }
987 994
988 if (!page_cache_get_speculative(page)) 995 if (!page_cache_get_speculative(page))
989 goto repeat; 996 goto repeat;
@@ -1795,7 +1802,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
1795 1802
1796static struct page *__read_cache_page(struct address_space *mapping, 1803static struct page *__read_cache_page(struct address_space *mapping,
1797 pgoff_t index, 1804 pgoff_t index,
1798 int (*filler)(void *,struct page*), 1805 int (*filler)(void *, struct page *),
1799 void *data, 1806 void *data,
1800 gfp_t gfp) 1807 gfp_t gfp)
1801{ 1808{
@@ -1807,7 +1814,7 @@ repeat:
1807 page = __page_cache_alloc(gfp | __GFP_COLD); 1814 page = __page_cache_alloc(gfp | __GFP_COLD);
1808 if (!page) 1815 if (!page)
1809 return ERR_PTR(-ENOMEM); 1816 return ERR_PTR(-ENOMEM);
1810 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); 1817 err = add_to_page_cache_lru(page, mapping, index, gfp);
1811 if (unlikely(err)) { 1818 if (unlikely(err)) {
1812 page_cache_release(page); 1819 page_cache_release(page);
1813 if (err == -EEXIST) 1820 if (err == -EEXIST)
@@ -1826,7 +1833,7 @@ repeat:
1826 1833
1827static struct page *do_read_cache_page(struct address_space *mapping, 1834static struct page *do_read_cache_page(struct address_space *mapping,
1828 pgoff_t index, 1835 pgoff_t index,
1829 int (*filler)(void *,struct page*), 1836 int (*filler)(void *, struct page *),
1830 void *data, 1837 void *data,
1831 gfp_t gfp) 1838 gfp_t gfp)
1832 1839
@@ -1866,7 +1873,7 @@ out:
1866 * @mapping: the page's address_space 1873 * @mapping: the page's address_space
1867 * @index: the page index 1874 * @index: the page index
1868 * @filler: function to perform the read 1875 * @filler: function to perform the read
1869 * @data: destination for read data 1876 * @data: first arg to filler(data, page) function, often left as NULL
1870 * 1877 *
1871 * Same as read_cache_page, but don't wait for page to become unlocked 1878 * Same as read_cache_page, but don't wait for page to become unlocked
1872 * after submitting it to the filler. 1879 * after submitting it to the filler.
@@ -1878,7 +1885,7 @@ out:
1878 */ 1885 */
1879struct page *read_cache_page_async(struct address_space *mapping, 1886struct page *read_cache_page_async(struct address_space *mapping,
1880 pgoff_t index, 1887 pgoff_t index,
1881 int (*filler)(void *,struct page*), 1888 int (*filler)(void *, struct page *),
1882 void *data) 1889 void *data)
1883{ 1890{
1884 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); 1891 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
@@ -1904,10 +1911,7 @@ static struct page *wait_on_page_read(struct page *page)
1904 * @gfp: the page allocator flags to use if allocating 1911 * @gfp: the page allocator flags to use if allocating
1905 * 1912 *
1906 * This is the same as "read_mapping_page(mapping, index, NULL)", but with 1913 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
1907 * any new page allocations done using the specified allocation flags. Note 1914 * any new page allocations done using the specified allocation flags.
1908 * that the Radix tree operations will still use GFP_KERNEL, so you can't
1909 * expect to do this atomically or anything like that - but you can pass in
1910 * other page requirements.
1911 * 1915 *
1912 * If the page does not get brought uptodate, return -EIO. 1916 * If the page does not get brought uptodate, return -EIO.
1913 */ 1917 */
@@ -1926,7 +1930,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
1926 * @mapping: the page's address_space 1930 * @mapping: the page's address_space
1927 * @index: the page index 1931 * @index: the page index
1928 * @filler: function to perform the read 1932 * @filler: function to perform the read
1929 * @data: destination for read data 1933 * @data: first arg to filler(data, page) function, often left as NULL
1930 * 1934 *
1931 * Read into the page cache. If a page already exists, and PageUptodate() is 1935 * Read into the page cache. If a page already exists, and PageUptodate() is
1932 * not set, try to fill the page then wait for it to become unlocked. 1936 * not set, try to fill the page then wait for it to become unlocked.
@@ -1935,7 +1939,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
1935 */ 1939 */
1936struct page *read_cache_page(struct address_space *mapping, 1940struct page *read_cache_page(struct address_space *mapping,
1937 pgoff_t index, 1941 pgoff_t index,
1938 int (*filler)(void *,struct page*), 1942 int (*filler)(void *, struct page *),
1939 void *data) 1943 void *data)
1940{ 1944{
1941 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); 1945 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
diff --git a/mm/highmem.c b/mm/highmem.c
index 693394daa2e..5ef672c07f7 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -326,7 +326,7 @@ static struct page_address_slot {
326 spinlock_t lock; /* Protect this bucket's list */ 326 spinlock_t lock; /* Protect this bucket's list */
327} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; 327} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
328 328
329static struct page_address_slot *page_slot(struct page *page) 329static struct page_address_slot *page_slot(const struct page *page)
330{ 330{
331 return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; 331 return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
332} 332}
@@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page)
337 * 337 *
338 * Returns the page's virtual address. 338 * Returns the page's virtual address.
339 */ 339 */
340void *page_address(struct page *page) 340void *page_address(const struct page *page)
341{ 341{
342 unsigned long flags; 342 unsigned long flags;
343 void *ret; 343 void *ret;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 81532f297fd..d819d938288 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -989,7 +989,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
989 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 989 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
990 VM_BUG_ON(!PageCompound(page)); 990 VM_BUG_ON(!PageCompound(page));
991 if (flags & FOLL_GET) 991 if (flags & FOLL_GET)
992 get_page(page); 992 get_page_foll(page);
993 993
994out: 994out:
995 return page; 995 return page;
@@ -1156,6 +1156,7 @@ static void __split_huge_page_refcount(struct page *page)
1156 unsigned long head_index = page->index; 1156 unsigned long head_index = page->index;
1157 struct zone *zone = page_zone(page); 1157 struct zone *zone = page_zone(page);
1158 int zonestat; 1158 int zonestat;
1159 int tail_count = 0;
1159 1160
1160 /* prevent PageLRU to go away from under us, and freeze lru stats */ 1161 /* prevent PageLRU to go away from under us, and freeze lru stats */
1161 spin_lock_irq(&zone->lru_lock); 1162 spin_lock_irq(&zone->lru_lock);
@@ -1164,11 +1165,27 @@ static void __split_huge_page_refcount(struct page *page)
1164 for (i = 1; i < HPAGE_PMD_NR; i++) { 1165 for (i = 1; i < HPAGE_PMD_NR; i++) {
1165 struct page *page_tail = page + i; 1166 struct page *page_tail = page + i;
1166 1167
1167 /* tail_page->_count cannot change */ 1168 /* tail_page->_mapcount cannot change */
1168 atomic_sub(atomic_read(&page_tail->_count), &page->_count); 1169 BUG_ON(page_mapcount(page_tail) < 0);
1169 BUG_ON(page_count(page) <= 0); 1170 tail_count += page_mapcount(page_tail);
1170 atomic_add(page_mapcount(page) + 1, &page_tail->_count); 1171 /* check for overflow */
1171 BUG_ON(atomic_read(&page_tail->_count) <= 0); 1172 BUG_ON(tail_count < 0);
1173 BUG_ON(atomic_read(&page_tail->_count) != 0);
1174 /*
1175 * tail_page->_count is zero and not changing from
1176 * under us. But get_page_unless_zero() may be running
1177 * from under us on the tail_page. If we used
1178 * atomic_set() below instead of atomic_add(), we
1179 * would then run atomic_set() concurrently with
1180 * get_page_unless_zero(), and atomic_set() is
1181 * implemented in C not using locked ops. spin_unlock
1182 * on x86 sometime uses locked ops because of PPro
1183 * errata 66, 92, so unless somebody can guarantee
1184 * atomic_set() here would be safe on all archs (and
1185 * not only on x86), it's safer to use atomic_add().
1186 */
1187 atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
1188 &page_tail->_count);
1172 1189
1173 /* after clearing PageTail the gup refcount can be released */ 1190 /* after clearing PageTail the gup refcount can be released */
1174 smp_mb(); 1191 smp_mb();
@@ -1186,10 +1203,7 @@ static void __split_huge_page_refcount(struct page *page)
1186 (1L << PG_uptodate))); 1203 (1L << PG_uptodate)));
1187 page_tail->flags |= (1L << PG_dirty); 1204 page_tail->flags |= (1L << PG_dirty);
1188 1205
1189 /* 1206 /* clear PageTail before overwriting first_page */
1190 * 1) clear PageTail before overwriting first_page
1191 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
1192 */
1193 smp_wmb(); 1207 smp_wmb();
1194 1208
1195 /* 1209 /*
@@ -1206,7 +1220,6 @@ static void __split_huge_page_refcount(struct page *page)
1206 * status is achieved setting a reserved bit in the 1220 * status is achieved setting a reserved bit in the
1207 * pmd, not by clearing the present bit. 1221 * pmd, not by clearing the present bit.
1208 */ 1222 */
1209 BUG_ON(page_mapcount(page_tail));
1210 page_tail->_mapcount = page->_mapcount; 1223 page_tail->_mapcount = page->_mapcount;
1211 1224
1212 BUG_ON(page_tail->mapping); 1225 BUG_ON(page_tail->mapping);
@@ -1223,6 +1236,8 @@ static void __split_huge_page_refcount(struct page *page)
1223 1236
1224 lru_add_page_tail(zone, page, page_tail); 1237 lru_add_page_tail(zone, page, page_tail);
1225 } 1238 }
1239 atomic_sub(tail_count, &page->_count);
1240 BUG_ON(atomic_read(&page->_count) <= 0);
1226 1241
1227 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1242 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1228 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); 1243 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
@@ -1596,14 +1611,13 @@ void __khugepaged_exit(struct mm_struct *mm)
1596 list_del(&mm_slot->mm_node); 1611 list_del(&mm_slot->mm_node);
1597 free = 1; 1612 free = 1;
1598 } 1613 }
1614 spin_unlock(&khugepaged_mm_lock);
1599 1615
1600 if (free) { 1616 if (free) {
1601 spin_unlock(&khugepaged_mm_lock);
1602 clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 1617 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1603 free_mm_slot(mm_slot); 1618 free_mm_slot(mm_slot);
1604 mmdrop(mm); 1619 mmdrop(mm);
1605 } else if (mm_slot) { 1620 } else if (mm_slot) {
1606 spin_unlock(&khugepaged_mm_lock);
1607 /* 1621 /*
1608 * This is required to serialize against 1622 * This is required to serialize against
1609 * khugepaged_test_exit() (which is guaranteed to run 1623 * khugepaged_test_exit() (which is guaranteed to run
@@ -1614,8 +1628,7 @@ void __khugepaged_exit(struct mm_struct *mm)
1614 */ 1628 */
1615 down_write(&mm->mmap_sem); 1629 down_write(&mm->mmap_sem);
1616 up_write(&mm->mmap_sem); 1630 up_write(&mm->mmap_sem);
1617 } else 1631 }
1618 spin_unlock(&khugepaged_mm_lock);
1619} 1632}
1620 1633
1621static void release_pte_page(struct page *page) 1634static void release_pte_page(struct page *page)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bfcf153bc82..2316840b337 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,7 +24,7 @@
24 24
25#include <asm/page.h> 25#include <asm/page.h>
26#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <asm/io.h> 27#include <linux/io.h>
28 28
29#include <linux/hugetlb.h> 29#include <linux/hugetlb.h>
30#include <linux/node.h> 30#include <linux/node.h>
@@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock);
62 * must either hold the mmap_sem for write, or the mmap_sem for read and 62 * must either hold the mmap_sem for write, or the mmap_sem for read and
63 * the hugetlb_instantiation mutex: 63 * the hugetlb_instantiation mutex:
64 * 64 *
65 * down_write(&mm->mmap_sem); 65 * down_write(&mm->mmap_sem);
66 * or 66 * or
67 * down_read(&mm->mmap_sem); 67 * down_read(&mm->mmap_sem);
68 * mutex_lock(&hugetlb_instantiation_mutex); 68 * mutex_lock(&hugetlb_instantiation_mutex);
69 */ 69 */
70struct file_region { 70struct file_region {
71 struct list_head link; 71 struct list_head link;
@@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page)
503 h->nr_huge_pages--; 503 h->nr_huge_pages--;
504 h->nr_huge_pages_node[page_to_nid(page)]--; 504 h->nr_huge_pages_node[page_to_nid(page)]--;
505 for (i = 0; i < pages_per_huge_page(h); i++) { 505 for (i = 0; i < pages_per_huge_page(h); i++) {
506 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 506 page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
507 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 507 1 << PG_referenced | 1 << PG_dirty |
508 1 << PG_private | 1<< PG_writeback); 508 1 << PG_active | 1 << PG_reserved |
509 1 << PG_private | 1 << PG_writeback);
509 } 510 }
510 set_compound_page_dtor(page, NULL); 511 set_compound_page_dtor(page, NULL);
511 set_page_refcounted(page); 512 set_page_refcounted(page);
@@ -575,6 +576,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
575 __SetPageHead(page); 576 __SetPageHead(page);
576 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 577 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
577 __SetPageTail(p); 578 __SetPageTail(p);
579 set_page_count(p, 0);
578 p->first_page = page; 580 p->first_page = page;
579 } 581 }
580} 582}
@@ -591,7 +593,6 @@ int PageHuge(struct page *page)
591 593
592 return dtor == free_huge_page; 594 return dtor == free_huge_page;
593} 595}
594
595EXPORT_SYMBOL_GPL(PageHuge); 596EXPORT_SYMBOL_GPL(PageHuge);
596 597
597static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 598static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
@@ -900,7 +901,6 @@ retry:
900 h->resv_huge_pages += delta; 901 h->resv_huge_pages += delta;
901 ret = 0; 902 ret = 0;
902 903
903 spin_unlock(&hugetlb_lock);
904 /* Free the needed pages to the hugetlb pool */ 904 /* Free the needed pages to the hugetlb pool */
905 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 905 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
906 if ((--needed) < 0) 906 if ((--needed) < 0)
@@ -914,6 +914,7 @@ retry:
914 VM_BUG_ON(page_count(page)); 914 VM_BUG_ON(page_count(page));
915 enqueue_huge_page(h, page); 915 enqueue_huge_page(h, page);
916 } 916 }
917 spin_unlock(&hugetlb_lock);
917 918
918 /* Free unnecessary surplus pages to the buddy allocator */ 919 /* Free unnecessary surplus pages to the buddy allocator */
919free: 920free:
@@ -1105,8 +1106,16 @@ static void __init gather_bootmem_prealloc(void)
1105 struct huge_bootmem_page *m; 1106 struct huge_bootmem_page *m;
1106 1107
1107 list_for_each_entry(m, &huge_boot_pages, list) { 1108 list_for_each_entry(m, &huge_boot_pages, list) {
1108 struct page *page = virt_to_page(m);
1109 struct hstate *h = m->hstate; 1109 struct hstate *h = m->hstate;
1110 struct page *page;
1111
1112#ifdef CONFIG_HIGHMEM
1113 page = pfn_to_page(m->phys >> PAGE_SHIFT);
1114 free_bootmem_late((unsigned long)m,
1115 sizeof(struct huge_bootmem_page));
1116#else
1117 page = virt_to_page(m);
1118#endif
1110 __ClearPageReserved(page); 1119 __ClearPageReserved(page);
1111 WARN_ON(page_count(page) != 1); 1120 WARN_ON(page_count(page) != 1);
1112 prep_compound_huge_page(page, h->order); 1121 prep_compound_huge_page(page, h->order);
@@ -2124,9 +2133,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
2124 pte_t entry; 2133 pte_t entry;
2125 2134
2126 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 2135 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
2127 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 2136 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
2128 update_mmu_cache(vma, address, ptep); 2137 update_mmu_cache(vma, address, ptep);
2129 }
2130} 2138}
2131 2139
2132 2140
@@ -2181,9 +2189,9 @@ static int is_hugetlb_entry_migration(pte_t pte)
2181 if (huge_pte_none(pte) || pte_present(pte)) 2189 if (huge_pte_none(pte) || pte_present(pte))
2182 return 0; 2190 return 0;
2183 swp = pte_to_swp_entry(pte); 2191 swp = pte_to_swp_entry(pte);
2184 if (non_swap_entry(swp) && is_migration_entry(swp)) { 2192 if (non_swap_entry(swp) && is_migration_entry(swp))
2185 return 1; 2193 return 1;
2186 } else 2194 else
2187 return 0; 2195 return 0;
2188} 2196}
2189 2197
@@ -2194,9 +2202,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2194 if (huge_pte_none(pte) || pte_present(pte)) 2202 if (huge_pte_none(pte) || pte_present(pte))
2195 return 0; 2203 return 0;
2196 swp = pte_to_swp_entry(pte); 2204 swp = pte_to_swp_entry(pte);
2197 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { 2205 if (non_swap_entry(swp) && is_hwpoison_entry(swp))
2198 return 1; 2206 return 1;
2199 } else 2207 else
2200 return 0; 2208 return 0;
2201} 2209}
2202 2210
@@ -2415,6 +2423,8 @@ retry_avoidcopy:
2415 * anon_vma prepared. 2423 * anon_vma prepared.
2416 */ 2424 */
2417 if (unlikely(anon_vma_prepare(vma))) { 2425 if (unlikely(anon_vma_prepare(vma))) {
2426 page_cache_release(new_page);
2427 page_cache_release(old_page);
2418 /* Caller expects lock to be held */ 2428 /* Caller expects lock to be held */
2419 spin_lock(&mm->page_table_lock); 2429 spin_lock(&mm->page_table_lock);
2420 return VM_FAULT_OOM; 2430 return VM_FAULT_OOM;
@@ -2559,7 +2569,7 @@ retry:
2559 * So we need to block hugepage fault by PG_hwpoison bit check. 2569 * So we need to block hugepage fault by PG_hwpoison bit check.
2560 */ 2570 */
2561 if (unlikely(PageHWPoison(page))) { 2571 if (unlikely(PageHWPoison(page))) {
2562 ret = VM_FAULT_HWPOISON | 2572 ret = VM_FAULT_HWPOISON |
2563 VM_FAULT_SET_HINDEX(h - hstates); 2573 VM_FAULT_SET_HINDEX(h - hstates);
2564 goto backout_unlocked; 2574 goto backout_unlocked;
2565 } 2575 }
@@ -2627,7 +2637,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2627 migration_entry_wait(mm, (pmd_t *)ptep, address); 2637 migration_entry_wait(mm, (pmd_t *)ptep, address);
2628 return 0; 2638 return 0;
2629 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2639 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2630 return VM_FAULT_HWPOISON_LARGE | 2640 return VM_FAULT_HWPOISON_LARGE |
2631 VM_FAULT_SET_HINDEX(h - hstates); 2641 VM_FAULT_SET_HINDEX(h - hstates);
2632 } 2642 }
2633 2643
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 4019979b263..a56a851908d 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -5,7 +5,7 @@
5#include <linux/list.h> 5#include <linux/list.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7 7
8#include <asm/atomic.h> 8#include <linux/atomic.h>
9#include <asm/pgtable.h> 9#include <asm/pgtable.h>
10#include <asm/mmu.h> 10#include <asm/mmu.h>
11 11
diff --git a/mm/internal.h b/mm/internal.h
index d071d380fb4..2189af49178 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)
37 atomic_dec(&page->_count); 37 atomic_dec(&page->_count);
38} 38}
39 39
40static inline void __get_page_tail_foll(struct page *page,
41 bool get_page_head)
42{
43 /*
44 * If we're getting a tail page, the elevated page->_count is
45 * required only in the head page and we will elevate the head
46 * page->_count and tail page->_mapcount.
47 *
48 * We elevate page_tail->_mapcount for tail pages to force
49 * page_tail->_count to be zero at all times to avoid getting
50 * false positives from get_page_unless_zero() with
51 * speculative page access (like in
52 * page_cache_get_speculative()) on tail pages.
53 */
54 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
55 VM_BUG_ON(atomic_read(&page->_count) != 0);
56 VM_BUG_ON(page_mapcount(page) < 0);
57 if (get_page_head)
58 atomic_inc(&page->first_page->_count);
59 atomic_inc(&page->_mapcount);
60}
61
62/*
63 * This is meant to be called as the FOLL_GET operation of
64 * follow_page() and it must be called while holding the proper PT
65 * lock while the pte (or pmd_trans_huge) is still mapping the page.
66 */
67static inline void get_page_foll(struct page *page)
68{
69 if (unlikely(PageTail(page)))
70 /*
71 * This is safe only because
72 * __split_huge_page_refcount() can't run under
73 * get_page_foll() because we hold the proper PT lock.
74 */
75 __get_page_tail_foll(page, true);
76 else {
77 /*
78 * Getting a normal page or the head of a compound page
79 * requires to already have an elevated page->_count.
80 */
81 VM_BUG_ON(atomic_read(&page->_count) <= 0);
82 atomic_inc(&page->_count);
83 }
84}
85
40extern unsigned long highest_memmap_pfn; 86extern unsigned long highest_memmap_pfn;
41 87
42/* 88/*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index aacee45616f..d6880f542f9 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -96,7 +96,7 @@
96 96
97#include <asm/sections.h> 97#include <asm/sections.h>
98#include <asm/processor.h> 98#include <asm/processor.h>
99#include <asm/atomic.h> 99#include <linux/atomic.h>
100 100
101#include <linux/kmemcheck.h> 101#include <linux/kmemcheck.h>
102#include <linux/kmemleak.h> 102#include <linux/kmemleak.h>
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491ed50..74bf193eff0 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma,
218 endoff = (loff_t)(end - vma->vm_start - 1) 218 endoff = (loff_t)(end - vma->vm_start - 1)
219 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 219 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
220 220
221 /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ 221 /* vmtruncate_range needs to take i_mutex */
222 up_read(&current->mm->mmap_sem); 222 up_read(&current->mm->mmap_sem);
223 error = vmtruncate_range(mapping->host, offset, endoff); 223 error = vmtruncate_range(mapping->host, offset, endoff);
224 down_read(&current->mm->mmap_sem); 224 down_read(&current->mm->mmap_sem);
diff --git a/mm/memblock.c b/mm/memblock.c
index a0562d1a6ad..ccbf9733959 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -758,9 +758,9 @@ void __init memblock_analyze(void)
758 758
759 /* Check marker in the unused last array entry */ 759 /* Check marker in the unused last array entry */
760 WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base 760 WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
761 != (phys_addr_t)RED_INACTIVE); 761 != MEMBLOCK_INACTIVE);
762 WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base 762 WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
763 != (phys_addr_t)RED_INACTIVE); 763 != MEMBLOCK_INACTIVE);
764 764
765 memblock.memory_size = 0; 765 memblock.memory_size = 0;
766 766
@@ -786,8 +786,8 @@ void __init memblock_init(void)
786 memblock.reserved.max = INIT_MEMBLOCK_REGIONS; 786 memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
787 787
788 /* Write a marker in the unused last array entry */ 788 /* Write a marker in the unused last array entry */
789 memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; 789 memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
790 memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; 790 memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
791 791
792 /* Create a dummy zero size MEMBLOCK which will get coalesced away later. 792 /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
793 * This simplifies the memblock_add() code below... 793 * This simplifies the memblock_add() code below...
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e013b8e57d2..dd81ddc64b4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,7 +35,6 @@
35#include <linux/limits.h> 35#include <linux/limits.h>
36#include <linux/mutex.h> 36#include <linux/mutex.h>
37#include <linux/rbtree.h> 37#include <linux/rbtree.h>
38#include <linux/shmem_fs.h>
39#include <linux/slab.h> 38#include <linux/slab.h>
40#include <linux/swap.h> 39#include <linux/swap.h>
41#include <linux/swapops.h> 40#include <linux/swapops.h>
@@ -246,10 +245,13 @@ struct mem_cgroup {
246 * Should the accounting and control be hierarchical, per subtree? 245 * Should the accounting and control be hierarchical, per subtree?
247 */ 246 */
248 bool use_hierarchy; 247 bool use_hierarchy;
249 atomic_t oom_lock; 248
249 bool oom_lock;
250 atomic_t under_oom;
251
250 atomic_t refcnt; 252 atomic_t refcnt;
251 253
252 unsigned int swappiness; 254 int swappiness;
253 /* OOM-Killer disable */ 255 /* OOM-Killer disable */
254 int oom_kill_disable; 256 int oom_kill_disable;
255 257
@@ -636,27 +638,44 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
636 preempt_enable(); 638 preempt_enable();
637} 639}
638 640
639static unsigned long 641unsigned long
640mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) 642mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
643 unsigned int lru_mask)
641{ 644{
642 struct mem_cgroup_per_zone *mz; 645 struct mem_cgroup_per_zone *mz;
646 enum lru_list l;
647 unsigned long ret = 0;
648
649 mz = mem_cgroup_zoneinfo(mem, nid, zid);
650
651 for_each_lru(l) {
652 if (BIT(l) & lru_mask)
653 ret += MEM_CGROUP_ZSTAT(mz, l);
654 }
655 return ret;
656}
657
658static unsigned long
659mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
660 int nid, unsigned int lru_mask)
661{
643 u64 total = 0; 662 u64 total = 0;
644 int zid; 663 int zid;
645 664
646 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 665 for (zid = 0; zid < MAX_NR_ZONES; zid++)
647 mz = mem_cgroup_zoneinfo(mem, nid, zid); 666 total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
648 total += MEM_CGROUP_ZSTAT(mz, idx); 667
649 }
650 return total; 668 return total;
651} 669}
652static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 670
653 enum lru_list idx) 671static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
672 unsigned int lru_mask)
654{ 673{
655 int nid; 674 int nid;
656 u64 total = 0; 675 u64 total = 0;
657 676
658 for_each_online_node(nid) 677 for_each_node_state(nid, N_HIGH_MEMORY)
659 total += mem_cgroup_get_zonestat_node(mem, nid, idx); 678 total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
660 return total; 679 return total;
661} 680}
662 681
@@ -1043,6 +1062,21 @@ void mem_cgroup_move_lists(struct page *page,
1043 mem_cgroup_add_lru_list(page, to); 1062 mem_cgroup_add_lru_list(page, to);
1044} 1063}
1045 1064
1065/*
1066 * Checks whether given mem is same or in the root_mem's
1067 * hierarchy subtree
1068 */
1069static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
1070 struct mem_cgroup *mem)
1071{
1072 if (root_mem != mem) {
1073 return (root_mem->use_hierarchy &&
1074 css_is_ancestor(&mem->css, &root_mem->css));
1075 }
1076
1077 return true;
1078}
1079
1046int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 1080int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
1047{ 1081{
1048 int ret; 1082 int ret;
@@ -1062,10 +1096,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
1062 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 1096 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
1063 * hierarchy(even if use_hierarchy is disabled in "mem"). 1097 * hierarchy(even if use_hierarchy is disabled in "mem").
1064 */ 1098 */
1065 if (mem->use_hierarchy) 1099 ret = mem_cgroup_same_or_subtree(mem, curr);
1066 ret = css_is_ancestor(&curr->css, &mem->css);
1067 else
1068 ret = (curr == mem);
1069 css_put(&curr->css); 1100 css_put(&curr->css);
1070 return ret; 1101 return ret;
1071} 1102}
@@ -1077,8 +1108,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
1077 unsigned long gb; 1108 unsigned long gb;
1078 unsigned long inactive_ratio; 1109 unsigned long inactive_ratio;
1079 1110
1080 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 1111 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
1081 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 1112 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
1082 1113
1083 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1114 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1084 if (gb) 1115 if (gb)
@@ -1117,109 +1148,12 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
1117 unsigned long active; 1148 unsigned long active;
1118 unsigned long inactive; 1149 unsigned long inactive;
1119 1150
1120 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); 1151 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
1121 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); 1152 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
1122 1153
1123 return (active > inactive); 1154 return (active > inactive);
1124} 1155}
1125 1156
1126unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
1127 struct zone *zone,
1128 enum lru_list lru)
1129{
1130 int nid = zone_to_nid(zone);
1131 int zid = zone_idx(zone);
1132 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1133
1134 return MEM_CGROUP_ZSTAT(mz, lru);
1135}
1136
1137static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1138 int nid)
1139{
1140 unsigned long ret;
1141
1142 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) +
1143 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE);
1144
1145 return ret;
1146}
1147
1148static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1149 int nid)
1150{
1151 unsigned long ret;
1152
1153 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1154 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1155 return ret;
1156}
1157
1158#if MAX_NUMNODES > 1
1159static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1160{
1161 u64 total = 0;
1162 int nid;
1163
1164 for_each_node_state(nid, N_HIGH_MEMORY)
1165 total += mem_cgroup_node_nr_file_lru_pages(memcg, nid);
1166
1167 return total;
1168}
1169
1170static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
1171{
1172 u64 total = 0;
1173 int nid;
1174
1175 for_each_node_state(nid, N_HIGH_MEMORY)
1176 total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid);
1177
1178 return total;
1179}
1180
1181static unsigned long
1182mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid)
1183{
1184 return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE);
1185}
1186
1187static unsigned long
1188mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg)
1189{
1190 u64 total = 0;
1191 int nid;
1192
1193 for_each_node_state(nid, N_HIGH_MEMORY)
1194 total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid);
1195
1196 return total;
1197}
1198
1199static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
1200 int nid)
1201{
1202 enum lru_list l;
1203 u64 total = 0;
1204
1205 for_each_lru(l)
1206 total += mem_cgroup_get_zonestat_node(memcg, nid, l);
1207
1208 return total;
1209}
1210
1211static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg)
1212{
1213 u64 total = 0;
1214 int nid;
1215
1216 for_each_node_state(nid, N_HIGH_MEMORY)
1217 total += mem_cgroup_node_nr_lru_pages(memcg, nid);
1218
1219 return total;
1220}
1221#endif /* CONFIG_NUMA */
1222
1223struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1157struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1224 struct zone *zone) 1158 struct zone *zone)
1225{ 1159{
@@ -1329,7 +1263,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
1329 return margin >> PAGE_SHIFT; 1263 return margin >> PAGE_SHIFT;
1330} 1264}
1331 1265
1332static unsigned int get_swappiness(struct mem_cgroup *memcg) 1266int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1333{ 1267{
1334 struct cgroup *cgrp = memcg->css.cgroup; 1268 struct cgroup *cgrp = memcg->css.cgroup;
1335 1269
@@ -1401,10 +1335,9 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1401 to = mc.to; 1335 to = mc.to;
1402 if (!from) 1336 if (!from)
1403 goto unlock; 1337 goto unlock;
1404 if (from == mem || to == mem 1338
1405 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) 1339 ret = mem_cgroup_same_or_subtree(mem, from)
1406 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) 1340 || mem_cgroup_same_or_subtree(mem, to);
1407 ret = true;
1408unlock: 1341unlock:
1409 spin_unlock(&mc.lock); 1342 spin_unlock(&mc.lock);
1410 return ret; 1343 return ret;
@@ -1576,11 +1509,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1576static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, 1509static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
1577 int nid, bool noswap) 1510 int nid, bool noswap)
1578{ 1511{
1579 if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) 1512 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
1580 return true; 1513 return true;
1581 if (noswap || !total_swap_pages) 1514 if (noswap || !total_swap_pages)
1582 return false; 1515 return false;
1583 if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) 1516 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
1584 return true; 1517 return true;
1585 return false; 1518 return false;
1586 1519
@@ -1730,7 +1663,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1730 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1663 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1731 1664
1732 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1665 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1733 if (!check_soft && root_mem->memsw_is_minimum) 1666 if (!check_soft && !shrink && root_mem->memsw_is_minimum)
1734 noswap = true; 1667 noswap = true;
1735 1668
1736 while (1) { 1669 while (1) {
@@ -1776,12 +1709,11 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1776 /* we use swappiness of local cgroup */ 1709 /* we use swappiness of local cgroup */
1777 if (check_soft) { 1710 if (check_soft) {
1778 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1711 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1779 noswap, get_swappiness(victim), zone, 1712 noswap, zone, &nr_scanned);
1780 &nr_scanned);
1781 *total_scanned += nr_scanned; 1713 *total_scanned += nr_scanned;
1782 } else 1714 } else
1783 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1715 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1784 noswap, get_swappiness(victim)); 1716 noswap);
1785 css_put(&victim->css); 1717 css_put(&victim->css);
1786 /* 1718 /*
1787 * At shrinking usage, we can't check we should stop here or 1719 * At shrinking usage, we can't check we should stop here or
@@ -1803,38 +1735,77 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1803/* 1735/*
1804 * Check OOM-Killer is already running under our hierarchy. 1736 * Check OOM-Killer is already running under our hierarchy.
1805 * If someone is running, return false. 1737 * If someone is running, return false.
1738 * Has to be called with memcg_oom_lock
1806 */ 1739 */
1807static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1740static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1808{ 1741{
1809 int x, lock_count = 0; 1742 struct mem_cgroup *iter, *failed = NULL;
1810 struct mem_cgroup *iter; 1743 bool cond = true;
1811 1744
1812 for_each_mem_cgroup_tree(iter, mem) { 1745 for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1813 x = atomic_inc_return(&iter->oom_lock); 1746 if (iter->oom_lock) {
1814 lock_count = max(x, lock_count); 1747 /*
1748 * this subtree of our hierarchy is already locked
1749 * so we cannot give a lock.
1750 */
1751 failed = iter;
1752 cond = false;
1753 } else
1754 iter->oom_lock = true;
1815 } 1755 }
1816 1756
1817 if (lock_count == 1) 1757 if (!failed)
1818 return true; 1758 return true;
1759
1760 /*
1761 * OK, we failed to lock the whole subtree so we have to clean up
1762 * what we set up to the failing subtree
1763 */
1764 cond = true;
1765 for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1766 if (iter == failed) {
1767 cond = false;
1768 continue;
1769 }
1770 iter->oom_lock = false;
1771 }
1819 return false; 1772 return false;
1820} 1773}
1821 1774
1775/*
1776 * Has to be called with memcg_oom_lock
1777 */
1822static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1778static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1823{ 1779{
1824 struct mem_cgroup *iter; 1780 struct mem_cgroup *iter;
1825 1781
1782 for_each_mem_cgroup_tree(iter, mem)
1783 iter->oom_lock = false;
1784 return 0;
1785}
1786
1787static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
1788{
1789 struct mem_cgroup *iter;
1790
1791 for_each_mem_cgroup_tree(iter, mem)
1792 atomic_inc(&iter->under_oom);
1793}
1794
1795static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
1796{
1797 struct mem_cgroup *iter;
1798
1826 /* 1799 /*
1827 * When a new child is created while the hierarchy is under oom, 1800 * When a new child is created while the hierarchy is under oom,
1828 * mem_cgroup_oom_lock() may not be called. We have to use 1801 * mem_cgroup_oom_lock() may not be called. We have to use
1829 * atomic_add_unless() here. 1802 * atomic_add_unless() here.
1830 */ 1803 */
1831 for_each_mem_cgroup_tree(iter, mem) 1804 for_each_mem_cgroup_tree(iter, mem)
1832 atomic_add_unless(&iter->oom_lock, -1, 0); 1805 atomic_add_unless(&iter->under_oom, -1, 0);
1833 return 0;
1834} 1806}
1835 1807
1836 1808static DEFINE_SPINLOCK(memcg_oom_lock);
1837static DEFINE_MUTEX(memcg_oom_mutex);
1838static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1809static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1839 1810
1840struct oom_wait_info { 1811struct oom_wait_info {
@@ -1845,25 +1816,20 @@ struct oom_wait_info {
1845static int memcg_oom_wake_function(wait_queue_t *wait, 1816static int memcg_oom_wake_function(wait_queue_t *wait,
1846 unsigned mode, int sync, void *arg) 1817 unsigned mode, int sync, void *arg)
1847{ 1818{
1848 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; 1819 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
1820 *oom_wait_mem;
1849 struct oom_wait_info *oom_wait_info; 1821 struct oom_wait_info *oom_wait_info;
1850 1822
1851 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1823 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1824 oom_wait_mem = oom_wait_info->mem;
1852 1825
1853 if (oom_wait_info->mem == wake_mem)
1854 goto wakeup;
1855 /* if no hierarchy, no match */
1856 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
1857 return 0;
1858 /* 1826 /*
1859 * Both of oom_wait_info->mem and wake_mem are stable under us. 1827 * Both of oom_wait_info->mem and wake_mem are stable under us.
1860 * Then we can use css_is_ancestor without taking care of RCU. 1828 * Then we can use css_is_ancestor without taking care of RCU.
1861 */ 1829 */
1862 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && 1830 if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
1863 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) 1831 && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
1864 return 0; 1832 return 0;
1865
1866wakeup:
1867 return autoremove_wake_function(wait, mode, sync, arg); 1833 return autoremove_wake_function(wait, mode, sync, arg);
1868} 1834}
1869 1835
@@ -1875,7 +1841,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
1875 1841
1876static void memcg_oom_recover(struct mem_cgroup *mem) 1842static void memcg_oom_recover(struct mem_cgroup *mem)
1877{ 1843{
1878 if (mem && atomic_read(&mem->oom_lock)) 1844 if (mem && atomic_read(&mem->under_oom))
1879 memcg_wakeup_oom(mem); 1845 memcg_wakeup_oom(mem);
1880} 1846}
1881 1847
@@ -1893,8 +1859,10 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1893 owait.wait.private = current; 1859 owait.wait.private = current;
1894 INIT_LIST_HEAD(&owait.wait.task_list); 1860 INIT_LIST_HEAD(&owait.wait.task_list);
1895 need_to_kill = true; 1861 need_to_kill = true;
1862 mem_cgroup_mark_under_oom(mem);
1863
1896 /* At first, try to OOM lock hierarchy under mem.*/ 1864 /* At first, try to OOM lock hierarchy under mem.*/
1897 mutex_lock(&memcg_oom_mutex); 1865 spin_lock(&memcg_oom_lock);
1898 locked = mem_cgroup_oom_lock(mem); 1866 locked = mem_cgroup_oom_lock(mem);
1899 /* 1867 /*
1900 * Even if signal_pending(), we can't quit charge() loop without 1868 * Even if signal_pending(), we can't quit charge() loop without
@@ -1906,7 +1874,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1906 need_to_kill = false; 1874 need_to_kill = false;
1907 if (locked) 1875 if (locked)
1908 mem_cgroup_oom_notify(mem); 1876 mem_cgroup_oom_notify(mem);
1909 mutex_unlock(&memcg_oom_mutex); 1877 spin_unlock(&memcg_oom_lock);
1910 1878
1911 if (need_to_kill) { 1879 if (need_to_kill) {
1912 finish_wait(&memcg_oom_waitq, &owait.wait); 1880 finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1915,10 +1883,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1915 schedule(); 1883 schedule();
1916 finish_wait(&memcg_oom_waitq, &owait.wait); 1884 finish_wait(&memcg_oom_waitq, &owait.wait);
1917 } 1885 }
1918 mutex_lock(&memcg_oom_mutex); 1886 spin_lock(&memcg_oom_lock);
1919 mem_cgroup_oom_unlock(mem); 1887 if (locked)
1888 mem_cgroup_oom_unlock(mem);
1920 memcg_wakeup_oom(mem); 1889 memcg_wakeup_oom(mem);
1921 mutex_unlock(&memcg_oom_mutex); 1890 spin_unlock(&memcg_oom_lock);
1891
1892 mem_cgroup_unmark_under_oom(mem);
1922 1893
1923 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1894 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1924 return false; 1895 return false;
@@ -2079,59 +2050,70 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
2079} 2050}
2080 2051
2081/* 2052/*
2082 * Tries to drain stocked charges in other cpus. This function is asynchronous 2053 * Drains all per-CPU charge caches for given root_mem resp. subtree
2083 * and just put a work per cpu for draining localy on each cpu. Caller can 2054 * of the hierarchy under it. sync flag says whether we should block
2084 * expects some charges will be back to res_counter later but cannot wait for 2055 * until the work is done.
2085 * it.
2086 */ 2056 */
2087static void drain_all_stock_async(struct mem_cgroup *root_mem) 2057static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
2088{ 2058{
2089 int cpu, curcpu; 2059 int cpu, curcpu;
2090 /* 2060
2091 * If someone calls draining, avoid adding more kworker runs.
2092 */
2093 if (!mutex_trylock(&percpu_charge_mutex))
2094 return;
2095 /* Notify other cpus that system-wide "drain" is running */ 2061 /* Notify other cpus that system-wide "drain" is running */
2096 get_online_cpus(); 2062 get_online_cpus();
2097 /* 2063 curcpu = get_cpu();
2098 * Get a hint for avoiding draining charges on the current cpu,
2099 * which must be exhausted by our charging. It is not required that
2100 * this be a precise check, so we use raw_smp_processor_id() instead of
2101 * getcpu()/putcpu().
2102 */
2103 curcpu = raw_smp_processor_id();
2104 for_each_online_cpu(cpu) { 2064 for_each_online_cpu(cpu) {
2105 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2065 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2106 struct mem_cgroup *mem; 2066 struct mem_cgroup *mem;
2107 2067
2108 if (cpu == curcpu)
2109 continue;
2110
2111 mem = stock->cached; 2068 mem = stock->cached;
2112 if (!mem) 2069 if (!mem || !stock->nr_pages)
2070 continue;
2071 if (!mem_cgroup_same_or_subtree(root_mem, mem))
2113 continue; 2072 continue;
2114 if (mem != root_mem) { 2073 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2115 if (!root_mem->use_hierarchy) 2074 if (cpu == curcpu)
2116 continue; 2075 drain_local_stock(&stock->work);
2117 /* check whether "mem" is under tree of "root_mem" */ 2076 else
2118 if (!css_is_ancestor(&mem->css, &root_mem->css)) 2077 schedule_work_on(cpu, &stock->work);
2119 continue;
2120 } 2078 }
2121 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2122 schedule_work_on(cpu, &stock->work);
2123 } 2079 }
2080 put_cpu();
2081
2082 if (!sync)
2083 goto out;
2084
2085 for_each_online_cpu(cpu) {
2086 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2087 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2088 flush_work(&stock->work);
2089 }
2090out:
2124 put_online_cpus(); 2091 put_online_cpus();
2092}
2093
2094/*
2095 * Tries to drain stocked charges in other cpus. This function is asynchronous
2096 * and just put a work per cpu for draining localy on each cpu. Caller can
2097 * expects some charges will be back to res_counter later but cannot wait for
2098 * it.
2099 */
2100static void drain_all_stock_async(struct mem_cgroup *root_mem)
2101{
2102 /*
2103 * If someone calls draining, avoid adding more kworker runs.
2104 */
2105 if (!mutex_trylock(&percpu_charge_mutex))
2106 return;
2107 drain_all_stock(root_mem, false);
2125 mutex_unlock(&percpu_charge_mutex); 2108 mutex_unlock(&percpu_charge_mutex);
2126 /* We don't wait for flush_work */
2127} 2109}
2128 2110
2129/* This is a synchronous drain interface. */ 2111/* This is a synchronous drain interface. */
2130static void drain_all_stock_sync(void) 2112static void drain_all_stock_sync(struct mem_cgroup *root_mem)
2131{ 2113{
2132 /* called when force_empty is called */ 2114 /* called when force_empty is called */
2133 mutex_lock(&percpu_charge_mutex); 2115 mutex_lock(&percpu_charge_mutex);
2134 schedule_on_each_cpu(drain_local_stock); 2116 drain_all_stock(root_mem, true);
2135 mutex_unlock(&percpu_charge_mutex); 2117 mutex_unlock(&percpu_charge_mutex);
2136} 2118}
2137 2119
@@ -2784,30 +2766,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2784 return 0; 2766 return 0;
2785 if (PageCompound(page)) 2767 if (PageCompound(page))
2786 return 0; 2768 return 0;
2787 /*
2788 * Corner case handling. This is called from add_to_page_cache()
2789 * in usual. But some FS (shmem) precharges this page before calling it
2790 * and call add_to_page_cache() with GFP_NOWAIT.
2791 *
2792 * For GFP_NOWAIT case, the page may be pre-charged before calling
2793 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
2794 * charge twice. (It works but has to pay a bit larger cost.)
2795 * And when the page is SwapCache, it should take swap information
2796 * into account. This is under lock_page() now.
2797 */
2798 if (!(gfp_mask & __GFP_WAIT)) {
2799 struct page_cgroup *pc;
2800
2801 pc = lookup_page_cgroup(page);
2802 if (!pc)
2803 return 0;
2804 lock_page_cgroup(pc);
2805 if (PageCgroupUsed(pc)) {
2806 unlock_page_cgroup(pc);
2807 return 0;
2808 }
2809 unlock_page_cgroup(pc);
2810 }
2811 2769
2812 if (unlikely(!mm)) 2770 if (unlikely(!mm))
2813 mm = &init_mm; 2771 mm = &init_mm;
@@ -3398,28 +3356,47 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
3398} 3356}
3399 3357
3400/* 3358/*
3401 * A call to try to shrink memory usage on charge failure at shmem's swapin. 3359 * At replace page cache, newpage is not under any memcg but it's on
3402 * Calling hierarchical_reclaim is not enough because we should update 3360 * LRU. So, this function doesn't touch res_counter but handles LRU
3403 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. 3361 * in correct way. Both pages are locked so we cannot race with uncharge.
3404 * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
3405 * not from the memcg which this page would be charged to.
3406 * try_charge_swapin does all of these works properly.
3407 */ 3362 */
3408int mem_cgroup_shmem_charge_fallback(struct page *page, 3363void mem_cgroup_replace_page_cache(struct page *oldpage,
3409 struct mm_struct *mm, 3364 struct page *newpage)
3410 gfp_t gfp_mask)
3411{ 3365{
3412 struct mem_cgroup *mem; 3366 struct mem_cgroup *memcg;
3413 int ret; 3367 struct page_cgroup *pc;
3368 struct zone *zone;
3369 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3370 unsigned long flags;
3414 3371
3415 if (mem_cgroup_disabled()) 3372 if (mem_cgroup_disabled())
3416 return 0; 3373 return;
3417 3374
3418 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 3375 pc = lookup_page_cgroup(oldpage);
3419 if (!ret) 3376 /* fix accounting on old pages */
3420 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ 3377 lock_page_cgroup(pc);
3378 memcg = pc->mem_cgroup;
3379 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
3380 ClearPageCgroupUsed(pc);
3381 unlock_page_cgroup(pc);
3421 3382
3422 return ret; 3383 if (PageSwapBacked(oldpage))
3384 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3385
3386 zone = page_zone(newpage);
3387 pc = lookup_page_cgroup(newpage);
3388 /*
3389 * Even if newpage->mapping was NULL before starting replacement,
3390 * the newpage may be on LRU(or pagevec for LRU) already. We lock
3391 * LRU while we overwrite pc->mem_cgroup.
3392 */
3393 spin_lock_irqsave(&zone->lru_lock, flags);
3394 if (PageLRU(newpage))
3395 del_page_from_lru_list(zone, newpage, page_lru(newpage));
3396 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type);
3397 if (PageLRU(newpage))
3398 add_page_to_lru_list(zone, newpage, page_lru(newpage));
3399 spin_unlock_irqrestore(&zone->lru_lock, flags);
3423} 3400}
3424 3401
3425#ifdef CONFIG_DEBUG_VM 3402#ifdef CONFIG_DEBUG_VM
@@ -3780,7 +3757,7 @@ move_account:
3780 goto out; 3757 goto out;
3781 /* This is for making all *used* pages to be on LRU. */ 3758 /* This is for making all *used* pages to be on LRU. */
3782 lru_add_drain_all(); 3759 lru_add_drain_all();
3783 drain_all_stock_sync(); 3760 drain_all_stock_sync(mem);
3784 ret = 0; 3761 ret = 0;
3785 mem_cgroup_start_move(mem); 3762 mem_cgroup_start_move(mem);
3786 for_each_node_state(node, N_HIGH_MEMORY) { 3763 for_each_node_state(node, N_HIGH_MEMORY) {
@@ -3826,7 +3803,7 @@ try_to_free:
3826 goto out; 3803 goto out;
3827 } 3804 }
3828 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3805 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3829 false, get_swappiness(mem)); 3806 false);
3830 if (!progress) { 3807 if (!progress) {
3831 nr_retries--; 3808 nr_retries--;
3832 /* maybe some writeback is necessary */ 3809 /* maybe some writeback is necessary */
@@ -4152,15 +4129,15 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
4152 s->stat[MCS_PGMAJFAULT] += val; 4129 s->stat[MCS_PGMAJFAULT] += val;
4153 4130
4154 /* per zone stat */ 4131 /* per zone stat */
4155 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 4132 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
4156 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 4133 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4157 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 4134 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
4158 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 4135 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4159 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 4136 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
4160 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 4137 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4161 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 4138 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
4162 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 4139 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4163 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 4140 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
4164 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 4141 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4165} 4142}
4166 4143
@@ -4182,35 +4159,37 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4182 struct cgroup *cont = m->private; 4159 struct cgroup *cont = m->private;
4183 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4160 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4184 4161
4185 total_nr = mem_cgroup_nr_lru_pages(mem_cont); 4162 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
4186 seq_printf(m, "total=%lu", total_nr); 4163 seq_printf(m, "total=%lu", total_nr);
4187 for_each_node_state(nid, N_HIGH_MEMORY) { 4164 for_each_node_state(nid, N_HIGH_MEMORY) {
4188 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid); 4165 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
4189 seq_printf(m, " N%d=%lu", nid, node_nr); 4166 seq_printf(m, " N%d=%lu", nid, node_nr);
4190 } 4167 }
4191 seq_putc(m, '\n'); 4168 seq_putc(m, '\n');
4192 4169
4193 file_nr = mem_cgroup_nr_file_lru_pages(mem_cont); 4170 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
4194 seq_printf(m, "file=%lu", file_nr); 4171 seq_printf(m, "file=%lu", file_nr);
4195 for_each_node_state(nid, N_HIGH_MEMORY) { 4172 for_each_node_state(nid, N_HIGH_MEMORY) {
4196 node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid); 4173 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4174 LRU_ALL_FILE);
4197 seq_printf(m, " N%d=%lu", nid, node_nr); 4175 seq_printf(m, " N%d=%lu", nid, node_nr);
4198 } 4176 }
4199 seq_putc(m, '\n'); 4177 seq_putc(m, '\n');
4200 4178
4201 anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont); 4179 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
4202 seq_printf(m, "anon=%lu", anon_nr); 4180 seq_printf(m, "anon=%lu", anon_nr);
4203 for_each_node_state(nid, N_HIGH_MEMORY) { 4181 for_each_node_state(nid, N_HIGH_MEMORY) {
4204 node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid); 4182 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4183 LRU_ALL_ANON);
4205 seq_printf(m, " N%d=%lu", nid, node_nr); 4184 seq_printf(m, " N%d=%lu", nid, node_nr);
4206 } 4185 }
4207 seq_putc(m, '\n'); 4186 seq_putc(m, '\n');
4208 4187
4209 unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont); 4188 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
4210 seq_printf(m, "unevictable=%lu", unevictable_nr); 4189 seq_printf(m, "unevictable=%lu", unevictable_nr);
4211 for_each_node_state(nid, N_HIGH_MEMORY) { 4190 for_each_node_state(nid, N_HIGH_MEMORY) {
4212 node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont, 4191 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4213 nid); 4192 BIT(LRU_UNEVICTABLE));
4214 seq_printf(m, " N%d=%lu", nid, node_nr); 4193 seq_printf(m, " N%d=%lu", nid, node_nr);
4215 } 4194 }
4216 seq_putc(m, '\n'); 4195 seq_putc(m, '\n');
@@ -4288,7 +4267,7 @@ static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
4288{ 4267{
4289 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4268 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4290 4269
4291 return get_swappiness(memcg); 4270 return mem_cgroup_swappiness(memcg);
4292} 4271}
4293 4272
4294static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 4273static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
@@ -4578,15 +4557,15 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4578 if (!event) 4557 if (!event)
4579 return -ENOMEM; 4558 return -ENOMEM;
4580 4559
4581 mutex_lock(&memcg_oom_mutex); 4560 spin_lock(&memcg_oom_lock);
4582 4561
4583 event->eventfd = eventfd; 4562 event->eventfd = eventfd;
4584 list_add(&event->list, &memcg->oom_notify); 4563 list_add(&event->list, &memcg->oom_notify);
4585 4564
4586 /* already in OOM ? */ 4565 /* already in OOM ? */
4587 if (atomic_read(&memcg->oom_lock)) 4566 if (atomic_read(&memcg->under_oom))
4588 eventfd_signal(eventfd, 1); 4567 eventfd_signal(eventfd, 1);
4589 mutex_unlock(&memcg_oom_mutex); 4568 spin_unlock(&memcg_oom_lock);
4590 4569
4591 return 0; 4570 return 0;
4592} 4571}
@@ -4600,7 +4579,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4600 4579
4601 BUG_ON(type != _OOM_TYPE); 4580 BUG_ON(type != _OOM_TYPE);
4602 4581
4603 mutex_lock(&memcg_oom_mutex); 4582 spin_lock(&memcg_oom_lock);
4604 4583
4605 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 4584 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
4606 if (ev->eventfd == eventfd) { 4585 if (ev->eventfd == eventfd) {
@@ -4609,7 +4588,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4609 } 4588 }
4610 } 4589 }
4611 4590
4612 mutex_unlock(&memcg_oom_mutex); 4591 spin_unlock(&memcg_oom_lock);
4613} 4592}
4614 4593
4615static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4594static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
@@ -4619,7 +4598,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4619 4598
4620 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 4599 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
4621 4600
4622 if (atomic_read(&mem->oom_lock)) 4601 if (atomic_read(&mem->under_oom))
4623 cb->fill(cb, "under_oom", 1); 4602 cb->fill(cb, "under_oom", 1);
4624 else 4603 else
4625 cb->fill(cb, "under_oom", 0); 4604 cb->fill(cb, "under_oom", 0);
@@ -4963,9 +4942,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4963 int cpu; 4942 int cpu;
4964 enable_swap_cgroup(); 4943 enable_swap_cgroup();
4965 parent = NULL; 4944 parent = NULL;
4966 root_mem_cgroup = mem;
4967 if (mem_cgroup_soft_limit_tree_init()) 4945 if (mem_cgroup_soft_limit_tree_init())
4968 goto free_out; 4946 goto free_out;
4947 root_mem_cgroup = mem;
4969 for_each_possible_cpu(cpu) { 4948 for_each_possible_cpu(cpu) {
4970 struct memcg_stock_pcp *stock = 4949 struct memcg_stock_pcp *stock =
4971 &per_cpu(memcg_stock, cpu); 4950 &per_cpu(memcg_stock, cpu);
@@ -4997,14 +4976,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4997 INIT_LIST_HEAD(&mem->oom_notify); 4976 INIT_LIST_HEAD(&mem->oom_notify);
4998 4977
4999 if (parent) 4978 if (parent)
5000 mem->swappiness = get_swappiness(parent); 4979 mem->swappiness = mem_cgroup_swappiness(parent);
5001 atomic_set(&mem->refcnt, 1); 4980 atomic_set(&mem->refcnt, 1);
5002 mem->move_charge_at_immigrate = 0; 4981 mem->move_charge_at_immigrate = 0;
5003 mutex_init(&mem->thresholds_lock); 4982 mutex_init(&mem->thresholds_lock);
5004 return &mem->css; 4983 return &mem->css;
5005free_out: 4984free_out:
5006 __mem_cgroup_free(mem); 4985 __mem_cgroup_free(mem);
5007 root_mem_cgroup = NULL;
5008 return ERR_PTR(error); 4986 return ERR_PTR(error);
5009} 4987}
5010 4988
@@ -5181,15 +5159,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5181 pgoff = pte_to_pgoff(ptent); 5159 pgoff = pte_to_pgoff(ptent);
5182 5160
5183 /* page is moved even if it's not RSS of this task(page-faulted). */ 5161 /* page is moved even if it's not RSS of this task(page-faulted). */
5184 if (!mapping_cap_swap_backed(mapping)) { /* normal file */ 5162 page = find_get_page(mapping, pgoff);
5185 page = find_get_page(mapping, pgoff); 5163
5186 } else { /* shmem/tmpfs file. we should take account of swap too. */ 5164#ifdef CONFIG_SWAP
5187 swp_entry_t ent; 5165 /* shmem/tmpfs may report page out on swap: account for that too. */
5188 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); 5166 if (radix_tree_exceptional_entry(page)) {
5167 swp_entry_t swap = radix_to_swp_entry(page);
5189 if (do_swap_account) 5168 if (do_swap_account)
5190 entry->val = ent.val; 5169 *entry = swap;
5170 page = find_get_page(&swapper_space, swap.val);
5191 } 5171 }
5192 5172#endif
5193 return page; 5173 return page;
5194} 5174}
5195 5175
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 740c4f52059..2b43ba051ac 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -53,6 +53,7 @@
53#include <linux/hugetlb.h> 53#include <linux/hugetlb.h>
54#include <linux/memory_hotplug.h> 54#include <linux/memory_hotplug.h>
55#include <linux/mm_inline.h> 55#include <linux/mm_inline.h>
56#include <linux/kfifo.h>
56#include "internal.h" 57#include "internal.h"
57 58
58int sysctl_memory_failure_early_kill __read_mostly = 0; 59int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1178,6 +1179,97 @@ void memory_failure(unsigned long pfn, int trapno)
1178 __memory_failure(pfn, trapno, 0); 1179 __memory_failure(pfn, trapno, 0);
1179} 1180}
1180 1181
1182#define MEMORY_FAILURE_FIFO_ORDER 4
1183#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1184
1185struct memory_failure_entry {
1186 unsigned long pfn;
1187 int trapno;
1188 int flags;
1189};
1190
1191struct memory_failure_cpu {
1192 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1193 MEMORY_FAILURE_FIFO_SIZE);
1194 spinlock_t lock;
1195 struct work_struct work;
1196};
1197
1198static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1199
1200/**
1201 * memory_failure_queue - Schedule handling memory failure of a page.
1202 * @pfn: Page Number of the corrupted page
1203 * @trapno: Trap number reported in the signal to user space.
1204 * @flags: Flags for memory failure handling
1205 *
1206 * This function is called by the low level hardware error handler
1207 * when it detects hardware memory corruption of a page. It schedules
1208 * the recovering of error page, including dropping pages, killing
1209 * processes etc.
1210 *
1211 * The function is primarily of use for corruptions that
1212 * happen outside the current execution context (e.g. when
1213 * detected by a background scrubber)
1214 *
1215 * Can run in IRQ context.
1216 */
1217void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1218{
1219 struct memory_failure_cpu *mf_cpu;
1220 unsigned long proc_flags;
1221 struct memory_failure_entry entry = {
1222 .pfn = pfn,
1223 .trapno = trapno,
1224 .flags = flags,
1225 };
1226
1227 mf_cpu = &get_cpu_var(memory_failure_cpu);
1228 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1229 if (kfifo_put(&mf_cpu->fifo, &entry))
1230 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1231 else
1232 pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
1233 pfn);
1234 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1235 put_cpu_var(memory_failure_cpu);
1236}
1237EXPORT_SYMBOL_GPL(memory_failure_queue);
1238
1239static void memory_failure_work_func(struct work_struct *work)
1240{
1241 struct memory_failure_cpu *mf_cpu;
1242 struct memory_failure_entry entry = { 0, };
1243 unsigned long proc_flags;
1244 int gotten;
1245
1246 mf_cpu = &__get_cpu_var(memory_failure_cpu);
1247 for (;;) {
1248 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1249 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1250 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1251 if (!gotten)
1252 break;
1253 __memory_failure(entry.pfn, entry.trapno, entry.flags);
1254 }
1255}
1256
1257static int __init memory_failure_init(void)
1258{
1259 struct memory_failure_cpu *mf_cpu;
1260 int cpu;
1261
1262 for_each_possible_cpu(cpu) {
1263 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1264 spin_lock_init(&mf_cpu->lock);
1265 INIT_KFIFO(mf_cpu->fifo);
1266 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1267 }
1268
1269 return 0;
1270}
1271core_initcall(memory_failure_init);
1272
1181/** 1273/**
1182 * unpoison_memory - Unpoison a previously poisoned page 1274 * unpoison_memory - Unpoison a previously poisoned page
1183 * @pfn: Page number of the to be unpoisoned page 1275 * @pfn: Page number of the to be unpoisoned page
diff --git a/mm/memory.c b/mm/memory.c
index 9b8a01d941c..b2b87315cdc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1290,13 +1290,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1290 return addr; 1290 return addr;
1291} 1291}
1292 1292
1293#ifdef CONFIG_PREEMPT
1294# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
1295#else
1296/* No preempt: go for improved straight-line efficiency */
1297# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
1298#endif
1299
1300/** 1293/**
1301 * unmap_vmas - unmap a range of memory covered by a list of vma's 1294 * unmap_vmas - unmap a range of memory covered by a list of vma's
1302 * @tlb: address of the caller's struct mmu_gather 1295 * @tlb: address of the caller's struct mmu_gather
@@ -1310,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1310 * 1303 *
1311 * Unmap all pages in the vma list. 1304 * Unmap all pages in the vma list.
1312 * 1305 *
1313 * We aim to not hold locks for too long (for scheduling latency reasons).
1314 * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
1315 * return the ending mmu_gather to the caller.
1316 *
1317 * Only addresses between `start' and `end' will be unmapped. 1306 * Only addresses between `start' and `end' will be unmapped.
1318 * 1307 *
1319 * The VMA list must be sorted in ascending virtual address order. 1308 * The VMA list must be sorted in ascending virtual address order.
@@ -1514,7 +1503,7 @@ split_fallthrough:
1514 } 1503 }
1515 1504
1516 if (flags & FOLL_GET) 1505 if (flags & FOLL_GET)
1517 get_page(page); 1506 get_page_foll(page);
1518 if (flags & FOLL_TOUCH) { 1507 if (flags & FOLL_TOUCH) {
1519 if ((flags & FOLL_WRITE) && 1508 if ((flags & FOLL_WRITE) &&
1520 !pte_dirty(pte) && !PageDirty(page)) 1509 !pte_dirty(pte) && !PageDirty(page))
@@ -1816,7 +1805,63 @@ next_page:
1816} 1805}
1817EXPORT_SYMBOL(__get_user_pages); 1806EXPORT_SYMBOL(__get_user_pages);
1818 1807
1819/** 1808/*
1809 * fixup_user_fault() - manually resolve a user page fault
1810 * @tsk: the task_struct to use for page fault accounting, or
1811 * NULL if faults are not to be recorded.
1812 * @mm: mm_struct of target mm
1813 * @address: user address
1814 * @fault_flags:flags to pass down to handle_mm_fault()
1815 *
1816 * This is meant to be called in the specific scenario where for locking reasons
1817 * we try to access user memory in atomic context (within a pagefault_disable()
1818 * section), this returns -EFAULT, and we want to resolve the user fault before
1819 * trying again.
1820 *
1821 * Typically this is meant to be used by the futex code.
1822 *
1823 * The main difference with get_user_pages() is that this function will
1824 * unconditionally call handle_mm_fault() which will in turn perform all the
1825 * necessary SW fixup of the dirty and young bits in the PTE, while
1826 * handle_mm_fault() only guarantees to update these in the struct page.
1827 *
1828 * This is important for some architectures where those bits also gate the
1829 * access permission to the page because they are maintained in software. On
1830 * such architectures, gup() will not be enough to make a subsequent access
1831 * succeed.
1832 *
1833 * This should be called with the mm_sem held for read.
1834 */
1835int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1836 unsigned long address, unsigned int fault_flags)
1837{
1838 struct vm_area_struct *vma;
1839 int ret;
1840
1841 vma = find_extend_vma(mm, address);
1842 if (!vma || address < vma->vm_start)
1843 return -EFAULT;
1844
1845 ret = handle_mm_fault(mm, vma, address, fault_flags);
1846 if (ret & VM_FAULT_ERROR) {
1847 if (ret & VM_FAULT_OOM)
1848 return -ENOMEM;
1849 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1850 return -EHWPOISON;
1851 if (ret & VM_FAULT_SIGBUS)
1852 return -EFAULT;
1853 BUG();
1854 }
1855 if (tsk) {
1856 if (ret & VM_FAULT_MAJOR)
1857 tsk->maj_flt++;
1858 else
1859 tsk->min_flt++;
1860 }
1861 return 0;
1862}
1863
1864/*
1820 * get_user_pages() - pin user pages in memory 1865 * get_user_pages() - pin user pages in memory
1821 * @tsk: the task_struct to use for page fault accounting, or 1866 * @tsk: the task_struct to use for page fault accounting, or
1822 * NULL if faults are not to be recorded. 1867 * NULL if faults are not to be recorded.
@@ -3104,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3104 pte_t *page_table; 3149 pte_t *page_table;
3105 spinlock_t *ptl; 3150 spinlock_t *ptl;
3106 struct page *page; 3151 struct page *page;
3152 struct page *cow_page;
3107 pte_t entry; 3153 pte_t entry;
3108 int anon = 0; 3154 int anon = 0;
3109 int charged = 0;
3110 struct page *dirty_page = NULL; 3155 struct page *dirty_page = NULL;
3111 struct vm_fault vmf; 3156 struct vm_fault vmf;
3112 int ret; 3157 int ret;
3113 int page_mkwrite = 0; 3158 int page_mkwrite = 0;
3114 3159
3160 /*
3161 * If we do COW later, allocate page befor taking lock_page()
3162 * on the file cache page. This will reduce lock holding time.
3163 */
3164 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3165
3166 if (unlikely(anon_vma_prepare(vma)))
3167 return VM_FAULT_OOM;
3168
3169 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3170 if (!cow_page)
3171 return VM_FAULT_OOM;
3172
3173 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3174 page_cache_release(cow_page);
3175 return VM_FAULT_OOM;
3176 }
3177 } else
3178 cow_page = NULL;
3179
3115 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 3180 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3116 vmf.pgoff = pgoff; 3181 vmf.pgoff = pgoff;
3117 vmf.flags = flags; 3182 vmf.flags = flags;
@@ -3120,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3120 ret = vma->vm_ops->fault(vma, &vmf); 3185 ret = vma->vm_ops->fault(vma, &vmf);
3121 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | 3186 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3122 VM_FAULT_RETRY))) 3187 VM_FAULT_RETRY)))
3123 return ret; 3188 goto uncharge_out;
3124 3189
3125 if (unlikely(PageHWPoison(vmf.page))) { 3190 if (unlikely(PageHWPoison(vmf.page))) {
3126 if (ret & VM_FAULT_LOCKED) 3191 if (ret & VM_FAULT_LOCKED)
3127 unlock_page(vmf.page); 3192 unlock_page(vmf.page);
3128 return VM_FAULT_HWPOISON; 3193 ret = VM_FAULT_HWPOISON;
3194 goto uncharge_out;
3129 } 3195 }
3130 3196
3131 /* 3197 /*
@@ -3143,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3143 page = vmf.page; 3209 page = vmf.page;
3144 if (flags & FAULT_FLAG_WRITE) { 3210 if (flags & FAULT_FLAG_WRITE) {
3145 if (!(vma->vm_flags & VM_SHARED)) { 3211 if (!(vma->vm_flags & VM_SHARED)) {
3212 page = cow_page;
3146 anon = 1; 3213 anon = 1;
3147 if (unlikely(anon_vma_prepare(vma))) {
3148 ret = VM_FAULT_OOM;
3149 goto out;
3150 }
3151 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
3152 vma, address);
3153 if (!page) {
3154 ret = VM_FAULT_OOM;
3155 goto out;
3156 }
3157 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
3158 ret = VM_FAULT_OOM;
3159 page_cache_release(page);
3160 goto out;
3161 }
3162 charged = 1;
3163 copy_user_highpage(page, vmf.page, address, vma); 3214 copy_user_highpage(page, vmf.page, address, vma);
3164 __SetPageUptodate(page); 3215 __SetPageUptodate(page);
3165 } else { 3216 } else {
@@ -3228,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3228 /* no need to invalidate: a not-present page won't be cached */ 3279 /* no need to invalidate: a not-present page won't be cached */
3229 update_mmu_cache(vma, address, page_table); 3280 update_mmu_cache(vma, address, page_table);
3230 } else { 3281 } else {
3231 if (charged) 3282 if (cow_page)
3232 mem_cgroup_uncharge_page(page); 3283 mem_cgroup_uncharge_page(cow_page);
3233 if (anon) 3284 if (anon)
3234 page_cache_release(page); 3285 page_cache_release(page);
3235 else 3286 else
@@ -3238,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3238 3289
3239 pte_unmap_unlock(page_table, ptl); 3290 pte_unmap_unlock(page_table, ptl);
3240 3291
3241out:
3242 if (dirty_page) { 3292 if (dirty_page) {
3243 struct address_space *mapping = page->mapping; 3293 struct address_space *mapping = page->mapping;
3244 3294
@@ -3268,6 +3318,13 @@ out:
3268unwritable_page: 3318unwritable_page:
3269 page_cache_release(page); 3319 page_cache_release(page);
3270 return ret; 3320 return ret;
3321uncharge_out:
3322 /* fs's fault handler get error */
3323 if (cow_page) {
3324 mem_cgroup_uncharge_page(cow_page);
3325 page_cache_release(cow_page);
3326 }
3327 return ret;
3271} 3328}
3272 3329
3273static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3330static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c46887b5a11..6e7d8b21dbf 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,17 @@
34 34
35#include "internal.h" 35#include "internal.h"
36 36
37/*
38 * online_page_callback contains pointer to current page onlining function.
39 * Initially it is generic_online_page(). If it is required it could be
40 * changed by calling set_online_page_callback() for callback registration
41 * and restore_online_page_callback() for generic callback restore.
42 */
43
44static void generic_online_page(struct page *page);
45
46static online_page_callback_t online_page_callback = generic_online_page;
47
37DEFINE_MUTEX(mem_hotplug_mutex); 48DEFINE_MUTEX(mem_hotplug_mutex);
38 49
39void lock_memory_hotplug(void) 50void lock_memory_hotplug(void)
@@ -361,23 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
361} 372}
362EXPORT_SYMBOL_GPL(__remove_pages); 373EXPORT_SYMBOL_GPL(__remove_pages);
363 374
364void online_page(struct page *page) 375int set_online_page_callback(online_page_callback_t callback)
376{
377 int rc = -EINVAL;
378
379 lock_memory_hotplug();
380
381 if (online_page_callback == generic_online_page) {
382 online_page_callback = callback;
383 rc = 0;
384 }
385
386 unlock_memory_hotplug();
387
388 return rc;
389}
390EXPORT_SYMBOL_GPL(set_online_page_callback);
391
392int restore_online_page_callback(online_page_callback_t callback)
393{
394 int rc = -EINVAL;
395
396 lock_memory_hotplug();
397
398 if (online_page_callback == callback) {
399 online_page_callback = generic_online_page;
400 rc = 0;
401 }
402
403 unlock_memory_hotplug();
404
405 return rc;
406}
407EXPORT_SYMBOL_GPL(restore_online_page_callback);
408
409void __online_page_set_limits(struct page *page)
365{ 410{
366 unsigned long pfn = page_to_pfn(page); 411 unsigned long pfn = page_to_pfn(page);
367 412
368 totalram_pages++;
369 if (pfn >= num_physpages) 413 if (pfn >= num_physpages)
370 num_physpages = pfn + 1; 414 num_physpages = pfn + 1;
415}
416EXPORT_SYMBOL_GPL(__online_page_set_limits);
417
418void __online_page_increment_counters(struct page *page)
419{
420 totalram_pages++;
371 421
372#ifdef CONFIG_HIGHMEM 422#ifdef CONFIG_HIGHMEM
373 if (PageHighMem(page)) 423 if (PageHighMem(page))
374 totalhigh_pages++; 424 totalhigh_pages++;
375#endif 425#endif
426}
427EXPORT_SYMBOL_GPL(__online_page_increment_counters);
376 428
429void __online_page_free(struct page *page)
430{
377 ClearPageReserved(page); 431 ClearPageReserved(page);
378 init_page_count(page); 432 init_page_count(page);
379 __free_page(page); 433 __free_page(page);
380} 434}
435EXPORT_SYMBOL_GPL(__online_page_free);
436
437static void generic_online_page(struct page *page)
438{
439 __online_page_set_limits(page);
440 __online_page_increment_counters(page);
441 __online_page_free(page);
442}
381 443
382static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 444static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
383 void *arg) 445 void *arg)
@@ -388,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
388 if (PageReserved(pfn_to_page(start_pfn))) 450 if (PageReserved(pfn_to_page(start_pfn)))
389 for (i = 0; i < nr_pages; i++) { 451 for (i = 0; i < nr_pages; i++) {
390 page = pfn_to_page(start_pfn + i); 452 page = pfn_to_page(start_pfn + i);
391 online_page(page); 453 (*online_page_callback)(page);
392 onlined_pages++; 454 onlined_pages++;
393 } 455 }
394 *(unsigned long *)arg = onlined_pages; 456 *(unsigned long *)arg = onlined_pages;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e7fb9d25c54..2775fd04924 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,6 +93,7 @@
93 93
94#include <asm/tlbflush.h> 94#include <asm/tlbflush.h>
95#include <asm/uaccess.h> 95#include <asm/uaccess.h>
96#include <linux/random.h>
96 97
97#include "internal.h" 98#include "internal.h"
98 99
@@ -643,14 +644,22 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
643 if (!vma || vma->vm_start > start) 644 if (!vma || vma->vm_start > start)
644 return -EFAULT; 645 return -EFAULT;
645 646
647 if (start > vma->vm_start)
648 prev = vma;
649
646 for (; vma && vma->vm_start < end; prev = vma, vma = next) { 650 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
647 next = vma->vm_next; 651 next = vma->vm_next;
648 vmstart = max(start, vma->vm_start); 652 vmstart = max(start, vma->vm_start);
649 vmend = min(end, vma->vm_end); 653 vmend = min(end, vma->vm_end);
650 654
651 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 655 if (mpol_equal(vma_policy(vma), new_pol))
656 continue;
657
658 pgoff = vma->vm_pgoff +
659 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
652 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, 660 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
653 vma->anon_vma, vma->vm_file, pgoff, new_pol); 661 vma->anon_vma, vma->vm_file, pgoff,
662 new_pol);
654 if (prev) { 663 if (prev) {
655 vma = prev; 664 vma = prev;
656 next = vma->vm_next; 665 next = vma->vm_next;
@@ -1411,7 +1420,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1411 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); 1420 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1412 1421
1413 if (!err && nmask) { 1422 if (!err && nmask) {
1414 err = copy_from_user(bm, nm, alloc_size); 1423 unsigned long copy_size;
1424 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1425 err = copy_from_user(bm, nm, copy_size);
1415 /* ensure entire bitmap is zeroed */ 1426 /* ensure entire bitmap is zeroed */
1416 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); 1427 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1417 err |= compat_put_bitmap(nmask, bm, nr_bits); 1428 err |= compat_put_bitmap(nmask, bm, nr_bits);
@@ -1645,6 +1656,21 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1645 return interleave_nodes(pol); 1656 return interleave_nodes(pol);
1646} 1657}
1647 1658
1659/*
1660 * Return the bit number of a random bit set in the nodemask.
1661 * (returns -1 if nodemask is empty)
1662 */
1663int node_random(const nodemask_t *maskp)
1664{
1665 int w, bit = -1;
1666
1667 w = nodes_weight(*maskp);
1668 if (w)
1669 bit = bitmap_ord_to_pos(maskp->bits,
1670 get_random_int() % w, MAX_NUMNODES);
1671 return bit;
1672}
1673
1648#ifdef CONFIG_HUGETLBFS 1674#ifdef CONFIG_HUGETLBFS
1649/* 1675/*
1650 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) 1676 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
diff --git a/mm/migrate.c b/mm/migrate.c
index 666e4e67741..14d0a6a632f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
120 120
121 ptep = pte_offset_map(pmd, addr); 121 ptep = pte_offset_map(pmd, addr);
122 122
123 if (!is_swap_pte(*ptep)) { 123 /*
124 pte_unmap(ptep); 124 * Peek to check is_swap_pte() before taking ptlock? No, we
125 goto out; 125 * can race mremap's move_ptes(), which skips anon_vma lock.
126 } 126 */
127 127
128 ptl = pte_lockptr(mm, pmd); 128 ptl = pte_lockptr(mm, pmd);
129 } 129 }
diff --git a/mm/mincore.c b/mm/mincore.c
index a4e6b9d75c7..636a86876ff 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
69 * file will not get a swp_entry_t in its pte, but rather it is like 69 * file will not get a swp_entry_t in its pte, but rather it is like
70 * any other file mapping (ie. marked !present and faulted in with 70 * any other file mapping (ie. marked !present and faulted in with
71 * tmpfs's .fault). So swapped out tmpfs mappings are tested here. 71 * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
72 *
73 * However when tmpfs moves the page from pagecache and into swapcache,
74 * it is still in core, but the find_get_page below won't find it.
75 * No big deal, but make a note of it.
76 */ 72 */
77 page = find_get_page(mapping, pgoff); 73 page = find_get_page(mapping, pgoff);
74#ifdef CONFIG_SWAP
75 /* shmem/tmpfs may return swap: account for swapcache page too. */
76 if (radix_tree_exceptional_entry(page)) {
77 swp_entry_t swap = radix_to_swp_entry(page);
78 page = find_get_page(&swapper_space, swap.val);
79 }
80#endif
78 if (page) { 81 if (page) {
79 present = PageUptodate(page); 82 present = PageUptodate(page);
80 page_cache_release(page); 83 page_cache_release(page);
diff --git a/mm/mmap.c b/mm/mmap.c
index d49736ff8a8..a65efd4db3e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -122,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
122 return 0; 122 return 0;
123 123
124 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 124 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
125 unsigned long n; 125 free = global_page_state(NR_FREE_PAGES);
126 free += global_page_state(NR_FILE_PAGES);
127
128 /*
129 * shmem pages shouldn't be counted as free in this
130 * case, they can't be purged, only swapped out, and
131 * that won't affect the overall amount of available
132 * memory in the system.
133 */
134 free -= global_page_state(NR_SHMEM);
126 135
127 free = global_page_state(NR_FILE_PAGES);
128 free += nr_swap_pages; 136 free += nr_swap_pages;
129 137
130 /* 138 /*
@@ -136,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
136 free += global_page_state(NR_SLAB_RECLAIMABLE); 144 free += global_page_state(NR_SLAB_RECLAIMABLE);
137 145
138 /* 146 /*
139 * Leave the last 3% for root
140 */
141 if (!cap_sys_admin)
142 free -= free / 32;
143
144 if (free > pages)
145 return 0;
146
147 /*
148 * nr_free_pages() is very expensive on large systems,
149 * only call if we're about to fail.
150 */
151 n = nr_free_pages();
152
153 /*
154 * Leave reserved pages. The pages are not for anonymous pages. 147 * Leave reserved pages. The pages are not for anonymous pages.
155 */ 148 */
156 if (n <= totalreserve_pages) 149 if (free <= totalreserve_pages)
157 goto error; 150 goto error;
158 else 151 else
159 n -= totalreserve_pages; 152 free -= totalreserve_pages;
160 153
161 /* 154 /*
162 * Leave the last 3% for root 155 * Leave the last 3% for root
163 */ 156 */
164 if (!cap_sys_admin) 157 if (!cap_sys_admin)
165 n -= n / 32; 158 free -= free / 32;
166 free += n;
167 159
168 if (free > pages) 160 if (free > pages)
169 return 0; 161 return 0;
diff --git a/mm/nommu.c b/mm/nommu.c
index 9edc897a397..4358032566e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,6 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
25#include <linux/tracehook.h>
26#include <linux/blkdev.h> 25#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
28#include <linux/mount.h> 27#include <linux/mount.h>
@@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file,
1087 * it's being traced - otherwise breakpoints set in it may interfere 1086 * it's being traced - otherwise breakpoints set in it may interfere
1088 * with another untraced process 1087 * with another untraced process
1089 */ 1088 */
1090 if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) 1089 if ((flags & MAP_PRIVATE) && current->ptrace)
1091 vm_flags &= ~VM_MAYSHARE; 1090 vm_flags &= ~VM_MAYSHARE;
1092 1091
1093 return vm_flags; 1092 return vm_flags;
@@ -1885,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1885 return 0; 1884 return 0;
1886 1885
1887 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 1886 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
1888 unsigned long n; 1887 free = global_page_state(NR_FREE_PAGES);
1888 free += global_page_state(NR_FILE_PAGES);
1889
1890 /*
1891 * shmem pages shouldn't be counted as free in this
1892 * case, they can't be purged, only swapped out, and
1893 * that won't affect the overall amount of available
1894 * memory in the system.
1895 */
1896 free -= global_page_state(NR_SHMEM);
1889 1897
1890 free = global_page_state(NR_FILE_PAGES);
1891 free += nr_swap_pages; 1898 free += nr_swap_pages;
1892 1899
1893 /* 1900 /*
@@ -1899,34 +1906,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1899 free += global_page_state(NR_SLAB_RECLAIMABLE); 1906 free += global_page_state(NR_SLAB_RECLAIMABLE);
1900 1907
1901 /* 1908 /*
1902 * Leave the last 3% for root
1903 */
1904 if (!cap_sys_admin)
1905 free -= free / 32;
1906
1907 if (free > pages)
1908 return 0;
1909
1910 /*
1911 * nr_free_pages() is very expensive on large systems,
1912 * only call if we're about to fail.
1913 */
1914 n = nr_free_pages();
1915
1916 /*
1917 * Leave reserved pages. The pages are not for anonymous pages. 1909 * Leave reserved pages. The pages are not for anonymous pages.
1918 */ 1910 */
1919 if (n <= totalreserve_pages) 1911 if (free <= totalreserve_pages)
1920 goto error; 1912 goto error;
1921 else 1913 else
1922 n -= totalreserve_pages; 1914 free -= totalreserve_pages;
1923 1915
1924 /* 1916 /*
1925 * Leave the last 3% for root 1917 * Leave the last 3% for root
1926 */ 1918 */
1927 if (!cap_sys_admin) 1919 if (!cap_sys_admin)
1928 n -= n / 32; 1920 free -= free / 32;
1929 free += n;
1930 1921
1931 if (free > pages) 1922 if (free > pages)
1932 return 0; 1923 return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e4b0991ca35..e9a17857a20 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -162,7 +162,7 @@ static bool oom_unkillable_task(struct task_struct *p,
162unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, 162unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
163 const nodemask_t *nodemask, unsigned long totalpages) 163 const nodemask_t *nodemask, unsigned long totalpages)
164{ 164{
165 int points; 165 long points;
166 166
167 if (oom_unkillable_task(p, mem, nodemask)) 167 if (oom_unkillable_task(p, mem, nodemask))
168 return 0; 168 return 0;
@@ -303,7 +303,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
303 do_each_thread(g, p) { 303 do_each_thread(g, p) {
304 unsigned int points; 304 unsigned int points;
305 305
306 if (!p->mm) 306 if (p->exit_state)
307 continue; 307 continue;
308 if (oom_unkillable_task(p, mem, nodemask)) 308 if (oom_unkillable_task(p, mem, nodemask))
309 continue; 309 continue;
@@ -319,6 +319,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
319 */ 319 */
320 if (test_tsk_thread_flag(p, TIF_MEMDIE)) 320 if (test_tsk_thread_flag(p, TIF_MEMDIE))
321 return ERR_PTR(-1UL); 321 return ERR_PTR(-1UL);
322 if (!p->mm)
323 continue;
322 324
323 if (p->flags & PF_EXITING) { 325 if (p->flags & PF_EXITING) {
324 /* 326 /*
@@ -339,8 +341,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
339 * then wait for it to finish before killing 341 * then wait for it to finish before killing
340 * some other task unnecessarily. 342 * some other task unnecessarily.
341 */ 343 */
342 if (!(task_ptrace(p->group_leader) & 344 if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
343 PT_TRACE_EXIT))
344 return ERR_PTR(-1UL); 345 return ERR_PTR(-1UL);
345 } 346 }
346 } 347 }
@@ -488,7 +489,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
488 489
489 /* 490 /*
490 * If any of p's children has a different mm and is eligible for kill, 491 * If any of p's children has a different mm and is eligible for kill,
491 * the one with the highest badness() score is sacrificed for its 492 * the one with the highest oom_badness() score is sacrificed for its
492 * parent. This attempts to lose the minimal amount of work done while 493 * parent. This attempts to lose the minimal amount of work done while
493 * still freeing memory. 494 * still freeing memory.
494 */ 495 */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 31f69886242..0e309cd1b5b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -37,6 +37,16 @@
37#include <trace/events/writeback.h> 37#include <trace/events/writeback.h>
38 38
39/* 39/*
40 * Sleep at most 200ms at a time in balance_dirty_pages().
41 */
42#define MAX_PAUSE max(HZ/5, 1)
43
44/*
45 * Estimate write bandwidth at 200ms intervals.
46 */
47#define BANDWIDTH_INTERVAL max(HZ/5, 1)
48
49/*
40 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 50 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
41 * will look to see if it needs to force writeback or throttling. 51 * will look to see if it needs to force writeback or throttling.
42 */ 52 */
@@ -111,6 +121,7 @@ EXPORT_SYMBOL(laptop_mode);
111 121
112/* End of sysctl-exported parameters */ 122/* End of sysctl-exported parameters */
113 123
124unsigned long global_dirty_limit;
114 125
115/* 126/*
116 * Scale the writeback cache size proportional to the relative writeout speeds. 127 * Scale the writeback cache size proportional to the relative writeout speeds.
@@ -219,6 +230,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
219 */ 230 */
220static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) 231static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
221{ 232{
233 __inc_bdi_stat(bdi, BDI_WRITTEN);
222 __prop_inc_percpu_max(&vm_completions, &bdi->completions, 234 __prop_inc_percpu_max(&vm_completions, &bdi->completions,
223 bdi->max_prop_frac); 235 bdi->max_prop_frac);
224} 236}
@@ -244,13 +256,8 @@ void task_dirty_inc(struct task_struct *tsk)
244static void bdi_writeout_fraction(struct backing_dev_info *bdi, 256static void bdi_writeout_fraction(struct backing_dev_info *bdi,
245 long *numerator, long *denominator) 257 long *numerator, long *denominator)
246{ 258{
247 if (bdi_cap_writeback_dirty(bdi)) { 259 prop_fraction_percpu(&vm_completions, &bdi->completions,
248 prop_fraction_percpu(&vm_completions, &bdi->completions,
249 numerator, denominator); 260 numerator, denominator);
250 } else {
251 *numerator = 0;
252 *denominator = 1;
253 }
254} 261}
255 262
256static inline void task_dirties_fraction(struct task_struct *tsk, 263static inline void task_dirties_fraction(struct task_struct *tsk,
@@ -274,12 +281,13 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
274 * effectively curb the growth of dirty pages. Light dirtiers with high enough 281 * effectively curb the growth of dirty pages. Light dirtiers with high enough
275 * dirty threshold may never get throttled. 282 * dirty threshold may never get throttled.
276 */ 283 */
284#define TASK_LIMIT_FRACTION 8
277static unsigned long task_dirty_limit(struct task_struct *tsk, 285static unsigned long task_dirty_limit(struct task_struct *tsk,
278 unsigned long bdi_dirty) 286 unsigned long bdi_dirty)
279{ 287{
280 long numerator, denominator; 288 long numerator, denominator;
281 unsigned long dirty = bdi_dirty; 289 unsigned long dirty = bdi_dirty;
282 u64 inv = dirty >> 3; 290 u64 inv = dirty / TASK_LIMIT_FRACTION;
283 291
284 task_dirties_fraction(tsk, &numerator, &denominator); 292 task_dirties_fraction(tsk, &numerator, &denominator);
285 inv *= numerator; 293 inv *= numerator;
@@ -290,6 +298,12 @@ static unsigned long task_dirty_limit(struct task_struct *tsk,
290 return max(dirty, bdi_dirty/2); 298 return max(dirty, bdi_dirty/2);
291} 299}
292 300
301/* Minimum limit for any task */
302static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
303{
304 return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
305}
306
293/* 307/*
294 * 308 *
295 */ 309 */
@@ -397,6 +411,11 @@ unsigned long determine_dirtyable_memory(void)
397 return x + 1; /* Ensure that we never return 0 */ 411 return x + 1; /* Ensure that we never return 0 */
398} 412}
399 413
414static unsigned long hard_dirty_limit(unsigned long thresh)
415{
416 return max(thresh, global_dirty_limit);
417}
418
400/* 419/*
401 * global_dirty_limits - background-writeback and dirty-throttling thresholds 420 * global_dirty_limits - background-writeback and dirty-throttling thresholds
402 * 421 *
@@ -435,12 +454,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
435 } 454 }
436 *pbackground = background; 455 *pbackground = background;
437 *pdirty = dirty; 456 *pdirty = dirty;
457 trace_global_dirty_state(background, dirty);
438} 458}
439 459
440/* 460/**
441 * bdi_dirty_limit - @bdi's share of dirty throttling threshold 461 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
462 * @bdi: the backing_dev_info to query
463 * @dirty: global dirty limit in pages
442 * 464 *
443 * Allocate high/low dirty limits to fast/slow devices, in order to prevent 465 * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
466 * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
467 * And the "limit" in the name is not seriously taken as hard limit in
468 * balance_dirty_pages().
469 *
470 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
444 * - starving fast devices 471 * - starving fast devices
445 * - piling up dirty pages (that will take long time to sync) on slow devices 472 * - piling up dirty pages (that will take long time to sync) on slow devices
446 * 473 *
@@ -468,6 +495,153 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
468 return bdi_dirty; 495 return bdi_dirty;
469} 496}
470 497
498static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
499 unsigned long elapsed,
500 unsigned long written)
501{
502 const unsigned long period = roundup_pow_of_two(3 * HZ);
503 unsigned long avg = bdi->avg_write_bandwidth;
504 unsigned long old = bdi->write_bandwidth;
505 u64 bw;
506
507 /*
508 * bw = written * HZ / elapsed
509 *
510 * bw * elapsed + write_bandwidth * (period - elapsed)
511 * write_bandwidth = ---------------------------------------------------
512 * period
513 */
514 bw = written - bdi->written_stamp;
515 bw *= HZ;
516 if (unlikely(elapsed > period)) {
517 do_div(bw, elapsed);
518 avg = bw;
519 goto out;
520 }
521 bw += (u64)bdi->write_bandwidth * (period - elapsed);
522 bw >>= ilog2(period);
523
524 /*
525 * one more level of smoothing, for filtering out sudden spikes
526 */
527 if (avg > old && old >= (unsigned long)bw)
528 avg -= (avg - old) >> 3;
529
530 if (avg < old && old <= (unsigned long)bw)
531 avg += (old - avg) >> 3;
532
533out:
534 bdi->write_bandwidth = bw;
535 bdi->avg_write_bandwidth = avg;
536}
537
538/*
539 * The global dirtyable memory and dirty threshold could be suddenly knocked
540 * down by a large amount (eg. on the startup of KVM in a swapless system).
541 * This may throw the system into deep dirty exceeded state and throttle
542 * heavy/light dirtiers alike. To retain good responsiveness, maintain
543 * global_dirty_limit for tracking slowly down to the knocked down dirty
544 * threshold.
545 */
546static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
547{
548 unsigned long limit = global_dirty_limit;
549
550 /*
551 * Follow up in one step.
552 */
553 if (limit < thresh) {
554 limit = thresh;
555 goto update;
556 }
557
558 /*
559 * Follow down slowly. Use the higher one as the target, because thresh
560 * may drop below dirty. This is exactly the reason to introduce
561 * global_dirty_limit which is guaranteed to lie above the dirty pages.
562 */
563 thresh = max(thresh, dirty);
564 if (limit > thresh) {
565 limit -= (limit - thresh) >> 5;
566 goto update;
567 }
568 return;
569update:
570 global_dirty_limit = limit;
571}
572
573static void global_update_bandwidth(unsigned long thresh,
574 unsigned long dirty,
575 unsigned long now)
576{
577 static DEFINE_SPINLOCK(dirty_lock);
578 static unsigned long update_time;
579
580 /*
581 * check locklessly first to optimize away locking for the most time
582 */
583 if (time_before(now, update_time + BANDWIDTH_INTERVAL))
584 return;
585
586 spin_lock(&dirty_lock);
587 if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
588 update_dirty_limit(thresh, dirty);
589 update_time = now;
590 }
591 spin_unlock(&dirty_lock);
592}
593
594void __bdi_update_bandwidth(struct backing_dev_info *bdi,
595 unsigned long thresh,
596 unsigned long dirty,
597 unsigned long bdi_thresh,
598 unsigned long bdi_dirty,
599 unsigned long start_time)
600{
601 unsigned long now = jiffies;
602 unsigned long elapsed = now - bdi->bw_time_stamp;
603 unsigned long written;
604
605 /*
606 * rate-limit, only update once every 200ms.
607 */
608 if (elapsed < BANDWIDTH_INTERVAL)
609 return;
610
611 written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
612
613 /*
614 * Skip quiet periods when disk bandwidth is under-utilized.
615 * (at least 1s idle time between two flusher runs)
616 */
617 if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
618 goto snapshot;
619
620 if (thresh)
621 global_update_bandwidth(thresh, dirty, now);
622
623 bdi_update_write_bandwidth(bdi, elapsed, written);
624
625snapshot:
626 bdi->written_stamp = written;
627 bdi->bw_time_stamp = now;
628}
629
630static void bdi_update_bandwidth(struct backing_dev_info *bdi,
631 unsigned long thresh,
632 unsigned long dirty,
633 unsigned long bdi_thresh,
634 unsigned long bdi_dirty,
635 unsigned long start_time)
636{
637 if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
638 return;
639 spin_lock(&bdi->wb.list_lock);
640 __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
641 start_time);
642 spin_unlock(&bdi->wb.list_lock);
643}
644
471/* 645/*
472 * balance_dirty_pages() must be called by processes which are generating dirty 646 * balance_dirty_pages() must be called by processes which are generating dirty
473 * data. It looks at the number of dirty pages in the machine and will force 647 * data. It looks at the number of dirty pages in the machine and will force
@@ -478,27 +652,25 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
478static void balance_dirty_pages(struct address_space *mapping, 652static void balance_dirty_pages(struct address_space *mapping,
479 unsigned long write_chunk) 653 unsigned long write_chunk)
480{ 654{
481 long nr_reclaimable, bdi_nr_reclaimable; 655 unsigned long nr_reclaimable, bdi_nr_reclaimable;
482 long nr_writeback, bdi_nr_writeback; 656 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
657 unsigned long bdi_dirty;
483 unsigned long background_thresh; 658 unsigned long background_thresh;
484 unsigned long dirty_thresh; 659 unsigned long dirty_thresh;
485 unsigned long bdi_thresh; 660 unsigned long bdi_thresh;
661 unsigned long task_bdi_thresh;
662 unsigned long min_task_bdi_thresh;
486 unsigned long pages_written = 0; 663 unsigned long pages_written = 0;
487 unsigned long pause = 1; 664 unsigned long pause = 1;
488 bool dirty_exceeded = false; 665 bool dirty_exceeded = false;
666 bool clear_dirty_exceeded = true;
489 struct backing_dev_info *bdi = mapping->backing_dev_info; 667 struct backing_dev_info *bdi = mapping->backing_dev_info;
668 unsigned long start_time = jiffies;
490 669
491 for (;;) { 670 for (;;) {
492 struct writeback_control wbc = {
493 .sync_mode = WB_SYNC_NONE,
494 .older_than_this = NULL,
495 .nr_to_write = write_chunk,
496 .range_cyclic = 1,
497 };
498
499 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 671 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
500 global_page_state(NR_UNSTABLE_NFS); 672 global_page_state(NR_UNSTABLE_NFS);
501 nr_writeback = global_page_state(NR_WRITEBACK); 673 nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
502 674
503 global_dirty_limits(&background_thresh, &dirty_thresh); 675 global_dirty_limits(&background_thresh, &dirty_thresh);
504 676
@@ -507,12 +679,12 @@ static void balance_dirty_pages(struct address_space *mapping,
507 * catch-up. This avoids (excessively) small writeouts 679 * catch-up. This avoids (excessively) small writeouts
508 * when the bdi limits are ramping up. 680 * when the bdi limits are ramping up.
509 */ 681 */
510 if (nr_reclaimable + nr_writeback <= 682 if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
511 (background_thresh + dirty_thresh) / 2)
512 break; 683 break;
513 684
514 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 685 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
515 bdi_thresh = task_dirty_limit(current, bdi_thresh); 686 min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
687 task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
516 688
517 /* 689 /*
518 * In order to avoid the stacked BDI deadlock we need 690 * In order to avoid the stacked BDI deadlock we need
@@ -524,12 +696,14 @@ static void balance_dirty_pages(struct address_space *mapping,
524 * actually dirty; with m+n sitting in the percpu 696 * actually dirty; with m+n sitting in the percpu
525 * deltas. 697 * deltas.
526 */ 698 */
527 if (bdi_thresh < 2*bdi_stat_error(bdi)) { 699 if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
528 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); 700 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
529 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); 701 bdi_dirty = bdi_nr_reclaimable +
702 bdi_stat_sum(bdi, BDI_WRITEBACK);
530 } else { 703 } else {
531 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 704 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
532 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); 705 bdi_dirty = bdi_nr_reclaimable +
706 bdi_stat(bdi, BDI_WRITEBACK);
533 } 707 }
534 708
535 /* 709 /*
@@ -538,9 +712,10 @@ static void balance_dirty_pages(struct address_space *mapping,
538 * bdi or process from holding back light ones; The latter is 712 * bdi or process from holding back light ones; The latter is
539 * the last resort safeguard. 713 * the last resort safeguard.
540 */ 714 */
541 dirty_exceeded = 715 dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
542 (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) 716 (nr_dirty > dirty_thresh);
543 || (nr_reclaimable + nr_writeback > dirty_thresh); 717 clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
718 (nr_dirty <= dirty_thresh);
544 719
545 if (!dirty_exceeded) 720 if (!dirty_exceeded)
546 break; 721 break;
@@ -548,6 +723,9 @@ static void balance_dirty_pages(struct address_space *mapping,
548 if (!bdi->dirty_exceeded) 723 if (!bdi->dirty_exceeded)
549 bdi->dirty_exceeded = 1; 724 bdi->dirty_exceeded = 1;
550 725
726 bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
727 bdi_thresh, bdi_dirty, start_time);
728
551 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 729 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
552 * Unstable writes are a feature of certain networked 730 * Unstable writes are a feature of certain networked
553 * filesystems (i.e. NFS) in which data may have been 731 * filesystems (i.e. NFS) in which data may have been
@@ -557,17 +735,29 @@ static void balance_dirty_pages(struct address_space *mapping,
557 * threshold otherwise wait until the disk writes catch 735 * threshold otherwise wait until the disk writes catch
558 * up. 736 * up.
559 */ 737 */
560 trace_wbc_balance_dirty_start(&wbc, bdi); 738 trace_balance_dirty_start(bdi);
561 if (bdi_nr_reclaimable > bdi_thresh) { 739 if (bdi_nr_reclaimable > task_bdi_thresh) {
562 writeback_inodes_wb(&bdi->wb, &wbc); 740 pages_written += writeback_inodes_wb(&bdi->wb,
563 pages_written += write_chunk - wbc.nr_to_write; 741 write_chunk);
564 trace_wbc_balance_dirty_written(&wbc, bdi); 742 trace_balance_dirty_written(bdi, pages_written);
565 if (pages_written >= write_chunk) 743 if (pages_written >= write_chunk)
566 break; /* We've done our duty */ 744 break; /* We've done our duty */
567 } 745 }
568 trace_wbc_balance_dirty_wait(&wbc, bdi);
569 __set_current_state(TASK_UNINTERRUPTIBLE); 746 __set_current_state(TASK_UNINTERRUPTIBLE);
570 io_schedule_timeout(pause); 747 io_schedule_timeout(pause);
748 trace_balance_dirty_wait(bdi);
749
750 dirty_thresh = hard_dirty_limit(dirty_thresh);
751 /*
752 * max-pause area. If dirty exceeded but still within this
753 * area, no need to sleep for more than 200ms: (a) 8 pages per
754 * 200ms is typically more than enough to curb heavy dirtiers;
755 * (b) the pause time limit makes the dirtiers more responsive.
756 */
757 if (nr_dirty < dirty_thresh &&
758 bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 &&
759 time_after(jiffies, start_time + MAX_PAUSE))
760 break;
571 761
572 /* 762 /*
573 * Increase the delay for each loop, up to our previous 763 * Increase the delay for each loop, up to our previous
@@ -578,7 +768,8 @@ static void balance_dirty_pages(struct address_space *mapping,
578 pause = HZ / 10; 768 pause = HZ / 10;
579 } 769 }
580 770
581 if (!dirty_exceeded && bdi->dirty_exceeded) 771 /* Clear dirty_exceeded flag only when no task can exceed the limit */
772 if (clear_dirty_exceeded && bdi->dirty_exceeded)
582 bdi->dirty_exceeded = 0; 773 bdi->dirty_exceeded = 0;
583 774
584 if (writeback_in_progress(bdi)) 775 if (writeback_in_progress(bdi))
@@ -626,9 +817,13 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
626void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 817void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
627 unsigned long nr_pages_dirtied) 818 unsigned long nr_pages_dirtied)
628{ 819{
820 struct backing_dev_info *bdi = mapping->backing_dev_info;
629 unsigned long ratelimit; 821 unsigned long ratelimit;
630 unsigned long *p; 822 unsigned long *p;
631 823
824 if (!bdi_cap_account_dirty(bdi))
825 return;
826
632 ratelimit = ratelimit_pages; 827 ratelimit = ratelimit_pages;
633 if (mapping->backing_dev_info->dirty_exceeded) 828 if (mapping->backing_dev_info->dirty_exceeded)
634 ratelimit = 8; 829 ratelimit = 8;
@@ -892,12 +1087,12 @@ int write_cache_pages(struct address_space *mapping,
892 range_whole = 1; 1087 range_whole = 1;
893 cycled = 1; /* ignore range_cyclic tests */ 1088 cycled = 1; /* ignore range_cyclic tests */
894 } 1089 }
895 if (wbc->sync_mode == WB_SYNC_ALL) 1090 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
896 tag = PAGECACHE_TAG_TOWRITE; 1091 tag = PAGECACHE_TAG_TOWRITE;
897 else 1092 else
898 tag = PAGECACHE_TAG_DIRTY; 1093 tag = PAGECACHE_TAG_DIRTY;
899retry: 1094retry:
900 if (wbc->sync_mode == WB_SYNC_ALL) 1095 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
901 tag_pages_for_writeback(mapping, index, end); 1096 tag_pages_for_writeback(mapping, index, end);
902 done_index = index; 1097 done_index = index;
903 while (!done && (index <= end)) { 1098 while (!done && (index <= end)) {
@@ -1141,7 +1336,6 @@ EXPORT_SYMBOL(account_page_dirtied);
1141void account_page_writeback(struct page *page) 1336void account_page_writeback(struct page *page)
1142{ 1337{
1143 inc_zone_page_state(page, NR_WRITEBACK); 1338 inc_zone_page_state(page, NR_WRITEBACK);
1144 inc_zone_page_state(page, NR_WRITTEN);
1145} 1339}
1146EXPORT_SYMBOL(account_page_writeback); 1340EXPORT_SYMBOL(account_page_writeback);
1147 1341
@@ -1358,8 +1552,10 @@ int test_clear_page_writeback(struct page *page)
1358 } else { 1552 } else {
1359 ret = TestClearPageWriteback(page); 1553 ret = TestClearPageWriteback(page);
1360 } 1554 }
1361 if (ret) 1555 if (ret) {
1362 dec_zone_page_state(page, NR_WRITEBACK); 1556 dec_zone_page_state(page, NR_WRITEBACK);
1557 inc_zone_page_state(page, NR_WRITTEN);
1558 }
1363 return ret; 1559 return ret;
1364} 1560}
1365 1561
@@ -1405,10 +1601,6 @@ EXPORT_SYMBOL(test_set_page_writeback);
1405 */ 1601 */
1406int mapping_tagged(struct address_space *mapping, int tag) 1602int mapping_tagged(struct address_space *mapping, int tag)
1407{ 1603{
1408 int ret; 1604 return radix_tree_tagged(&mapping->page_tree, tag);
1409 rcu_read_lock();
1410 ret = radix_tree_tagged(&mapping->page_tree, tag);
1411 rcu_read_unlock();
1412 return ret;
1413} 1605}
1414EXPORT_SYMBOL(mapping_tagged); 1606EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e8985acdab..8859578e4bd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -127,6 +127,20 @@ void pm_restrict_gfp_mask(void)
127 saved_gfp_mask = gfp_allowed_mask; 127 saved_gfp_mask = gfp_allowed_mask;
128 gfp_allowed_mask &= ~GFP_IOFS; 128 gfp_allowed_mask &= ~GFP_IOFS;
129} 129}
130
131static bool pm_suspending(void)
132{
133 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
134 return false;
135 return true;
136}
137
138#else
139
140static bool pm_suspending(void)
141{
142 return false;
143}
130#endif /* CONFIG_PM_SLEEP */ 144#endif /* CONFIG_PM_SLEEP */
131 145
132#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 146#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -176,6 +190,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
176}; 190};
177 191
178int min_free_kbytes = 1024; 192int min_free_kbytes = 1024;
193int min_free_order_shift = 1;
179 194
180static unsigned long __meminitdata nr_kernel_pages; 195static unsigned long __meminitdata nr_kernel_pages;
181static unsigned long __meminitdata nr_all_pages; 196static unsigned long __meminitdata nr_all_pages;
@@ -355,8 +370,8 @@ void prep_compound_page(struct page *page, unsigned long order)
355 __SetPageHead(page); 370 __SetPageHead(page);
356 for (i = 1; i < nr_pages; i++) { 371 for (i = 1; i < nr_pages; i++) {
357 struct page *p = page + i; 372 struct page *p = page + i;
358
359 __SetPageTail(p); 373 __SetPageTail(p);
374 set_page_count(p, 0);
360 p->first_page = page; 375 p->first_page = page;
361 } 376 }
362} 377}
@@ -1370,21 +1385,12 @@ failed:
1370 1385
1371#ifdef CONFIG_FAIL_PAGE_ALLOC 1386#ifdef CONFIG_FAIL_PAGE_ALLOC
1372 1387
1373static struct fail_page_alloc_attr { 1388static struct {
1374 struct fault_attr attr; 1389 struct fault_attr attr;
1375 1390
1376 u32 ignore_gfp_highmem; 1391 u32 ignore_gfp_highmem;
1377 u32 ignore_gfp_wait; 1392 u32 ignore_gfp_wait;
1378 u32 min_order; 1393 u32 min_order;
1379
1380#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1381
1382 struct dentry *ignore_gfp_highmem_file;
1383 struct dentry *ignore_gfp_wait_file;
1384 struct dentry *min_order_file;
1385
1386#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1387
1388} fail_page_alloc = { 1394} fail_page_alloc = {
1389 .attr = FAULT_ATTR_INITIALIZER, 1395 .attr = FAULT_ATTR_INITIALIZER,
1390 .ignore_gfp_wait = 1, 1396 .ignore_gfp_wait = 1,
@@ -1418,36 +1424,27 @@ static int __init fail_page_alloc_debugfs(void)
1418{ 1424{
1419 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1425 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1420 struct dentry *dir; 1426 struct dentry *dir;
1421 int err;
1422
1423 err = init_fault_attr_dentries(&fail_page_alloc.attr,
1424 "fail_page_alloc");
1425 if (err)
1426 return err;
1427 dir = fail_page_alloc.attr.dentries.dir;
1428
1429 fail_page_alloc.ignore_gfp_wait_file =
1430 debugfs_create_bool("ignore-gfp-wait", mode, dir,
1431 &fail_page_alloc.ignore_gfp_wait);
1432
1433 fail_page_alloc.ignore_gfp_highmem_file =
1434 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1435 &fail_page_alloc.ignore_gfp_highmem);
1436 fail_page_alloc.min_order_file =
1437 debugfs_create_u32("min-order", mode, dir,
1438 &fail_page_alloc.min_order);
1439
1440 if (!fail_page_alloc.ignore_gfp_wait_file ||
1441 !fail_page_alloc.ignore_gfp_highmem_file ||
1442 !fail_page_alloc.min_order_file) {
1443 err = -ENOMEM;
1444 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
1445 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
1446 debugfs_remove(fail_page_alloc.min_order_file);
1447 cleanup_fault_attr_dentries(&fail_page_alloc.attr);
1448 }
1449 1427
1450 return err; 1428 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1429 &fail_page_alloc.attr);
1430 if (IS_ERR(dir))
1431 return PTR_ERR(dir);
1432
1433 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1434 &fail_page_alloc.ignore_gfp_wait))
1435 goto fail;
1436 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1437 &fail_page_alloc.ignore_gfp_highmem))
1438 goto fail;
1439 if (!debugfs_create_u32("min-order", mode, dir,
1440 &fail_page_alloc.min_order))
1441 goto fail;
1442
1443 return 0;
1444fail:
1445 debugfs_remove_recursive(dir);
1446
1447 return -ENOMEM;
1451} 1448}
1452 1449
1453late_initcall(fail_page_alloc_debugfs); 1450late_initcall(fail_page_alloc_debugfs);
@@ -1487,7 +1484,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1487 free_pages -= z->free_area[o].nr_free << o; 1484 free_pages -= z->free_area[o].nr_free << o;
1488 1485
1489 /* Require fewer higher order pages to be free */ 1486 /* Require fewer higher order pages to be free */
1490 min >>= 1; 1487 min >>= min_free_order_shift;
1491 1488
1492 if (free_pages <= min) 1489 if (free_pages <= min)
1493 return false; 1490 return false;
@@ -1616,6 +1613,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1616 set_bit(i, zlc->fullzones); 1613 set_bit(i, zlc->fullzones);
1617} 1614}
1618 1615
1616/*
1617 * clear all zones full, called after direct reclaim makes progress so that
1618 * a zone that was recently full is not skipped over for up to a second
1619 */
1620static void zlc_clear_zones_full(struct zonelist *zonelist)
1621{
1622 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1623
1624 zlc = zonelist->zlcache_ptr;
1625 if (!zlc)
1626 return;
1627
1628 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1629}
1630
1619#else /* CONFIG_NUMA */ 1631#else /* CONFIG_NUMA */
1620 1632
1621static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1633static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1632,6 +1644,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1632static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1644static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1633{ 1645{
1634} 1646}
1647
1648static void zlc_clear_zones_full(struct zonelist *zonelist)
1649{
1650}
1635#endif /* CONFIG_NUMA */ 1651#endif /* CONFIG_NUMA */
1636 1652
1637/* 1653/*
@@ -1664,7 +1680,7 @@ zonelist_scan:
1664 continue; 1680 continue;
1665 if ((alloc_flags & ALLOC_CPUSET) && 1681 if ((alloc_flags & ALLOC_CPUSET) &&
1666 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1682 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1667 goto try_next_zone; 1683 continue;
1668 1684
1669 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1685 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1670 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1686 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1676,17 +1692,36 @@ zonelist_scan:
1676 classzone_idx, alloc_flags)) 1692 classzone_idx, alloc_flags))
1677 goto try_this_zone; 1693 goto try_this_zone;
1678 1694
1695 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1696 /*
1697 * we do zlc_setup if there are multiple nodes
1698 * and before considering the first zone allowed
1699 * by the cpuset.
1700 */
1701 allowednodes = zlc_setup(zonelist, alloc_flags);
1702 zlc_active = 1;
1703 did_zlc_setup = 1;
1704 }
1705
1679 if (zone_reclaim_mode == 0) 1706 if (zone_reclaim_mode == 0)
1680 goto this_zone_full; 1707 goto this_zone_full;
1681 1708
1709 /*
1710 * As we may have just activated ZLC, check if the first
1711 * eligible zone has failed zone_reclaim recently.
1712 */
1713 if (NUMA_BUILD && zlc_active &&
1714 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1715 continue;
1716
1682 ret = zone_reclaim(zone, gfp_mask, order); 1717 ret = zone_reclaim(zone, gfp_mask, order);
1683 switch (ret) { 1718 switch (ret) {
1684 case ZONE_RECLAIM_NOSCAN: 1719 case ZONE_RECLAIM_NOSCAN:
1685 /* did not scan */ 1720 /* did not scan */
1686 goto try_next_zone; 1721 continue;
1687 case ZONE_RECLAIM_FULL: 1722 case ZONE_RECLAIM_FULL:
1688 /* scanned but unreclaimable */ 1723 /* scanned but unreclaimable */
1689 goto this_zone_full; 1724 continue;
1690 default: 1725 default:
1691 /* did we reclaim enough */ 1726 /* did we reclaim enough */
1692 if (!zone_watermark_ok(zone, order, mark, 1727 if (!zone_watermark_ok(zone, order, mark,
@@ -1703,16 +1738,6 @@ try_this_zone:
1703this_zone_full: 1738this_zone_full:
1704 if (NUMA_BUILD) 1739 if (NUMA_BUILD)
1705 zlc_mark_zone_full(zonelist, z); 1740 zlc_mark_zone_full(zonelist, z);
1706try_next_zone:
1707 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1708 /*
1709 * we do zlc_setup after the first zone is tried but only
1710 * if there are multiple nodes make it worthwhile
1711 */
1712 allowednodes = zlc_setup(zonelist, alloc_flags);
1713 zlc_active = 1;
1714 did_zlc_setup = 1;
1715 }
1716 } 1741 }
1717 1742
1718 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1743 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1954,6 +1979,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1954 if (unlikely(!(*did_some_progress))) 1979 if (unlikely(!(*did_some_progress)))
1955 return NULL; 1980 return NULL;
1956 1981
1982 /* After successful reclaim, reconsider all zones for allocation */
1983 if (NUMA_BUILD)
1984 zlc_clear_zones_full(zonelist);
1985
1957retry: 1986retry:
1958 page = get_page_from_freelist(gfp_mask, nodemask, order, 1987 page = get_page_from_freelist(gfp_mask, nodemask, order,
1959 zonelist, high_zoneidx, 1988 zonelist, high_zoneidx,
@@ -2193,6 +2222,14 @@ rebalance:
2193 2222
2194 goto restart; 2223 goto restart;
2195 } 2224 }
2225
2226 /*
2227 * Suspend converts GFP_KERNEL to __GFP_WAIT which can
2228 * prevent reclaim making forward progress without
2229 * invoking OOM. Bail if we are suspending
2230 */
2231 if (pm_suspending())
2232 goto nopage;
2196 } 2233 }
2197 2234
2198 /* Check if we should retry the allocation */ 2235 /* Check if we should retry the allocation */
@@ -3356,9 +3393,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3356 unsigned long block_migratetype; 3393 unsigned long block_migratetype;
3357 int reserve; 3394 int reserve;
3358 3395
3359 /* Get the start pfn, end pfn and the number of blocks to reserve */ 3396 /*
3397 * Get the start pfn, end pfn and the number of blocks to reserve
3398 * We have to be careful to be aligned to pageblock_nr_pages to
3399 * make sure that we always check pfn_valid for the first page in
3400 * the block.
3401 */
3360 start_pfn = zone->zone_start_pfn; 3402 start_pfn = zone->zone_start_pfn;
3361 end_pfn = start_pfn + zone->spanned_pages; 3403 end_pfn = start_pfn + zone->spanned_pages;
3404 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3362 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3405 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3363 pageblock_order; 3406 pageblock_order;
3364 3407
@@ -4585,6 +4628,60 @@ void __init sort_node_map(void)
4585 cmp_node_active_region, NULL); 4628 cmp_node_active_region, NULL);
4586} 4629}
4587 4630
4631/**
4632 * node_map_pfn_alignment - determine the maximum internode alignment
4633 *
4634 * This function should be called after node map is populated and sorted.
4635 * It calculates the maximum power of two alignment which can distinguish
4636 * all the nodes.
4637 *
4638 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
4639 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
4640 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
4641 * shifted, 1GiB is enough and this function will indicate so.
4642 *
4643 * This is used to test whether pfn -> nid mapping of the chosen memory
4644 * model has fine enough granularity to avoid incorrect mapping for the
4645 * populated node map.
4646 *
4647 * Returns the determined alignment in pfn's. 0 if there is no alignment
4648 * requirement (single node).
4649 */
4650unsigned long __init node_map_pfn_alignment(void)
4651{
4652 unsigned long accl_mask = 0, last_end = 0;
4653 int last_nid = -1;
4654 int i;
4655
4656 for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
4657 int nid = early_node_map[i].nid;
4658 unsigned long start = early_node_map[i].start_pfn;
4659 unsigned long end = early_node_map[i].end_pfn;
4660 unsigned long mask;
4661
4662 if (!start || last_nid < 0 || last_nid == nid) {
4663 last_nid = nid;
4664 last_end = end;
4665 continue;
4666 }
4667
4668 /*
4669 * Start with a mask granular enough to pin-point to the
4670 * start pfn and tick off bits one-by-one until it becomes
4671 * too coarse to separate the current node from the last.
4672 */
4673 mask = ~((1 << __ffs(start)) - 1);
4674 while (mask && last_end <= (start & (mask << 1)))
4675 mask <<= 1;
4676
4677 /* accumulate all internode masks */
4678 accl_mask |= mask;
4679 }
4680
4681 /* convert mask to number of pages */
4682 return ~accl_mask + 1;
4683}
4684
4588/* Find the lowest pfn for a node */ 4685/* Find the lowest pfn for a node */
4589static unsigned long __init find_min_pfn_for_node(int nid) 4686static unsigned long __init find_min_pfn_for_node(int nid)
4590{ 4687{
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 53bffc6c293..39d216d535e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -225,8 +225,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
225 unsigned long start, end, pfn; 225 unsigned long start, end, pfn;
226 int fail = 0; 226 int fail = 0;
227 227
228 start = start_pfn & ~(PAGES_PER_SECTION - 1); 228 start = SECTION_ALIGN_DOWN(start_pfn);
229 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); 229 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
230 230
231 if (nid == -1) { 231 if (nid == -1) {
232 /* 232 /*
@@ -258,8 +258,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn,
258{ 258{
259 unsigned long start, end, pfn; 259 unsigned long start, end, pfn;
260 260
261 start = start_pfn & ~(PAGES_PER_SECTION - 1); 261 start = SECTION_ALIGN_DOWN(start_pfn);
262 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); 262 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
263 263
264 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 264 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
265 __free_page_cgroup(pfn); 265 __free_page_cgroup(pfn);
@@ -537,7 +537,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
537nomem: 537nomem:
538 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); 538 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
539 printk(KERN_INFO 539 printk(KERN_INFO
540 "swap_cgroup can be disabled by noswapaccount boot option\n"); 540 "swap_cgroup can be disabled by swapaccount=0 boot option\n");
541 return -ENOMEM; 541 return -ENOMEM;
542} 542}
543 543
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index c3450d53361..2f5cf10ff66 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -126,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
126 126
127 return 0; 127 return 0;
128} 128}
129#endif 129
130static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
131{
132 struct vm_area_struct *vma;
133
134 /* We don't need vma lookup at all. */
135 if (!walk->hugetlb_entry)
136 return NULL;
137
138 VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
139 vma = find_vma(walk->mm, addr);
140 if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
141 return vma;
142
143 return NULL;
144}
145
146#else /* CONFIG_HUGETLB_PAGE */
147static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
148{
149 return NULL;
150}
151
152static int walk_hugetlb_range(struct vm_area_struct *vma,
153 unsigned long addr, unsigned long end,
154 struct mm_walk *walk)
155{
156 return 0;
157}
158
159#endif /* CONFIG_HUGETLB_PAGE */
160
161
130 162
131/** 163/**
132 * walk_page_range - walk a memory map's page tables with a callback 164 * walk_page_range - walk a memory map's page tables with a callback
@@ -144,11 +176,15 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
144 * associated range, and a copy of the original mm_walk for access to 176 * associated range, and a copy of the original mm_walk for access to
145 * the ->private or ->mm fields. 177 * the ->private or ->mm fields.
146 * 178 *
147 * No locks are taken, but the bottom level iterator will map PTE 179 * Usually no locks are taken, but splitting transparent huge page may
180 * take page table lock. And the bottom level iterator will map PTE
148 * directories from highmem if necessary. 181 * directories from highmem if necessary.
149 * 182 *
150 * If any callback returns a non-zero value, the walk is aborted and 183 * If any callback returns a non-zero value, the walk is aborted and
151 * the return value is propagated back to the caller. Otherwise 0 is returned. 184 * the return value is propagated back to the caller. Otherwise 0 is returned.
185 *
186 * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
187 * is !NULL.
152 */ 188 */
153int walk_page_range(unsigned long addr, unsigned long end, 189int walk_page_range(unsigned long addr, unsigned long end,
154 struct mm_walk *walk) 190 struct mm_walk *walk)
@@ -165,18 +201,17 @@ int walk_page_range(unsigned long addr, unsigned long end,
165 201
166 pgd = pgd_offset(walk->mm, addr); 202 pgd = pgd_offset(walk->mm, addr);
167 do { 203 do {
168 struct vm_area_struct *uninitialized_var(vma); 204 struct vm_area_struct *vma;
169 205
170 next = pgd_addr_end(addr, end); 206 next = pgd_addr_end(addr, end);
171 207
172#ifdef CONFIG_HUGETLB_PAGE
173 /* 208 /*
174 * handle hugetlb vma individually because pagetable walk for 209 * handle hugetlb vma individually because pagetable walk for
175 * the hugetlb page is dependent on the architecture and 210 * the hugetlb page is dependent on the architecture and
176 * we can't handled it in the same manner as non-huge pages. 211 * we can't handled it in the same manner as non-huge pages.
177 */ 212 */
178 vma = find_vma(walk->mm, addr); 213 vma = hugetlb_vma(addr, walk);
179 if (vma && is_vm_hugetlb_page(vma)) { 214 if (vma) {
180 if (vma->vm_end < next) 215 if (vma->vm_end < next)
181 next = vma->vm_end; 216 next = vma->vm_end;
182 /* 217 /*
@@ -189,7 +224,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
189 pgd = pgd_offset(walk->mm, next); 224 pgd = pgd_offset(walk->mm, next);
190 continue; 225 continue;
191 } 226 }
192#endif 227
193 if (pgd_none_or_clear_bad(pgd)) { 228 if (pgd_none_or_clear_bad(pgd)) {
194 if (walk->pte_hole) 229 if (walk->pte_hole)
195 err = walk->pte_hole(addr, next, walk); 230 err = walk->pte_hole(addr, next, walk);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index ea534960a04..bfad7246665 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -143,8 +143,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
143 int page_start, int page_end) 143 int page_start, int page_end)
144{ 144{
145 flush_cache_vunmap( 145 flush_cache_vunmap(
146 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), 146 pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
147 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); 147 pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
148} 148}
149 149
150static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) 150static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
@@ -206,8 +206,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
206 int page_start, int page_end) 206 int page_start, int page_end)
207{ 207{
208 flush_tlb_kernel_range( 208 flush_tlb_kernel_range(
209 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), 209 pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
210 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); 210 pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
211} 211}
212 212
213static int __pcpu_map_pages(unsigned long addr, struct page **pages, 213static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@@ -284,8 +284,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
284 int page_start, int page_end) 284 int page_start, int page_end)
285{ 285{
286 flush_cache_vmap( 286 flush_cache_vmap(
287 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), 287 pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
288 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); 288 pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
289} 289}
290 290
291/** 291/**
diff --git a/mm/percpu.c b/mm/percpu.c
index bf80e55dbed..0ae7a09141e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -116,9 +116,9 @@ static int pcpu_atom_size __read_mostly;
116static int pcpu_nr_slots __read_mostly; 116static int pcpu_nr_slots __read_mostly;
117static size_t pcpu_chunk_struct_size __read_mostly; 117static size_t pcpu_chunk_struct_size __read_mostly;
118 118
119/* cpus with the lowest and highest unit numbers */ 119/* cpus with the lowest and highest unit addresses */
120static unsigned int pcpu_first_unit_cpu __read_mostly; 120static unsigned int pcpu_low_unit_cpu __read_mostly;
121static unsigned int pcpu_last_unit_cpu __read_mostly; 121static unsigned int pcpu_high_unit_cpu __read_mostly;
122 122
123/* the address of the first chunk which starts with the kernel static area */ 123/* the address of the first chunk which starts with the kernel static area */
124void *pcpu_base_addr __read_mostly; 124void *pcpu_base_addr __read_mostly;
@@ -984,19 +984,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
984{ 984{
985 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); 985 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
986 bool in_first_chunk = false; 986 bool in_first_chunk = false;
987 unsigned long first_start, first_end; 987 unsigned long first_low, first_high;
988 unsigned int cpu; 988 unsigned int cpu;
989 989
990 /* 990 /*
991 * The following test on first_start/end isn't strictly 991 * The following test on unit_low/high isn't strictly
992 * necessary but will speed up lookups of addresses which 992 * necessary but will speed up lookups of addresses which
993 * aren't in the first chunk. 993 * aren't in the first chunk.
994 */ 994 */
995 first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0); 995 first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
996 first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu, 996 first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
997 pcpu_unit_pages); 997 pcpu_unit_pages);
998 if ((unsigned long)addr >= first_start && 998 if ((unsigned long)addr >= first_low &&
999 (unsigned long)addr < first_end) { 999 (unsigned long)addr < first_high) {
1000 for_each_possible_cpu(cpu) { 1000 for_each_possible_cpu(cpu) {
1001 void *start = per_cpu_ptr(base, cpu); 1001 void *start = per_cpu_ptr(base, cpu);
1002 1002
@@ -1011,9 +1011,11 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
1011 if (!is_vmalloc_addr(addr)) 1011 if (!is_vmalloc_addr(addr))
1012 return __pa(addr); 1012 return __pa(addr);
1013 else 1013 else
1014 return page_to_phys(vmalloc_to_page(addr)); 1014 return page_to_phys(vmalloc_to_page(addr)) +
1015 offset_in_page(addr);
1015 } else 1016 } else
1016 return page_to_phys(pcpu_addr_to_page(addr)); 1017 return page_to_phys(pcpu_addr_to_page(addr)) +
1018 offset_in_page(addr);
1017} 1019}
1018 1020
1019/** 1021/**
@@ -1233,7 +1235,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1233 1235
1234 for (cpu = 0; cpu < nr_cpu_ids; cpu++) 1236 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1235 unit_map[cpu] = UINT_MAX; 1237 unit_map[cpu] = UINT_MAX;
1236 pcpu_first_unit_cpu = NR_CPUS; 1238
1239 pcpu_low_unit_cpu = NR_CPUS;
1240 pcpu_high_unit_cpu = NR_CPUS;
1237 1241
1238 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { 1242 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
1239 const struct pcpu_group_info *gi = &ai->groups[group]; 1243 const struct pcpu_group_info *gi = &ai->groups[group];
@@ -1253,9 +1257,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1253 unit_map[cpu] = unit + i; 1257 unit_map[cpu] = unit + i;
1254 unit_off[cpu] = gi->base_offset + i * ai->unit_size; 1258 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
1255 1259
1256 if (pcpu_first_unit_cpu == NR_CPUS) 1260 /* determine low/high unit_cpu */
1257 pcpu_first_unit_cpu = cpu; 1261 if (pcpu_low_unit_cpu == NR_CPUS ||
1258 pcpu_last_unit_cpu = cpu; 1262 unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
1263 pcpu_low_unit_cpu = cpu;
1264 if (pcpu_high_unit_cpu == NR_CPUS ||
1265 unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
1266 pcpu_high_unit_cpu = cpu;
1259 } 1267 }
1260 } 1268 }
1261 pcpu_nr_units = unit; 1269 pcpu_nr_units = unit;
diff --git a/mm/rmap.c b/mm/rmap.c
index 23295f65ae4..8005080fb9e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,7 +21,6 @@
21 * Lock ordering in mm: 21 * Lock ordering in mm:
22 * 22 *
23 * inode->i_mutex (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * inode->i_alloc_sem (vmtruncate_range)
25 * mm->mmap_sem 24 * mm->mmap_sem
26 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
27 * mapping->i_mmap_mutex 26 * mapping->i_mmap_mutex
@@ -32,11 +31,11 @@
32 * mmlist_lock (in mmput, drain_mmlist and others) 31 * mmlist_lock (in mmput, drain_mmlist and others)
33 * mapping->private_lock (in __set_page_dirty_buffers) 32 * mapping->private_lock (in __set_page_dirty_buffers)
34 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 33 * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
35 * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) 34 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
36 * sb_lock (within inode_lock in fs/fs-writeback.c) 35 * sb_lock (within inode_lock in fs/fs-writeback.c)
37 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
38 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
39 * within inode_wb_list_lock in __sync_single_inode) 38 * within bdi.wb->list_lock in __sync_single_inode)
40 * 39 *
41 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) 40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
42 * ->tasklist_lock 41 * ->tasklist_lock
@@ -870,11 +869,11 @@ int page_referenced(struct page *page,
870 vm_flags); 869 vm_flags);
871 if (we_locked) 870 if (we_locked)
872 unlock_page(page); 871 unlock_page(page);
872
873 if (page_test_and_clear_young(page_to_pfn(page)))
874 referenced++;
873 } 875 }
874out: 876out:
875 if (page_test_and_clear_young(page_to_pfn(page)))
876 referenced++;
877
878 return referenced; 877 return referenced;
879} 878}
880 879
diff --git a/mm/shmem.c b/mm/shmem.c
index fcedf5464eb..fba53caba0d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,7 +6,8 @@
6 * 2000-2001 Christoph Rohland 6 * 2000-2001 Christoph Rohland
7 * 2000-2001 SAP AG 7 * 2000-2001 SAP AG
8 * 2002 Red Hat Inc. 8 * 2002 Red Hat Inc.
9 * Copyright (C) 2002-2005 Hugh Dickins. 9 * Copyright (C) 2002-2011 Hugh Dickins.
10 * Copyright (C) 2011 Google Inc.
10 * Copyright (C) 2002-2005 VERITAS Software Corporation. 11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
11 * Copyright (C) 2004 Andi Kleen, SuSE Labs 12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
12 * 13 *
@@ -28,7 +29,6 @@
28#include <linux/file.h> 29#include <linux/file.h>
29#include <linux/mm.h> 30#include <linux/mm.h>
30#include <linux/module.h> 31#include <linux/module.h>
31#include <linux/percpu_counter.h>
32#include <linux/swap.h> 32#include <linux/swap.h>
33 33
34static struct vfsmount *shm_mnt; 34static struct vfsmount *shm_mnt;
@@ -51,6 +51,9 @@ static struct vfsmount *shm_mnt;
51#include <linux/shmem_fs.h> 51#include <linux/shmem_fs.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/blkdev.h> 53#include <linux/blkdev.h>
54#include <linux/pagevec.h>
55#include <linux/percpu_counter.h>
56#include <linux/splice.h>
54#include <linux/security.h> 57#include <linux/security.h>
55#include <linux/swapops.h> 58#include <linux/swapops.h>
56#include <linux/mempolicy.h> 59#include <linux/mempolicy.h>
@@ -62,43 +65,17 @@ static struct vfsmount *shm_mnt;
62#include <linux/magic.h> 65#include <linux/magic.h>
63 66
64#include <asm/uaccess.h> 67#include <asm/uaccess.h>
65#include <asm/div64.h>
66#include <asm/pgtable.h> 68#include <asm/pgtable.h>
67 69
68/*
69 * The maximum size of a shmem/tmpfs file is limited by the maximum size of
70 * its triple-indirect swap vector - see illustration at shmem_swp_entry().
71 *
72 * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
73 * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum
74 * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
75 * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
76 *
77 * We use / and * instead of shifts in the definitions below, so that the swap
78 * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
79 */
80#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
81#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
82
83#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
84#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
85
86#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
87#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
88
89#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) 70#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
90#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) 71#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
91 72
92/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
93#define SHMEM_PAGEIN VM_READ
94#define SHMEM_TRUNCATE VM_WRITE
95
96/* Definition to limit shmem_truncate's steps between cond_rescheds */
97#define LATENCY_LIMIT 64
98
99/* Pretend that each entry is of this size in directory's i_size */ 73/* Pretend that each entry is of this size in directory's i_size */
100#define BOGO_DIRENT_SIZE 20 74#define BOGO_DIRENT_SIZE 20
101 75
76/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
77#define SHORT_SYMLINK_LEN 128
78
102struct shmem_xattr { 79struct shmem_xattr {
103 struct list_head list; /* anchored by shmem_inode_info->xattr_list */ 80 struct list_head list; /* anchored by shmem_inode_info->xattr_list */
104 char *name; /* xattr name */ 81 char *name; /* xattr name */
@@ -106,7 +83,7 @@ struct shmem_xattr {
106 char value[0]; 83 char value[0];
107}; 84};
108 85
109/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ 86/* Flag allocation requirements to shmem_getpage */
110enum sgp_type { 87enum sgp_type {
111 SGP_READ, /* don't exceed i_size, don't allocate page */ 88 SGP_READ, /* don't exceed i_size, don't allocate page */
112 SGP_CACHE, /* don't exceed i_size, may allocate page */ 89 SGP_CACHE, /* don't exceed i_size, may allocate page */
@@ -126,57 +103,14 @@ static unsigned long shmem_default_max_inodes(void)
126} 103}
127#endif 104#endif
128 105
129static int shmem_getpage(struct inode *inode, unsigned long idx, 106static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
130 struct page **pagep, enum sgp_type sgp, int *type); 107 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
131
132static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
133{
134 /*
135 * The above definition of ENTRIES_PER_PAGE, and the use of
136 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
137 * might be reconsidered if it ever diverges from PAGE_SIZE.
138 *
139 * Mobility flags are masked out as swap vectors cannot move
140 */
141 return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
142 PAGE_CACHE_SHIFT-PAGE_SHIFT);
143}
144
145static inline void shmem_dir_free(struct page *page)
146{
147 __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
148}
149
150static struct page **shmem_dir_map(struct page *page)
151{
152 return (struct page **)kmap_atomic(page, KM_USER0);
153}
154
155static inline void shmem_dir_unmap(struct page **dir)
156{
157 kunmap_atomic(dir, KM_USER0);
158}
159
160static swp_entry_t *shmem_swp_map(struct page *page)
161{
162 return (swp_entry_t *)kmap_atomic(page, KM_USER1);
163}
164
165static inline void shmem_swp_balance_unmap(void)
166{
167 /*
168 * When passing a pointer to an i_direct entry, to code which
169 * also handles indirect entries and so will shmem_swp_unmap,
170 * we must arrange for the preempt count to remain in balance.
171 * What kmap_atomic of a lowmem page does depends on config
172 * and architecture, so pretend to kmap_atomic some lowmem page.
173 */
174 (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
175}
176 108
177static inline void shmem_swp_unmap(swp_entry_t *entry) 109static inline int shmem_getpage(struct inode *inode, pgoff_t index,
110 struct page **pagep, enum sgp_type sgp, int *fault_type)
178{ 111{
179 kunmap_atomic(entry, KM_USER1); 112 return shmem_getpage_gfp(inode, index, pagep, sgp,
113 mapping_gfp_mask(inode->i_mapping), fault_type);
180} 114}
181 115
182static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 116static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@ -236,17 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
236static LIST_HEAD(shmem_swaplist); 170static LIST_HEAD(shmem_swaplist);
237static DEFINE_MUTEX(shmem_swaplist_mutex); 171static DEFINE_MUTEX(shmem_swaplist_mutex);
238 172
239static void shmem_free_blocks(struct inode *inode, long pages)
240{
241 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
242 if (sbinfo->max_blocks) {
243 percpu_counter_add(&sbinfo->used_blocks, -pages);
244 spin_lock(&inode->i_lock);
245 inode->i_blocks -= pages*BLOCKS_PER_PAGE;
246 spin_unlock(&inode->i_lock);
247 }
248}
249
250static int shmem_reserve_inode(struct super_block *sb) 173static int shmem_reserve_inode(struct super_block *sb)
251{ 174{
252 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 175 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@@ -273,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb)
273} 196}
274 197
275/** 198/**
276 * shmem_recalc_inode - recalculate the size of an inode 199 * shmem_recalc_inode - recalculate the block usage of an inode
277 * @inode: inode to recalc 200 * @inode: inode to recalc
278 * 201 *
279 * We have to calculate the free blocks since the mm can drop 202 * We have to calculate the free blocks since the mm can drop
@@ -291,474 +214,297 @@ static void shmem_recalc_inode(struct inode *inode)
291 214
292 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 215 freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
293 if (freed > 0) { 216 if (freed > 0) {
217 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
218 if (sbinfo->max_blocks)
219 percpu_counter_add(&sbinfo->used_blocks, -freed);
294 info->alloced -= freed; 220 info->alloced -= freed;
221 inode->i_blocks -= freed * BLOCKS_PER_PAGE;
295 shmem_unacct_blocks(info->flags, freed); 222 shmem_unacct_blocks(info->flags, freed);
296 shmem_free_blocks(inode, freed);
297 } 223 }
298} 224}
299 225
300/** 226/*
301 * shmem_swp_entry - find the swap vector position in the info structure 227 * Replace item expected in radix tree by a new item, while holding tree lock.
302 * @info: info structure for the inode 228 */
303 * @index: index of the page to find 229static int shmem_radix_tree_replace(struct address_space *mapping,
304 * @page: optional page to add to the structure. Has to be preset to 230 pgoff_t index, void *expected, void *replacement)
305 * all zeros 231{
306 * 232 void **pslot;
307 * If there is no space allocated yet it will return NULL when 233 void *item = NULL;
308 * page is NULL, else it will use the page for the needed block, 234
309 * setting it to NULL on return to indicate that it has been used. 235 VM_BUG_ON(!expected);
310 * 236 pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
311 * The swap vector is organized the following way: 237 if (pslot)
312 * 238 item = radix_tree_deref_slot_protected(pslot,
313 * There are SHMEM_NR_DIRECT entries directly stored in the 239 &mapping->tree_lock);
314 * shmem_inode_info structure. So small files do not need an addional 240 if (item != expected)
315 * allocation. 241 return -ENOENT;
316 * 242 if (replacement)
317 * For pages with index > SHMEM_NR_DIRECT there is the pointer 243 radix_tree_replace_slot(pslot, replacement);
318 * i_indirect which points to a page which holds in the first half 244 else
319 * doubly indirect blocks, in the second half triple indirect blocks: 245 radix_tree_delete(&mapping->page_tree, index);
320 * 246 return 0;
321 * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the 247}
322 * following layout (for SHMEM_NR_DIRECT == 16): 248
323 * 249/*
324 * i_indirect -> dir --> 16-19 250 * Like add_to_page_cache_locked, but error if expected item has gone.
325 * | +-> 20-23
326 * |
327 * +-->dir2 --> 24-27
328 * | +-> 28-31
329 * | +-> 32-35
330 * | +-> 36-39
331 * |
332 * +-->dir3 --> 40-43
333 * +-> 44-47
334 * +-> 48-51
335 * +-> 52-55
336 */ 251 */
337static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) 252static int shmem_add_to_page_cache(struct page *page,
253 struct address_space *mapping,
254 pgoff_t index, gfp_t gfp, void *expected)
338{ 255{
339 unsigned long offset; 256 int error = 0;
340 struct page **dir;
341 struct page *subdir;
342 257
343 if (index < SHMEM_NR_DIRECT) { 258 VM_BUG_ON(!PageLocked(page));
344 shmem_swp_balance_unmap(); 259 VM_BUG_ON(!PageSwapBacked(page));
345 return info->i_direct+index;
346 }
347 if (!info->i_indirect) {
348 if (page) {
349 info->i_indirect = *page;
350 *page = NULL;
351 }
352 return NULL; /* need another page */
353 }
354 260
355 index -= SHMEM_NR_DIRECT; 261 if (!expected)
356 offset = index % ENTRIES_PER_PAGE; 262 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
357 index /= ENTRIES_PER_PAGE; 263 if (!error) {
358 dir = shmem_dir_map(info->i_indirect); 264 page_cache_get(page);
359 265 page->mapping = mapping;
360 if (index >= ENTRIES_PER_PAGE/2) { 266 page->index = index;
361 index -= ENTRIES_PER_PAGE/2; 267
362 dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE; 268 spin_lock_irq(&mapping->tree_lock);
363 index %= ENTRIES_PER_PAGE; 269 if (!expected)
364 subdir = *dir; 270 error = radix_tree_insert(&mapping->page_tree,
365 if (!subdir) { 271 index, page);
366 if (page) { 272 else
367 *dir = *page; 273 error = shmem_radix_tree_replace(mapping, index,
368 *page = NULL; 274 expected, page);
369 } 275 if (!error) {
370 shmem_dir_unmap(dir); 276 mapping->nrpages++;
371 return NULL; /* need another page */ 277 __inc_zone_page_state(page, NR_FILE_PAGES);
372 } 278 __inc_zone_page_state(page, NR_SHMEM);
373 shmem_dir_unmap(dir); 279 spin_unlock_irq(&mapping->tree_lock);
374 dir = shmem_dir_map(subdir); 280 } else {
375 } 281 page->mapping = NULL;
376 282 spin_unlock_irq(&mapping->tree_lock);
377 dir += index; 283 page_cache_release(page);
378 subdir = *dir;
379 if (!subdir) {
380 if (!page || !(subdir = *page)) {
381 shmem_dir_unmap(dir);
382 return NULL; /* need a page */
383 } 284 }
384 *dir = subdir; 285 if (!expected)
385 *page = NULL; 286 radix_tree_preload_end();
386 } 287 }
387 shmem_dir_unmap(dir); 288 if (error)
388 return shmem_swp_map(subdir) + offset; 289 mem_cgroup_uncharge_cache_page(page);
290 return error;
389} 291}
390 292
391static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) 293/*
294 * Like delete_from_page_cache, but substitutes swap for page.
295 */
296static void shmem_delete_from_page_cache(struct page *page, void *radswap)
392{ 297{
393 long incdec = value? 1: -1; 298 struct address_space *mapping = page->mapping;
299 int error;
394 300
395 entry->val = value; 301 spin_lock_irq(&mapping->tree_lock);
396 info->swapped += incdec; 302 error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
397 if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { 303 page->mapping = NULL;
398 struct page *page = kmap_atomic_to_page(entry); 304 mapping->nrpages--;
399 set_page_private(page, page_private(page) + incdec); 305 __dec_zone_page_state(page, NR_FILE_PAGES);
400 } 306 __dec_zone_page_state(page, NR_SHMEM);
307 spin_unlock_irq(&mapping->tree_lock);
308 page_cache_release(page);
309 BUG_ON(error);
401} 310}
402 311
403/** 312/*
404 * shmem_swp_alloc - get the position of the swap entry for the page. 313 * Like find_get_pages, but collecting swap entries as well as pages.
405 * @info: info structure for the inode
406 * @index: index of the page to find
407 * @sgp: check and recheck i_size? skip allocation?
408 *
409 * If the entry does not exist, allocate it.
410 */ 314 */
411static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) 315static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
412{ 316 pgoff_t start, unsigned int nr_pages,
413 struct inode *inode = &info->vfs_inode; 317 struct page **pages, pgoff_t *indices)
414 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 318{
415 struct page *page = NULL; 319 unsigned int i;
416 swp_entry_t *entry; 320 unsigned int ret;
417 321 unsigned int nr_found;
418 if (sgp != SGP_WRITE && 322
419 ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 323 rcu_read_lock();
420 return ERR_PTR(-EINVAL); 324restart:
421 325 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
422 while (!(entry = shmem_swp_entry(info, index, &page))) { 326 (void ***)pages, indices, start, nr_pages);
423 if (sgp == SGP_READ) 327 ret = 0;
424 return shmem_swp_map(ZERO_PAGE(0)); 328 for (i = 0; i < nr_found; i++) {
425 /* 329 struct page *page;
426 * Test used_blocks against 1 less max_blocks, since we have 1 data 330repeat:
427 * page (and perhaps indirect index pages) yet to allocate: 331 page = radix_tree_deref_slot((void **)pages[i]);
428 * a waste to allocate index if we cannot allocate data. 332 if (unlikely(!page))
429 */ 333 continue;
430 if (sbinfo->max_blocks) { 334 if (radix_tree_exception(page)) {
431 if (percpu_counter_compare(&sbinfo->used_blocks, 335 if (radix_tree_deref_retry(page))
432 sbinfo->max_blocks - 1) >= 0) 336 goto restart;
433 return ERR_PTR(-ENOSPC); 337 /*
434 percpu_counter_inc(&sbinfo->used_blocks); 338 * Otherwise, we must be storing a swap entry
435 spin_lock(&inode->i_lock); 339 * here as an exceptional entry: so return it
436 inode->i_blocks += BLOCKS_PER_PAGE; 340 * without attempting to raise page count.
437 spin_unlock(&inode->i_lock); 341 */
342 goto export;
438 } 343 }
344 if (!page_cache_get_speculative(page))
345 goto repeat;
439 346
440 spin_unlock(&info->lock); 347 /* Has the page moved? */
441 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); 348 if (unlikely(page != *((void **)pages[i]))) {
442 spin_lock(&info->lock); 349 page_cache_release(page);
443 350 goto repeat;
444 if (!page) {
445 shmem_free_blocks(inode, 1);
446 return ERR_PTR(-ENOMEM);
447 }
448 if (sgp != SGP_WRITE &&
449 ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
450 entry = ERR_PTR(-EINVAL);
451 break;
452 } 351 }
453 if (info->next_index <= index) 352export:
454 info->next_index = index + 1; 353 indices[ret] = indices[i];
455 } 354 pages[ret] = page;
456 if (page) { 355 ret++;
457 /* another task gave its page, or truncated the file */ 356 }
458 shmem_free_blocks(inode, 1); 357 if (unlikely(!ret && nr_found))
459 shmem_dir_free(page); 358 goto restart;
460 } 359 rcu_read_unlock();
461 if (info->next_index <= index && !IS_ERR(entry)) 360 return ret;
462 info->next_index = index + 1;
463 return entry;
464} 361}
465 362
466/** 363/*
467 * shmem_free_swp - free some swap entries in a directory 364 * Remove swap entry from radix tree, free the swap and its page cache.
468 * @dir: pointer to the directory
469 * @edir: pointer after last entry of the directory
470 * @punch_lock: pointer to spinlock when needed for the holepunch case
471 */ 365 */
472static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, 366static int shmem_free_swap(struct address_space *mapping,
473 spinlock_t *punch_lock) 367 pgoff_t index, void *radswap)
474{ 368{
475 spinlock_t *punch_unlock = NULL; 369 int error;
476 swp_entry_t *ptr; 370
477 int freed = 0; 371 spin_lock_irq(&mapping->tree_lock);
478 372 error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
479 for (ptr = dir; ptr < edir; ptr++) { 373 spin_unlock_irq(&mapping->tree_lock);
480 if (ptr->val) { 374 if (!error)
481 if (unlikely(punch_lock)) { 375 free_swap_and_cache(radix_to_swp_entry(radswap));
482 punch_unlock = punch_lock; 376 return error;
483 punch_lock = NULL;
484 spin_lock(punch_unlock);
485 if (!ptr->val)
486 continue;
487 }
488 free_swap_and_cache(*ptr);
489 *ptr = (swp_entry_t){0};
490 freed++;
491 }
492 }
493 if (punch_unlock)
494 spin_unlock(punch_unlock);
495 return freed;
496}
497
498static int shmem_map_and_free_swp(struct page *subdir, int offset,
499 int limit, struct page ***dir, spinlock_t *punch_lock)
500{
501 swp_entry_t *ptr;
502 int freed = 0;
503
504 ptr = shmem_swp_map(subdir);
505 for (; offset < limit; offset += LATENCY_LIMIT) {
506 int size = limit - offset;
507 if (size > LATENCY_LIMIT)
508 size = LATENCY_LIMIT;
509 freed += shmem_free_swp(ptr+offset, ptr+offset+size,
510 punch_lock);
511 if (need_resched()) {
512 shmem_swp_unmap(ptr);
513 if (*dir) {
514 shmem_dir_unmap(*dir);
515 *dir = NULL;
516 }
517 cond_resched();
518 ptr = shmem_swp_map(subdir);
519 }
520 }
521 shmem_swp_unmap(ptr);
522 return freed;
523} 377}
524 378
525static void shmem_free_pages(struct list_head *next) 379/*
380 * Pagevec may contain swap entries, so shuffle up pages before releasing.
381 */
382static void shmem_pagevec_release(struct pagevec *pvec)
526{ 383{
527 struct page *page; 384 int i, j;
528 int freed = 0; 385
529 386 for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
530 do { 387 struct page *page = pvec->pages[i];
531 page = container_of(next, struct page, lru); 388 if (!radix_tree_exceptional_entry(page))
532 next = next->next; 389 pvec->pages[j++] = page;
533 shmem_dir_free(page); 390 }
534 freed++; 391 pvec->nr = j;
535 if (freed >= LATENCY_LIMIT) { 392 pagevec_release(pvec);
536 cond_resched();
537 freed = 0;
538 }
539 } while (next);
540} 393}
541 394
542void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 395/*
396 * Remove range of pages and swap entries from radix tree, and free them.
397 */
398void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
543{ 399{
400 struct address_space *mapping = inode->i_mapping;
544 struct shmem_inode_info *info = SHMEM_I(inode); 401 struct shmem_inode_info *info = SHMEM_I(inode);
545 unsigned long idx; 402 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
546 unsigned long size; 403 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
547 unsigned long limit; 404 pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
548 unsigned long stage; 405 struct pagevec pvec;
549 unsigned long diroff; 406 pgoff_t indices[PAGEVEC_SIZE];
550 struct page **dir;
551 struct page *topdir;
552 struct page *middir;
553 struct page *subdir;
554 swp_entry_t *ptr;
555 LIST_HEAD(pages_to_free);
556 long nr_pages_to_free = 0;
557 long nr_swaps_freed = 0; 407 long nr_swaps_freed = 0;
558 int offset; 408 pgoff_t index;
559 int freed; 409 int i;
560 int punch_hole;
561 spinlock_t *needs_lock;
562 spinlock_t *punch_lock;
563 unsigned long upper_limit;
564 410
565 truncate_inode_pages_range(inode->i_mapping, start, end); 411 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
566 412
567 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 413 pagevec_init(&pvec, 0);
568 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 414 index = start;
569 if (idx >= info->next_index) 415 while (index <= end) {
570 return; 416 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
417 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
418 pvec.pages, indices);
419 if (!pvec.nr)
420 break;
421 mem_cgroup_uncharge_start();
422 for (i = 0; i < pagevec_count(&pvec); i++) {
423 struct page *page = pvec.pages[i];
571 424
572 spin_lock(&info->lock); 425 index = indices[i];
573 info->flags |= SHMEM_TRUNCATE; 426 if (index > end)
574 if (likely(end == (loff_t) -1)) { 427 break;
575 limit = info->next_index;
576 upper_limit = SHMEM_MAX_INDEX;
577 info->next_index = idx;
578 needs_lock = NULL;
579 punch_hole = 0;
580 } else {
581 if (end + 1 >= inode->i_size) { /* we may free a little more */
582 limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
583 PAGE_CACHE_SHIFT;
584 upper_limit = SHMEM_MAX_INDEX;
585 } else {
586 limit = (end + 1) >> PAGE_CACHE_SHIFT;
587 upper_limit = limit;
588 }
589 needs_lock = &info->lock;
590 punch_hole = 1;
591 }
592 428
593 topdir = info->i_indirect; 429 if (radix_tree_exceptional_entry(page)) {
594 if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { 430 nr_swaps_freed += !shmem_free_swap(mapping,
595 info->i_indirect = NULL; 431 index, page);
596 nr_pages_to_free++; 432 continue;
597 list_add(&topdir->lru, &pages_to_free); 433 }
434
435 if (!trylock_page(page))
436 continue;
437 if (page->mapping == mapping) {
438 VM_BUG_ON(PageWriteback(page));
439 truncate_inode_page(mapping, page);
440 }
441 unlock_page(page);
442 }
443 shmem_pagevec_release(&pvec);
444 mem_cgroup_uncharge_end();
445 cond_resched();
446 index++;
598 } 447 }
599 spin_unlock(&info->lock);
600 448
601 if (info->swapped && idx < SHMEM_NR_DIRECT) { 449 if (partial) {
602 ptr = info->i_direct; 450 struct page *page = NULL;
603 size = limit; 451 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
604 if (size > SHMEM_NR_DIRECT) 452 if (page) {
605 size = SHMEM_NR_DIRECT; 453 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
606 nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); 454 set_page_dirty(page);
455 unlock_page(page);
456 page_cache_release(page);
457 }
607 } 458 }
608 459
609 /* 460 index = start;
610 * If there are no indirect blocks or we are punching a hole 461 for ( ; ; ) {
611 * below indirect blocks, nothing to be done. 462 cond_resched();
612 */ 463 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
613 if (!topdir || limit <= SHMEM_NR_DIRECT) 464 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
614 goto done2; 465 pvec.pages, indices);
466 if (!pvec.nr) {
467 if (index == start)
468 break;
469 index = start;
470 continue;
471 }
472 if (index == start && indices[0] > end) {
473 shmem_pagevec_release(&pvec);
474 break;
475 }
476 mem_cgroup_uncharge_start();
477 for (i = 0; i < pagevec_count(&pvec); i++) {
478 struct page *page = pvec.pages[i];
615 479
616 /* 480 index = indices[i];
617 * The truncation case has already dropped info->lock, and we're safe 481 if (index > end)
618 * because i_size and next_index have already been lowered, preventing 482 break;
619 * access beyond. But in the punch_hole case, we still need to take
620 * the lock when updating the swap directory, because there might be
621 * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
622 * shmem_writepage. However, whenever we find we can remove a whole
623 * directory page (not at the misaligned start or end of the range),
624 * we first NULLify its pointer in the level above, and then have no
625 * need to take the lock when updating its contents: needs_lock and
626 * punch_lock (either pointing to info->lock or NULL) manage this.
627 */
628 483
629 upper_limit -= SHMEM_NR_DIRECT; 484 if (radix_tree_exceptional_entry(page)) {
630 limit -= SHMEM_NR_DIRECT; 485 nr_swaps_freed += !shmem_free_swap(mapping,
631 idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; 486 index, page);
632 offset = idx % ENTRIES_PER_PAGE; 487 continue;
633 idx -= offset;
634
635 dir = shmem_dir_map(topdir);
636 stage = ENTRIES_PER_PAGEPAGE/2;
637 if (idx < ENTRIES_PER_PAGEPAGE/2) {
638 middir = topdir;
639 diroff = idx/ENTRIES_PER_PAGE;
640 } else {
641 dir += ENTRIES_PER_PAGE/2;
642 dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
643 while (stage <= idx)
644 stage += ENTRIES_PER_PAGEPAGE;
645 middir = *dir;
646 if (*dir) {
647 diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
648 ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
649 if (!diroff && !offset && upper_limit >= stage) {
650 if (needs_lock) {
651 spin_lock(needs_lock);
652 *dir = NULL;
653 spin_unlock(needs_lock);
654 needs_lock = NULL;
655 } else
656 *dir = NULL;
657 nr_pages_to_free++;
658 list_add(&middir->lru, &pages_to_free);
659 } 488 }
660 shmem_dir_unmap(dir);
661 dir = shmem_dir_map(middir);
662 } else {
663 diroff = 0;
664 offset = 0;
665 idx = stage;
666 }
667 }
668 489
669 for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { 490 lock_page(page);
670 if (unlikely(idx == stage)) { 491 if (page->mapping == mapping) {
671 shmem_dir_unmap(dir); 492 VM_BUG_ON(PageWriteback(page));
672 dir = shmem_dir_map(topdir) + 493 truncate_inode_page(mapping, page);
673 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
674 while (!*dir) {
675 dir++;
676 idx += ENTRIES_PER_PAGEPAGE;
677 if (idx >= limit)
678 goto done1;
679 } 494 }
680 stage = idx + ENTRIES_PER_PAGEPAGE; 495 unlock_page(page);
681 middir = *dir;
682 if (punch_hole)
683 needs_lock = &info->lock;
684 if (upper_limit >= stage) {
685 if (needs_lock) {
686 spin_lock(needs_lock);
687 *dir = NULL;
688 spin_unlock(needs_lock);
689 needs_lock = NULL;
690 } else
691 *dir = NULL;
692 nr_pages_to_free++;
693 list_add(&middir->lru, &pages_to_free);
694 }
695 shmem_dir_unmap(dir);
696 cond_resched();
697 dir = shmem_dir_map(middir);
698 diroff = 0;
699 }
700 punch_lock = needs_lock;
701 subdir = dir[diroff];
702 if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
703 if (needs_lock) {
704 spin_lock(needs_lock);
705 dir[diroff] = NULL;
706 spin_unlock(needs_lock);
707 punch_lock = NULL;
708 } else
709 dir[diroff] = NULL;
710 nr_pages_to_free++;
711 list_add(&subdir->lru, &pages_to_free);
712 }
713 if (subdir && page_private(subdir) /* has swap entries */) {
714 size = limit - idx;
715 if (size > ENTRIES_PER_PAGE)
716 size = ENTRIES_PER_PAGE;
717 freed = shmem_map_and_free_swp(subdir,
718 offset, size, &dir, punch_lock);
719 if (!dir)
720 dir = shmem_dir_map(middir);
721 nr_swaps_freed += freed;
722 if (offset || punch_lock) {
723 spin_lock(&info->lock);
724 set_page_private(subdir,
725 page_private(subdir) - freed);
726 spin_unlock(&info->lock);
727 } else
728 BUG_ON(page_private(subdir) != freed);
729 } 496 }
730 offset = 0; 497 shmem_pagevec_release(&pvec);
731 } 498 mem_cgroup_uncharge_end();
732done1: 499 index++;
733 shmem_dir_unmap(dir);
734done2:
735 if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
736 /*
737 * Call truncate_inode_pages again: racing shmem_unuse_inode
738 * may have swizzled a page in from swap since
739 * truncate_pagecache or generic_delete_inode did it, before we
740 * lowered next_index. Also, though shmem_getpage checks
741 * i_size before adding to cache, no recheck after: so fix the
742 * narrow window there too.
743 */
744 truncate_inode_pages_range(inode->i_mapping, start, end);
745 } 500 }
746 501
747 spin_lock(&info->lock); 502 spin_lock(&info->lock);
748 info->flags &= ~SHMEM_TRUNCATE;
749 info->swapped -= nr_swaps_freed; 503 info->swapped -= nr_swaps_freed;
750 if (nr_pages_to_free)
751 shmem_free_blocks(inode, nr_pages_to_free);
752 shmem_recalc_inode(inode); 504 shmem_recalc_inode(inode);
753 spin_unlock(&info->lock); 505 spin_unlock(&info->lock);
754 506
755 /* 507 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
756 * Empty swap vector directory pages to be freed?
757 */
758 if (!list_empty(&pages_to_free)) {
759 pages_to_free.prev->next = NULL;
760 shmem_free_pages(pages_to_free.next);
761 }
762} 508}
763EXPORT_SYMBOL_GPL(shmem_truncate_range); 509EXPORT_SYMBOL_GPL(shmem_truncate_range);
764 510
@@ -774,37 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
774 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 520 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
775 loff_t oldsize = inode->i_size; 521 loff_t oldsize = inode->i_size;
776 loff_t newsize = attr->ia_size; 522 loff_t newsize = attr->ia_size;
777 struct page *page = NULL;
778 523
779 if (newsize < oldsize) {
780 /*
781 * If truncating down to a partial page, then
782 * if that page is already allocated, hold it
783 * in memory until the truncation is over, so
784 * truncate_partial_page cannot miss it were
785 * it assigned to swap.
786 */
787 if (newsize & (PAGE_CACHE_SIZE-1)) {
788 (void) shmem_getpage(inode,
789 newsize >> PAGE_CACHE_SHIFT,
790 &page, SGP_READ, NULL);
791 if (page)
792 unlock_page(page);
793 }
794 /*
795 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
796 * detect if any pages might have been added to cache
797 * after truncate_inode_pages. But we needn't bother
798 * if it's being fully truncated to zero-length: the
799 * nrpages check is efficient enough in that case.
800 */
801 if (newsize) {
802 struct shmem_inode_info *info = SHMEM_I(inode);
803 spin_lock(&info->lock);
804 info->flags &= ~SHMEM_PAGEIN;
805 spin_unlock(&info->lock);
806 }
807 }
808 if (newsize != oldsize) { 524 if (newsize != oldsize) {
809 i_size_write(inode, newsize); 525 i_size_write(inode, newsize);
810 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 526 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -816,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
816 /* unmap again to remove racily COWed private pages */ 532 /* unmap again to remove racily COWed private pages */
817 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 533 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
818 } 534 }
819 if (page)
820 page_cache_release(page);
821 } 535 }
822 536
823 setattr_copy(inode, attr); 537 setattr_copy(inode, attr);
@@ -842,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode)
842 list_del_init(&info->swaplist); 556 list_del_init(&info->swaplist);
843 mutex_unlock(&shmem_swaplist_mutex); 557 mutex_unlock(&shmem_swaplist_mutex);
844 } 558 }
845 } 559 } else
560 kfree(info->symlink);
846 561
847 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { 562 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
848 kfree(xattr->name); 563 kfree(xattr->name);
@@ -853,106 +568,27 @@ static void shmem_evict_inode(struct inode *inode)
853 end_writeback(inode); 568 end_writeback(inode);
854} 569}
855 570
856static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) 571/*
857{ 572 * If swap found in inode, free it and move page from swapcache to filecache.
858 swp_entry_t *ptr; 573 */
859 574static int shmem_unuse_inode(struct shmem_inode_info *info,
860 for (ptr = dir; ptr < edir; ptr++) { 575 swp_entry_t swap, struct page *page)
861 if (ptr->val == entry.val)
862 return ptr - dir;
863 }
864 return -1;
865}
866
867static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
868{ 576{
869 struct address_space *mapping; 577 struct address_space *mapping = info->vfs_inode.i_mapping;
870 unsigned long idx; 578 void *radswap;
871 unsigned long size; 579 pgoff_t index;
872 unsigned long limit;
873 unsigned long stage;
874 struct page **dir;
875 struct page *subdir;
876 swp_entry_t *ptr;
877 int offset;
878 int error; 580 int error;
879 581
880 idx = 0; 582 radswap = swp_to_radix_entry(swap);
881 ptr = info->i_direct; 583 index = radix_tree_locate_item(&mapping->page_tree, radswap);
882 spin_lock(&info->lock); 584 if (index == -1)
883 if (!info->swapped) { 585 return 0;
884 list_del_init(&info->swaplist);
885 goto lost2;
886 }
887 limit = info->next_index;
888 size = limit;
889 if (size > SHMEM_NR_DIRECT)
890 size = SHMEM_NR_DIRECT;
891 offset = shmem_find_swp(entry, ptr, ptr+size);
892 if (offset >= 0) {
893 shmem_swp_balance_unmap();
894 goto found;
895 }
896 if (!info->i_indirect)
897 goto lost2;
898
899 dir = shmem_dir_map(info->i_indirect);
900 stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
901
902 for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
903 if (unlikely(idx == stage)) {
904 shmem_dir_unmap(dir-1);
905 if (cond_resched_lock(&info->lock)) {
906 /* check it has not been truncated */
907 if (limit > info->next_index) {
908 limit = info->next_index;
909 if (idx >= limit)
910 goto lost2;
911 }
912 }
913 dir = shmem_dir_map(info->i_indirect) +
914 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
915 while (!*dir) {
916 dir++;
917 idx += ENTRIES_PER_PAGEPAGE;
918 if (idx >= limit)
919 goto lost1;
920 }
921 stage = idx + ENTRIES_PER_PAGEPAGE;
922 subdir = *dir;
923 shmem_dir_unmap(dir);
924 dir = shmem_dir_map(subdir);
925 }
926 subdir = *dir;
927 if (subdir && page_private(subdir)) {
928 ptr = shmem_swp_map(subdir);
929 size = limit - idx;
930 if (size > ENTRIES_PER_PAGE)
931 size = ENTRIES_PER_PAGE;
932 offset = shmem_find_swp(entry, ptr, ptr+size);
933 shmem_swp_unmap(ptr);
934 if (offset >= 0) {
935 shmem_dir_unmap(dir);
936 ptr = shmem_swp_map(subdir);
937 goto found;
938 }
939 }
940 }
941lost1:
942 shmem_dir_unmap(dir-1);
943lost2:
944 spin_unlock(&info->lock);
945 return 0;
946found:
947 idx += offset;
948 ptr += offset;
949 586
950 /* 587 /*
951 * Move _head_ to start search for next from here. 588 * Move _head_ to start search for next from here.
952 * But be careful: shmem_evict_inode checks list_empty without taking 589 * But be careful: shmem_evict_inode checks list_empty without taking
953 * mutex, and there's an instant in list_move_tail when info->swaplist 590 * mutex, and there's an instant in list_move_tail when info->swaplist
954 * would appear empty, if it were the only one on shmem_swaplist. We 591 * would appear empty, if it were the only one on shmem_swaplist.
955 * could avoid doing it if inode NULL; or use this minor optimization.
956 */ 592 */
957 if (shmem_swaplist.next != &info->swaplist) 593 if (shmem_swaplist.next != &info->swaplist)
958 list_move_tail(&shmem_swaplist, &info->swaplist); 594 list_move_tail(&shmem_swaplist, &info->swaplist);
@@ -962,42 +598,34 @@ found:
962 * but also to hold up shmem_evict_inode(): so inode cannot be freed 598 * but also to hold up shmem_evict_inode(): so inode cannot be freed
963 * beneath us (pagelock doesn't help until the page is in pagecache). 599 * beneath us (pagelock doesn't help until the page is in pagecache).
964 */ 600 */
965 mapping = info->vfs_inode.i_mapping; 601 error = shmem_add_to_page_cache(page, mapping, index,
966 error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); 602 GFP_NOWAIT, radswap);
967 /* which does mem_cgroup_uncharge_cache_page on error */ 603 /* which does mem_cgroup_uncharge_cache_page on error */
968 604
969 if (error == -EEXIST) { 605 if (error != -ENOMEM) {
970 struct page *filepage = find_get_page(mapping, idx); 606 /*
971 error = 1; 607 * Truncation and eviction use free_swap_and_cache(), which
972 if (filepage) { 608 * only does trylock page: if we raced, best clean up here.
973 /* 609 */
974 * There might be a more uptodate page coming down
975 * from a stacked writepage: forget our swappage if so.
976 */
977 if (PageUptodate(filepage))
978 error = 0;
979 page_cache_release(filepage);
980 }
981 }
982 if (!error) {
983 delete_from_swap_cache(page); 610 delete_from_swap_cache(page);
984 set_page_dirty(page); 611 set_page_dirty(page);
985 info->flags |= SHMEM_PAGEIN; 612 if (!error) {
986 shmem_swp_set(info, ptr, 0); 613 spin_lock(&info->lock);
987 swap_free(entry); 614 info->swapped--;
615 spin_unlock(&info->lock);
616 swap_free(swap);
617 }
988 error = 1; /* not an error, but entry was found */ 618 error = 1; /* not an error, but entry was found */
989 } 619 }
990 shmem_swp_unmap(ptr);
991 spin_unlock(&info->lock);
992 return error; 620 return error;
993} 621}
994 622
995/* 623/*
996 * shmem_unuse() search for an eventually swapped out shmem page. 624 * Search through swapped inodes to find and replace swap by page.
997 */ 625 */
998int shmem_unuse(swp_entry_t entry, struct page *page) 626int shmem_unuse(swp_entry_t swap, struct page *page)
999{ 627{
1000 struct list_head *p, *next; 628 struct list_head *this, *next;
1001 struct shmem_inode_info *info; 629 struct shmem_inode_info *info;
1002 int found = 0; 630 int found = 0;
1003 int error; 631 int error;
@@ -1006,32 +634,25 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
1006 * Charge page using GFP_KERNEL while we can wait, before taking 634 * Charge page using GFP_KERNEL while we can wait, before taking
1007 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 635 * the shmem_swaplist_mutex which might hold up shmem_writepage().
1008 * Charged back to the user (not to caller) when swap account is used. 636 * Charged back to the user (not to caller) when swap account is used.
1009 * add_to_page_cache() will be called with GFP_NOWAIT.
1010 */ 637 */
1011 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 638 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
1012 if (error) 639 if (error)
1013 goto out; 640 goto out;
1014 /* 641 /* No radix_tree_preload: swap entry keeps a place for page in tree */
1015 * Try to preload while we can wait, to not make a habit of
1016 * draining atomic reserves; but don't latch on to this cpu,
1017 * it's okay if sometimes we get rescheduled after this.
1018 */
1019 error = radix_tree_preload(GFP_KERNEL);
1020 if (error)
1021 goto uncharge;
1022 radix_tree_preload_end();
1023 642
1024 mutex_lock(&shmem_swaplist_mutex); 643 mutex_lock(&shmem_swaplist_mutex);
1025 list_for_each_safe(p, next, &shmem_swaplist) { 644 list_for_each_safe(this, next, &shmem_swaplist) {
1026 info = list_entry(p, struct shmem_inode_info, swaplist); 645 info = list_entry(this, struct shmem_inode_info, swaplist);
1027 found = shmem_unuse_inode(info, entry, page); 646 if (info->swapped)
647 found = shmem_unuse_inode(info, swap, page);
648 else
649 list_del_init(&info->swaplist);
1028 cond_resched(); 650 cond_resched();
1029 if (found) 651 if (found)
1030 break; 652 break;
1031 } 653 }
1032 mutex_unlock(&shmem_swaplist_mutex); 654 mutex_unlock(&shmem_swaplist_mutex);
1033 655
1034uncharge:
1035 if (!found) 656 if (!found)
1036 mem_cgroup_uncharge_cache_page(page); 657 mem_cgroup_uncharge_cache_page(page);
1037 if (found < 0) 658 if (found < 0)
@@ -1048,10 +669,10 @@ out:
1048static int shmem_writepage(struct page *page, struct writeback_control *wbc) 669static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1049{ 670{
1050 struct shmem_inode_info *info; 671 struct shmem_inode_info *info;
1051 swp_entry_t *entry, swap;
1052 struct address_space *mapping; 672 struct address_space *mapping;
1053 unsigned long index;
1054 struct inode *inode; 673 struct inode *inode;
674 swp_entry_t swap;
675 pgoff_t index;
1055 676
1056 BUG_ON(!PageLocked(page)); 677 BUG_ON(!PageLocked(page));
1057 mapping = page->mapping; 678 mapping = page->mapping;
@@ -1066,69 +687,46 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1066 /* 687 /*
1067 * shmem_backing_dev_info's capabilities prevent regular writeback or 688 * shmem_backing_dev_info's capabilities prevent regular writeback or
1068 * sync from ever calling shmem_writepage; but a stacking filesystem 689 * sync from ever calling shmem_writepage; but a stacking filesystem
1069 * may use the ->writepage of its underlying filesystem, in which case 690 * might use ->writepage of its underlying filesystem, in which case
1070 * tmpfs should write out to swap only in response to memory pressure, 691 * tmpfs should write out to swap only in response to memory pressure,
1071 * and not for the writeback threads or sync. However, in those cases, 692 * and not for the writeback threads or sync.
1072 * we do still want to check if there's a redundant swappage to be
1073 * discarded.
1074 */ 693 */
1075 if (wbc->for_reclaim) 694 if (!wbc->for_reclaim) {
1076 swap = get_swap_page(); 695 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
1077 else 696 goto redirty;
1078 swap.val = 0; 697 }
698 swap = get_swap_page();
699 if (!swap.val)
700 goto redirty;
1079 701
1080 /* 702 /*
1081 * Add inode to shmem_unuse()'s list of swapped-out inodes, 703 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1082 * if it's not already there. Do it now because we cannot take 704 * if it's not already there. Do it now before the page is
1083 * mutex while holding spinlock, and must do so before the page 705 * moved to swap cache, when its pagelock no longer protects
1084 * is moved to swap cache, when its pagelock no longer protects
1085 * the inode from eviction. But don't unlock the mutex until 706 * the inode from eviction. But don't unlock the mutex until
1086 * we've taken the spinlock, because shmem_unuse_inode() will 707 * we've incremented swapped, because shmem_unuse_inode() will
1087 * prune a !swapped inode from the swaplist under both locks. 708 * prune a !swapped inode from the swaplist under this mutex.
1088 */ 709 */
1089 if (swap.val) { 710 mutex_lock(&shmem_swaplist_mutex);
1090 mutex_lock(&shmem_swaplist_mutex); 711 if (list_empty(&info->swaplist))
1091 if (list_empty(&info->swaplist)) 712 list_add_tail(&info->swaplist, &shmem_swaplist);
1092 list_add_tail(&info->swaplist, &shmem_swaplist);
1093 }
1094
1095 spin_lock(&info->lock);
1096 if (swap.val)
1097 mutex_unlock(&shmem_swaplist_mutex);
1098
1099 if (index >= info->next_index) {
1100 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
1101 goto unlock;
1102 }
1103 entry = shmem_swp_entry(info, index, NULL);
1104 if (entry->val) {
1105 /*
1106 * The more uptodate page coming down from a stacked
1107 * writepage should replace our old swappage.
1108 */
1109 free_swap_and_cache(*entry);
1110 shmem_swp_set(info, entry, 0);
1111 }
1112 shmem_recalc_inode(inode);
1113 713
1114 if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 714 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1115 delete_from_page_cache(page);
1116 shmem_swp_set(info, entry, swap.val);
1117 shmem_swp_unmap(entry);
1118 swap_shmem_alloc(swap); 715 swap_shmem_alloc(swap);
716 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
717
718 spin_lock(&info->lock);
719 info->swapped++;
720 shmem_recalc_inode(inode);
1119 spin_unlock(&info->lock); 721 spin_unlock(&info->lock);
722
723 mutex_unlock(&shmem_swaplist_mutex);
1120 BUG_ON(page_mapped(page)); 724 BUG_ON(page_mapped(page));
1121 swap_writepage(page, wbc); 725 swap_writepage(page, wbc);
1122 return 0; 726 return 0;
1123 } 727 }
1124 728
1125 shmem_swp_unmap(entry); 729 mutex_unlock(&shmem_swaplist_mutex);
1126unlock:
1127 spin_unlock(&info->lock);
1128 /*
1129 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
1130 * clear SWAP_HAS_CACHE flag.
1131 */
1132 swapcache_free(swap, NULL); 730 swapcache_free(swap, NULL);
1133redirty: 731redirty:
1134 set_page_dirty(page); 732 set_page_dirty(page);
@@ -1165,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1165} 763}
1166#endif /* CONFIG_TMPFS */ 764#endif /* CONFIG_TMPFS */
1167 765
1168static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 766static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1169 struct shmem_inode_info *info, unsigned long idx) 767 struct shmem_inode_info *info, pgoff_t index)
1170{ 768{
1171 struct mempolicy mpol, *spol; 769 struct mempolicy mpol, *spol;
1172 struct vm_area_struct pvma; 770 struct vm_area_struct pvma;
1173 struct page *page;
1174 771
1175 spol = mpol_cond_copy(&mpol, 772 spol = mpol_cond_copy(&mpol,
1176 mpol_shared_policy_lookup(&info->policy, idx)); 773 mpol_shared_policy_lookup(&info->policy, index));
1177 774
1178 /* Create a pseudo vma that just contains the policy */ 775 /* Create a pseudo vma that just contains the policy */
1179 pvma.vm_start = 0; 776 pvma.vm_start = 0;
1180 pvma.vm_pgoff = idx; 777 pvma.vm_pgoff = index;
1181 pvma.vm_ops = NULL; 778 pvma.vm_ops = NULL;
1182 pvma.vm_policy = spol; 779 pvma.vm_policy = spol;
1183 page = swapin_readahead(entry, gfp, &pvma, 0); 780 return swapin_readahead(swap, gfp, &pvma, 0);
1184 return page;
1185} 781}
1186 782
1187static struct page *shmem_alloc_page(gfp_t gfp, 783static struct page *shmem_alloc_page(gfp_t gfp,
1188 struct shmem_inode_info *info, unsigned long idx) 784 struct shmem_inode_info *info, pgoff_t index)
1189{ 785{
1190 struct vm_area_struct pvma; 786 struct vm_area_struct pvma;
1191 787
1192 /* Create a pseudo vma that just contains the policy */ 788 /* Create a pseudo vma that just contains the policy */
1193 pvma.vm_start = 0; 789 pvma.vm_start = 0;
1194 pvma.vm_pgoff = idx; 790 pvma.vm_pgoff = index;
1195 pvma.vm_ops = NULL; 791 pvma.vm_ops = NULL;
1196 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); 792 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1197 793
1198 /* 794 /*
1199 * alloc_page_vma() will drop the shared policy reference 795 * alloc_page_vma() will drop the shared policy reference
@@ -1202,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp,
1202} 798}
1203#else /* !CONFIG_NUMA */ 799#else /* !CONFIG_NUMA */
1204#ifdef CONFIG_TMPFS 800#ifdef CONFIG_TMPFS
1205static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) 801static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1206{ 802{
1207} 803}
1208#endif /* CONFIG_TMPFS */ 804#endif /* CONFIG_TMPFS */
1209 805
1210static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 806static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1211 struct shmem_inode_info *info, unsigned long idx) 807 struct shmem_inode_info *info, pgoff_t index)
1212{ 808{
1213 return swapin_readahead(entry, gfp, NULL, 0); 809 return swapin_readahead(swap, gfp, NULL, 0);
1214} 810}
1215 811
1216static inline struct page *shmem_alloc_page(gfp_t gfp, 812static inline struct page *shmem_alloc_page(gfp_t gfp,
1217 struct shmem_inode_info *info, unsigned long idx) 813 struct shmem_inode_info *info, pgoff_t index)
1218{ 814{
1219 return alloc_page(gfp); 815 return alloc_page(gfp);
1220} 816}
@@ -1228,311 +824,195 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1228#endif 824#endif
1229 825
1230/* 826/*
1231 * shmem_getpage - either get the page from swap or allocate a new one 827 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1232 * 828 *
1233 * If we allocate a new one we do not mark it dirty. That's up to the 829 * If we allocate a new one we do not mark it dirty. That's up to the
1234 * vm. If we swap it in we mark it dirty since we also free the swap 830 * vm. If we swap it in we mark it dirty since we also free the swap
1235 * entry since a page cannot live in both the swap and page cache 831 * entry since a page cannot live in both the swap and page cache
1236 */ 832 */
1237static int shmem_getpage(struct inode *inode, unsigned long idx, 833static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1238 struct page **pagep, enum sgp_type sgp, int *type) 834 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
1239{ 835{
1240 struct address_space *mapping = inode->i_mapping; 836 struct address_space *mapping = inode->i_mapping;
1241 struct shmem_inode_info *info = SHMEM_I(inode); 837 struct shmem_inode_info *info;
1242 struct shmem_sb_info *sbinfo; 838 struct shmem_sb_info *sbinfo;
1243 struct page *filepage = *pagep; 839 struct page *page;
1244 struct page *swappage;
1245 struct page *prealloc_page = NULL;
1246 swp_entry_t *entry;
1247 swp_entry_t swap; 840 swp_entry_t swap;
1248 gfp_t gfp;
1249 int error; 841 int error;
842 int once = 0;
1250 843
1251 if (idx >= SHMEM_MAX_INDEX) 844 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
1252 return -EFBIG; 845 return -EFBIG;
846repeat:
847 swap.val = 0;
848 page = find_lock_page(mapping, index);
849 if (radix_tree_exceptional_entry(page)) {
850 swap = radix_to_swp_entry(page);
851 page = NULL;
852 }
1253 853
1254 if (type) 854 if (sgp != SGP_WRITE &&
1255 *type = 0; 855 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
856 error = -EINVAL;
857 goto failed;
858 }
1256 859
1257 /* 860 if (page || (sgp == SGP_READ && !swap.val)) {
1258 * Normally, filepage is NULL on entry, and either found
1259 * uptodate immediately, or allocated and zeroed, or read
1260 * in under swappage, which is then assigned to filepage.
1261 * But shmem_readpage (required for splice) passes in a locked
1262 * filepage, which may be found not uptodate by other callers
1263 * too, and may need to be copied from the swappage read in.
1264 */
1265repeat:
1266 if (!filepage)
1267 filepage = find_lock_page(mapping, idx);
1268 if (filepage && PageUptodate(filepage))
1269 goto done;
1270 gfp = mapping_gfp_mask(mapping);
1271 if (!filepage) {
1272 /* 861 /*
1273 * Try to preload while we can wait, to not make a habit of 862 * Once we can get the page lock, it must be uptodate:
1274 * draining atomic reserves; but don't latch on to this cpu. 863 * if there were an error in reading back from swap,
864 * the page would not be inserted into the filecache.
1275 */ 865 */
1276 error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 866 BUG_ON(page && !PageUptodate(page));
1277 if (error) 867 *pagep = page;
1278 goto failed; 868 return 0;
1279 radix_tree_preload_end();
1280 if (sgp != SGP_READ && !prealloc_page) {
1281 /* We don't care if this fails */
1282 prealloc_page = shmem_alloc_page(gfp, info, idx);
1283 if (prealloc_page) {
1284 if (mem_cgroup_cache_charge(prealloc_page,
1285 current->mm, GFP_KERNEL)) {
1286 page_cache_release(prealloc_page);
1287 prealloc_page = NULL;
1288 }
1289 }
1290 }
1291 } 869 }
1292 error = 0;
1293 870
1294 spin_lock(&info->lock); 871 /*
1295 shmem_recalc_inode(inode); 872 * Fast cache lookup did not find it:
1296 entry = shmem_swp_alloc(info, idx, sgp); 873 * bring it back from swap or allocate.
1297 if (IS_ERR(entry)) { 874 */
1298 spin_unlock(&info->lock); 875 info = SHMEM_I(inode);
1299 error = PTR_ERR(entry); 876 sbinfo = SHMEM_SB(inode->i_sb);
1300 goto failed;
1301 }
1302 swap = *entry;
1303 877
1304 if (swap.val) { 878 if (swap.val) {
1305 /* Look it up and read it in.. */ 879 /* Look it up and read it in.. */
1306 swappage = lookup_swap_cache(swap); 880 page = lookup_swap_cache(swap);
1307 if (!swappage) { 881 if (!page) {
1308 shmem_swp_unmap(entry);
1309 spin_unlock(&info->lock);
1310 /* here we actually do the io */ 882 /* here we actually do the io */
1311 if (type) 883 if (fault_type)
1312 *type |= VM_FAULT_MAJOR; 884 *fault_type |= VM_FAULT_MAJOR;
1313 swappage = shmem_swapin(swap, gfp, info, idx); 885 page = shmem_swapin(swap, gfp, info, index);
1314 if (!swappage) { 886 if (!page) {
1315 spin_lock(&info->lock); 887 error = -ENOMEM;
1316 entry = shmem_swp_alloc(info, idx, sgp); 888 goto failed;
1317 if (IS_ERR(entry))
1318 error = PTR_ERR(entry);
1319 else {
1320 if (entry->val == swap.val)
1321 error = -ENOMEM;
1322 shmem_swp_unmap(entry);
1323 }
1324 spin_unlock(&info->lock);
1325 if (error)
1326 goto failed;
1327 goto repeat;
1328 } 889 }
1329 wait_on_page_locked(swappage);
1330 page_cache_release(swappage);
1331 goto repeat;
1332 } 890 }
1333 891
1334 /* We have to do this with page locked to prevent races */ 892 /* We have to do this with page locked to prevent races */
1335 if (!trylock_page(swappage)) { 893 lock_page(page);
1336 shmem_swp_unmap(entry); 894 if (!PageUptodate(page)) {
1337 spin_unlock(&info->lock);
1338 wait_on_page_locked(swappage);
1339 page_cache_release(swappage);
1340 goto repeat;
1341 }
1342 if (PageWriteback(swappage)) {
1343 shmem_swp_unmap(entry);
1344 spin_unlock(&info->lock);
1345 wait_on_page_writeback(swappage);
1346 unlock_page(swappage);
1347 page_cache_release(swappage);
1348 goto repeat;
1349 }
1350 if (!PageUptodate(swappage)) {
1351 shmem_swp_unmap(entry);
1352 spin_unlock(&info->lock);
1353 unlock_page(swappage);
1354 page_cache_release(swappage);
1355 error = -EIO; 895 error = -EIO;
1356 goto failed; 896 goto failed;
1357 } 897 }
1358 898 wait_on_page_writeback(page);
1359 if (filepage) { 899
1360 shmem_swp_set(info, entry, 0); 900 /* Someone may have already done it for us */
1361 shmem_swp_unmap(entry); 901 if (page->mapping) {
1362 delete_from_swap_cache(swappage); 902 if (page->mapping == mapping &&
1363 spin_unlock(&info->lock); 903 page->index == index)
1364 copy_highpage(filepage, swappage); 904 goto done;
1365 unlock_page(swappage); 905 error = -EEXIST;
1366 page_cache_release(swappage); 906 goto failed;
1367 flush_dcache_page(filepage);
1368 SetPageUptodate(filepage);
1369 set_page_dirty(filepage);
1370 swap_free(swap);
1371 } else if (!(error = add_to_page_cache_locked(swappage, mapping,
1372 idx, GFP_NOWAIT))) {
1373 info->flags |= SHMEM_PAGEIN;
1374 shmem_swp_set(info, entry, 0);
1375 shmem_swp_unmap(entry);
1376 delete_from_swap_cache(swappage);
1377 spin_unlock(&info->lock);
1378 filepage = swappage;
1379 set_page_dirty(filepage);
1380 swap_free(swap);
1381 } else {
1382 shmem_swp_unmap(entry);
1383 spin_unlock(&info->lock);
1384 if (error == -ENOMEM) {
1385 /*
1386 * reclaim from proper memory cgroup and
1387 * call memcg's OOM if needed.
1388 */
1389 error = mem_cgroup_shmem_charge_fallback(
1390 swappage,
1391 current->mm,
1392 gfp);
1393 if (error) {
1394 unlock_page(swappage);
1395 page_cache_release(swappage);
1396 goto failed;
1397 }
1398 }
1399 unlock_page(swappage);
1400 page_cache_release(swappage);
1401 goto repeat;
1402 }
1403 } else if (sgp == SGP_READ && !filepage) {
1404 shmem_swp_unmap(entry);
1405 filepage = find_get_page(mapping, idx);
1406 if (filepage &&
1407 (!PageUptodate(filepage) || !trylock_page(filepage))) {
1408 spin_unlock(&info->lock);
1409 wait_on_page_locked(filepage);
1410 page_cache_release(filepage);
1411 filepage = NULL;
1412 goto repeat;
1413 } 907 }
908
909 error = mem_cgroup_cache_charge(page, current->mm,
910 gfp & GFP_RECLAIM_MASK);
911 if (!error)
912 error = shmem_add_to_page_cache(page, mapping, index,
913 gfp, swp_to_radix_entry(swap));
914 if (error)
915 goto failed;
916
917 spin_lock(&info->lock);
918 info->swapped--;
919 shmem_recalc_inode(inode);
1414 spin_unlock(&info->lock); 920 spin_unlock(&info->lock);
921
922 delete_from_swap_cache(page);
923 set_page_dirty(page);
924 swap_free(swap);
925
1415 } else { 926 } else {
1416 shmem_swp_unmap(entry); 927 if (shmem_acct_block(info->flags)) {
1417 sbinfo = SHMEM_SB(inode->i_sb); 928 error = -ENOSPC;
929 goto failed;
930 }
1418 if (sbinfo->max_blocks) { 931 if (sbinfo->max_blocks) {
1419 if (percpu_counter_compare(&sbinfo->used_blocks, 932 if (percpu_counter_compare(&sbinfo->used_blocks,
1420 sbinfo->max_blocks) >= 0 || 933 sbinfo->max_blocks) >= 0) {
1421 shmem_acct_block(info->flags)) 934 error = -ENOSPC;
1422 goto nospace; 935 goto unacct;
1423 percpu_counter_inc(&sbinfo->used_blocks);
1424 spin_lock(&inode->i_lock);
1425 inode->i_blocks += BLOCKS_PER_PAGE;
1426 spin_unlock(&inode->i_lock);
1427 } else if (shmem_acct_block(info->flags))
1428 goto nospace;
1429
1430 if (!filepage) {
1431 int ret;
1432
1433 if (!prealloc_page) {
1434 spin_unlock(&info->lock);
1435 filepage = shmem_alloc_page(gfp, info, idx);
1436 if (!filepage) {
1437 shmem_unacct_blocks(info->flags, 1);
1438 shmem_free_blocks(inode, 1);
1439 error = -ENOMEM;
1440 goto failed;
1441 }
1442 SetPageSwapBacked(filepage);
1443
1444 /*
1445 * Precharge page while we can wait, compensate
1446 * after
1447 */
1448 error = mem_cgroup_cache_charge(filepage,
1449 current->mm, GFP_KERNEL);
1450 if (error) {
1451 page_cache_release(filepage);
1452 shmem_unacct_blocks(info->flags, 1);
1453 shmem_free_blocks(inode, 1);
1454 filepage = NULL;
1455 goto failed;
1456 }
1457
1458 spin_lock(&info->lock);
1459 } else {
1460 filepage = prealloc_page;
1461 prealloc_page = NULL;
1462 SetPageSwapBacked(filepage);
1463 } 936 }
937 percpu_counter_inc(&sbinfo->used_blocks);
938 }
1464 939
1465 entry = shmem_swp_alloc(info, idx, sgp); 940 page = shmem_alloc_page(gfp, info, index);
1466 if (IS_ERR(entry)) 941 if (!page) {
1467 error = PTR_ERR(entry); 942 error = -ENOMEM;
1468 else { 943 goto decused;
1469 swap = *entry;
1470 shmem_swp_unmap(entry);
1471 }
1472 ret = error || swap.val;
1473 if (ret)
1474 mem_cgroup_uncharge_cache_page(filepage);
1475 else
1476 ret = add_to_page_cache_lru(filepage, mapping,
1477 idx, GFP_NOWAIT);
1478 /*
1479 * At add_to_page_cache_lru() failure, uncharge will
1480 * be done automatically.
1481 */
1482 if (ret) {
1483 spin_unlock(&info->lock);
1484 page_cache_release(filepage);
1485 shmem_unacct_blocks(info->flags, 1);
1486 shmem_free_blocks(inode, 1);
1487 filepage = NULL;
1488 if (error)
1489 goto failed;
1490 goto repeat;
1491 }
1492 info->flags |= SHMEM_PAGEIN;
1493 } 944 }
1494 945
946 SetPageSwapBacked(page);
947 __set_page_locked(page);
948 error = mem_cgroup_cache_charge(page, current->mm,
949 gfp & GFP_RECLAIM_MASK);
950 if (!error)
951 error = shmem_add_to_page_cache(page, mapping, index,
952 gfp, NULL);
953 if (error)
954 goto decused;
955 lru_cache_add_anon(page);
956
957 spin_lock(&info->lock);
1495 info->alloced++; 958 info->alloced++;
959 inode->i_blocks += BLOCKS_PER_PAGE;
960 shmem_recalc_inode(inode);
1496 spin_unlock(&info->lock); 961 spin_unlock(&info->lock);
1497 clear_highpage(filepage); 962
1498 flush_dcache_page(filepage); 963 clear_highpage(page);
1499 SetPageUptodate(filepage); 964 flush_dcache_page(page);
965 SetPageUptodate(page);
1500 if (sgp == SGP_DIRTY) 966 if (sgp == SGP_DIRTY)
1501 set_page_dirty(filepage); 967 set_page_dirty(page);
1502 } 968 }
1503done: 969done:
1504 *pagep = filepage; 970 /* Perhaps the file has been truncated since we checked */
1505 error = 0; 971 if (sgp != SGP_WRITE &&
1506 goto out; 972 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
973 error = -EINVAL;
974 goto trunc;
975 }
976 *pagep = page;
977 return 0;
1507 978
1508nospace:
1509 /* 979 /*
1510 * Perhaps the page was brought in from swap between find_lock_page 980 * Error recovery.
1511 * and taking info->lock? We allow for that at add_to_page_cache_lru,
1512 * but must also avoid reporting a spurious ENOSPC while working on a
1513 * full tmpfs. (When filepage has been passed in to shmem_getpage, it
1514 * is already in page cache, which prevents this race from occurring.)
1515 */ 981 */
1516 if (!filepage) { 982trunc:
1517 struct page *page = find_get_page(mapping, idx); 983 ClearPageDirty(page);
1518 if (page) { 984 delete_from_page_cache(page);
1519 spin_unlock(&info->lock); 985 spin_lock(&info->lock);
1520 page_cache_release(page); 986 info->alloced--;
1521 goto repeat; 987 inode->i_blocks -= BLOCKS_PER_PAGE;
1522 }
1523 }
1524 spin_unlock(&info->lock); 988 spin_unlock(&info->lock);
1525 error = -ENOSPC; 989decused:
990 if (sbinfo->max_blocks)
991 percpu_counter_add(&sbinfo->used_blocks, -1);
992unacct:
993 shmem_unacct_blocks(info->flags, 1);
1526failed: 994failed:
1527 if (*pagep != filepage) { 995 if (swap.val && error != -EINVAL) {
1528 unlock_page(filepage); 996 struct page *test = find_get_page(mapping, index);
1529 page_cache_release(filepage); 997 if (test && !radix_tree_exceptional_entry(test))
998 page_cache_release(test);
999 /* Have another try if the entry has changed */
1000 if (test != swp_to_radix_entry(swap))
1001 error = -EEXIST;
1530 } 1002 }
1531out: 1003 if (page) {
1532 if (prealloc_page) { 1004 unlock_page(page);
1533 mem_cgroup_uncharge_cache_page(prealloc_page); 1005 page_cache_release(page);
1534 page_cache_release(prealloc_page);
1535 } 1006 }
1007 if (error == -ENOSPC && !once++) {
1008 info = SHMEM_I(inode);
1009 spin_lock(&info->lock);
1010 shmem_recalc_inode(inode);
1011 spin_unlock(&info->lock);
1012 goto repeat;
1013 }
1014 if (error == -EEXIST)
1015 goto repeat;
1536 return error; 1016 return error;
1537} 1017}
1538 1018
@@ -1540,36 +1020,34 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1540{ 1020{
1541 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1021 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1542 int error; 1022 int error;
1543 int ret; 1023 int ret = VM_FAULT_LOCKED;
1544
1545 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1546 return VM_FAULT_SIGBUS;
1547 1024
1548 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1025 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1549 if (error) 1026 if (error)
1550 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1027 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1028
1551 if (ret & VM_FAULT_MAJOR) { 1029 if (ret & VM_FAULT_MAJOR) {
1552 count_vm_event(PGMAJFAULT); 1030 count_vm_event(PGMAJFAULT);
1553 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1031 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1554 } 1032 }
1555 return ret | VM_FAULT_LOCKED; 1033 return ret;
1556} 1034}
1557 1035
1558#ifdef CONFIG_NUMA 1036#ifdef CONFIG_NUMA
1559static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) 1037static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
1560{ 1038{
1561 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1039 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1562 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); 1040 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
1563} 1041}
1564 1042
1565static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 1043static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1566 unsigned long addr) 1044 unsigned long addr)
1567{ 1045{
1568 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1046 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1569 unsigned long idx; 1047 pgoff_t index;
1570 1048
1571 idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1049 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1572 return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); 1050 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
1573} 1051}
1574#endif 1052#endif
1575 1053
@@ -1667,20 +1145,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1667 1145
1668#ifdef CONFIG_TMPFS 1146#ifdef CONFIG_TMPFS
1669static const struct inode_operations shmem_symlink_inode_operations; 1147static const struct inode_operations shmem_symlink_inode_operations;
1670static const struct inode_operations shmem_symlink_inline_operations; 1148static const struct inode_operations shmem_short_symlink_operations;
1671
1672/*
1673 * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
1674 * but providing them allows a tmpfs file to be used for splice, sendfile, and
1675 * below the loop driver, in the generic fashion that many filesystems support.
1676 */
1677static int shmem_readpage(struct file *file, struct page *page)
1678{
1679 struct inode *inode = page->mapping->host;
1680 int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
1681 unlock_page(page);
1682 return error;
1683}
1684 1149
1685static int 1150static int
1686shmem_write_begin(struct file *file, struct address_space *mapping, 1151shmem_write_begin(struct file *file, struct address_space *mapping,
@@ -1689,7 +1154,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
1689{ 1154{
1690 struct inode *inode = mapping->host; 1155 struct inode *inode = mapping->host;
1691 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1156 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1692 *pagep = NULL;
1693 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 1157 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1694} 1158}
1695 1159
@@ -1714,7 +1178,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1714{ 1178{
1715 struct inode *inode = filp->f_path.dentry->d_inode; 1179 struct inode *inode = filp->f_path.dentry->d_inode;
1716 struct address_space *mapping = inode->i_mapping; 1180 struct address_space *mapping = inode->i_mapping;
1717 unsigned long index, offset; 1181 pgoff_t index;
1182 unsigned long offset;
1718 enum sgp_type sgp = SGP_READ; 1183 enum sgp_type sgp = SGP_READ;
1719 1184
1720 /* 1185 /*
@@ -1730,7 +1195,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1730 1195
1731 for (;;) { 1196 for (;;) {
1732 struct page *page = NULL; 1197 struct page *page = NULL;
1733 unsigned long end_index, nr, ret; 1198 pgoff_t end_index;
1199 unsigned long nr, ret;
1734 loff_t i_size = i_size_read(inode); 1200 loff_t i_size = i_size_read(inode);
1735 1201
1736 end_index = i_size >> PAGE_CACHE_SHIFT; 1202 end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -1846,6 +1312,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1846 return retval; 1312 return retval;
1847} 1313}
1848 1314
1315static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1316 struct pipe_inode_info *pipe, size_t len,
1317 unsigned int flags)
1318{
1319 struct address_space *mapping = in->f_mapping;
1320 struct inode *inode = mapping->host;
1321 unsigned int loff, nr_pages, req_pages;
1322 struct page *pages[PIPE_DEF_BUFFERS];
1323 struct partial_page partial[PIPE_DEF_BUFFERS];
1324 struct page *page;
1325 pgoff_t index, end_index;
1326 loff_t isize, left;
1327 int error, page_nr;
1328 struct splice_pipe_desc spd = {
1329 .pages = pages,
1330 .partial = partial,
1331 .flags = flags,
1332 .ops = &page_cache_pipe_buf_ops,
1333 .spd_release = spd_release_page,
1334 };
1335
1336 isize = i_size_read(inode);
1337 if (unlikely(*ppos >= isize))
1338 return 0;
1339
1340 left = isize - *ppos;
1341 if (unlikely(left < len))
1342 len = left;
1343
1344 if (splice_grow_spd(pipe, &spd))
1345 return -ENOMEM;
1346
1347 index = *ppos >> PAGE_CACHE_SHIFT;
1348 loff = *ppos & ~PAGE_CACHE_MASK;
1349 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1350 nr_pages = min(req_pages, pipe->buffers);
1351
1352 spd.nr_pages = find_get_pages_contig(mapping, index,
1353 nr_pages, spd.pages);
1354 index += spd.nr_pages;
1355 error = 0;
1356
1357 while (spd.nr_pages < nr_pages) {
1358 error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
1359 if (error)
1360 break;
1361 unlock_page(page);
1362 spd.pages[spd.nr_pages++] = page;
1363 index++;
1364 }
1365
1366 index = *ppos >> PAGE_CACHE_SHIFT;
1367 nr_pages = spd.nr_pages;
1368 spd.nr_pages = 0;
1369
1370 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
1371 unsigned int this_len;
1372
1373 if (!len)
1374 break;
1375
1376 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
1377 page = spd.pages[page_nr];
1378
1379 if (!PageUptodate(page) || page->mapping != mapping) {
1380 error = shmem_getpage(inode, index, &page,
1381 SGP_CACHE, NULL);
1382 if (error)
1383 break;
1384 unlock_page(page);
1385 page_cache_release(spd.pages[page_nr]);
1386 spd.pages[page_nr] = page;
1387 }
1388
1389 isize = i_size_read(inode);
1390 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1391 if (unlikely(!isize || index > end_index))
1392 break;
1393
1394 if (end_index == index) {
1395 unsigned int plen;
1396
1397 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1398 if (plen <= loff)
1399 break;
1400
1401 this_len = min(this_len, plen - loff);
1402 len = this_len;
1403 }
1404
1405 spd.partial[page_nr].offset = loff;
1406 spd.partial[page_nr].len = this_len;
1407 len -= this_len;
1408 loff = 0;
1409 spd.nr_pages++;
1410 index++;
1411 }
1412
1413 while (page_nr < nr_pages)
1414 page_cache_release(spd.pages[page_nr++]);
1415
1416 if (spd.nr_pages)
1417 error = splice_to_pipe(pipe, &spd);
1418
1419 splice_shrink_spd(pipe, &spd);
1420
1421 if (error > 0) {
1422 *ppos += error;
1423 file_accessed(in);
1424 }
1425 return error;
1426}
1427
1849static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1428static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1850{ 1429{
1851 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 1430 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1855,8 +1434,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1855 buf->f_namelen = NAME_MAX; 1434 buf->f_namelen = NAME_MAX;
1856 if (sbinfo->max_blocks) { 1435 if (sbinfo->max_blocks) {
1857 buf->f_blocks = sbinfo->max_blocks; 1436 buf->f_blocks = sbinfo->max_blocks;
1858 buf->f_bavail = buf->f_bfree = 1437 buf->f_bavail =
1859 sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); 1438 buf->f_bfree = sbinfo->max_blocks -
1439 percpu_counter_sum(&sbinfo->used_blocks);
1860 } 1440 }
1861 if (sbinfo->max_inodes) { 1441 if (sbinfo->max_inodes) {
1862 buf->f_files = sbinfo->max_inodes; 1442 buf->f_files = sbinfo->max_inodes;
@@ -2006,7 +1586,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2006 int error; 1586 int error;
2007 int len; 1587 int len;
2008 struct inode *inode; 1588 struct inode *inode;
2009 struct page *page = NULL; 1589 struct page *page;
2010 char *kaddr; 1590 char *kaddr;
2011 struct shmem_inode_info *info; 1591 struct shmem_inode_info *info;
2012 1592
@@ -2030,10 +1610,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2030 1610
2031 info = SHMEM_I(inode); 1611 info = SHMEM_I(inode);
2032 inode->i_size = len-1; 1612 inode->i_size = len-1;
2033 if (len <= SHMEM_SYMLINK_INLINE_LEN) { 1613 if (len <= SHORT_SYMLINK_LEN) {
2034 /* do it inline */ 1614 info->symlink = kmemdup(symname, len, GFP_KERNEL);
2035 memcpy(info->inline_symlink, symname, len); 1615 if (!info->symlink) {
2036 inode->i_op = &shmem_symlink_inline_operations; 1616 iput(inode);
1617 return -ENOMEM;
1618 }
1619 inode->i_op = &shmem_short_symlink_operations;
2037 } else { 1620 } else {
2038 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); 1621 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
2039 if (error) { 1622 if (error) {
@@ -2056,17 +1639,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2056 return 0; 1639 return 0;
2057} 1640}
2058 1641
2059static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) 1642static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
2060{ 1643{
2061 nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); 1644 nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
2062 return NULL; 1645 return NULL;
2063} 1646}
2064 1647
2065static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) 1648static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
2066{ 1649{
2067 struct page *page = NULL; 1650 struct page *page = NULL;
2068 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); 1651 int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
2069 nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); 1652 nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
2070 if (page) 1653 if (page)
2071 unlock_page(page); 1654 unlock_page(page);
2072 return page; 1655 return page;
@@ -2177,7 +1760,6 @@ out:
2177 return err; 1760 return err;
2178} 1761}
2179 1762
2180
2181static const struct xattr_handler *shmem_xattr_handlers[] = { 1763static const struct xattr_handler *shmem_xattr_handlers[] = {
2182#ifdef CONFIG_TMPFS_POSIX_ACL 1764#ifdef CONFIG_TMPFS_POSIX_ACL
2183 &generic_acl_access_handler, 1765 &generic_acl_access_handler,
@@ -2307,9 +1889,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
2307} 1889}
2308#endif /* CONFIG_TMPFS_XATTR */ 1890#endif /* CONFIG_TMPFS_XATTR */
2309 1891
2310static const struct inode_operations shmem_symlink_inline_operations = { 1892static const struct inode_operations shmem_short_symlink_operations = {
2311 .readlink = generic_readlink, 1893 .readlink = generic_readlink,
2312 .follow_link = shmem_follow_link_inline, 1894 .follow_link = shmem_follow_short_symlink,
2313#ifdef CONFIG_TMPFS_XATTR 1895#ifdef CONFIG_TMPFS_XATTR
2314 .setxattr = shmem_setxattr, 1896 .setxattr = shmem_setxattr,
2315 .getxattr = shmem_getxattr, 1897 .getxattr = shmem_getxattr,
@@ -2509,8 +2091,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2509 if (config.max_inodes < inodes) 2091 if (config.max_inodes < inodes)
2510 goto out; 2092 goto out;
2511 /* 2093 /*
2512 * Those tests also disallow limited->unlimited while any are in 2094 * Those tests disallow limited->unlimited while any are in use;
2513 * use, so i_blocks will always be zero when max_blocks is zero;
2514 * but we must separately disallow unlimited->limited, because 2095 * but we must separately disallow unlimited->limited, because
2515 * in that case we have no record of how much is already in use. 2096 * in that case we have no record of how much is already in use.
2516 */ 2097 */
@@ -2602,7 +2183,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2602 goto failed; 2183 goto failed;
2603 sbinfo->free_inodes = sbinfo->max_inodes; 2184 sbinfo->free_inodes = sbinfo->max_inodes;
2604 2185
2605 sb->s_maxbytes = SHMEM_MAX_BYTES; 2186 sb->s_maxbytes = MAX_LFS_FILESIZE;
2606 sb->s_blocksize = PAGE_CACHE_SIZE; 2187 sb->s_blocksize = PAGE_CACHE_SIZE;
2607 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 2188 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2608 sb->s_magic = TMPFS_MAGIC; 2189 sb->s_magic = TMPFS_MAGIC;
@@ -2637,14 +2218,14 @@ static struct kmem_cache *shmem_inode_cachep;
2637 2218
2638static struct inode *shmem_alloc_inode(struct super_block *sb) 2219static struct inode *shmem_alloc_inode(struct super_block *sb)
2639{ 2220{
2640 struct shmem_inode_info *p; 2221 struct shmem_inode_info *info;
2641 p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); 2222 info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2642 if (!p) 2223 if (!info)
2643 return NULL; 2224 return NULL;
2644 return &p->vfs_inode; 2225 return &info->vfs_inode;
2645} 2226}
2646 2227
2647static void shmem_i_callback(struct rcu_head *head) 2228static void shmem_destroy_callback(struct rcu_head *head)
2648{ 2229{
2649 struct inode *inode = container_of(head, struct inode, i_rcu); 2230 struct inode *inode = container_of(head, struct inode, i_rcu);
2650 INIT_LIST_HEAD(&inode->i_dentry); 2231 INIT_LIST_HEAD(&inode->i_dentry);
@@ -2653,29 +2234,26 @@ static void shmem_i_callback(struct rcu_head *head)
2653 2234
2654static void shmem_destroy_inode(struct inode *inode) 2235static void shmem_destroy_inode(struct inode *inode)
2655{ 2236{
2656 if ((inode->i_mode & S_IFMT) == S_IFREG) { 2237 if ((inode->i_mode & S_IFMT) == S_IFREG)
2657 /* only struct inode is valid if it's an inline symlink */
2658 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2238 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2659 } 2239 call_rcu(&inode->i_rcu, shmem_destroy_callback);
2660 call_rcu(&inode->i_rcu, shmem_i_callback);
2661} 2240}
2662 2241
2663static void init_once(void *foo) 2242static void shmem_init_inode(void *foo)
2664{ 2243{
2665 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2244 struct shmem_inode_info *info = foo;
2666 2245 inode_init_once(&info->vfs_inode);
2667 inode_init_once(&p->vfs_inode);
2668} 2246}
2669 2247
2670static int init_inodecache(void) 2248static int shmem_init_inodecache(void)
2671{ 2249{
2672 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 2250 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2673 sizeof(struct shmem_inode_info), 2251 sizeof(struct shmem_inode_info),
2674 0, SLAB_PANIC, init_once); 2252 0, SLAB_PANIC, shmem_init_inode);
2675 return 0; 2253 return 0;
2676} 2254}
2677 2255
2678static void destroy_inodecache(void) 2256static void shmem_destroy_inodecache(void)
2679{ 2257{
2680 kmem_cache_destroy(shmem_inode_cachep); 2258 kmem_cache_destroy(shmem_inode_cachep);
2681} 2259}
@@ -2684,7 +2262,6 @@ static const struct address_space_operations shmem_aops = {
2684 .writepage = shmem_writepage, 2262 .writepage = shmem_writepage,
2685 .set_page_dirty = __set_page_dirty_no_writeback, 2263 .set_page_dirty = __set_page_dirty_no_writeback,
2686#ifdef CONFIG_TMPFS 2264#ifdef CONFIG_TMPFS
2687 .readpage = shmem_readpage,
2688 .write_begin = shmem_write_begin, 2265 .write_begin = shmem_write_begin,
2689 .write_end = shmem_write_end, 2266 .write_end = shmem_write_end,
2690#endif 2267#endif
@@ -2701,7 +2278,7 @@ static const struct file_operations shmem_file_operations = {
2701 .aio_read = shmem_file_aio_read, 2278 .aio_read = shmem_file_aio_read,
2702 .aio_write = generic_file_aio_write, 2279 .aio_write = generic_file_aio_write,
2703 .fsync = noop_fsync, 2280 .fsync = noop_fsync,
2704 .splice_read = generic_file_splice_read, 2281 .splice_read = shmem_file_splice_read,
2705 .splice_write = generic_file_splice_write, 2282 .splice_write = generic_file_splice_write,
2706#endif 2283#endif
2707}; 2284};
@@ -2715,10 +2292,6 @@ static const struct inode_operations shmem_inode_operations = {
2715 .listxattr = shmem_listxattr, 2292 .listxattr = shmem_listxattr,
2716 .removexattr = shmem_removexattr, 2293 .removexattr = shmem_removexattr,
2717#endif 2294#endif
2718#ifdef CONFIG_TMPFS_POSIX_ACL
2719 .check_acl = generic_check_acl,
2720#endif
2721
2722}; 2295};
2723 2296
2724static const struct inode_operations shmem_dir_inode_operations = { 2297static const struct inode_operations shmem_dir_inode_operations = {
@@ -2741,7 +2314,6 @@ static const struct inode_operations shmem_dir_inode_operations = {
2741#endif 2314#endif
2742#ifdef CONFIG_TMPFS_POSIX_ACL 2315#ifdef CONFIG_TMPFS_POSIX_ACL
2743 .setattr = shmem_setattr, 2316 .setattr = shmem_setattr,
2744 .check_acl = generic_check_acl,
2745#endif 2317#endif
2746}; 2318};
2747 2319
@@ -2754,7 +2326,6 @@ static const struct inode_operations shmem_special_inode_operations = {
2754#endif 2326#endif
2755#ifdef CONFIG_TMPFS_POSIX_ACL 2327#ifdef CONFIG_TMPFS_POSIX_ACL
2756 .setattr = shmem_setattr, 2328 .setattr = shmem_setattr,
2757 .check_acl = generic_check_acl,
2758#endif 2329#endif
2759}; 2330};
2760 2331
@@ -2779,21 +2350,20 @@ static const struct vm_operations_struct shmem_vm_ops = {
2779#endif 2350#endif
2780}; 2351};
2781 2352
2782
2783static struct dentry *shmem_mount(struct file_system_type *fs_type, 2353static struct dentry *shmem_mount(struct file_system_type *fs_type,
2784 int flags, const char *dev_name, void *data) 2354 int flags, const char *dev_name, void *data)
2785{ 2355{
2786 return mount_nodev(fs_type, flags, data, shmem_fill_super); 2356 return mount_nodev(fs_type, flags, data, shmem_fill_super);
2787} 2357}
2788 2358
2789static struct file_system_type tmpfs_fs_type = { 2359static struct file_system_type shmem_fs_type = {
2790 .owner = THIS_MODULE, 2360 .owner = THIS_MODULE,
2791 .name = "tmpfs", 2361 .name = "tmpfs",
2792 .mount = shmem_mount, 2362 .mount = shmem_mount,
2793 .kill_sb = kill_litter_super, 2363 .kill_sb = kill_litter_super,
2794}; 2364};
2795 2365
2796int __init init_tmpfs(void) 2366int __init shmem_init(void)
2797{ 2367{
2798 int error; 2368 int error;
2799 2369
@@ -2801,18 +2371,18 @@ int __init init_tmpfs(void)
2801 if (error) 2371 if (error)
2802 goto out4; 2372 goto out4;
2803 2373
2804 error = init_inodecache(); 2374 error = shmem_init_inodecache();
2805 if (error) 2375 if (error)
2806 goto out3; 2376 goto out3;
2807 2377
2808 error = register_filesystem(&tmpfs_fs_type); 2378 error = register_filesystem(&shmem_fs_type);
2809 if (error) { 2379 if (error) {
2810 printk(KERN_ERR "Could not register tmpfs\n"); 2380 printk(KERN_ERR "Could not register tmpfs\n");
2811 goto out2; 2381 goto out2;
2812 } 2382 }
2813 2383
2814 shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, 2384 shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
2815 tmpfs_fs_type.name, NULL); 2385 shmem_fs_type.name, NULL);
2816 if (IS_ERR(shm_mnt)) { 2386 if (IS_ERR(shm_mnt)) {
2817 error = PTR_ERR(shm_mnt); 2387 error = PTR_ERR(shm_mnt);
2818 printk(KERN_ERR "Could not kern_mount tmpfs\n"); 2388 printk(KERN_ERR "Could not kern_mount tmpfs\n");
@@ -2821,9 +2391,9 @@ int __init init_tmpfs(void)
2821 return 0; 2391 return 0;
2822 2392
2823out1: 2393out1:
2824 unregister_filesystem(&tmpfs_fs_type); 2394 unregister_filesystem(&shmem_fs_type);
2825out2: 2395out2:
2826 destroy_inodecache(); 2396 shmem_destroy_inodecache();
2827out3: 2397out3:
2828 bdi_destroy(&shmem_backing_dev_info); 2398 bdi_destroy(&shmem_backing_dev_info);
2829out4: 2399out4:
@@ -2831,45 +2401,6 @@ out4:
2831 return error; 2401 return error;
2832} 2402}
2833 2403
2834#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2835/**
2836 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2837 * @inode: the inode to be searched
2838 * @pgoff: the offset to be searched
2839 * @pagep: the pointer for the found page to be stored
2840 * @ent: the pointer for the found swap entry to be stored
2841 *
2842 * If a page is found, refcount of it is incremented. Callers should handle
2843 * these refcount.
2844 */
2845void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2846 struct page **pagep, swp_entry_t *ent)
2847{
2848 swp_entry_t entry = { .val = 0 }, *ptr;
2849 struct page *page = NULL;
2850 struct shmem_inode_info *info = SHMEM_I(inode);
2851
2852 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2853 goto out;
2854
2855 spin_lock(&info->lock);
2856 ptr = shmem_swp_entry(info, pgoff, NULL);
2857#ifdef CONFIG_SWAP
2858 if (ptr && ptr->val) {
2859 entry.val = ptr->val;
2860 page = find_get_page(&swapper_space, entry.val);
2861 } else
2862#endif
2863 page = find_get_page(inode->i_mapping, pgoff);
2864 if (ptr)
2865 shmem_swp_unmap(ptr);
2866 spin_unlock(&info->lock);
2867out:
2868 *pagep = page;
2869 *ent = entry;
2870}
2871#endif
2872
2873#else /* !CONFIG_SHMEM */ 2404#else /* !CONFIG_SHMEM */
2874 2405
2875/* 2406/*
@@ -2883,23 +2414,23 @@ out:
2883 2414
2884#include <linux/ramfs.h> 2415#include <linux/ramfs.h>
2885 2416
2886static struct file_system_type tmpfs_fs_type = { 2417static struct file_system_type shmem_fs_type = {
2887 .name = "tmpfs", 2418 .name = "tmpfs",
2888 .mount = ramfs_mount, 2419 .mount = ramfs_mount,
2889 .kill_sb = kill_litter_super, 2420 .kill_sb = kill_litter_super,
2890}; 2421};
2891 2422
2892int __init init_tmpfs(void) 2423int __init shmem_init(void)
2893{ 2424{
2894 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); 2425 BUG_ON(register_filesystem(&shmem_fs_type) != 0);
2895 2426
2896 shm_mnt = kern_mount(&tmpfs_fs_type); 2427 shm_mnt = kern_mount(&shmem_fs_type);
2897 BUG_ON(IS_ERR(shm_mnt)); 2428 BUG_ON(IS_ERR(shm_mnt));
2898 2429
2899 return 0; 2430 return 0;
2900} 2431}
2901 2432
2902int shmem_unuse(swp_entry_t entry, struct page *page) 2433int shmem_unuse(swp_entry_t swap, struct page *page)
2903{ 2434{
2904 return 0; 2435 return 0;
2905} 2436}
@@ -2909,43 +2440,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
2909 return 0; 2440 return 0;
2910} 2441}
2911 2442
2912void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 2443void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
2913{ 2444{
2914 truncate_inode_pages_range(inode->i_mapping, start, end); 2445 truncate_inode_pages_range(inode->i_mapping, lstart, lend);
2915} 2446}
2916EXPORT_SYMBOL_GPL(shmem_truncate_range); 2447EXPORT_SYMBOL_GPL(shmem_truncate_range);
2917 2448
2918#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2919/**
2920 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2921 * @inode: the inode to be searched
2922 * @pgoff: the offset to be searched
2923 * @pagep: the pointer for the found page to be stored
2924 * @ent: the pointer for the found swap entry to be stored
2925 *
2926 * If a page is found, refcount of it is incremented. Callers should handle
2927 * these refcount.
2928 */
2929void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2930 struct page **pagep, swp_entry_t *ent)
2931{
2932 struct page *page = NULL;
2933
2934 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2935 goto out;
2936 page = find_get_page(inode->i_mapping, pgoff);
2937out:
2938 *pagep = page;
2939 *ent = (swp_entry_t){ .val = 0 };
2940}
2941#endif
2942
2943#define shmem_vm_ops generic_file_vm_ops 2449#define shmem_vm_ops generic_file_vm_ops
2944#define shmem_file_operations ramfs_file_operations 2450#define shmem_file_operations ramfs_file_operations
2945#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 2451#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
2946#define shmem_acct_size(flags, size) 0 2452#define shmem_acct_size(flags, size) 0
2947#define shmem_unacct_size(flags, size) do {} while (0) 2453#define shmem_unacct_size(flags, size) do {} while (0)
2948#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
2949 2454
2950#endif /* CONFIG_SHMEM */ 2455#endif /* CONFIG_SHMEM */
2951 2456
@@ -2969,7 +2474,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2969 if (IS_ERR(shm_mnt)) 2474 if (IS_ERR(shm_mnt))
2970 return (void *)shm_mnt; 2475 return (void *)shm_mnt;
2971 2476
2972 if (size < 0 || size > SHMEM_MAX_BYTES) 2477 if (size < 0 || size > MAX_LFS_FILESIZE)
2973 return ERR_PTR(-EINVAL); 2478 return ERR_PTR(-EINVAL);
2974 2479
2975 if (shmem_acct_size(flags, size)) 2480 if (shmem_acct_size(flags, size))
@@ -3015,6 +2520,15 @@ put_memory:
3015} 2520}
3016EXPORT_SYMBOL_GPL(shmem_file_setup); 2521EXPORT_SYMBOL_GPL(shmem_file_setup);
3017 2522
2523void shmem_set_file(struct vm_area_struct *vma, struct file *file)
2524{
2525 if (vma->vm_file)
2526 fput(vma->vm_file);
2527 vma->vm_file = file;
2528 vma->vm_ops = &shmem_vm_ops;
2529 vma->vm_flags |= VM_CAN_NONLINEAR;
2530}
2531
3018/** 2532/**
3019 * shmem_zero_setup - setup a shared anonymous mapping 2533 * shmem_zero_setup - setup a shared anonymous mapping
3020 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff 2534 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
@@ -3028,11 +2542,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
3028 if (IS_ERR(file)) 2542 if (IS_ERR(file))
3029 return PTR_ERR(file); 2543 return PTR_ERR(file);
3030 2544
3031 if (vma->vm_file) 2545 shmem_set_file(vma, file);
3032 fput(vma->vm_file);
3033 vma->vm_file = file;
3034 vma->vm_ops = &shmem_vm_ops;
3035 vma->vm_flags |= VM_CAN_NONLINEAR;
3036 return 0; 2546 return 0;
3037} 2547}
3038 2548
@@ -3048,13 +2558,29 @@ int shmem_zero_setup(struct vm_area_struct *vma)
3048 * suit tmpfs, since it may have pages in swapcache, and needs to find those 2558 * suit tmpfs, since it may have pages in swapcache, and needs to find those
3049 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 2559 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
3050 * 2560 *
3051 * Provide a stub for those callers to start using now, then later 2561 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
3052 * flesh it out to call shmem_getpage() with additional gfp mask, when 2562 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
3053 * shmem_file_splice_read() is added and shmem_readpage() is removed.
3054 */ 2563 */
3055struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 2564struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
3056 pgoff_t index, gfp_t gfp) 2565 pgoff_t index, gfp_t gfp)
3057{ 2566{
2567#ifdef CONFIG_SHMEM
2568 struct inode *inode = mapping->host;
2569 struct page *page;
2570 int error;
2571
2572 BUG_ON(mapping->a_ops != &shmem_aops);
2573 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
2574 if (error)
2575 page = ERR_PTR(error);
2576 else
2577 unlock_page(page);
2578 return page;
2579#else
2580 /*
2581 * The tiny !SHMEM case uses ramfs without swap
2582 */
3058 return read_cache_page_gfp(mapping, index, gfp); 2583 return read_cache_page_gfp(mapping, index, gfp);
2584#endif
3059} 2585}
3060EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 2586EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/slab.c b/mm/slab.c
index d96e223de77..893c76df924 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic =
574 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 574 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
575 575
576/* internal cache of cache description objs */ 576/* internal cache of cache description objs */
577static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
577static struct kmem_cache cache_cache = { 578static struct kmem_cache cache_cache = {
579 .nodelists = cache_cache_nodelists,
578 .batchcount = 1, 580 .batchcount = 1,
579 .limit = BOOT_CPUCACHE_ENTRIES, 581 .limit = BOOT_CPUCACHE_ENTRIES,
580 .shared = 1, 582 .shared = 1,
@@ -593,6 +595,7 @@ static enum {
593 PARTIAL_AC, 595 PARTIAL_AC,
594 PARTIAL_L3, 596 PARTIAL_L3,
595 EARLY, 597 EARLY,
598 LATE,
596 FULL 599 FULL
597} g_cpucache_up; 600} g_cpucache_up;
598 601
@@ -620,37 +623,67 @@ int slab_is_available(void)
620static struct lock_class_key on_slab_l3_key; 623static struct lock_class_key on_slab_l3_key;
621static struct lock_class_key on_slab_alc_key; 624static struct lock_class_key on_slab_alc_key;
622 625
626static struct lock_class_key debugobj_l3_key;
627static struct lock_class_key debugobj_alc_key;
628
629static void slab_set_lock_classes(struct kmem_cache *cachep,
630 struct lock_class_key *l3_key, struct lock_class_key *alc_key,
631 int q)
632{
633 struct array_cache **alc;
634 struct kmem_list3 *l3;
635 int r;
636
637 l3 = cachep->nodelists[q];
638 if (!l3)
639 return;
640
641 lockdep_set_class(&l3->list_lock, l3_key);
642 alc = l3->alien;
643 /*
644 * FIXME: This check for BAD_ALIEN_MAGIC
645 * should go away when common slab code is taught to
646 * work even without alien caches.
647 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
648 * for alloc_alien_cache,
649 */
650 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
651 return;
652 for_each_node(r) {
653 if (alc[r])
654 lockdep_set_class(&alc[r]->lock, alc_key);
655 }
656}
657
658static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
659{
660 slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
661}
662
663static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
664{
665 int node;
666
667 for_each_online_node(node)
668 slab_set_debugobj_lock_classes_node(cachep, node);
669}
670
623static void init_node_lock_keys(int q) 671static void init_node_lock_keys(int q)
624{ 672{
625 struct cache_sizes *s = malloc_sizes; 673 struct cache_sizes *s = malloc_sizes;
626 674
627 if (g_cpucache_up != FULL) 675 if (g_cpucache_up < LATE)
628 return; 676 return;
629 677
630 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { 678 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
631 struct array_cache **alc;
632 struct kmem_list3 *l3; 679 struct kmem_list3 *l3;
633 int r;
634 680
635 l3 = s->cs_cachep->nodelists[q]; 681 l3 = s->cs_cachep->nodelists[q];
636 if (!l3 || OFF_SLAB(s->cs_cachep)) 682 if (!l3 || OFF_SLAB(s->cs_cachep))
637 continue; 683 continue;
638 lockdep_set_class(&l3->list_lock, &on_slab_l3_key); 684
639 alc = l3->alien; 685 slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key,
640 /* 686 &on_slab_alc_key, q);
641 * FIXME: This check for BAD_ALIEN_MAGIC
642 * should go away when common slab code is taught to
643 * work even without alien caches.
644 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
645 * for alloc_alien_cache,
646 */
647 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
648 continue;
649 for_each_node(r) {
650 if (alc[r])
651 lockdep_set_class(&alc[r]->lock,
652 &on_slab_alc_key);
653 }
654 } 687 }
655} 688}
656 689
@@ -669,6 +702,14 @@ static void init_node_lock_keys(int q)
669static inline void init_lock_keys(void) 702static inline void init_lock_keys(void)
670{ 703{
671} 704}
705
706static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
707{
708}
709
710static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
711{
712}
672#endif 713#endif
673 714
674/* 715/*
@@ -1262,6 +1303,8 @@ static int __cpuinit cpuup_prepare(long cpu)
1262 spin_unlock_irq(&l3->list_lock); 1303 spin_unlock_irq(&l3->list_lock);
1263 kfree(shared); 1304 kfree(shared);
1264 free_alien_cache(alien); 1305 free_alien_cache(alien);
1306 if (cachep->flags & SLAB_DEBUG_OBJECTS)
1307 slab_set_debugobj_lock_classes_node(cachep, node);
1265 } 1308 }
1266 init_node_lock_keys(node); 1309 init_node_lock_keys(node);
1267 1310
@@ -1492,11 +1535,10 @@ void __init kmem_cache_init(void)
1492 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; 1535 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1493 1536
1494 /* 1537 /*
1495 * struct kmem_cache size depends on nr_node_ids, which 1538 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1496 * can be less than MAX_NUMNODES.
1497 */ 1539 */
1498 cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + 1540 cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1499 nr_node_ids * sizeof(struct kmem_list3 *); 1541 nr_node_ids * sizeof(struct kmem_list3 *);
1500#if DEBUG 1542#if DEBUG
1501 cache_cache.obj_size = cache_cache.buffer_size; 1543 cache_cache.obj_size = cache_cache.buffer_size;
1502#endif 1544#endif
@@ -1625,6 +1667,11 @@ void __init kmem_cache_init_late(void)
1625{ 1667{
1626 struct kmem_cache *cachep; 1668 struct kmem_cache *cachep;
1627 1669
1670 g_cpucache_up = LATE;
1671
1672 /* Annotate slab for lockdep -- annotate the malloc caches */
1673 init_lock_keys();
1674
1628 /* 6) resize the head arrays to their final sizes */ 1675 /* 6) resize the head arrays to their final sizes */
1629 mutex_lock(&cache_chain_mutex); 1676 mutex_lock(&cache_chain_mutex);
1630 list_for_each_entry(cachep, &cache_chain, next) 1677 list_for_each_entry(cachep, &cache_chain, next)
@@ -1635,9 +1682,6 @@ void __init kmem_cache_init_late(void)
1635 /* Done! */ 1682 /* Done! */
1636 g_cpucache_up = FULL; 1683 g_cpucache_up = FULL;
1637 1684
1638 /* Annotate slab for lockdep -- annotate the malloc caches */
1639 init_lock_keys();
1640
1641 /* 1685 /*
1642 * Register a cpu startup notifier callback that initializes 1686 * Register a cpu startup notifier callback that initializes
1643 * cpu_cache_get for all new cpus 1687 * cpu_cache_get for all new cpus
@@ -2308,6 +2352,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2308 if (!cachep) 2352 if (!cachep)
2309 goto oops; 2353 goto oops;
2310 2354
2355 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
2311#if DEBUG 2356#if DEBUG
2312 cachep->obj_size = size; 2357 cachep->obj_size = size;
2313 2358
@@ -2424,6 +2469,16 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2424 goto oops; 2469 goto oops;
2425 } 2470 }
2426 2471
2472 if (flags & SLAB_DEBUG_OBJECTS) {
2473 /*
2474 * Would deadlock through slab_destroy()->call_rcu()->
2475 * debug_object_activate()->kmem_cache_alloc().
2476 */
2477 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
2478
2479 slab_set_debugobj_lock_classes(cachep);
2480 }
2481
2427 /* cache setup completed, link it into the list */ 2482 /* cache setup completed, link it into the list */
2428 list_add(&cachep->next, &cache_chain); 2483 list_add(&cachep->next, &cache_chain);
2429oops: 2484oops:
@@ -3153,12 +3208,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3153 objp += obj_offset(cachep); 3208 objp += obj_offset(cachep);
3154 if (cachep->ctor && cachep->flags & SLAB_POISON) 3209 if (cachep->ctor && cachep->flags & SLAB_POISON)
3155 cachep->ctor(objp); 3210 cachep->ctor(objp);
3156#if ARCH_SLAB_MINALIGN 3211 if (ARCH_SLAB_MINALIGN &&
3157 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3212 ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
3158 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3213 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3159 objp, ARCH_SLAB_MINALIGN); 3214 objp, (int)ARCH_SLAB_MINALIGN);
3160 } 3215 }
3161#endif
3162 return objp; 3216 return objp;
3163} 3217}
3164#else 3218#else
@@ -3402,7 +3456,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3402 cache_alloc_debugcheck_before(cachep, flags); 3456 cache_alloc_debugcheck_before(cachep, flags);
3403 local_irq_save(save_flags); 3457 local_irq_save(save_flags);
3404 3458
3405 if (nodeid == -1) 3459 if (nodeid == NUMA_NO_NODE)
3406 nodeid = slab_node; 3460 nodeid = slab_node;
3407 3461
3408 if (unlikely(!cachep->nodelists[nodeid])) { 3462 if (unlikely(!cachep->nodelists[nodeid])) {
@@ -3933,7 +3987,7 @@ fail:
3933 3987
3934struct ccupdate_struct { 3988struct ccupdate_struct {
3935 struct kmem_cache *cachep; 3989 struct kmem_cache *cachep;
3936 struct array_cache *new[NR_CPUS]; 3990 struct array_cache *new[0];
3937}; 3991};
3938 3992
3939static void do_ccupdate_local(void *info) 3993static void do_ccupdate_local(void *info)
@@ -3955,7 +4009,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3955 struct ccupdate_struct *new; 4009 struct ccupdate_struct *new;
3956 int i; 4010 int i;
3957 4011
3958 new = kzalloc(sizeof(*new), gfp); 4012 new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
4013 gfp);
3959 if (!new) 4014 if (!new)
3960 return -ENOMEM; 4015 return -ENOMEM;
3961 4016
diff --git a/mm/slob.c b/mm/slob.c
index 46e0aee33a2..bf391818716 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -70,7 +70,7 @@
70 70
71#include <trace/events/kmem.h> 71#include <trace/events/kmem.h>
72 72
73#include <asm/atomic.h> 73#include <linux/atomic.h>
74 74
75/* 75/*
76 * slob_block has a field 'units', which indicates size of block if +ve, 76 * slob_block has a field 'units', which indicates size of block if +ve,
@@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
482 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 482 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
483 void *ret; 483 void *ret;
484 484
485 gfp &= gfp_allowed_mask;
486
485 lockdep_trace_alloc(gfp); 487 lockdep_trace_alloc(gfp);
486 488
487 if (size < PAGE_SIZE - align) { 489 if (size < PAGE_SIZE - align) {
@@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
608{ 610{
609 void *b; 611 void *b;
610 612
613 flags &= gfp_allowed_mask;
614
615 lockdep_trace_alloc(flags);
616
611 if (c->size < PAGE_SIZE) { 617 if (c->size < PAGE_SIZE) {
612 b = slob_alloc(c->size, flags, c->align, node); 618 b = slob_alloc(c->size, flags, c->align, node);
613 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, 619 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
diff --git a/mm/slub.c b/mm/slub.c
index 35f351f2619..f73234db904 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2,10 +2,11 @@
2 * SLUB: A slab allocator that limits cache line use instead of queuing 2 * SLUB: A slab allocator that limits cache line use instead of queuing
3 * objects in per cpu and per node lists. 3 * objects in per cpu and per node lists.
4 * 4 *
5 * The allocator synchronizes using per slab locks and only 5 * The allocator synchronizes using per slab locks or atomic operatios
6 * uses a centralized lock to manage a pool of partial slabs. 6 * and only uses a centralized lock to manage a pool of partial slabs.
7 * 7 *
8 * (C) 2007 SGI, Christoph Lameter 8 * (C) 2007 SGI, Christoph Lameter
9 * (C) 2011 Linux Foundation, Christoph Lameter
9 */ 10 */
10 11
11#include <linux/mm.h> 12#include <linux/mm.h>
@@ -27,20 +28,33 @@
27#include <linux/memory.h> 28#include <linux/memory.h>
28#include <linux/math64.h> 29#include <linux/math64.h>
29#include <linux/fault-inject.h> 30#include <linux/fault-inject.h>
31#include <linux/stacktrace.h>
30 32
31#include <trace/events/kmem.h> 33#include <trace/events/kmem.h>
32 34
33/* 35/*
34 * Lock order: 36 * Lock order:
35 * 1. slab_lock(page) 37 * 1. slub_lock (Global Semaphore)
36 * 2. slab->list_lock 38 * 2. node->list_lock
39 * 3. slab_lock(page) (Only on some arches and for debugging)
37 * 40 *
38 * The slab_lock protects operations on the object of a particular 41 * slub_lock
39 * slab and its metadata in the page struct. If the slab lock 42 *
40 * has been taken then no allocations nor frees can be performed 43 * The role of the slub_lock is to protect the list of all the slabs
41 * on the objects in the slab nor can the slab be added or removed 44 * and to synchronize major metadata changes to slab cache structures.
42 * from the partial or full lists since this would mean modifying 45 *
43 * the page_struct of the slab. 46 * The slab_lock is only used for debugging and on arches that do not
47 * have the ability to do a cmpxchg_double. It only protects the second
48 * double word in the page struct. Meaning
49 * A. page->freelist -> List of object free in a page
50 * B. page->counters -> Counters of objects
51 * C. page->frozen -> frozen state
52 *
53 * If a slab is frozen then it is exempt from list management. It is not
54 * on any list. The processor that froze the slab is the one who can
55 * perform list operations on the page. Other processors may put objects
56 * onto the freelist but the processor that froze the slab is the only
57 * one that can retrieve the objects from the page's freelist.
44 * 58 *
45 * The list_lock protects the partial and full list on each node and 59 * The list_lock protects the partial and full list on each node and
46 * the partial slab counter. If taken then no new slabs may be added or 60 * the partial slab counter. If taken then no new slabs may be added or
@@ -53,20 +67,6 @@
53 * slabs, operations can continue without any centralized lock. F.e. 67 * slabs, operations can continue without any centralized lock. F.e.
54 * allocating a long series of objects that fill up slabs does not require 68 * allocating a long series of objects that fill up slabs does not require
55 * the list lock. 69 * the list lock.
56 *
57 * The lock order is sometimes inverted when we are trying to get a slab
58 * off a list. We take the list_lock and then look for a page on the list
59 * to use. While we do that objects in the slabs may be freed. We can
60 * only operate on the slab if we have also taken the slab_lock. So we use
61 * a slab_trylock() on the slab. If trylock was successful then no frees
62 * can occur anymore and we can use the slab for allocations etc. If the
63 * slab_trylock() does not succeed then frees are in progress in the slab and
64 * we must stay away from it for a while since we may cause a bouncing
65 * cacheline if we try to acquire the lock. So go onto the next slab.
66 * If all pages are busy then we may allocate a new slab instead of reusing
67 * a partial slab. A new slab has no one operating on it and thus there is
68 * no danger of cacheline contention.
69 *
70 * Interrupts are disabled during allocation and deallocation in order to 70 * Interrupts are disabled during allocation and deallocation in order to
71 * make the slab allocator safe to use in the context of an irq. In addition 71 * make the slab allocator safe to use in the context of an irq. In addition
72 * interrupts are disabled to ensure that the processor does not change 72 * interrupts are disabled to ensure that the processor does not change
@@ -131,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
131/* Enable to test recovery from slab corruption on boot */ 131/* Enable to test recovery from slab corruption on boot */
132#undef SLUB_RESILIENCY_TEST 132#undef SLUB_RESILIENCY_TEST
133 133
134/* Enable to log cmpxchg failures */
135#undef SLUB_DEBUG_CMPXCHG
136
134/* 137/*
135 * Mininum number of partial slabs. These will be left on the partial 138 * Mininum number of partial slabs. These will be left on the partial
136 * lists even if they are empty. kmem_cache_shrink may reclaim them. 139 * lists even if they are empty. kmem_cache_shrink may reclaim them.
@@ -166,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
166 169
167#define OO_SHIFT 16 170#define OO_SHIFT 16
168#define OO_MASK ((1 << OO_SHIFT) - 1) 171#define OO_MASK ((1 << OO_SHIFT) - 1)
169#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ 172#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
170 173
171/* Internal SLUB flags */ 174/* Internal SLUB flags */
172#define __OBJECT_POISON 0x80000000UL /* Poison object */ 175#define __OBJECT_POISON 0x80000000UL /* Poison object */
176#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
173 177
174static int kmem_size = sizeof(struct kmem_cache); 178static int kmem_size = sizeof(struct kmem_cache);
175 179
@@ -191,8 +195,12 @@ static LIST_HEAD(slab_caches);
191/* 195/*
192 * Tracking user of a slab. 196 * Tracking user of a slab.
193 */ 197 */
198#define TRACK_ADDRS_COUNT 16
194struct track { 199struct track {
195 unsigned long addr; /* Called from address */ 200 unsigned long addr; /* Called from address */
201#ifdef CONFIG_STACKTRACE
202 unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
203#endif
196 int cpu; /* Was running on cpu */ 204 int cpu; /* Was running on cpu */
197 int pid; /* Pid context */ 205 int pid; /* Pid context */
198 unsigned long when; /* When did the operation occur */ 206 unsigned long when; /* When did the operation occur */
@@ -338,11 +346,99 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
338 return x.x & OO_MASK; 346 return x.x & OO_MASK;
339} 347}
340 348
349/*
350 * Per slab locking using the pagelock
351 */
352static __always_inline void slab_lock(struct page *page)
353{
354 bit_spin_lock(PG_locked, &page->flags);
355}
356
357static __always_inline void slab_unlock(struct page *page)
358{
359 __bit_spin_unlock(PG_locked, &page->flags);
360}
361
362/* Interrupts must be disabled (for the fallback code to work right) */
363static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
364 void *freelist_old, unsigned long counters_old,
365 void *freelist_new, unsigned long counters_new,
366 const char *n)
367{
368 VM_BUG_ON(!irqs_disabled());
369#ifdef CONFIG_CMPXCHG_DOUBLE
370 if (s->flags & __CMPXCHG_DOUBLE) {
371 if (cmpxchg_double(&page->freelist,
372 freelist_old, counters_old,
373 freelist_new, counters_new))
374 return 1;
375 } else
376#endif
377 {
378 slab_lock(page);
379 if (page->freelist == freelist_old && page->counters == counters_old) {
380 page->freelist = freelist_new;
381 page->counters = counters_new;
382 slab_unlock(page);
383 return 1;
384 }
385 slab_unlock(page);
386 }
387
388 cpu_relax();
389 stat(s, CMPXCHG_DOUBLE_FAIL);
390
391#ifdef SLUB_DEBUG_CMPXCHG
392 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
393#endif
394
395 return 0;
396}
397
398static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
399 void *freelist_old, unsigned long counters_old,
400 void *freelist_new, unsigned long counters_new,
401 const char *n)
402{
403#ifdef CONFIG_CMPXCHG_DOUBLE
404 if (s->flags & __CMPXCHG_DOUBLE) {
405 if (cmpxchg_double(&page->freelist,
406 freelist_old, counters_old,
407 freelist_new, counters_new))
408 return 1;
409 } else
410#endif
411 {
412 unsigned long flags;
413
414 local_irq_save(flags);
415 slab_lock(page);
416 if (page->freelist == freelist_old && page->counters == counters_old) {
417 page->freelist = freelist_new;
418 page->counters = counters_new;
419 slab_unlock(page);
420 local_irq_restore(flags);
421 return 1;
422 }
423 slab_unlock(page);
424 local_irq_restore(flags);
425 }
426
427 cpu_relax();
428 stat(s, CMPXCHG_DOUBLE_FAIL);
429
430#ifdef SLUB_DEBUG_CMPXCHG
431 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
432#endif
433
434 return 0;
435}
436
341#ifdef CONFIG_SLUB_DEBUG 437#ifdef CONFIG_SLUB_DEBUG
342/* 438/*
343 * Determine a map of object in use on a page. 439 * Determine a map of object in use on a page.
344 * 440 *
345 * Slab lock or node listlock must be held to guarantee that the page does 441 * Node listlock must be held to guarantee that the page does
346 * not vanish from under us. 442 * not vanish from under us.
347 */ 443 */
348static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) 444static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
@@ -420,6 +516,24 @@ static void set_track(struct kmem_cache *s, void *object,
420 struct track *p = get_track(s, object, alloc); 516 struct track *p = get_track(s, object, alloc);
421 517
422 if (addr) { 518 if (addr) {
519#ifdef CONFIG_STACKTRACE
520 struct stack_trace trace;
521 int i;
522
523 trace.nr_entries = 0;
524 trace.max_entries = TRACK_ADDRS_COUNT;
525 trace.entries = p->addrs;
526 trace.skip = 3;
527 save_stack_trace(&trace);
528
529 /* See rant in lockdep.c */
530 if (trace.nr_entries != 0 &&
531 trace.entries[trace.nr_entries - 1] == ULONG_MAX)
532 trace.nr_entries--;
533
534 for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
535 p->addrs[i] = 0;
536#endif
423 p->addr = addr; 537 p->addr = addr;
424 p->cpu = smp_processor_id(); 538 p->cpu = smp_processor_id();
425 p->pid = current->pid; 539 p->pid = current->pid;
@@ -444,6 +558,16 @@ static void print_track(const char *s, struct track *t)
444 558
445 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 559 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
446 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 560 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
561#ifdef CONFIG_STACKTRACE
562 {
563 int i;
564 for (i = 0; i < TRACK_ADDRS_COUNT; i++)
565 if (t->addrs[i])
566 printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
567 else
568 break;
569 }
570#endif
447} 571}
448 572
449static void print_tracking(struct kmem_cache *s, void *object) 573static void print_tracking(struct kmem_cache *s, void *object)
@@ -557,10 +681,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
557 memset(p + s->objsize, val, s->inuse - s->objsize); 681 memset(p + s->objsize, val, s->inuse - s->objsize);
558} 682}
559 683
560static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) 684static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
561{ 685{
562 while (bytes) { 686 while (bytes) {
563 if (*start != (u8)value) 687 if (*start != value)
564 return start; 688 return start;
565 start++; 689 start++;
566 bytes--; 690 bytes--;
@@ -568,6 +692,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
568 return NULL; 692 return NULL;
569} 693}
570 694
695static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
696{
697 u64 value64;
698 unsigned int words, prefix;
699
700 if (bytes <= 16)
701 return check_bytes8(start, value, bytes);
702
703 value64 = value | value << 8 | value << 16 | value << 24;
704 value64 = (value64 & 0xffffffff) | value64 << 32;
705 prefix = 8 - ((unsigned long)start) % 8;
706
707 if (prefix) {
708 u8 *r = check_bytes8(start, value, prefix);
709 if (r)
710 return r;
711 start += prefix;
712 bytes -= prefix;
713 }
714
715 words = bytes / 8;
716
717 while (words) {
718 if (*(u64 *)start != value64)
719 return check_bytes8(start, value, 8);
720 start += 8;
721 words--;
722 }
723
724 return check_bytes8(start, value, bytes % 8);
725}
726
571static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 727static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
572 void *from, void *to) 728 void *from, void *to)
573{ 729{
@@ -773,10 +929,11 @@ static int check_slab(struct kmem_cache *s, struct page *page)
773static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 929static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
774{ 930{
775 int nr = 0; 931 int nr = 0;
776 void *fp = page->freelist; 932 void *fp;
777 void *object = NULL; 933 void *object = NULL;
778 unsigned long max_objects; 934 unsigned long max_objects;
779 935
936 fp = page->freelist;
780 while (fp && nr <= page->objects) { 937 while (fp && nr <= page->objects) {
781 if (fp == search) 938 if (fp == search)
782 return 1; 939 return 1;
@@ -881,26 +1038,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
881 1038
882/* 1039/*
883 * Tracking of fully allocated slabs for debugging purposes. 1040 * Tracking of fully allocated slabs for debugging purposes.
1041 *
1042 * list_lock must be held.
884 */ 1043 */
885static void add_full(struct kmem_cache_node *n, struct page *page) 1044static void add_full(struct kmem_cache *s,
1045 struct kmem_cache_node *n, struct page *page)
886{ 1046{
887 spin_lock(&n->list_lock); 1047 if (!(s->flags & SLAB_STORE_USER))
1048 return;
1049
888 list_add(&page->lru, &n->full); 1050 list_add(&page->lru, &n->full);
889 spin_unlock(&n->list_lock);
890} 1051}
891 1052
1053/*
1054 * list_lock must be held.
1055 */
892static void remove_full(struct kmem_cache *s, struct page *page) 1056static void remove_full(struct kmem_cache *s, struct page *page)
893{ 1057{
894 struct kmem_cache_node *n;
895
896 if (!(s->flags & SLAB_STORE_USER)) 1058 if (!(s->flags & SLAB_STORE_USER))
897 return; 1059 return;
898 1060
899 n = get_node(s, page_to_nid(page));
900
901 spin_lock(&n->list_lock);
902 list_del(&page->lru); 1061 list_del(&page->lru);
903 spin_unlock(&n->list_lock);
904} 1062}
905 1063
906/* Tracking of the number of slabs for debugging purposes */ 1064/* Tracking of the number of slabs for debugging purposes */
@@ -956,11 +1114,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa
956 if (!check_slab(s, page)) 1114 if (!check_slab(s, page))
957 goto bad; 1115 goto bad;
958 1116
959 if (!on_freelist(s, page, object)) {
960 object_err(s, page, object, "Object already allocated");
961 goto bad;
962 }
963
964 if (!check_valid_pointer(s, page, object)) { 1117 if (!check_valid_pointer(s, page, object)) {
965 object_err(s, page, object, "Freelist Pointer check fails"); 1118 object_err(s, page, object, "Freelist Pointer check fails");
966 goto bad; 1119 goto bad;
@@ -993,6 +1146,12 @@ bad:
993static noinline int free_debug_processing(struct kmem_cache *s, 1146static noinline int free_debug_processing(struct kmem_cache *s,
994 struct page *page, void *object, unsigned long addr) 1147 struct page *page, void *object, unsigned long addr)
995{ 1148{
1149 unsigned long flags;
1150 int rc = 0;
1151
1152 local_irq_save(flags);
1153 slab_lock(page);
1154
996 if (!check_slab(s, page)) 1155 if (!check_slab(s, page))
997 goto fail; 1156 goto fail;
998 1157
@@ -1007,7 +1166,7 @@ static noinline int free_debug_processing(struct kmem_cache *s,
1007 } 1166 }
1008 1167
1009 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1168 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1010 return 0; 1169 goto out;
1011 1170
1012 if (unlikely(s != page->slab)) { 1171 if (unlikely(s != page->slab)) {
1013 if (!PageSlab(page)) { 1172 if (!PageSlab(page)) {
@@ -1024,18 +1183,19 @@ static noinline int free_debug_processing(struct kmem_cache *s,
1024 goto fail; 1183 goto fail;
1025 } 1184 }
1026 1185
1027 /* Special debug activities for freeing objects */
1028 if (!PageSlubFrozen(page) && !page->freelist)
1029 remove_full(s, page);
1030 if (s->flags & SLAB_STORE_USER) 1186 if (s->flags & SLAB_STORE_USER)
1031 set_track(s, object, TRACK_FREE, addr); 1187 set_track(s, object, TRACK_FREE, addr);
1032 trace(s, page, object, 0); 1188 trace(s, page, object, 0);
1033 init_object(s, object, SLUB_RED_INACTIVE); 1189 init_object(s, object, SLUB_RED_INACTIVE);
1034 return 1; 1190 rc = 1;
1191out:
1192 slab_unlock(page);
1193 local_irq_restore(flags);
1194 return rc;
1035 1195
1036fail: 1196fail:
1037 slab_fix(s, "Object at 0x%p not freed", object); 1197 slab_fix(s, "Object at 0x%p not freed", object);
1038 return 0; 1198 goto out;
1039} 1199}
1040 1200
1041static int __init setup_slub_debug(char *str) 1201static int __init setup_slub_debug(char *str)
@@ -1135,7 +1295,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1135 { return 1; } 1295 { return 1; }
1136static inline int check_object(struct kmem_cache *s, struct page *page, 1296static inline int check_object(struct kmem_cache *s, struct page *page,
1137 void *object, u8 val) { return 1; } 1297 void *object, u8 val) { return 1; }
1138static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1298static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1299 struct page *page) {}
1300static inline void remove_full(struct kmem_cache *s, struct page *page) {}
1139static inline unsigned long kmem_cache_flags(unsigned long objsize, 1301static inline unsigned long kmem_cache_flags(unsigned long objsize,
1140 unsigned long flags, const char *name, 1302 unsigned long flags, const char *name,
1141 void (*ctor)(void *)) 1303 void (*ctor)(void *))
@@ -1187,6 +1349,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1187 struct kmem_cache_order_objects oo = s->oo; 1349 struct kmem_cache_order_objects oo = s->oo;
1188 gfp_t alloc_gfp; 1350 gfp_t alloc_gfp;
1189 1351
1352 flags &= gfp_allowed_mask;
1353
1354 if (flags & __GFP_WAIT)
1355 local_irq_enable();
1356
1190 flags |= s->allocflags; 1357 flags |= s->allocflags;
1191 1358
1192 /* 1359 /*
@@ -1203,12 +1370,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1203 * Try a lower order alloc if possible 1370 * Try a lower order alloc if possible
1204 */ 1371 */
1205 page = alloc_slab_page(flags, node, oo); 1372 page = alloc_slab_page(flags, node, oo);
1206 if (!page)
1207 return NULL;
1208 1373
1209 stat(s, ORDER_FALLBACK); 1374 if (page)
1375 stat(s, ORDER_FALLBACK);
1210 } 1376 }
1211 1377
1378 if (flags & __GFP_WAIT)
1379 local_irq_disable();
1380
1381 if (!page)
1382 return NULL;
1383
1212 if (kmemcheck_enabled 1384 if (kmemcheck_enabled
1213 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1385 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1214 int pages = 1 << oo_order(oo); 1386 int pages = 1 << oo_order(oo);
@@ -1276,6 +1448,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1276 1448
1277 page->freelist = start; 1449 page->freelist = start;
1278 page->inuse = 0; 1450 page->inuse = 0;
1451 page->frozen = 1;
1279out: 1452out:
1280 return page; 1453 return page;
1281} 1454}
@@ -1353,77 +1526,87 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
1353} 1526}
1354 1527
1355/* 1528/*
1356 * Per slab locking using the pagelock 1529 * Management of partially allocated slabs.
1357 */ 1530 *
1358static __always_inline void slab_lock(struct page *page) 1531 * list_lock must be held.
1359{
1360 bit_spin_lock(PG_locked, &page->flags);
1361}
1362
1363static __always_inline void slab_unlock(struct page *page)
1364{
1365 __bit_spin_unlock(PG_locked, &page->flags);
1366}
1367
1368static __always_inline int slab_trylock(struct page *page)
1369{
1370 int rc = 1;
1371
1372 rc = bit_spin_trylock(PG_locked, &page->flags);
1373 return rc;
1374}
1375
1376/*
1377 * Management of partially allocated slabs
1378 */ 1532 */
1379static void add_partial(struct kmem_cache_node *n, 1533static inline void add_partial(struct kmem_cache_node *n,
1380 struct page *page, int tail) 1534 struct page *page, int tail)
1381{ 1535{
1382 spin_lock(&n->list_lock);
1383 n->nr_partial++; 1536 n->nr_partial++;
1384 if (tail) 1537 if (tail)
1385 list_add_tail(&page->lru, &n->partial); 1538 list_add_tail(&page->lru, &n->partial);
1386 else 1539 else
1387 list_add(&page->lru, &n->partial); 1540 list_add(&page->lru, &n->partial);
1388 spin_unlock(&n->list_lock);
1389} 1541}
1390 1542
1391static inline void __remove_partial(struct kmem_cache_node *n, 1543/*
1544 * list_lock must be held.
1545 */
1546static inline void remove_partial(struct kmem_cache_node *n,
1392 struct page *page) 1547 struct page *page)
1393{ 1548{
1394 list_del(&page->lru); 1549 list_del(&page->lru);
1395 n->nr_partial--; 1550 n->nr_partial--;
1396} 1551}
1397 1552
1398static void remove_partial(struct kmem_cache *s, struct page *page)
1399{
1400 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1401
1402 spin_lock(&n->list_lock);
1403 __remove_partial(n, page);
1404 spin_unlock(&n->list_lock);
1405}
1406
1407/* 1553/*
1408 * Lock slab and remove from the partial list. 1554 * Lock slab, remove from the partial list and put the object into the
1555 * per cpu freelist.
1409 * 1556 *
1410 * Must hold list_lock. 1557 * Must hold list_lock.
1411 */ 1558 */
1412static inline int lock_and_freeze_slab(struct kmem_cache_node *n, 1559static inline int acquire_slab(struct kmem_cache *s,
1413 struct page *page) 1560 struct kmem_cache_node *n, struct page *page)
1414{ 1561{
1415 if (slab_trylock(page)) { 1562 void *freelist;
1416 __remove_partial(n, page); 1563 unsigned long counters;
1417 __SetPageSlubFrozen(page); 1564 struct page new;
1565
1566 /*
1567 * Zap the freelist and set the frozen bit.
1568 * The old freelist is the list of objects for the
1569 * per cpu allocation list.
1570 */
1571 do {
1572 freelist = page->freelist;
1573 counters = page->counters;
1574 new.counters = counters;
1575 new.inuse = page->objects;
1576
1577 VM_BUG_ON(new.frozen);
1578 new.frozen = 1;
1579
1580 } while (!__cmpxchg_double_slab(s, page,
1581 freelist, counters,
1582 NULL, new.counters,
1583 "lock and freeze"));
1584
1585 remove_partial(n, page);
1586
1587 if (freelist) {
1588 /* Populate the per cpu freelist */
1589 this_cpu_write(s->cpu_slab->freelist, freelist);
1590 this_cpu_write(s->cpu_slab->page, page);
1591 this_cpu_write(s->cpu_slab->node, page_to_nid(page));
1418 return 1; 1592 return 1;
1593 } else {
1594 /*
1595 * Slab page came from the wrong list. No object to allocate
1596 * from. Put it onto the correct list and continue partial
1597 * scan.
1598 */
1599 printk(KERN_ERR "SLUB: %s : Page without available objects on"
1600 " partial list\n", s->name);
1601 return 0;
1419 } 1602 }
1420 return 0;
1421} 1603}
1422 1604
1423/* 1605/*
1424 * Try to allocate a partial slab from a specific node. 1606 * Try to allocate a partial slab from a specific node.
1425 */ 1607 */
1426static struct page *get_partial_node(struct kmem_cache_node *n) 1608static struct page *get_partial_node(struct kmem_cache *s,
1609 struct kmem_cache_node *n)
1427{ 1610{
1428 struct page *page; 1611 struct page *page;
1429 1612
@@ -1438,7 +1621,7 @@ static struct page *get_partial_node(struct kmem_cache_node *n)
1438 1621
1439 spin_lock(&n->list_lock); 1622 spin_lock(&n->list_lock);
1440 list_for_each_entry(page, &n->partial, lru) 1623 list_for_each_entry(page, &n->partial, lru)
1441 if (lock_and_freeze_slab(n, page)) 1624 if (acquire_slab(s, n, page))
1442 goto out; 1625 goto out;
1443 page = NULL; 1626 page = NULL;
1444out: 1627out:
@@ -1489,7 +1672,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1489 1672
1490 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1673 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1491 n->nr_partial > s->min_partial) { 1674 n->nr_partial > s->min_partial) {
1492 page = get_partial_node(n); 1675 page = get_partial_node(s, n);
1493 if (page) { 1676 if (page) {
1494 put_mems_allowed(); 1677 put_mems_allowed();
1495 return page; 1678 return page;
@@ -1509,60 +1692,13 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1509 struct page *page; 1692 struct page *page;
1510 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1693 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
1511 1694
1512 page = get_partial_node(get_node(s, searchnode)); 1695 page = get_partial_node(s, get_node(s, searchnode));
1513 if (page || node != NUMA_NO_NODE) 1696 if (page || node != NUMA_NO_NODE)
1514 return page; 1697 return page;
1515 1698
1516 return get_any_partial(s, flags); 1699 return get_any_partial(s, flags);
1517} 1700}
1518 1701
1519/*
1520 * Move a page back to the lists.
1521 *
1522 * Must be called with the slab lock held.
1523 *
1524 * On exit the slab lock will have been dropped.
1525 */
1526static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1527 __releases(bitlock)
1528{
1529 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1530
1531 __ClearPageSlubFrozen(page);
1532 if (page->inuse) {
1533
1534 if (page->freelist) {
1535 add_partial(n, page, tail);
1536 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1537 } else {
1538 stat(s, DEACTIVATE_FULL);
1539 if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
1540 add_full(n, page);
1541 }
1542 slab_unlock(page);
1543 } else {
1544 stat(s, DEACTIVATE_EMPTY);
1545 if (n->nr_partial < s->min_partial) {
1546 /*
1547 * Adding an empty slab to the partial slabs in order
1548 * to avoid page allocator overhead. This slab needs
1549 * to come after the other slabs with objects in
1550 * so that the others get filled first. That way the
1551 * size of the partial list stays small.
1552 *
1553 * kmem_cache_shrink can reclaim any empty slabs from
1554 * the partial list.
1555 */
1556 add_partial(n, page, 1);
1557 slab_unlock(page);
1558 } else {
1559 slab_unlock(page);
1560 stat(s, FREE_SLAB);
1561 discard_slab(s, page);
1562 }
1563 }
1564}
1565
1566#ifdef CONFIG_PREEMPT 1702#ifdef CONFIG_PREEMPT
1567/* 1703/*
1568 * Calculate the next globally unique transaction for disambiguiation 1704 * Calculate the next globally unique transaction for disambiguiation
@@ -1632,42 +1768,161 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
1632/* 1768/*
1633 * Remove the cpu slab 1769 * Remove the cpu slab
1634 */ 1770 */
1771
1772/*
1773 * Remove the cpu slab
1774 */
1635static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1775static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1636 __releases(bitlock)
1637{ 1776{
1777 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
1638 struct page *page = c->page; 1778 struct page *page = c->page;
1639 int tail = 1; 1779 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1640 1780 int lock = 0;
1641 if (page->freelist) 1781 enum slab_modes l = M_NONE, m = M_NONE;
1782 void *freelist;
1783 void *nextfree;
1784 int tail = 0;
1785 struct page new;
1786 struct page old;
1787
1788 if (page->freelist) {
1642 stat(s, DEACTIVATE_REMOTE_FREES); 1789 stat(s, DEACTIVATE_REMOTE_FREES);
1790 tail = 1;
1791 }
1792
1793 c->tid = next_tid(c->tid);
1794 c->page = NULL;
1795 freelist = c->freelist;
1796 c->freelist = NULL;
1797
1643 /* 1798 /*
1644 * Merge cpu freelist into slab freelist. Typically we get here 1799 * Stage one: Free all available per cpu objects back
1645 * because both freelists are empty. So this is unlikely 1800 * to the page freelist while it is still frozen. Leave the
1646 * to occur. 1801 * last one.
1802 *
1803 * There is no need to take the list->lock because the page
1804 * is still frozen.
1647 */ 1805 */
1648 while (unlikely(c->freelist)) { 1806 while (freelist && (nextfree = get_freepointer(s, freelist))) {
1649 void **object; 1807 void *prior;
1808 unsigned long counters;
1809
1810 do {
1811 prior = page->freelist;
1812 counters = page->counters;
1813 set_freepointer(s, freelist, prior);
1814 new.counters = counters;
1815 new.inuse--;
1816 VM_BUG_ON(!new.frozen);
1817
1818 } while (!__cmpxchg_double_slab(s, page,
1819 prior, counters,
1820 freelist, new.counters,
1821 "drain percpu freelist"));
1822
1823 freelist = nextfree;
1824 }
1650 1825
1651 tail = 0; /* Hot objects. Put the slab first */ 1826 /*
1827 * Stage two: Ensure that the page is unfrozen while the
1828 * list presence reflects the actual number of objects
1829 * during unfreeze.
1830 *
1831 * We setup the list membership and then perform a cmpxchg
1832 * with the count. If there is a mismatch then the page
1833 * is not unfrozen but the page is on the wrong list.
1834 *
1835 * Then we restart the process which may have to remove
1836 * the page from the list that we just put it on again
1837 * because the number of objects in the slab may have
1838 * changed.
1839 */
1840redo:
1652 1841
1653 /* Retrieve object from cpu_freelist */ 1842 old.freelist = page->freelist;
1654 object = c->freelist; 1843 old.counters = page->counters;
1655 c->freelist = get_freepointer(s, c->freelist); 1844 VM_BUG_ON(!old.frozen);
1656 1845
1657 /* And put onto the regular freelist */ 1846 /* Determine target state of the slab */
1658 set_freepointer(s, object, page->freelist); 1847 new.counters = old.counters;
1659 page->freelist = object; 1848 if (freelist) {
1660 page->inuse--; 1849 new.inuse--;
1850 set_freepointer(s, freelist, old.freelist);
1851 new.freelist = freelist;
1852 } else
1853 new.freelist = old.freelist;
1854
1855 new.frozen = 0;
1856
1857 if (!new.inuse && n->nr_partial > s->min_partial)
1858 m = M_FREE;
1859 else if (new.freelist) {
1860 m = M_PARTIAL;
1861 if (!lock) {
1862 lock = 1;
1863 /*
1864 * Taking the spinlock removes the possiblity
1865 * that acquire_slab() will see a slab page that
1866 * is frozen
1867 */
1868 spin_lock(&n->list_lock);
1869 }
1870 } else {
1871 m = M_FULL;
1872 if (kmem_cache_debug(s) && !lock) {
1873 lock = 1;
1874 /*
1875 * This also ensures that the scanning of full
1876 * slabs from diagnostic functions will not see
1877 * any frozen slabs.
1878 */
1879 spin_lock(&n->list_lock);
1880 }
1881 }
1882
1883 if (l != m) {
1884
1885 if (l == M_PARTIAL)
1886
1887 remove_partial(n, page);
1888
1889 else if (l == M_FULL)
1890
1891 remove_full(s, page);
1892
1893 if (m == M_PARTIAL) {
1894
1895 add_partial(n, page, tail);
1896 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1897
1898 } else if (m == M_FULL) {
1899
1900 stat(s, DEACTIVATE_FULL);
1901 add_full(s, n, page);
1902
1903 }
1904 }
1905
1906 l = m;
1907 if (!__cmpxchg_double_slab(s, page,
1908 old.freelist, old.counters,
1909 new.freelist, new.counters,
1910 "unfreezing slab"))
1911 goto redo;
1912
1913 if (lock)
1914 spin_unlock(&n->list_lock);
1915
1916 if (m == M_FREE) {
1917 stat(s, DEACTIVATE_EMPTY);
1918 discard_slab(s, page);
1919 stat(s, FREE_SLAB);
1661 } 1920 }
1662 c->page = NULL;
1663 c->tid = next_tid(c->tid);
1664 unfreeze_slab(s, page, tail);
1665} 1921}
1666 1922
1667static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1923static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1668{ 1924{
1669 stat(s, CPUSLAB_FLUSH); 1925 stat(s, CPUSLAB_FLUSH);
1670 slab_lock(c->page);
1671 deactivate_slab(s, c); 1926 deactivate_slab(s, c);
1672} 1927}
1673 1928
@@ -1796,6 +2051,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1796 void **object; 2051 void **object;
1797 struct page *page; 2052 struct page *page;
1798 unsigned long flags; 2053 unsigned long flags;
2054 struct page new;
2055 unsigned long counters;
1799 2056
1800 local_irq_save(flags); 2057 local_irq_save(flags);
1801#ifdef CONFIG_PREEMPT 2058#ifdef CONFIG_PREEMPT
@@ -1814,72 +2071,102 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1814 if (!page) 2071 if (!page)
1815 goto new_slab; 2072 goto new_slab;
1816 2073
1817 slab_lock(page); 2074 if (unlikely(!node_match(c, node))) {
1818 if (unlikely(!node_match(c, node))) 2075 stat(s, ALLOC_NODE_MISMATCH);
1819 goto another_slab; 2076 deactivate_slab(s, c);
2077 goto new_slab;
2078 }
2079
2080 /* must check again c->freelist in case of cpu migration or IRQ */
2081 object = c->freelist;
2082 if (object)
2083 goto load_freelist;
2084
2085 stat(s, ALLOC_SLOWPATH);
2086
2087 do {
2088 object = page->freelist;
2089 counters = page->counters;
2090 new.counters = counters;
2091 VM_BUG_ON(!new.frozen);
2092
2093 /*
2094 * If there is no object left then we use this loop to
2095 * deactivate the slab which is simple since no objects
2096 * are left in the slab and therefore we do not need to
2097 * put the page back onto the partial list.
2098 *
2099 * If there are objects left then we retrieve them
2100 * and use them to refill the per cpu queue.
2101 */
2102
2103 new.inuse = page->objects;
2104 new.frozen = object != NULL;
2105
2106 } while (!__cmpxchg_double_slab(s, page,
2107 object, counters,
2108 NULL, new.counters,
2109 "__slab_alloc"));
2110
2111 if (unlikely(!object)) {
2112 c->page = NULL;
2113 stat(s, DEACTIVATE_BYPASS);
2114 goto new_slab;
2115 }
1820 2116
1821 stat(s, ALLOC_REFILL); 2117 stat(s, ALLOC_REFILL);
1822 2118
1823load_freelist: 2119load_freelist:
1824 object = page->freelist; 2120 VM_BUG_ON(!page->frozen);
1825 if (unlikely(!object))
1826 goto another_slab;
1827 if (kmem_cache_debug(s))
1828 goto debug;
1829
1830 c->freelist = get_freepointer(s, object); 2121 c->freelist = get_freepointer(s, object);
1831 page->inuse = page->objects;
1832 page->freelist = NULL;
1833
1834 slab_unlock(page);
1835 c->tid = next_tid(c->tid); 2122 c->tid = next_tid(c->tid);
1836 local_irq_restore(flags); 2123 local_irq_restore(flags);
1837 stat(s, ALLOC_SLOWPATH);
1838 return object; 2124 return object;
1839 2125
1840another_slab:
1841 deactivate_slab(s, c);
1842
1843new_slab: 2126new_slab:
1844 page = get_partial(s, gfpflags, node); 2127 page = get_partial(s, gfpflags, node);
1845 if (page) { 2128 if (page) {
1846 stat(s, ALLOC_FROM_PARTIAL); 2129 stat(s, ALLOC_FROM_PARTIAL);
1847 c->node = page_to_nid(page); 2130 object = c->freelist;
1848 c->page = page; 2131
2132 if (kmem_cache_debug(s))
2133 goto debug;
1849 goto load_freelist; 2134 goto load_freelist;
1850 } 2135 }
1851 2136
1852 gfpflags &= gfp_allowed_mask;
1853 if (gfpflags & __GFP_WAIT)
1854 local_irq_enable();
1855
1856 page = new_slab(s, gfpflags, node); 2137 page = new_slab(s, gfpflags, node);
1857 2138
1858 if (gfpflags & __GFP_WAIT)
1859 local_irq_disable();
1860
1861 if (page) { 2139 if (page) {
1862 c = __this_cpu_ptr(s->cpu_slab); 2140 c = __this_cpu_ptr(s->cpu_slab);
1863 stat(s, ALLOC_SLAB);
1864 if (c->page) 2141 if (c->page)
1865 flush_slab(s, c); 2142 flush_slab(s, c);
1866 2143
1867 slab_lock(page); 2144 /*
1868 __SetPageSlubFrozen(page); 2145 * No other reference to the page yet so we can
2146 * muck around with it freely without cmpxchg
2147 */
2148 object = page->freelist;
2149 page->freelist = NULL;
2150 page->inuse = page->objects;
2151
2152 stat(s, ALLOC_SLAB);
1869 c->node = page_to_nid(page); 2153 c->node = page_to_nid(page);
1870 c->page = page; 2154 c->page = page;
2155
2156 if (kmem_cache_debug(s))
2157 goto debug;
1871 goto load_freelist; 2158 goto load_freelist;
1872 } 2159 }
1873 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2160 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1874 slab_out_of_memory(s, gfpflags, node); 2161 slab_out_of_memory(s, gfpflags, node);
1875 local_irq_restore(flags); 2162 local_irq_restore(flags);
1876 return NULL; 2163 return NULL;
2164
1877debug: 2165debug:
1878 if (!alloc_debug_processing(s, page, object, addr)) 2166 if (!object || !alloc_debug_processing(s, page, object, addr))
1879 goto another_slab; 2167 goto new_slab;
1880 2168
1881 page->inuse++; 2169 c->freelist = get_freepointer(s, object);
1882 page->freelist = get_freepointer(s, object);
1883 deactivate_slab(s, c); 2170 deactivate_slab(s, c);
1884 c->page = NULL; 2171 c->page = NULL;
1885 c->node = NUMA_NO_NODE; 2172 c->node = NUMA_NO_NODE;
@@ -2031,52 +2318,89 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2031{ 2318{
2032 void *prior; 2319 void *prior;
2033 void **object = (void *)x; 2320 void **object = (void *)x;
2034 unsigned long flags; 2321 int was_frozen;
2322 int inuse;
2323 struct page new;
2324 unsigned long counters;
2325 struct kmem_cache_node *n = NULL;
2326 unsigned long uninitialized_var(flags);
2035 2327
2036 local_irq_save(flags);
2037 slab_lock(page);
2038 stat(s, FREE_SLOWPATH); 2328 stat(s, FREE_SLOWPATH);
2039 2329
2040 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) 2330 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
2041 goto out_unlock; 2331 return;
2042 2332
2043 prior = page->freelist; 2333 do {
2044 set_freepointer(s, object, prior); 2334 prior = page->freelist;
2045 page->freelist = object; 2335 counters = page->counters;
2046 page->inuse--; 2336 set_freepointer(s, object, prior);
2337 new.counters = counters;
2338 was_frozen = new.frozen;
2339 new.inuse--;
2340 if ((!new.inuse || !prior) && !was_frozen && !n) {
2341 n = get_node(s, page_to_nid(page));
2342 /*
2343 * Speculatively acquire the list_lock.
2344 * If the cmpxchg does not succeed then we may
2345 * drop the list_lock without any processing.
2346 *
2347 * Otherwise the list_lock will synchronize with
2348 * other processors updating the list of slabs.
2349 */
2350 spin_lock_irqsave(&n->list_lock, flags);
2351 }
2352 inuse = new.inuse;
2047 2353
2048 if (unlikely(PageSlubFrozen(page))) { 2354 } while (!cmpxchg_double_slab(s, page,
2049 stat(s, FREE_FROZEN); 2355 prior, counters,
2050 goto out_unlock; 2356 object, new.counters,
2051 } 2357 "__slab_free"));
2052 2358
2053 if (unlikely(!page->inuse)) 2359 if (likely(!n)) {
2054 goto slab_empty; 2360 /*
2361 * The list lock was not taken therefore no list
2362 * activity can be necessary.
2363 */
2364 if (was_frozen)
2365 stat(s, FREE_FROZEN);
2366 return;
2367 }
2055 2368
2056 /* 2369 /*
2057 * Objects left in the slab. If it was not on the partial list before 2370 * was_frozen may have been set after we acquired the list_lock in
2058 * then add it. 2371 * an earlier loop. So we need to check it here again.
2059 */ 2372 */
2060 if (unlikely(!prior)) { 2373 if (was_frozen)
2061 add_partial(get_node(s, page_to_nid(page)), page, 1); 2374 stat(s, FREE_FROZEN);
2062 stat(s, FREE_ADD_PARTIAL); 2375 else {
2063 } 2376 if (unlikely(!inuse && n->nr_partial > s->min_partial))
2377 goto slab_empty;
2064 2378
2065out_unlock: 2379 /*
2066 slab_unlock(page); 2380 * Objects left in the slab. If it was not on the partial list before
2067 local_irq_restore(flags); 2381 * then add it.
2382 */
2383 if (unlikely(!prior)) {
2384 remove_full(s, page);
2385 add_partial(n, page, 1);
2386 stat(s, FREE_ADD_PARTIAL);
2387 }
2388 }
2389 spin_unlock_irqrestore(&n->list_lock, flags);
2068 return; 2390 return;
2069 2391
2070slab_empty: 2392slab_empty:
2071 if (prior) { 2393 if (prior) {
2072 /* 2394 /*
2073 * Slab still on the partial list. 2395 * Slab on the partial list.
2074 */ 2396 */
2075 remove_partial(s, page); 2397 remove_partial(n, page);
2076 stat(s, FREE_REMOVE_PARTIAL); 2398 stat(s, FREE_REMOVE_PARTIAL);
2077 } 2399 } else
2078 slab_unlock(page); 2400 /* Slab must be on the full list */
2079 local_irq_restore(flags); 2401 remove_full(s, page);
2402
2403 spin_unlock_irqrestore(&n->list_lock, flags);
2080 stat(s, FREE_SLAB); 2404 stat(s, FREE_SLAB);
2081 discard_slab(s, page); 2405 discard_slab(s, page);
2082} 2406}
@@ -2350,7 +2674,6 @@ static void early_kmem_cache_node_alloc(int node)
2350{ 2674{
2351 struct page *page; 2675 struct page *page;
2352 struct kmem_cache_node *n; 2676 struct kmem_cache_node *n;
2353 unsigned long flags;
2354 2677
2355 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); 2678 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
2356 2679
@@ -2368,6 +2691,7 @@ static void early_kmem_cache_node_alloc(int node)
2368 BUG_ON(!n); 2691 BUG_ON(!n);
2369 page->freelist = get_freepointer(kmem_cache_node, n); 2692 page->freelist = get_freepointer(kmem_cache_node, n);
2370 page->inuse++; 2693 page->inuse++;
2694 page->frozen = 0;
2371 kmem_cache_node->node[node] = n; 2695 kmem_cache_node->node[node] = n;
2372#ifdef CONFIG_SLUB_DEBUG 2696#ifdef CONFIG_SLUB_DEBUG
2373 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2697 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
@@ -2376,14 +2700,7 @@ static void early_kmem_cache_node_alloc(int node)
2376 init_kmem_cache_node(n, kmem_cache_node); 2700 init_kmem_cache_node(n, kmem_cache_node);
2377 inc_slabs_node(kmem_cache_node, node, page->objects); 2701 inc_slabs_node(kmem_cache_node, node, page->objects);
2378 2702
2379 /*
2380 * lockdep requires consistent irq usage for each lock
2381 * so even though there cannot be a race this early in
2382 * the boot sequence, we still disable irqs.
2383 */
2384 local_irq_save(flags);
2385 add_partial(n, page, 0); 2703 add_partial(n, page, 0);
2386 local_irq_restore(flags);
2387} 2704}
2388 2705
2389static void free_kmem_cache_nodes(struct kmem_cache *s) 2706static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -2589,6 +2906,12 @@ static int kmem_cache_open(struct kmem_cache *s,
2589 } 2906 }
2590 } 2907 }
2591 2908
2909#ifdef CONFIG_CMPXCHG_DOUBLE
2910 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
2911 /* Enable fast mode */
2912 s->flags |= __CMPXCHG_DOUBLE;
2913#endif
2914
2592 /* 2915 /*
2593 * The larger the object size is, the more pages we want on the partial 2916 * The larger the object size is, the more pages we want on the partial
2594 * list to avoid pounding the page allocator excessively. 2917 * list to avoid pounding the page allocator excessively.
@@ -2661,7 +2984,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
2661 spin_lock_irqsave(&n->list_lock, flags); 2984 spin_lock_irqsave(&n->list_lock, flags);
2662 list_for_each_entry_safe(page, h, &n->partial, lru) { 2985 list_for_each_entry_safe(page, h, &n->partial, lru) {
2663 if (!page->inuse) { 2986 if (!page->inuse) {
2664 __remove_partial(n, page); 2987 remove_partial(n, page);
2665 discard_slab(s, page); 2988 discard_slab(s, page);
2666 } else { 2989 } else {
2667 list_slab_objects(s, page, 2990 list_slab_objects(s, page,
@@ -2928,6 +3251,42 @@ size_t ksize(const void *object)
2928} 3251}
2929EXPORT_SYMBOL(ksize); 3252EXPORT_SYMBOL(ksize);
2930 3253
3254#ifdef CONFIG_SLUB_DEBUG
3255bool verify_mem_not_deleted(const void *x)
3256{
3257 struct page *page;
3258 void *object = (void *)x;
3259 unsigned long flags;
3260 bool rv;
3261
3262 if (unlikely(ZERO_OR_NULL_PTR(x)))
3263 return false;
3264
3265 local_irq_save(flags);
3266
3267 page = virt_to_head_page(x);
3268 if (unlikely(!PageSlab(page))) {
3269 /* maybe it was from stack? */
3270 rv = true;
3271 goto out_unlock;
3272 }
3273
3274 slab_lock(page);
3275 if (on_freelist(page->slab, page, object)) {
3276 object_err(page->slab, page, object, "Object is on free-list");
3277 rv = false;
3278 } else {
3279 rv = true;
3280 }
3281 slab_unlock(page);
3282
3283out_unlock:
3284 local_irq_restore(flags);
3285 return rv;
3286}
3287EXPORT_SYMBOL(verify_mem_not_deleted);
3288#endif
3289
2931void kfree(const void *x) 3290void kfree(const void *x)
2932{ 3291{
2933 struct page *page; 3292 struct page *page;
@@ -2993,14 +3352,8 @@ int kmem_cache_shrink(struct kmem_cache *s)
2993 * list_lock. page->inuse here is the upper limit. 3352 * list_lock. page->inuse here is the upper limit.
2994 */ 3353 */
2995 list_for_each_entry_safe(page, t, &n->partial, lru) { 3354 list_for_each_entry_safe(page, t, &n->partial, lru) {
2996 if (!page->inuse && slab_trylock(page)) { 3355 if (!page->inuse) {
2997 /* 3356 remove_partial(n, page);
2998 * Must hold slab lock here because slab_free
2999 * may have freed the last object and be
3000 * waiting to release the slab.
3001 */
3002 __remove_partial(n, page);
3003 slab_unlock(page);
3004 discard_slab(s, page); 3357 discard_slab(s, page);
3005 } else { 3358 } else {
3006 list_move(&page->lru, 3359 list_move(&page->lru,
@@ -3588,12 +3941,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
3588static void validate_slab_slab(struct kmem_cache *s, struct page *page, 3941static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3589 unsigned long *map) 3942 unsigned long *map)
3590{ 3943{
3591 if (slab_trylock(page)) { 3944 slab_lock(page);
3592 validate_slab(s, page, map); 3945 validate_slab(s, page, map);
3593 slab_unlock(page); 3946 slab_unlock(page);
3594 } else
3595 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
3596 s->name, page);
3597} 3947}
3598 3948
3599static int validate_slab_node(struct kmem_cache *s, 3949static int validate_slab_node(struct kmem_cache *s,
@@ -4058,7 +4408,7 @@ static int any_slab_objects(struct kmem_cache *s)
4058#endif 4408#endif
4059 4409
4060#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 4410#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
4061#define to_slab(n) container_of(n, struct kmem_cache, kobj); 4411#define to_slab(n) container_of(n, struct kmem_cache, kobj)
4062 4412
4063struct slab_attribute { 4413struct slab_attribute {
4064 struct attribute attr; 4414 struct attribute attr;
@@ -4241,8 +4591,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s,
4241 const char *buf, size_t length) 4591 const char *buf, size_t length)
4242{ 4592{
4243 s->flags &= ~SLAB_DEBUG_FREE; 4593 s->flags &= ~SLAB_DEBUG_FREE;
4244 if (buf[0] == '1') 4594 if (buf[0] == '1') {
4595 s->flags &= ~__CMPXCHG_DOUBLE;
4245 s->flags |= SLAB_DEBUG_FREE; 4596 s->flags |= SLAB_DEBUG_FREE;
4597 }
4246 return length; 4598 return length;
4247} 4599}
4248SLAB_ATTR(sanity_checks); 4600SLAB_ATTR(sanity_checks);
@@ -4256,8 +4608,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4256 size_t length) 4608 size_t length)
4257{ 4609{
4258 s->flags &= ~SLAB_TRACE; 4610 s->flags &= ~SLAB_TRACE;
4259 if (buf[0] == '1') 4611 if (buf[0] == '1') {
4612 s->flags &= ~__CMPXCHG_DOUBLE;
4260 s->flags |= SLAB_TRACE; 4613 s->flags |= SLAB_TRACE;
4614 }
4261 return length; 4615 return length;
4262} 4616}
4263SLAB_ATTR(trace); 4617SLAB_ATTR(trace);
@@ -4274,8 +4628,10 @@ static ssize_t red_zone_store(struct kmem_cache *s,
4274 return -EBUSY; 4628 return -EBUSY;
4275 4629
4276 s->flags &= ~SLAB_RED_ZONE; 4630 s->flags &= ~SLAB_RED_ZONE;
4277 if (buf[0] == '1') 4631 if (buf[0] == '1') {
4632 s->flags &= ~__CMPXCHG_DOUBLE;
4278 s->flags |= SLAB_RED_ZONE; 4633 s->flags |= SLAB_RED_ZONE;
4634 }
4279 calculate_sizes(s, -1); 4635 calculate_sizes(s, -1);
4280 return length; 4636 return length;
4281} 4637}
@@ -4293,8 +4649,10 @@ static ssize_t poison_store(struct kmem_cache *s,
4293 return -EBUSY; 4649 return -EBUSY;
4294 4650
4295 s->flags &= ~SLAB_POISON; 4651 s->flags &= ~SLAB_POISON;
4296 if (buf[0] == '1') 4652 if (buf[0] == '1') {
4653 s->flags &= ~__CMPXCHG_DOUBLE;
4297 s->flags |= SLAB_POISON; 4654 s->flags |= SLAB_POISON;
4655 }
4298 calculate_sizes(s, -1); 4656 calculate_sizes(s, -1);
4299 return length; 4657 return length;
4300} 4658}
@@ -4312,8 +4670,10 @@ static ssize_t store_user_store(struct kmem_cache *s,
4312 return -EBUSY; 4670 return -EBUSY;
4313 4671
4314 s->flags &= ~SLAB_STORE_USER; 4672 s->flags &= ~SLAB_STORE_USER;
4315 if (buf[0] == '1') 4673 if (buf[0] == '1') {
4674 s->flags &= ~__CMPXCHG_DOUBLE;
4316 s->flags |= SLAB_STORE_USER; 4675 s->flags |= SLAB_STORE_USER;
4676 }
4317 calculate_sizes(s, -1); 4677 calculate_sizes(s, -1);
4318 return length; 4678 return length;
4319} 4679}
@@ -4478,6 +4838,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
4478STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 4838STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
4479STAT_ATTR(ALLOC_SLAB, alloc_slab); 4839STAT_ATTR(ALLOC_SLAB, alloc_slab);
4480STAT_ATTR(ALLOC_REFILL, alloc_refill); 4840STAT_ATTR(ALLOC_REFILL, alloc_refill);
4841STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
4481STAT_ATTR(FREE_SLAB, free_slab); 4842STAT_ATTR(FREE_SLAB, free_slab);
4482STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 4843STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
4483STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 4844STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
@@ -4485,7 +4846,10 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
4485STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 4846STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
4486STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 4847STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
4487STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 4848STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
4849STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
4488STAT_ATTR(ORDER_FALLBACK, order_fallback); 4850STAT_ATTR(ORDER_FALLBACK, order_fallback);
4851STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
4852STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
4489#endif 4853#endif
4490 4854
4491static struct attribute *slab_attrs[] = { 4855static struct attribute *slab_attrs[] = {
@@ -4535,6 +4899,7 @@ static struct attribute *slab_attrs[] = {
4535 &alloc_from_partial_attr.attr, 4899 &alloc_from_partial_attr.attr,
4536 &alloc_slab_attr.attr, 4900 &alloc_slab_attr.attr,
4537 &alloc_refill_attr.attr, 4901 &alloc_refill_attr.attr,
4902 &alloc_node_mismatch_attr.attr,
4538 &free_slab_attr.attr, 4903 &free_slab_attr.attr,
4539 &cpuslab_flush_attr.attr, 4904 &cpuslab_flush_attr.attr,
4540 &deactivate_full_attr.attr, 4905 &deactivate_full_attr.attr,
@@ -4542,7 +4907,10 @@ static struct attribute *slab_attrs[] = {
4542 &deactivate_to_head_attr.attr, 4907 &deactivate_to_head_attr.attr,
4543 &deactivate_to_tail_attr.attr, 4908 &deactivate_to_tail_attr.attr,
4544 &deactivate_remote_frees_attr.attr, 4909 &deactivate_remote_frees_attr.attr,
4910 &deactivate_bypass_attr.attr,
4545 &order_fallback_attr.attr, 4911 &order_fallback_attr.attr,
4912 &cmpxchg_double_fail_attr.attr,
4913 &cmpxchg_double_cpu_fail_attr.attr,
4546#endif 4914#endif
4547#ifdef CONFIG_FAILSLAB 4915#ifdef CONFIG_FAILSLAB
4548 &failslab_attr.attr, 4916 &failslab_attr.attr,
diff --git a/mm/sparse.c b/mm/sparse.c
index aa64b12831a..858e1dff9b2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -40,7 +40,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
40static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; 40static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
41#endif 41#endif
42 42
43int page_to_nid(struct page *page) 43int page_to_nid(const struct page *page)
44{ 44{
45 return section_to_node_table[page_to_section(page)]; 45 return section_to_node_table[page_to_section(page)];
46} 46}
diff --git a/mm/swap.c b/mm/swap.c
index 3a442f18b0b..87627f181c3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page)
78{ 78{
79 if (unlikely(PageTail(page))) { 79 if (unlikely(PageTail(page))) {
80 /* __split_huge_page_refcount can run under us */ 80 /* __split_huge_page_refcount can run under us */
81 struct page *page_head = page->first_page; 81 struct page *page_head = compound_trans_head(page);
82 smp_rmb(); 82
83 /* 83 if (likely(page != page_head &&
84 * If PageTail is still set after smp_rmb() we can be sure 84 get_page_unless_zero(page_head))) {
85 * that the page->first_page we read wasn't a dangling pointer.
86 * See __split_huge_page_refcount() smp_wmb().
87 */
88 if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
89 unsigned long flags; 85 unsigned long flags;
90 /* 86 /*
91 * Verify that our page_head wasn't converted 87 * page_head wasn't a dangling pointer but it
92 * to a a regular page before we got a 88 * may not be a head page anymore by the time
93 * reference on it. 89 * we obtain the lock. That is ok as long as it
90 * can't be freed from under us.
94 */ 91 */
95 if (unlikely(!PageHead(page_head))) {
96 /* PageHead is cleared after PageTail */
97 smp_rmb();
98 VM_BUG_ON(PageTail(page));
99 goto out_put_head;
100 }
101 /*
102 * Only run compound_lock on a valid PageHead,
103 * after having it pinned with
104 * get_page_unless_zero() above.
105 */
106 smp_mb();
107 /* page_head wasn't a dangling pointer */
108 flags = compound_lock_irqsave(page_head); 92 flags = compound_lock_irqsave(page_head);
109 if (unlikely(!PageTail(page))) { 93 if (unlikely(!PageTail(page))) {
110 /* __split_huge_page_refcount run before us */ 94 /* __split_huge_page_refcount run before us */
111 compound_unlock_irqrestore(page_head, flags); 95 compound_unlock_irqrestore(page_head, flags);
112 VM_BUG_ON(PageHead(page_head)); 96 VM_BUG_ON(PageHead(page_head));
113 out_put_head:
114 if (put_page_testzero(page_head)) 97 if (put_page_testzero(page_head))
115 __put_single_page(page_head); 98 __put_single_page(page_head);
116 out_put_single: 99 out_put_single:
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page)
121 VM_BUG_ON(page_head != page->first_page); 104 VM_BUG_ON(page_head != page->first_page);
122 /* 105 /*
123 * We can release the refcount taken by 106 * We can release the refcount taken by
124 * get_page_unless_zero now that 107 * get_page_unless_zero() now that
125 * split_huge_page_refcount is blocked on the 108 * __split_huge_page_refcount() is blocked on
126 * compound_lock. 109 * the compound_lock.
127 */ 110 */
128 if (put_page_testzero(page_head)) 111 if (put_page_testzero(page_head))
129 VM_BUG_ON(1); 112 VM_BUG_ON(1);
130 /* __split_huge_page_refcount will wait now */ 113 /* __split_huge_page_refcount will wait now */
131 VM_BUG_ON(atomic_read(&page->_count) <= 0); 114 VM_BUG_ON(page_mapcount(page) <= 0);
132 atomic_dec(&page->_count); 115 atomic_dec(&page->_mapcount);
133 VM_BUG_ON(atomic_read(&page_head->_count) <= 0); 116 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
117 VM_BUG_ON(atomic_read(&page->_count) != 0);
134 compound_unlock_irqrestore(page_head, flags); 118 compound_unlock_irqrestore(page_head, flags);
135 if (put_page_testzero(page_head)) { 119 if (put_page_testzero(page_head)) {
136 if (PageHead(page_head)) 120 if (PageHead(page_head))
@@ -160,6 +144,45 @@ void put_page(struct page *page)
160} 144}
161EXPORT_SYMBOL(put_page); 145EXPORT_SYMBOL(put_page);
162 146
147/*
148 * This function is exported but must not be called by anything other
149 * than get_page(). It implements the slow path of get_page().
150 */
151bool __get_page_tail(struct page *page)
152{
153 /*
154 * This takes care of get_page() if run on a tail page
155 * returned by one of the get_user_pages/follow_page variants.
156 * get_user_pages/follow_page itself doesn't need the compound
157 * lock because it runs __get_page_tail_foll() under the
158 * proper PT lock that already serializes against
159 * split_huge_page().
160 */
161 unsigned long flags;
162 bool got = false;
163 struct page *page_head = compound_trans_head(page);
164
165 if (likely(page != page_head && get_page_unless_zero(page_head))) {
166 /*
167 * page_head wasn't a dangling pointer but it
168 * may not be a head page anymore by the time
169 * we obtain the lock. That is ok as long as it
170 * can't be freed from under us.
171 */
172 flags = compound_lock_irqsave(page_head);
173 /* here __split_huge_page_refcount won't run anymore */
174 if (likely(PageTail(page))) {
175 __get_page_tail_foll(page, false);
176 got = true;
177 }
178 compound_unlock_irqrestore(page_head, flags);
179 if (unlikely(!got))
180 put_page(page_head);
181 }
182 return got;
183}
184EXPORT_SYMBOL(__get_page_tail);
185
163/** 186/**
164 * put_pages_list() - release a list of pages 187 * put_pages_list() - release a list of pages
165 * @pages: list of pages threaded on page->lru 188 * @pages: list of pages threaded on page->lru
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ff8dc1a18cb..17bc224bce6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1681,19 +1681,14 @@ out:
1681} 1681}
1682 1682
1683#ifdef CONFIG_PROC_FS 1683#ifdef CONFIG_PROC_FS
1684struct proc_swaps {
1685 struct seq_file seq;
1686 int event;
1687};
1688
1689static unsigned swaps_poll(struct file *file, poll_table *wait) 1684static unsigned swaps_poll(struct file *file, poll_table *wait)
1690{ 1685{
1691 struct proc_swaps *s = file->private_data; 1686 struct seq_file *seq = file->private_data;
1692 1687
1693 poll_wait(file, &proc_poll_wait, wait); 1688 poll_wait(file, &proc_poll_wait, wait);
1694 1689
1695 if (s->event != atomic_read(&proc_poll_event)) { 1690 if (seq->poll_event != atomic_read(&proc_poll_event)) {
1696 s->event = atomic_read(&proc_poll_event); 1691 seq->poll_event = atomic_read(&proc_poll_event);
1697 return POLLIN | POLLRDNORM | POLLERR | POLLPRI; 1692 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1698 } 1693 }
1699 1694
@@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = {
1783 1778
1784static int swaps_open(struct inode *inode, struct file *file) 1779static int swaps_open(struct inode *inode, struct file *file)
1785{ 1780{
1786 struct proc_swaps *s; 1781 struct seq_file *seq;
1787 int ret; 1782 int ret;
1788 1783
1789 s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
1790 if (!s)
1791 return -ENOMEM;
1792
1793 file->private_data = s;
1794
1795 ret = seq_open(file, &swaps_op); 1784 ret = seq_open(file, &swaps_op);
1796 if (ret) { 1785 if (ret)
1797 kfree(s);
1798 return ret; 1786 return ret;
1799 }
1800 1787
1801 s->seq.private = s; 1788 seq = file->private_data;
1802 s->event = atomic_read(&proc_poll_event); 1789 seq->poll_event = atomic_read(&proc_poll_event);
1803 return ret; 1790 return 0;
1804} 1791}
1805 1792
1806static const struct file_operations proc_swaps_operations = { 1793static const struct file_operations proc_swaps_operations = {
@@ -1937,20 +1924,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1937 1924
1938 /* 1925 /*
1939 * Find out how many pages are allowed for a single swap 1926 * Find out how many pages are allowed for a single swap
1940 * device. There are two limiting factors: 1) the number of 1927 * device. There are three limiting factors: 1) the number
1941 * bits for the swap offset in the swp_entry_t type and 1928 * of bits for the swap offset in the swp_entry_t type, and
1942 * 2) the number of bits in the a swap pte as defined by 1929 * 2) the number of bits in the swap pte as defined by the
1943 * the different architectures. In order to find the 1930 * the different architectures, and 3) the number of free bits
1944 * largest possible bit mask a swap entry with swap type 0 1931 * in an exceptional radix_tree entry. In order to find the
1932 * largest possible bit mask, a swap entry with swap type 0
1945 * and swap offset ~0UL is created, encoded to a swap pte, 1933 * and swap offset ~0UL is created, encoded to a swap pte,
1946 * decoded to a swp_entry_t again and finally the swap 1934 * decoded to a swp_entry_t again, and finally the swap
1947 * offset is extracted. This will mask all the bits from 1935 * offset is extracted. This will mask all the bits from
1948 * the initial ~0UL mask that can't be encoded in either 1936 * the initial ~0UL mask that can't be encoded in either
1949 * the swp_entry_t or the architecture definition of a 1937 * the swp_entry_t or the architecture definition of a
1950 * swap pte. 1938 * swap pte. Then the same is done for a radix_tree entry.
1951 */ 1939 */
1952 maxpages = swp_offset(pte_to_swp_entry( 1940 maxpages = swp_offset(pte_to_swp_entry(
1953 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; 1941 swp_entry_to_pte(swp_entry(0, ~0UL))));
1942 maxpages = swp_offset(radix_to_swp_entry(
1943 swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
1944
1954 if (maxpages > swap_header->info.last_page) { 1945 if (maxpages > swap_header->info.last_page) {
1955 maxpages = swap_header->info.last_page + 1; 1946 maxpages = swap_header->info.last_page + 1;
1956 /* p->max is an unsigned int: don't overflow it */ 1947 /* p->max is an unsigned int: don't overflow it */
diff --git a/mm/thrash.c b/mm/thrash.c
index fabf2d0f516..e53f7d02c17 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -6,7 +6,7 @@
6 * Released under the GPL, see the file COPYING for details. 6 * Released under the GPL, see the file COPYING for details.
7 * 7 *
8 * Simple token based thrashing protection, using the algorithm 8 * Simple token based thrashing protection, using the algorithm
9 * described in: http://www.cs.wm.edu/~sjiang/token.pdf 9 * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
10 * 10 *
11 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> 11 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
12 * Improved algorithm to pass token: 12 * Improved algorithm to pass token:
@@ -30,8 +30,6 @@
30static DEFINE_SPINLOCK(swap_token_lock); 30static DEFINE_SPINLOCK(swap_token_lock);
31struct mm_struct *swap_token_mm; 31struct mm_struct *swap_token_mm;
32struct mem_cgroup *swap_token_memcg; 32struct mem_cgroup *swap_token_memcg;
33static unsigned int global_faults;
34static unsigned int last_aging;
35 33
36#ifdef CONFIG_CGROUP_MEM_RES_CTLR 34#ifdef CONFIG_CGROUP_MEM_RES_CTLR
37static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) 35static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
@@ -55,6 +53,8 @@ void grab_swap_token(struct mm_struct *mm)
55{ 53{
56 int current_interval; 54 int current_interval;
57 unsigned int old_prio = mm->token_priority; 55 unsigned int old_prio = mm->token_priority;
56 static unsigned int global_faults;
57 static unsigned int last_aging;
58 58
59 global_faults++; 59 global_faults++;
60 60
@@ -67,6 +67,17 @@ void grab_swap_token(struct mm_struct *mm)
67 if (!swap_token_mm) 67 if (!swap_token_mm)
68 goto replace_token; 68 goto replace_token;
69 69
70 /*
71 * Usually, we don't need priority aging because long interval faults
72 * makes priority decrease quickly. But there is one exception. If the
73 * token owner task is sleeping, it never make long interval faults.
74 * Thus, we need a priority aging mechanism instead. The requirements
75 * of priority aging are
76 * 1) An aging interval is reasonable enough long. Too short aging
77 * interval makes quick swap token lost and decrease performance.
78 * 2) The swap token owner task have to get priority aging even if
79 * it's under sleep.
80 */
70 if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { 81 if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
71 swap_token_mm->token_priority /= 2; 82 swap_token_mm->token_priority /= 2;
72 last_aging = global_faults; 83 last_aging = global_faults;
diff --git a/mm/truncate.c b/mm/truncate.c
index e13f22efaad..b40ac6d4e86 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -199,9 +199,6 @@ int invalidate_inode_page(struct page *page)
199 * The first pass will remove most pages, so the search cost of the second pass 199 * The first pass will remove most pages, so the search cost of the second pass
200 * is low. 200 * is low.
201 * 201 *
202 * When looking at page->index outside the page lock we need to be careful to
203 * copy it into a local to avoid races (it could change at any time).
204 *
205 * We pass down the cache-hot hint to the page freeing code. Even if the 202 * We pass down the cache-hot hint to the page freeing code. Even if the
206 * mapping is large, it is probably the case that the final pages are the most 203 * mapping is large, it is probably the case that the final pages are the most
207 * recently touched, and freeing happens in ascending file offset order. 204 * recently touched, and freeing happens in ascending file offset order.
@@ -210,10 +207,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
210 loff_t lstart, loff_t lend) 207 loff_t lstart, loff_t lend)
211{ 208{
212 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 209 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
213 pgoff_t end;
214 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 210 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
215 struct pagevec pvec; 211 struct pagevec pvec;
216 pgoff_t next; 212 pgoff_t index;
213 pgoff_t end;
217 int i; 214 int i;
218 215
219 cleancache_flush_inode(mapping); 216 cleancache_flush_inode(mapping);
@@ -224,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping,
224 end = (lend >> PAGE_CACHE_SHIFT); 221 end = (lend >> PAGE_CACHE_SHIFT);
225 222
226 pagevec_init(&pvec, 0); 223 pagevec_init(&pvec, 0);
227 next = start; 224 index = start;
228 while (next <= end && 225 while (index <= end && pagevec_lookup(&pvec, mapping, index,
229 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 226 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
230 mem_cgroup_uncharge_start(); 227 mem_cgroup_uncharge_start();
231 for (i = 0; i < pagevec_count(&pvec); i++) { 228 for (i = 0; i < pagevec_count(&pvec); i++) {
232 struct page *page = pvec.pages[i]; 229 struct page *page = pvec.pages[i];
233 pgoff_t page_index = page->index;
234 230
235 if (page_index > end) { 231 /* We rely upon deletion not changing page->index */
236 next = page_index; 232 index = page->index;
233 if (index > end)
237 break; 234 break;
238 }
239 235
240 if (page_index > next)
241 next = page_index;
242 next++;
243 if (!trylock_page(page)) 236 if (!trylock_page(page))
244 continue; 237 continue;
238 WARN_ON(page->index != index);
245 if (PageWriteback(page)) { 239 if (PageWriteback(page)) {
246 unlock_page(page); 240 unlock_page(page);
247 continue; 241 continue;
@@ -252,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
252 pagevec_release(&pvec); 246 pagevec_release(&pvec);
253 mem_cgroup_uncharge_end(); 247 mem_cgroup_uncharge_end();
254 cond_resched(); 248 cond_resched();
249 index++;
255 } 250 }
256 251
257 if (partial) { 252 if (partial) {
@@ -264,16 +259,17 @@ void truncate_inode_pages_range(struct address_space *mapping,
264 } 259 }
265 } 260 }
266 261
267 next = start; 262 index = start;
268 for ( ; ; ) { 263 for ( ; ; ) {
269 cond_resched(); 264 cond_resched();
270 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 265 if (!pagevec_lookup(&pvec, mapping, index,
271 if (next == start) 266 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
267 if (index == start)
272 break; 268 break;
273 next = start; 269 index = start;
274 continue; 270 continue;
275 } 271 }
276 if (pvec.pages[0]->index > end) { 272 if (index == start && pvec.pages[0]->index > end) {
277 pagevec_release(&pvec); 273 pagevec_release(&pvec);
278 break; 274 break;
279 } 275 }
@@ -281,18 +277,20 @@ void truncate_inode_pages_range(struct address_space *mapping,
281 for (i = 0; i < pagevec_count(&pvec); i++) { 277 for (i = 0; i < pagevec_count(&pvec); i++) {
282 struct page *page = pvec.pages[i]; 278 struct page *page = pvec.pages[i];
283 279
284 if (page->index > end) 280 /* We rely upon deletion not changing page->index */
281 index = page->index;
282 if (index > end)
285 break; 283 break;
284
286 lock_page(page); 285 lock_page(page);
286 WARN_ON(page->index != index);
287 wait_on_page_writeback(page); 287 wait_on_page_writeback(page);
288 truncate_inode_page(mapping, page); 288 truncate_inode_page(mapping, page);
289 if (page->index > next)
290 next = page->index;
291 next++;
292 unlock_page(page); 289 unlock_page(page);
293 } 290 }
294 pagevec_release(&pvec); 291 pagevec_release(&pvec);
295 mem_cgroup_uncharge_end(); 292 mem_cgroup_uncharge_end();
293 index++;
296 } 294 }
297 cleancache_flush_inode(mapping); 295 cleancache_flush_inode(mapping);
298} 296}
@@ -333,35 +331,34 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
333 pgoff_t start, pgoff_t end) 331 pgoff_t start, pgoff_t end)
334{ 332{
335 struct pagevec pvec; 333 struct pagevec pvec;
336 pgoff_t next = start; 334 pgoff_t index = start;
337 unsigned long ret; 335 unsigned long ret;
338 unsigned long count = 0; 336 unsigned long count = 0;
339 int i; 337 int i;
340 338
339 /*
340 * Note: this function may get called on a shmem/tmpfs mapping:
341 * pagevec_lookup() might then return 0 prematurely (because it
342 * got a gangful of swap entries); but it's hardly worth worrying
343 * about - it can rarely have anything to free from such a mapping
344 * (most pages are dirty), and already skips over any difficulties.
345 */
346
341 pagevec_init(&pvec, 0); 347 pagevec_init(&pvec, 0);
342 while (next <= end && 348 while (index <= end && pagevec_lookup(&pvec, mapping, index,
343 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 349 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
344 mem_cgroup_uncharge_start(); 350 mem_cgroup_uncharge_start();
345 for (i = 0; i < pagevec_count(&pvec); i++) { 351 for (i = 0; i < pagevec_count(&pvec); i++) {
346 struct page *page = pvec.pages[i]; 352 struct page *page = pvec.pages[i];
347 pgoff_t index;
348 int lock_failed;
349 353
350 lock_failed = !trylock_page(page); 354 /* We rely upon deletion not changing page->index */
351
352 /*
353 * We really shouldn't be looking at the ->index of an
354 * unlocked page. But we're not allowed to lock these
355 * pages. So we rely upon nobody altering the ->index
356 * of this (pinned-by-us) page.
357 */
358 index = page->index; 355 index = page->index;
359 if (index > next) 356 if (index > end)
360 next = index; 357 break;
361 next++;
362 if (lock_failed)
363 continue;
364 358
359 if (!trylock_page(page))
360 continue;
361 WARN_ON(page->index != index);
365 ret = invalidate_inode_page(page); 362 ret = invalidate_inode_page(page);
366 unlock_page(page); 363 unlock_page(page);
367 /* 364 /*
@@ -371,12 +368,11 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
371 if (!ret) 368 if (!ret)
372 deactivate_page(page); 369 deactivate_page(page);
373 count += ret; 370 count += ret;
374 if (next > end)
375 break;
376 } 371 }
377 pagevec_release(&pvec); 372 pagevec_release(&pvec);
378 mem_cgroup_uncharge_end(); 373 mem_cgroup_uncharge_end();
379 cond_resched(); 374 cond_resched();
375 index++;
380 } 376 }
381 return count; 377 return count;
382} 378}
@@ -442,37 +438,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
442 pgoff_t start, pgoff_t end) 438 pgoff_t start, pgoff_t end)
443{ 439{
444 struct pagevec pvec; 440 struct pagevec pvec;
445 pgoff_t next; 441 pgoff_t index;
446 int i; 442 int i;
447 int ret = 0; 443 int ret = 0;
448 int ret2 = 0; 444 int ret2 = 0;
449 int did_range_unmap = 0; 445 int did_range_unmap = 0;
450 int wrapped = 0;
451 446
452 cleancache_flush_inode(mapping); 447 cleancache_flush_inode(mapping);
453 pagevec_init(&pvec, 0); 448 pagevec_init(&pvec, 0);
454 next = start; 449 index = start;
455 while (next <= end && !wrapped && 450 while (index <= end && pagevec_lookup(&pvec, mapping, index,
456 pagevec_lookup(&pvec, mapping, next, 451 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
457 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
458 mem_cgroup_uncharge_start(); 452 mem_cgroup_uncharge_start();
459 for (i = 0; i < pagevec_count(&pvec); i++) { 453 for (i = 0; i < pagevec_count(&pvec); i++) {
460 struct page *page = pvec.pages[i]; 454 struct page *page = pvec.pages[i];
461 pgoff_t page_index; 455
456 /* We rely upon deletion not changing page->index */
457 index = page->index;
458 if (index > end)
459 break;
462 460
463 lock_page(page); 461 lock_page(page);
462 WARN_ON(page->index != index);
464 if (page->mapping != mapping) { 463 if (page->mapping != mapping) {
465 unlock_page(page); 464 unlock_page(page);
466 continue; 465 continue;
467 } 466 }
468 page_index = page->index;
469 next = page_index + 1;
470 if (next == 0)
471 wrapped = 1;
472 if (page_index > end) {
473 unlock_page(page);
474 break;
475 }
476 wait_on_page_writeback(page); 467 wait_on_page_writeback(page);
477 if (page_mapped(page)) { 468 if (page_mapped(page)) {
478 if (!did_range_unmap) { 469 if (!did_range_unmap) {
@@ -480,9 +471,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
480 * Zap the rest of the file in one hit. 471 * Zap the rest of the file in one hit.
481 */ 472 */
482 unmap_mapping_range(mapping, 473 unmap_mapping_range(mapping,
483 (loff_t)page_index<<PAGE_CACHE_SHIFT, 474 (loff_t)index << PAGE_CACHE_SHIFT,
484 (loff_t)(end - page_index + 1) 475 (loff_t)(1 + end - index)
485 << PAGE_CACHE_SHIFT, 476 << PAGE_CACHE_SHIFT,
486 0); 477 0);
487 did_range_unmap = 1; 478 did_range_unmap = 1;
488 } else { 479 } else {
@@ -490,8 +481,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
490 * Just zap this page 481 * Just zap this page
491 */ 482 */
492 unmap_mapping_range(mapping, 483 unmap_mapping_range(mapping,
493 (loff_t)page_index<<PAGE_CACHE_SHIFT, 484 (loff_t)index << PAGE_CACHE_SHIFT,
494 PAGE_CACHE_SIZE, 0); 485 PAGE_CACHE_SIZE, 0);
495 } 486 }
496 } 487 }
497 BUG_ON(page_mapped(page)); 488 BUG_ON(page_mapped(page));
@@ -507,6 +498,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
507 pagevec_release(&pvec); 498 pagevec_release(&pvec);
508 mem_cgroup_uncharge_end(); 499 mem_cgroup_uncharge_end();
509 cond_resched(); 500 cond_resched();
501 index++;
510 } 502 }
511 cleancache_flush_inode(mapping); 503 cleancache_flush_inode(mapping);
512 return ret; 504 return ret;
@@ -531,8 +523,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
531/** 523/**
532 * truncate_pagecache - unmap and remove pagecache that has been truncated 524 * truncate_pagecache - unmap and remove pagecache that has been truncated
533 * @inode: inode 525 * @inode: inode
534 * @old: old file offset 526 * @oldsize: old file size
535 * @new: new file offset 527 * @newsize: new file size
536 * 528 *
537 * inode's new i_size must already be written before truncate_pagecache 529 * inode's new i_size must already be written before truncate_pagecache
538 * is called. 530 * is called.
@@ -544,9 +536,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
544 * situations such as writepage being called for a page that has already 536 * situations such as writepage being called for a page that has already
545 * had its underlying blocks deallocated. 537 * had its underlying blocks deallocated.
546 */ 538 */
547void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) 539void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize)
548{ 540{
549 struct address_space *mapping = inode->i_mapping; 541 struct address_space *mapping = inode->i_mapping;
542 loff_t holebegin = round_up(newsize, PAGE_SIZE);
550 543
551 /* 544 /*
552 * unmap_mapping_range is called twice, first simply for 545 * unmap_mapping_range is called twice, first simply for
@@ -557,9 +550,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
557 * truncate_inode_pages finishes, hence the second 550 * truncate_inode_pages finishes, hence the second
558 * unmap_mapping_range call must be made for correctness. 551 * unmap_mapping_range call must be made for correctness.
559 */ 552 */
560 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 553 unmap_mapping_range(mapping, holebegin, 0, 1);
561 truncate_inode_pages(mapping, new); 554 truncate_inode_pages(mapping, newsize);
562 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 555 unmap_mapping_range(mapping, holebegin, 0, 1);
563} 556}
564EXPORT_SYMBOL(truncate_pagecache); 557EXPORT_SYMBOL(truncate_pagecache);
565 558
@@ -589,29 +582,31 @@ EXPORT_SYMBOL(truncate_setsize);
589/** 582/**
590 * vmtruncate - unmap mappings "freed" by truncate() syscall 583 * vmtruncate - unmap mappings "freed" by truncate() syscall
591 * @inode: inode of the file used 584 * @inode: inode of the file used
592 * @offset: file offset to start truncating 585 * @newsize: file offset to start truncating
593 * 586 *
594 * This function is deprecated and truncate_setsize or truncate_pagecache 587 * This function is deprecated and truncate_setsize or truncate_pagecache
595 * should be used instead, together with filesystem specific block truncation. 588 * should be used instead, together with filesystem specific block truncation.
596 */ 589 */
597int vmtruncate(struct inode *inode, loff_t offset) 590int vmtruncate(struct inode *inode, loff_t newsize)
598{ 591{
599 int error; 592 int error;
600 593
601 error = inode_newsize_ok(inode, offset); 594 error = inode_newsize_ok(inode, newsize);
602 if (error) 595 if (error)
603 return error; 596 return error;
604 597
605 truncate_setsize(inode, offset); 598 truncate_setsize(inode, newsize);
606 if (inode->i_op->truncate) 599 if (inode->i_op->truncate)
607 inode->i_op->truncate(inode); 600 inode->i_op->truncate(inode);
608 return 0; 601 return 0;
609} 602}
610EXPORT_SYMBOL(vmtruncate); 603EXPORT_SYMBOL(vmtruncate);
611 604
612int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) 605int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
613{ 606{
614 struct address_space *mapping = inode->i_mapping; 607 struct address_space *mapping = inode->i_mapping;
608 loff_t holebegin = round_up(lstart, PAGE_SIZE);
609 loff_t holelen = 1 + lend - holebegin;
615 610
616 /* 611 /*
617 * If the underlying filesystem is not going to provide 612 * If the underlying filesystem is not going to provide
@@ -622,12 +617,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
622 return -ENOSYS; 617 return -ENOSYS;
623 618
624 mutex_lock(&inode->i_mutex); 619 mutex_lock(&inode->i_mutex);
625 down_write(&inode->i_alloc_sem); 620 inode_dio_wait(inode);
626 unmap_mapping_range(mapping, offset, (end - offset), 1); 621 unmap_mapping_range(mapping, holebegin, holelen, 1);
627 inode->i_op->truncate_range(inode, offset, end); 622 inode->i_op->truncate_range(inode, lstart, lend);
628 /* unmap again to remove racily COWed private pages */ 623 /* unmap again to remove racily COWed private pages */
629 unmap_mapping_range(mapping, offset, (end - offset), 1); 624 unmap_mapping_range(mapping, holebegin, holelen, 1);
630 up_write(&inode->i_alloc_sem);
631 mutex_unlock(&inode->i_mutex); 625 mutex_unlock(&inode->i_mutex);
632 626
633 return 0; 627 return 0;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1d34d75366a..3a65d6f7422 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -26,7 +26,7 @@
26#include <linux/rcupdate.h> 26#include <linux/rcupdate.h>
27#include <linux/pfn.h> 27#include <linux/pfn.h>
28#include <linux/kmemleak.h> 28#include <linux/kmemleak.h>
29#include <asm/atomic.h> 29#include <linux/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h> 32#include <asm/shmparam.h>
@@ -452,13 +452,6 @@ overflow:
452 return ERR_PTR(-EBUSY); 452 return ERR_PTR(-EBUSY);
453} 453}
454 454
455static void rcu_free_va(struct rcu_head *head)
456{
457 struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
458
459 kfree(va);
460}
461
462static void __free_vmap_area(struct vmap_area *va) 455static void __free_vmap_area(struct vmap_area *va)
463{ 456{
464 BUG_ON(RB_EMPTY_NODE(&va->rb_node)); 457 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
@@ -491,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va)
491 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) 484 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
492 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); 485 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
493 486
494 call_rcu(&va->rcu_head, rcu_free_va); 487 kfree_rcu(va, rcu_head);
495} 488}
496 489
497/* 490/*
@@ -732,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr)
732#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 725#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
733#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 726#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
734#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 727#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
735#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 728#define VMAP_BBMAP_BITS \
736 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 729 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
737 VMALLOC_PAGES / NR_CPUS / 16)) 730 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
731 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
738 732
739#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 733#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
740 734
@@ -837,13 +831,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
837 return vb; 831 return vb;
838} 832}
839 833
840static void rcu_free_vb(struct rcu_head *head)
841{
842 struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
843
844 kfree(vb);
845}
846
847static void free_vmap_block(struct vmap_block *vb) 834static void free_vmap_block(struct vmap_block *vb)
848{ 835{
849 struct vmap_block *tmp; 836 struct vmap_block *tmp;
@@ -856,7 +843,7 @@ static void free_vmap_block(struct vmap_block *vb)
856 BUG_ON(tmp != vb); 843 BUG_ON(tmp != vb);
857 844
858 free_vmap_area_noflush(vb->va); 845 free_vmap_area_noflush(vb->va);
859 call_rcu(&vb->rcu_head, rcu_free_vb); 846 kfree_rcu(vb, rcu_head);
860} 847}
861 848
862static void purge_fragmented_blocks(int cpu) 849static void purge_fragmented_blocks(int cpu)
@@ -1266,18 +1253,22 @@ EXPORT_SYMBOL_GPL(map_vm_area);
1266DEFINE_RWLOCK(vmlist_lock); 1253DEFINE_RWLOCK(vmlist_lock);
1267struct vm_struct *vmlist; 1254struct vm_struct *vmlist;
1268 1255
1269static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1256static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1270 unsigned long flags, void *caller) 1257 unsigned long flags, void *caller)
1271{ 1258{
1272 struct vm_struct *tmp, **p;
1273
1274 vm->flags = flags; 1259 vm->flags = flags;
1275 vm->addr = (void *)va->va_start; 1260 vm->addr = (void *)va->va_start;
1276 vm->size = va->va_end - va->va_start; 1261 vm->size = va->va_end - va->va_start;
1277 vm->caller = caller; 1262 vm->caller = caller;
1278 va->private = vm; 1263 va->private = vm;
1279 va->flags |= VM_VM_AREA; 1264 va->flags |= VM_VM_AREA;
1265}
1266
1267static void insert_vmalloc_vmlist(struct vm_struct *vm)
1268{
1269 struct vm_struct *tmp, **p;
1280 1270
1271 vm->flags &= ~VM_UNLIST;
1281 write_lock(&vmlist_lock); 1272 write_lock(&vmlist_lock);
1282 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 1273 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1283 if (tmp->addr >= vm->addr) 1274 if (tmp->addr >= vm->addr)
@@ -1288,6 +1279,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1288 write_unlock(&vmlist_lock); 1279 write_unlock(&vmlist_lock);
1289} 1280}
1290 1281
1282static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1283 unsigned long flags, void *caller)
1284{
1285 setup_vmalloc_vm(vm, va, flags, caller);
1286 insert_vmalloc_vmlist(vm);
1287}
1288
1291static struct vm_struct *__get_vm_area_node(unsigned long size, 1289static struct vm_struct *__get_vm_area_node(unsigned long size,
1292 unsigned long align, unsigned long flags, unsigned long start, 1290 unsigned long align, unsigned long flags, unsigned long start,
1293 unsigned long end, int node, gfp_t gfp_mask, void *caller) 1291 unsigned long end, int node, gfp_t gfp_mask, void *caller)
@@ -1326,7 +1324,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1326 return NULL; 1324 return NULL;
1327 } 1325 }
1328 1326
1329 insert_vmalloc_vm(area, va, flags, caller); 1327 /*
1328 * When this function is called from __vmalloc_node_range,
1329 * we do not add vm_struct to vmlist here to avoid
1330 * accessing uninitialized members of vm_struct such as
1331 * pages and nr_pages fields. They will be set later.
1332 * To distinguish it from others, we use a VM_UNLIST flag.
1333 */
1334 if (flags & VM_UNLIST)
1335 setup_vmalloc_vm(area, va, flags, caller);
1336 else
1337 insert_vmalloc_vm(area, va, flags, caller);
1338
1330 return area; 1339 return area;
1331} 1340}
1332 1341
@@ -1394,17 +1403,20 @@ struct vm_struct *remove_vm_area(const void *addr)
1394 va = find_vmap_area((unsigned long)addr); 1403 va = find_vmap_area((unsigned long)addr);
1395 if (va && va->flags & VM_VM_AREA) { 1404 if (va && va->flags & VM_VM_AREA) {
1396 struct vm_struct *vm = va->private; 1405 struct vm_struct *vm = va->private;
1397 struct vm_struct *tmp, **p; 1406
1398 /* 1407 if (!(vm->flags & VM_UNLIST)) {
1399 * remove from list and disallow access to this vm_struct 1408 struct vm_struct *tmp, **p;
1400 * before unmap. (address range confliction is maintained by 1409 /*
1401 * vmap.) 1410 * remove from list and disallow access to
1402 */ 1411 * this vm_struct before unmap. (address range
1403 write_lock(&vmlist_lock); 1412 * confliction is maintained by vmap.)
1404 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) 1413 */
1405 ; 1414 write_lock(&vmlist_lock);
1406 *p = tmp->next; 1415 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1407 write_unlock(&vmlist_lock); 1416 ;
1417 *p = tmp->next;
1418 write_unlock(&vmlist_lock);
1419 }
1408 1420
1409 vmap_debug_free_range(va->va_start, va->va_end); 1421 vmap_debug_free_range(va->va_start, va->va_end);
1410 free_unmap_vmap_area(va); 1422 free_unmap_vmap_area(va);
@@ -1615,13 +1627,21 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1615 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1627 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1616 return NULL; 1628 return NULL;
1617 1629
1618 area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, 1630 area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
1619 gfp_mask, caller); 1631 start, end, node, gfp_mask, caller);
1620 1632
1621 if (!area) 1633 if (!area)
1622 return NULL; 1634 return NULL;
1623 1635
1624 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1636 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
1637 if (!addr)
1638 return NULL;
1639
1640 /*
1641 * In this function, newly allocated vm_struct is not added
1642 * to vmlist at __get_vm_area_node(). so, it is added here.
1643 */
1644 insert_vmalloc_vmlist(area);
1625 1645
1626 /* 1646 /*
1627 * A ref_count = 3 is needed because the vm_struct and vmap_area 1647 * A ref_count = 3 is needed because the vm_struct and vmap_area
@@ -2153,6 +2173,14 @@ struct vm_struct *alloc_vm_area(size_t size)
2153 return NULL; 2173 return NULL;
2154 } 2174 }
2155 2175
2176 /*
2177 * If the allocated address space is passed to a hypercall
2178 * before being used then we cannot rely on a page fault to
2179 * trigger an update of the page tables. So sync all the page
2180 * tables here.
2181 */
2182 vmalloc_sync_all();
2183
2156 return area; 2184 return area;
2157} 2185}
2158EXPORT_SYMBOL_GPL(alloc_vm_area); 2186EXPORT_SYMBOL_GPL(alloc_vm_area);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d036e59d302..b55699cd906 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -95,8 +95,6 @@ struct scan_control {
95 /* Can pages be swapped as part of reclaim? */ 95 /* Can pages be swapped as part of reclaim? */
96 int may_swap; 96 int may_swap;
97 97
98 int swappiness;
99
100 int order; 98 int order;
101 99
102 /* 100 /*
@@ -173,7 +171,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
173 struct scan_control *sc, enum lru_list lru) 171 struct scan_control *sc, enum lru_list lru)
174{ 172{
175 if (!scanning_global_lru(sc)) 173 if (!scanning_global_lru(sc))
176 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); 174 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
175 zone_to_nid(zone), zone_idx(zone), BIT(lru));
177 176
178 return zone_page_state(zone, NR_LRU_BASE + lru); 177 return zone_page_state(zone, NR_LRU_BASE + lru);
179} 178}
@@ -250,49 +249,90 @@ unsigned long shrink_slab(struct shrink_control *shrink,
250 unsigned long long delta; 249 unsigned long long delta;
251 unsigned long total_scan; 250 unsigned long total_scan;
252 unsigned long max_pass; 251 unsigned long max_pass;
252 int shrink_ret = 0;
253 long nr;
254 long new_nr;
255 long batch_size = shrinker->batch ? shrinker->batch
256 : SHRINK_BATCH;
257
258 /*
259 * copy the current shrinker scan count into a local variable
260 * and zero it so that other concurrent shrinker invocations
261 * don't also do this scanning work.
262 */
263 do {
264 nr = shrinker->nr;
265 } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
253 266
267 total_scan = nr;
254 max_pass = do_shrinker_shrink(shrinker, shrink, 0); 268 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
255 delta = (4 * nr_pages_scanned) / shrinker->seeks; 269 delta = (4 * nr_pages_scanned) / shrinker->seeks;
256 delta *= max_pass; 270 delta *= max_pass;
257 do_div(delta, lru_pages + 1); 271 do_div(delta, lru_pages + 1);
258 shrinker->nr += delta; 272 total_scan += delta;
259 if (shrinker->nr < 0) { 273 if (total_scan < 0) {
260 printk(KERN_ERR "shrink_slab: %pF negative objects to " 274 printk(KERN_ERR "shrink_slab: %pF negative objects to "
261 "delete nr=%ld\n", 275 "delete nr=%ld\n",
262 shrinker->shrink, shrinker->nr); 276 shrinker->shrink, total_scan);
263 shrinker->nr = max_pass; 277 total_scan = max_pass;
264 } 278 }
265 279
266 /* 280 /*
281 * We need to avoid excessive windup on filesystem shrinkers
282 * due to large numbers of GFP_NOFS allocations causing the
283 * shrinkers to return -1 all the time. This results in a large
284 * nr being built up so when a shrink that can do some work
285 * comes along it empties the entire cache due to nr >>>
286 * max_pass. This is bad for sustaining a working set in
287 * memory.
288 *
289 * Hence only allow the shrinker to scan the entire cache when
290 * a large delta change is calculated directly.
291 */
292 if (delta < max_pass / 4)
293 total_scan = min(total_scan, max_pass / 2);
294
295 /*
267 * Avoid risking looping forever due to too large nr value: 296 * Avoid risking looping forever due to too large nr value:
268 * never try to free more than twice the estimate number of 297 * never try to free more than twice the estimate number of
269 * freeable entries. 298 * freeable entries.
270 */ 299 */
271 if (shrinker->nr > max_pass * 2) 300 if (total_scan > max_pass * 2)
272 shrinker->nr = max_pass * 2; 301 total_scan = max_pass * 2;
273 302
274 total_scan = shrinker->nr; 303 trace_mm_shrink_slab_start(shrinker, shrink, nr,
275 shrinker->nr = 0; 304 nr_pages_scanned, lru_pages,
305 max_pass, delta, total_scan);
276 306
277 while (total_scan >= SHRINK_BATCH) { 307 while (total_scan >= batch_size) {
278 long this_scan = SHRINK_BATCH;
279 int shrink_ret;
280 int nr_before; 308 int nr_before;
281 309
282 nr_before = do_shrinker_shrink(shrinker, shrink, 0); 310 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
283 shrink_ret = do_shrinker_shrink(shrinker, shrink, 311 shrink_ret = do_shrinker_shrink(shrinker, shrink,
284 this_scan); 312 batch_size);
285 if (shrink_ret == -1) 313 if (shrink_ret == -1)
286 break; 314 break;
287 if (shrink_ret < nr_before) 315 if (shrink_ret < nr_before)
288 ret += nr_before - shrink_ret; 316 ret += nr_before - shrink_ret;
289 count_vm_events(SLABS_SCANNED, this_scan); 317 count_vm_events(SLABS_SCANNED, batch_size);
290 total_scan -= this_scan; 318 total_scan -= batch_size;
291 319
292 cond_resched(); 320 cond_resched();
293 } 321 }
294 322
295 shrinker->nr += total_scan; 323 /*
324 * move the unused scan count back into the shrinker in a
325 * manner that handles concurrent updates. If we exhausted the
326 * scan, there is no need to do an update.
327 */
328 do {
329 nr = shrinker->nr;
330 new_nr = total_scan + nr;
331 if (total_scan <= 0)
332 break;
333 } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
334
335 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
296 } 336 }
297 up_read(&shrinker_rwsem); 337 up_read(&shrinker_rwsem);
298out: 338out:
@@ -1729,6 +1769,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1729 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); 1769 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1730} 1770}
1731 1771
1772static int vmscan_swappiness(struct scan_control *sc)
1773{
1774 if (scanning_global_lru(sc))
1775 return vm_swappiness;
1776 return mem_cgroup_swappiness(sc->mem_cgroup);
1777}
1778
1732/* 1779/*
1733 * Determine how aggressively the anon and file LRU lists should be 1780 * Determine how aggressively the anon and file LRU lists should be
1734 * scanned. The relative value of each set of LRU lists is determined 1781 * scanned. The relative value of each set of LRU lists is determined
@@ -1747,22 +1794,15 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1747 u64 fraction[2], denominator; 1794 u64 fraction[2], denominator;
1748 enum lru_list l; 1795 enum lru_list l;
1749 int noswap = 0; 1796 int noswap = 0;
1750 int force_scan = 0; 1797 bool force_scan = false;
1798 unsigned long nr_force_scan[2];
1751 1799
1752 1800 /* kswapd does zone balancing and needs to scan this zone */
1753 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1801 if (scanning_global_lru(sc) && current_is_kswapd())
1754 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1802 force_scan = true;
1755 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + 1803 /* memcg may have small limit and need to avoid priority drop */
1756 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 1804 if (!scanning_global_lru(sc))
1757 1805 force_scan = true;
1758 if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
1759 /* kswapd does zone balancing and need to scan this zone */
1760 if (scanning_global_lru(sc) && current_is_kswapd())
1761 force_scan = 1;
1762 /* memcg may have small limit and need to avoid priority drop */
1763 if (!scanning_global_lru(sc))
1764 force_scan = 1;
1765 }
1766 1806
1767 /* If we have no swap space, do not bother scanning anon pages. */ 1807 /* If we have no swap space, do not bother scanning anon pages. */
1768 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1808 if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1770,9 +1810,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1770 fraction[0] = 0; 1810 fraction[0] = 0;
1771 fraction[1] = 1; 1811 fraction[1] = 1;
1772 denominator = 1; 1812 denominator = 1;
1813 nr_force_scan[0] = 0;
1814 nr_force_scan[1] = SWAP_CLUSTER_MAX;
1773 goto out; 1815 goto out;
1774 } 1816 }
1775 1817
1818 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1819 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1820 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1821 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1822
1776 if (scanning_global_lru(sc)) { 1823 if (scanning_global_lru(sc)) {
1777 free = zone_page_state(zone, NR_FREE_PAGES); 1824 free = zone_page_state(zone, NR_FREE_PAGES);
1778 /* If we have very few page cache pages, 1825 /* If we have very few page cache pages,
@@ -1781,6 +1828,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1781 fraction[0] = 1; 1828 fraction[0] = 1;
1782 fraction[1] = 0; 1829 fraction[1] = 0;
1783 denominator = 1; 1830 denominator = 1;
1831 nr_force_scan[0] = SWAP_CLUSTER_MAX;
1832 nr_force_scan[1] = 0;
1784 goto out; 1833 goto out;
1785 } 1834 }
1786 } 1835 }
@@ -1789,8 +1838,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1789 * With swappiness at 100, anonymous and file have the same priority. 1838 * With swappiness at 100, anonymous and file have the same priority.
1790 * This scanning priority is essentially the inverse of IO cost. 1839 * This scanning priority is essentially the inverse of IO cost.
1791 */ 1840 */
1792 anon_prio = sc->swappiness; 1841 anon_prio = vmscan_swappiness(sc);
1793 file_prio = 200 - sc->swappiness; 1842 file_prio = 200 - vmscan_swappiness(sc);
1794 1843
1795 /* 1844 /*
1796 * OK, so we have swap space and a fair amount of page cache 1845 * OK, so we have swap space and a fair amount of page cache
@@ -1829,6 +1878,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1829 fraction[0] = ap; 1878 fraction[0] = ap;
1830 fraction[1] = fp; 1879 fraction[1] = fp;
1831 denominator = ap + fp + 1; 1880 denominator = ap + fp + 1;
1881 if (force_scan) {
1882 unsigned long scan = SWAP_CLUSTER_MAX;
1883 nr_force_scan[0] = div64_u64(scan * ap, denominator);
1884 nr_force_scan[1] = div64_u64(scan * fp, denominator);
1885 }
1832out: 1886out:
1833 for_each_evictable_lru(l) { 1887 for_each_evictable_lru(l) {
1834 int file = is_file_lru(l); 1888 int file = is_file_lru(l);
@@ -1849,12 +1903,8 @@ out:
1849 * memcg, priority drop can cause big latency. So, it's better 1903 * memcg, priority drop can cause big latency. So, it's better
1850 * to scan small amount. See may_noscan above. 1904 * to scan small amount. See may_noscan above.
1851 */ 1905 */
1852 if (!scan && force_scan) { 1906 if (!scan && force_scan)
1853 if (file) 1907 scan = nr_force_scan[file];
1854 scan = SWAP_CLUSTER_MAX;
1855 else if (!noswap)
1856 scan = SWAP_CLUSTER_MAX;
1857 }
1858 nr[l] = scan; 1908 nr[l] = scan;
1859 } 1909 }
1860} 1910}
@@ -2179,7 +2229,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2179 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2229 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2180 .may_unmap = 1, 2230 .may_unmap = 1,
2181 .may_swap = 1, 2231 .may_swap = 1,
2182 .swappiness = vm_swappiness,
2183 .order = order, 2232 .order = order,
2184 .mem_cgroup = NULL, 2233 .mem_cgroup = NULL,
2185 .nodemask = nodemask, 2234 .nodemask = nodemask,
@@ -2203,7 +2252,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2203 2252
2204unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2253unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2205 gfp_t gfp_mask, bool noswap, 2254 gfp_t gfp_mask, bool noswap,
2206 unsigned int swappiness,
2207 struct zone *zone, 2255 struct zone *zone,
2208 unsigned long *nr_scanned) 2256 unsigned long *nr_scanned)
2209{ 2257{
@@ -2213,7 +2261,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2213 .may_writepage = !laptop_mode, 2261 .may_writepage = !laptop_mode,
2214 .may_unmap = 1, 2262 .may_unmap = 1,
2215 .may_swap = !noswap, 2263 .may_swap = !noswap,
2216 .swappiness = swappiness,
2217 .order = 0, 2264 .order = 0,
2218 .mem_cgroup = mem, 2265 .mem_cgroup = mem,
2219 }; 2266 };
@@ -2242,8 +2289,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2242 2289
2243unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 2290unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2244 gfp_t gfp_mask, 2291 gfp_t gfp_mask,
2245 bool noswap, 2292 bool noswap)
2246 unsigned int swappiness)
2247{ 2293{
2248 struct zonelist *zonelist; 2294 struct zonelist *zonelist;
2249 unsigned long nr_reclaimed; 2295 unsigned long nr_reclaimed;
@@ -2253,7 +2299,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2253 .may_unmap = 1, 2299 .may_unmap = 1,
2254 .may_swap = !noswap, 2300 .may_swap = !noswap,
2255 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2301 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2256 .swappiness = swappiness,
2257 .order = 0, 2302 .order = 0,
2258 .mem_cgroup = mem_cont, 2303 .mem_cgroup = mem_cont,
2259 .nodemask = NULL, /* we don't care the placement */ 2304 .nodemask = NULL, /* we don't care the placement */
@@ -2404,7 +2449,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2404 * we want to put equal scanning pressure on each zone. 2449 * we want to put equal scanning pressure on each zone.
2405 */ 2450 */
2406 .nr_to_reclaim = ULONG_MAX, 2451 .nr_to_reclaim = ULONG_MAX,
2407 .swappiness = vm_swappiness,
2408 .order = order, 2452 .order = order,
2409 .mem_cgroup = NULL, 2453 .mem_cgroup = NULL,
2410 }; 2454 };
@@ -2453,6 +2497,9 @@ loop_again:
2453 high_wmark_pages(zone), 0, 0)) { 2497 high_wmark_pages(zone), 0, 0)) {
2454 end_zone = i; 2498 end_zone = i;
2455 break; 2499 break;
2500 } else {
2501 /* If balanced, clear the congested flag */
2502 zone_clear_flag(zone, ZONE_CONGESTED);
2456 } 2503 }
2457 } 2504 }
2458 if (i < 0) 2505 if (i < 0)
@@ -2874,7 +2921,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2874 .may_writepage = 1, 2921 .may_writepage = 1,
2875 .nr_to_reclaim = nr_to_reclaim, 2922 .nr_to_reclaim = nr_to_reclaim,
2876 .hibernation_mode = 1, 2923 .hibernation_mode = 1,
2877 .swappiness = vm_swappiness,
2878 .order = 0, 2924 .order = 0,
2879 }; 2925 };
2880 struct shrink_control shrink = { 2926 struct shrink_control shrink = {
@@ -3061,7 +3107,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3061 .nr_to_reclaim = max_t(unsigned long, nr_pages, 3107 .nr_to_reclaim = max_t(unsigned long, nr_pages,
3062 SWAP_CLUSTER_MAX), 3108 SWAP_CLUSTER_MAX),
3063 .gfp_mask = gfp_mask, 3109 .gfp_mask = gfp_mask,
3064 .swappiness = vm_swappiness,
3065 .order = order, 3110 .order = order,
3066 }; 3111 };
3067 struct shrink_control shrink = { 3112 struct shrink_control shrink = {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7694b..d52b13d28e8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -659,7 +659,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
659} 659}
660#endif 660#endif
661 661
662#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) 662#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
663#ifdef CONFIG_ZONE_DMA 663#ifdef CONFIG_ZONE_DMA
664#define TEXT_FOR_DMA(xx) xx "_dma", 664#define TEXT_FOR_DMA(xx) xx "_dma",
665#else 665#else
@@ -788,7 +788,7 @@ const char * const vmstat_text[] = {
788 788
789#endif /* CONFIG_VM_EVENTS_COUNTERS */ 789#endif /* CONFIG_VM_EVENTS_COUNTERS */
790}; 790};
791#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */ 791#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
792 792
793 793
794#ifdef CONFIG_PROC_FS 794#ifdef CONFIG_PROC_FS