aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile3
-rw-r--r--mm/backing-dev.c69
-rw-r--r--mm/filemap.c109
-rw-r--r--mm/hugetlb.c25
-rw-r--r--mm/memory.c1
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c3
-rw-r--r--mm/mmap.c26
-rw-r--r--mm/oom_kill.c1
-rw-r--r--mm/page-writeback.c17
-rw-r--r--mm/page_alloc.c60
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c41
-rw-r--r--mm/shmem.c84
-rw-r--r--mm/shmem_acl.c2
-rw-r--r--mm/slab.c13
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/truncate.c4
-rw-r--r--mm/vmalloc.c54
-rw-r--r--mm/vmscan.c75
-rw-r--r--mm/vmstat.c2
21 files changed, 421 insertions, 174 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 12b3a4eee88d..f3c077eb0b8e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,8 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 page_alloc.o page-writeback.o pdflush.o \ 11 page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o util.o mmzone.o vmstat.o $(mmu-y) 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 $(mmu-y)
14 15
15ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy) 16ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
16obj-y += bounce.o 17obj-y += bounce.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
new file mode 100644
index 000000000000..f50a2811f9dc
--- /dev/null
+++ b/mm/backing-dev.c
@@ -0,0 +1,69 @@
1
2#include <linux/wait.h>
3#include <linux/backing-dev.h>
4#include <linux/fs.h>
5#include <linux/sched.h>
6#include <linux/module.h>
7
8static wait_queue_head_t congestion_wqh[2] = {
9 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
10 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
11 };
12
13
14void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
15{
16 enum bdi_state bit;
17 wait_queue_head_t *wqh = &congestion_wqh[rw];
18
19 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
20 clear_bit(bit, &bdi->state);
21 smp_mb__after_clear_bit();
22 if (waitqueue_active(wqh))
23 wake_up(wqh);
24}
25EXPORT_SYMBOL(clear_bdi_congested);
26
27void set_bdi_congested(struct backing_dev_info *bdi, int rw)
28{
29 enum bdi_state bit;
30
31 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
32 set_bit(bit, &bdi->state);
33}
34EXPORT_SYMBOL(set_bdi_congested);
35
36/**
37 * congestion_wait - wait for a backing_dev to become uncongested
38 * @rw: READ or WRITE
39 * @timeout: timeout in jiffies
40 *
41 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
42 * write congestion. If no backing_devs are congested then just wait for the
43 * next write to be completed.
44 */
45long congestion_wait(int rw, long timeout)
46{
47 long ret;
48 DEFINE_WAIT(wait);
49 wait_queue_head_t *wqh = &congestion_wqh[rw];
50
51 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
52 ret = io_schedule_timeout(timeout);
53 finish_wait(wqh, &wait);
54 return ret;
55}
56EXPORT_SYMBOL(congestion_wait);
57
58/**
59 * congestion_end - wake up sleepers on a congested backing_dev_info
60 * @rw: READ or WRITE
61 */
62void congestion_end(int rw)
63{
64 wait_queue_head_t *wqh = &congestion_wqh[rw];
65
66 if (waitqueue_active(wqh))
67 wake_up(wqh);
68}
69EXPORT_SYMBOL(congestion_end);
diff --git a/mm/filemap.c b/mm/filemap.c
index 3464b681f844..7b84dc814347 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -75,8 +75,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
75 * ->mmap_sem 75 * ->mmap_sem
76 * ->lock_page (access_process_vm) 76 * ->lock_page (access_process_vm)
77 * 77 *
78 * ->mmap_sem 78 * ->i_mutex (generic_file_buffered_write)
79 * ->i_mutex (msync) 79 * ->mmap_sem (fault_in_pages_readable->do_page_fault)
80 * 80 *
81 * ->i_mutex 81 * ->i_mutex
82 * ->i_alloc_sem (various) 82 * ->i_alloc_sem (various)
@@ -467,25 +467,15 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
467} 467}
468 468
469#ifdef CONFIG_NUMA 469#ifdef CONFIG_NUMA
470struct page *page_cache_alloc(struct address_space *x) 470struct page *__page_cache_alloc(gfp_t gfp)
471{ 471{
472 if (cpuset_do_page_mem_spread()) { 472 if (cpuset_do_page_mem_spread()) {
473 int n = cpuset_mem_spread_node(); 473 int n = cpuset_mem_spread_node();
474 return alloc_pages_node(n, mapping_gfp_mask(x), 0); 474 return alloc_pages_node(n, gfp, 0);
475 } 475 }
476 return alloc_pages(mapping_gfp_mask(x), 0); 476 return alloc_pages(gfp, 0);
477} 477}
478EXPORT_SYMBOL(page_cache_alloc); 478EXPORT_SYMBOL(__page_cache_alloc);
479
480struct page *page_cache_alloc_cold(struct address_space *x)
481{
482 if (cpuset_do_page_mem_spread()) {
483 int n = cpuset_mem_spread_node();
484 return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
485 }
486 return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
487}
488EXPORT_SYMBOL(page_cache_alloc_cold);
489#endif 479#endif
490 480
491static int __sleep_on_page_lock(void *word) 481static int __sleep_on_page_lock(void *word)
@@ -826,7 +816,6 @@ struct page *
826grab_cache_page_nowait(struct address_space *mapping, unsigned long index) 816grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
827{ 817{
828 struct page *page = find_get_page(mapping, index); 818 struct page *page = find_get_page(mapping, index);
829 gfp_t gfp_mask;
830 819
831 if (page) { 820 if (page) {
832 if (!TestSetPageLocked(page)) 821 if (!TestSetPageLocked(page))
@@ -834,9 +823,8 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
834 page_cache_release(page); 823 page_cache_release(page);
835 return NULL; 824 return NULL;
836 } 825 }
837 gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS; 826 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
838 page = alloc_pages(gfp_mask, 0); 827 if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
839 if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
840 page_cache_release(page); 828 page_cache_release(page);
841 page = NULL; 829 page = NULL;
842 } 830 }
@@ -1884,11 +1872,10 @@ repeat:
1884 * if suid or (sgid and xgrp) 1872 * if suid or (sgid and xgrp)
1885 * remove privs 1873 * remove privs
1886 */ 1874 */
1887int remove_suid(struct dentry *dentry) 1875int should_remove_suid(struct dentry *dentry)
1888{ 1876{
1889 mode_t mode = dentry->d_inode->i_mode; 1877 mode_t mode = dentry->d_inode->i_mode;
1890 int kill = 0; 1878 int kill = 0;
1891 int result = 0;
1892 1879
1893 /* suid always must be killed */ 1880 /* suid always must be killed */
1894 if (unlikely(mode & S_ISUID)) 1881 if (unlikely(mode & S_ISUID))
@@ -1901,13 +1888,28 @@ int remove_suid(struct dentry *dentry)
1901 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 1888 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1902 kill |= ATTR_KILL_SGID; 1889 kill |= ATTR_KILL_SGID;
1903 1890
1904 if (unlikely(kill && !capable(CAP_FSETID))) { 1891 if (unlikely(kill && !capable(CAP_FSETID)))
1905 struct iattr newattrs; 1892 return kill;
1906 1893
1907 newattrs.ia_valid = ATTR_FORCE | kill; 1894 return 0;
1908 result = notify_change(dentry, &newattrs); 1895}
1909 } 1896
1910 return result; 1897int __remove_suid(struct dentry *dentry, int kill)
1898{
1899 struct iattr newattrs;
1900
1901 newattrs.ia_valid = ATTR_FORCE | kill;
1902 return notify_change(dentry, &newattrs);
1903}
1904
1905int remove_suid(struct dentry *dentry)
1906{
1907 int kill = should_remove_suid(dentry);
1908
1909 if (unlikely(kill))
1910 return __remove_suid(dentry, kill);
1911
1912 return 0;
1911} 1913}
1912EXPORT_SYMBOL(remove_suid); 1914EXPORT_SYMBOL(remove_suid);
1913 1915
@@ -2222,7 +2224,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2222 unsigned long nr_segs, loff_t *ppos) 2224 unsigned long nr_segs, loff_t *ppos)
2223{ 2225{
2224 struct file *file = iocb->ki_filp; 2226 struct file *file = iocb->ki_filp;
2225 const struct address_space * mapping = file->f_mapping; 2227 struct address_space * mapping = file->f_mapping;
2226 size_t ocount; /* original count */ 2228 size_t ocount; /* original count */
2227 size_t count; /* after file limit checks */ 2229 size_t count; /* after file limit checks */
2228 struct inode *inode = mapping->host; 2230 struct inode *inode = mapping->host;
@@ -2275,8 +2277,11 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2275 2277
2276 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 2278 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2277 if (unlikely(file->f_flags & O_DIRECT)) { 2279 if (unlikely(file->f_flags & O_DIRECT)) {
2278 written = generic_file_direct_write(iocb, iov, 2280 loff_t endbyte;
2279 &nr_segs, pos, ppos, count, ocount); 2281 ssize_t written_buffered;
2282
2283 written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
2284 ppos, count, ocount);
2280 if (written < 0 || written == count) 2285 if (written < 0 || written == count)
2281 goto out; 2286 goto out;
2282 /* 2287 /*
@@ -2285,10 +2290,46 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2285 */ 2290 */
2286 pos += written; 2291 pos += written;
2287 count -= written; 2292 count -= written;
2288 } 2293 written_buffered = generic_file_buffered_write(iocb, iov,
2294 nr_segs, pos, ppos, count,
2295 written);
2296 /*
2297 * If generic_file_buffered_write() retuned a synchronous error
2298 * then we want to return the number of bytes which were
2299 * direct-written, or the error code if that was zero. Note
2300 * that this differs from normal direct-io semantics, which
2301 * will return -EFOO even if some bytes were written.
2302 */
2303 if (written_buffered < 0) {
2304 err = written_buffered;
2305 goto out;
2306 }
2289 2307
2290 written = generic_file_buffered_write(iocb, iov, nr_segs, 2308 /*
2291 pos, ppos, count, written); 2309 * We need to ensure that the page cache pages are written to
2310 * disk and invalidated to preserve the expected O_DIRECT
2311 * semantics.
2312 */
2313 endbyte = pos + written_buffered - written - 1;
2314 err = do_sync_file_range(file, pos, endbyte,
2315 SYNC_FILE_RANGE_WAIT_BEFORE|
2316 SYNC_FILE_RANGE_WRITE|
2317 SYNC_FILE_RANGE_WAIT_AFTER);
2318 if (err == 0) {
2319 written = written_buffered;
2320 invalidate_mapping_pages(mapping,
2321 pos >> PAGE_CACHE_SHIFT,
2322 endbyte >> PAGE_CACHE_SHIFT);
2323 } else {
2324 /*
2325 * We don't know how much we wrote, so just return
2326 * the number of bytes which were direct-written
2327 */
2328 }
2329 } else {
2330 written = generic_file_buffered_write(iocb, iov, nr_segs,
2331 pos, ppos, count, written);
2332 }
2292out: 2333out:
2293 current->backing_dev_info = NULL; 2334 current->backing_dev_info = NULL;
2294 return written ? written : err; 2335 return written ? written : err;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1d709ff528e1..a088f593a807 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -356,8 +356,8 @@ nomem:
356 return -ENOMEM; 356 return -ENOMEM;
357} 357}
358 358
359void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 359void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
360 unsigned long end) 360 unsigned long end)
361{ 361{
362 struct mm_struct *mm = vma->vm_mm; 362 struct mm_struct *mm = vma->vm_mm;
363 unsigned long address; 363 unsigned long address;
@@ -398,6 +398,24 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
398 } 398 }
399} 399}
400 400
401void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
402 unsigned long end)
403{
404 /*
405 * It is undesirable to test vma->vm_file as it should be non-null
406 * for valid hugetlb area. However, vm_file will be NULL in the error
407 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
408 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
409 * to clean up. Since no pte has actually been setup, it is safe to
410 * do nothing in this case.
411 */
412 if (vma->vm_file) {
413 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
414 __unmap_hugepage_range(vma, start, end);
415 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
416 }
417}
418
401static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 419static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
402 unsigned long address, pte_t *ptep, pte_t pte) 420 unsigned long address, pte_t *ptep, pte_t pte)
403{ 421{
@@ -460,6 +478,9 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
460retry: 478retry:
461 page = find_lock_page(mapping, idx); 479 page = find_lock_page(mapping, idx);
462 if (!page) { 480 if (!page) {
481 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
482 if (idx >= size)
483 goto out;
463 if (hugetlb_get_quota(mapping)) 484 if (hugetlb_get_quota(mapping))
464 goto out; 485 goto out;
465 page = alloc_huge_page(vma, address); 486 page = alloc_huge_page(vma, address);
diff --git a/mm/memory.c b/mm/memory.c
index b5a4aadd961a..156861fcac43 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1452,6 +1452,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
1452 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) 1452 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1453 memset(kaddr, 0, PAGE_SIZE); 1453 memset(kaddr, 0, PAGE_SIZE);
1454 kunmap_atomic(kaddr, KM_USER0); 1454 kunmap_atomic(kaddr, KM_USER0);
1455 flush_dcache_page(dst);
1455 return; 1456 return;
1456 1457
1457 } 1458 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 25788b1b7fcf..617fb31086ee 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -727,7 +727,7 @@ int do_migrate_pages(struct mm_struct *mm,
727 return -ENOSYS; 727 return -ENOSYS;
728} 728}
729 729
730static struct page *new_vma_page(struct page *page, unsigned long private) 730static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
731{ 731{
732 return NULL; 732 return NULL;
733} 733}
diff --git a/mm/migrate.c b/mm/migrate.c
index ba2453f9483d..b4979d423d2b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -952,7 +952,8 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
952 goto out; 952 goto out;
953 953
954 pm[i].node = node; 954 pm[i].node = node;
955 } 955 } else
956 pm[i].node = 0; /* anything to not match MAX_NUMNODES */
956 } 957 }
957 /* End marker */ 958 /* End marker */
958 pm[nr_pages].node = MAX_NUMNODES; 959 pm[nr_pages].node = MAX_NUMNODES;
diff --git a/mm/mmap.c b/mm/mmap.c
index eea8eefd51a8..7b40abd7cba2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -900,17 +900,6 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
900 int accountable = 1; 900 int accountable = 1;
901 unsigned long charged = 0, reqprot = prot; 901 unsigned long charged = 0, reqprot = prot;
902 902
903 if (file) {
904 if (is_file_hugepages(file))
905 accountable = 0;
906
907 if (!file->f_op || !file->f_op->mmap)
908 return -ENODEV;
909
910 if ((prot & PROT_EXEC) &&
911 (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
912 return -EPERM;
913 }
914 /* 903 /*
915 * Does the application expect PROT_READ to imply PROT_EXEC? 904 * Does the application expect PROT_READ to imply PROT_EXEC?
916 * 905 *
@@ -1000,6 +989,16 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
1000 case MAP_PRIVATE: 989 case MAP_PRIVATE:
1001 if (!(file->f_mode & FMODE_READ)) 990 if (!(file->f_mode & FMODE_READ))
1002 return -EACCES; 991 return -EACCES;
992 if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) {
993 if (vm_flags & VM_EXEC)
994 return -EPERM;
995 vm_flags &= ~VM_MAYEXEC;
996 }
997 if (is_file_hugepages(file))
998 accountable = 0;
999
1000 if (!file->f_op || !file->f_op->mmap)
1001 return -ENODEV;
1003 break; 1002 break;
1004 1003
1005 default: 1004 default:
@@ -1380,7 +1379,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1380 * Check if the given range is hugepage aligned, and 1379 * Check if the given range is hugepage aligned, and
1381 * can be made suitable for hugepages. 1380 * can be made suitable for hugepages.
1382 */ 1381 */
1383 ret = prepare_hugepage_range(addr, len); 1382 ret = prepare_hugepage_range(addr, len, pgoff);
1384 } else { 1383 } else {
1385 /* 1384 /*
1386 * Ensure that a normal request is not falling in a 1385 * Ensure that a normal request is not falling in a
@@ -1881,6 +1880,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
1881 if ((addr + len) > TASK_SIZE || (addr + len) < addr) 1880 if ((addr + len) > TASK_SIZE || (addr + len) < addr)
1882 return -EINVAL; 1881 return -EINVAL;
1883 1882
1883 if (is_hugepage_only_range(mm, addr, len))
1884 return -EINVAL;
1885
1884 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 1886 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1885 1887
1886 error = arch_mmap_check(addr, len, flags); 1888 error = arch_mmap_check(addr, len, flags);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 20f41b082e16..2e3ce3a928b9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -15,6 +15,7 @@
15 * kernel subsystems and hints as to where to find out what things do. 15 * kernel subsystems and hints as to where to find out what things do.
16 */ 16 */
17 17
18#include <linux/oom.h>
18#include <linux/mm.h> 19#include <linux/mm.h>
19#include <linux/sched.h> 20#include <linux/sched.h>
20#include <linux/swap.h> 21#include <linux/swap.h>
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a0f339057449..8d9b19f239c3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -222,7 +222,7 @@ static void balance_dirty_pages(struct address_space *mapping)
222 if (pages_written >= write_chunk) 222 if (pages_written >= write_chunk)
223 break; /* We've done our duty */ 223 break; /* We've done our duty */
224 } 224 }
225 blk_congestion_wait(WRITE, HZ/10); 225 congestion_wait(WRITE, HZ/10);
226 } 226 }
227 227
228 if (nr_reclaimable + global_page_state(NR_WRITEBACK) 228 if (nr_reclaimable + global_page_state(NR_WRITEBACK)
@@ -314,7 +314,7 @@ void throttle_vm_writeout(void)
314 if (global_page_state(NR_UNSTABLE_NFS) + 314 if (global_page_state(NR_UNSTABLE_NFS) +
315 global_page_state(NR_WRITEBACK) <= dirty_thresh) 315 global_page_state(NR_WRITEBACK) <= dirty_thresh)
316 break; 316 break;
317 blk_congestion_wait(WRITE, HZ/10); 317 congestion_wait(WRITE, HZ/10);
318 } 318 }
319} 319}
320 320
@@ -351,7 +351,7 @@ static void background_writeout(unsigned long _min_pages)
351 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 351 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
352 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { 352 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
353 /* Wrote less than expected */ 353 /* Wrote less than expected */
354 blk_congestion_wait(WRITE, HZ/10); 354 congestion_wait(WRITE, HZ/10);
355 if (!wbc.encountered_congestion) 355 if (!wbc.encountered_congestion)
356 break; 356 break;
357 } 357 }
@@ -422,7 +422,7 @@ static void wb_kupdate(unsigned long arg)
422 writeback_inodes(&wbc); 422 writeback_inodes(&wbc);
423 if (wbc.nr_to_write > 0) { 423 if (wbc.nr_to_write > 0) {
424 if (wbc.encountered_congestion) 424 if (wbc.encountered_congestion)
425 blk_congestion_wait(WRITE, HZ/10); 425 congestion_wait(WRITE, HZ/10);
426 else 426 else
427 break; /* All the old data is written */ 427 break; /* All the old data is written */
428 } 428 }
@@ -956,15 +956,6 @@ int test_set_page_writeback(struct page *page)
956EXPORT_SYMBOL(test_set_page_writeback); 956EXPORT_SYMBOL(test_set_page_writeback);
957 957
958/* 958/*
959 * Wakes up tasks that are being throttled due to writeback congestion
960 */
961void writeback_congestion_end(void)
962{
963 blk_congestion_end(WRITE);
964}
965EXPORT_SYMBOL(writeback_congestion_end);
966
967/*
968 * Return true if any of the pages in the mapping are marged with the 959 * Return true if any of the pages in the mapping are marged with the
969 * passed tag. 960 * passed tag.
970 */ 961 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a8c003e7b3d5..bf2f6cff1d6a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -39,6 +39,7 @@
39#include <linux/stop_machine.h> 39#include <linux/stop_machine.h>
40#include <linux/sort.h> 40#include <linux/sort.h>
41#include <linux/pfn.h> 41#include <linux/pfn.h>
42#include <linux/backing-dev.h>
42 43
43#include <asm/tlbflush.h> 44#include <asm/tlbflush.h>
44#include <asm/div64.h> 45#include <asm/div64.h>
@@ -495,17 +496,16 @@ static void __free_pages_ok(struct page *page, unsigned int order)
495 int i; 496 int i;
496 int reserved = 0; 497 int reserved = 0;
497 498
498 arch_free_page(page, order);
499 if (!PageHighMem(page))
500 debug_check_no_locks_freed(page_address(page),
501 PAGE_SIZE<<order);
502
503 for (i = 0 ; i < (1 << order) ; ++i) 499 for (i = 0 ; i < (1 << order) ; ++i)
504 reserved += free_pages_check(page + i); 500 reserved += free_pages_check(page + i);
505 if (reserved) 501 if (reserved)
506 return; 502 return;
507 503
504 if (!PageHighMem(page))
505 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
506 arch_free_page(page, order);
508 kernel_map_pages(page, 1 << order, 0); 507 kernel_map_pages(page, 1 << order, 0);
508
509 local_irq_save(flags); 509 local_irq_save(flags);
510 __count_vm_events(PGFREE, 1 << order); 510 __count_vm_events(PGFREE, 1 << order);
511 free_one_page(page_zone(page), page, order); 511 free_one_page(page_zone(page), page, order);
@@ -781,13 +781,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
781 struct per_cpu_pages *pcp; 781 struct per_cpu_pages *pcp;
782 unsigned long flags; 782 unsigned long flags;
783 783
784 arch_free_page(page, 0);
785
786 if (PageAnon(page)) 784 if (PageAnon(page))
787 page->mapping = NULL; 785 page->mapping = NULL;
788 if (free_pages_check(page)) 786 if (free_pages_check(page))
789 return; 787 return;
790 788
789 if (!PageHighMem(page))
790 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
791 arch_free_page(page, 0);
791 kernel_map_pages(page, 1, 0); 792 kernel_map_pages(page, 1, 0);
792 793
793 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 794 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
@@ -852,7 +853,7 @@ again:
852 pcp = &zone_pcp(zone, cpu)->pcp[cold]; 853 pcp = &zone_pcp(zone, cpu)->pcp[cold];
853 local_irq_save(flags); 854 local_irq_save(flags);
854 if (!pcp->count) { 855 if (!pcp->count) {
855 pcp->count += rmqueue_bulk(zone, 0, 856 pcp->count = rmqueue_bulk(zone, 0,
856 pcp->batch, &pcp->list); 857 pcp->batch, &pcp->list);
857 if (unlikely(!pcp->count)) 858 if (unlikely(!pcp->count))
858 goto failed; 859 goto failed;
@@ -1050,7 +1051,7 @@ nofail_alloc:
1050 if (page) 1051 if (page)
1051 goto got_pg; 1052 goto got_pg;
1052 if (gfp_mask & __GFP_NOFAIL) { 1053 if (gfp_mask & __GFP_NOFAIL) {
1053 blk_congestion_wait(WRITE, HZ/50); 1054 congestion_wait(WRITE, HZ/50);
1054 goto nofail_alloc; 1055 goto nofail_alloc;
1055 } 1056 }
1056 } 1057 }
@@ -1113,7 +1114,7 @@ rebalance:
1113 do_retry = 1; 1114 do_retry = 1;
1114 } 1115 }
1115 if (do_retry) { 1116 if (do_retry) {
1116 blk_congestion_wait(WRITE, HZ/50); 1117 congestion_wait(WRITE, HZ/50);
1117 goto rebalance; 1118 goto rebalance;
1118 } 1119 }
1119 1120
@@ -1688,6 +1689,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1688 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 1689 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1689 if (!early_pfn_valid(pfn)) 1690 if (!early_pfn_valid(pfn))
1690 continue; 1691 continue;
1692 if (!early_pfn_in_nid(pfn, nid))
1693 continue;
1691 page = pfn_to_page(pfn); 1694 page = pfn_to_page(pfn);
1692 set_page_links(page, zone, nid, pfn); 1695 set_page_links(page, zone, nid, pfn);
1693 init_page_count(page); 1696 init_page_count(page);
@@ -2258,7 +2261,7 @@ unsigned long __init __absent_pages_in_range(int nid,
2258 2261
2259 /* Account for ranges past physical memory on this node */ 2262 /* Account for ranges past physical memory on this node */
2260 if (range_end_pfn > prev_end_pfn) 2263 if (range_end_pfn > prev_end_pfn)
2261 hole_pages = range_end_pfn - 2264 hole_pages += range_end_pfn -
2262 max(range_start_pfn, prev_end_pfn); 2265 max(range_start_pfn, prev_end_pfn);
2263 2266
2264 return hole_pages; 2267 return hole_pages;
@@ -2294,19 +2297,6 @@ unsigned long __init zone_absent_pages_in_node(int nid,
2294 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 2297 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
2295} 2298}
2296 2299
2297/* Return the zone index a PFN is in */
2298int memmap_zone_idx(struct page *lmem_map)
2299{
2300 int i;
2301 unsigned long phys_addr = virt_to_phys(lmem_map);
2302 unsigned long pfn = phys_addr >> PAGE_SHIFT;
2303
2304 for (i = 0; i < MAX_NR_ZONES; i++)
2305 if (pfn < arch_zone_highest_possible_pfn[i])
2306 break;
2307
2308 return i;
2309}
2310#else 2300#else
2311static inline unsigned long zone_spanned_pages_in_node(int nid, 2301static inline unsigned long zone_spanned_pages_in_node(int nid,
2312 unsigned long zone_type, 2302 unsigned long zone_type,
@@ -2325,10 +2315,6 @@ static inline unsigned long zone_absent_pages_in_node(int nid,
2325 return zholes_size[zone_type]; 2315 return zholes_size[zone_type];
2326} 2316}
2327 2317
2328static inline int memmap_zone_idx(struct page *lmem_map)
2329{
2330 return MAX_NR_ZONES;
2331}
2332#endif 2318#endif
2333 2319
2334static void __init calculate_node_totalpages(struct pglist_data *pgdat, 2320static void __init calculate_node_totalpages(struct pglist_data *pgdat,
@@ -2421,7 +2407,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2421 zone->zone_pgdat = pgdat; 2407 zone->zone_pgdat = pgdat;
2422 zone->free_pages = 0; 2408 zone->free_pages = 0;
2423 2409
2424 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 2410 zone->prev_priority = DEF_PRIORITY;
2425 2411
2426 zone_pcp_init(zone); 2412 zone_pcp_init(zone);
2427 INIT_LIST_HEAD(&zone->active_list); 2413 INIT_LIST_HEAD(&zone->active_list);
@@ -3136,3 +3122,19 @@ unsigned long page_to_pfn(struct page *page)
3136EXPORT_SYMBOL(pfn_to_page); 3122EXPORT_SYMBOL(pfn_to_page);
3137EXPORT_SYMBOL(page_to_pfn); 3123EXPORT_SYMBOL(page_to_pfn);
3138#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 3124#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
3125
3126#if MAX_NUMNODES > 1
3127/*
3128 * Find the highest possible node id.
3129 */
3130int highest_possible_node_id(void)
3131{
3132 unsigned int node;
3133 unsigned int highest = 0;
3134
3135 for_each_node_mask(node, node_possible_map)
3136 highest = node;
3137 return highest;
3138}
3139EXPORT_SYMBOL(highest_possible_node_id);
3140#endif
diff --git a/mm/readahead.c b/mm/readahead.c
index 1ba736ac0367..23cb61a01c6e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -173,6 +173,8 @@ static int read_pages(struct address_space *mapping, struct file *filp,
173 173
174 if (mapping->a_ops->readpages) { 174 if (mapping->a_ops->readpages) {
175 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); 175 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
176 /* Clean up the remaining pages */
177 put_pages_list(pages);
176 goto out; 178 goto out;
177 } 179 }
178 180
diff --git a/mm/rmap.c b/mm/rmap.c
index e2155d791d99..d8a842a586db 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,27 +21,21 @@
21 * Lock ordering in mm: 21 * Lock ordering in mm:
22 * 22 *
23 * inode->i_mutex (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * inode->i_alloc_sem 24 * inode->i_alloc_sem (vmtruncate_range)
25 * 25 * mm->mmap_sem
26 * When a page fault occurs in writing from user to file, down_read 26 * page->flags PG_locked (lock_page)
27 * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within 27 * mapping->i_mmap_lock
28 * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never 28 * anon_vma->lock
29 * taken together; in truncation, i_mutex is taken outermost. 29 * mm->page_table_lock or pte_lock
30 * 30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
31 * mm->mmap_sem 31 * swap_lock (in swap_duplicate, swap_info_get)
32 * page->flags PG_locked (lock_page) 32 * mmlist_lock (in mmput, drain_mmlist and others)
33 * mapping->i_mmap_lock 33 * mapping->private_lock (in __set_page_dirty_buffers)
34 * anon_vma->lock 34 * inode_lock (in set_page_dirty's __mark_inode_dirty)
35 * mm->page_table_lock or pte_lock 35 * sb_lock (within inode_lock in fs/fs-writeback.c)
36 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * swap_lock (in swap_duplicate, swap_info_get) 37 * in arch-dependent flush_dcache_mmap_lock,
38 * mmlist_lock (in mmput, drain_mmlist and others) 38 * within inode_lock in __sync_single_inode)
39 * mapping->private_lock (in __set_page_dirty_buffers)
40 * inode_lock (in set_page_dirty's __mark_inode_dirty)
41 * sb_lock (within inode_lock in fs/fs-writeback.c)
42 * mapping->tree_lock (widely used, in set_page_dirty,
43 * in arch-dependent flush_dcache_mmap_lock,
44 * within inode_lock in __sync_single_inode)
45 */ 39 */
46 40
47#include <linux/mm.h> 41#include <linux/mm.h>
@@ -576,15 +570,14 @@ void page_add_file_rmap(struct page *page)
576void page_remove_rmap(struct page *page) 570void page_remove_rmap(struct page *page)
577{ 571{
578 if (atomic_add_negative(-1, &page->_mapcount)) { 572 if (atomic_add_negative(-1, &page->_mapcount)) {
579#ifdef CONFIG_DEBUG_VM
580 if (unlikely(page_mapcount(page) < 0)) { 573 if (unlikely(page_mapcount(page) < 0)) {
581 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); 574 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
582 printk (KERN_EMERG " page->flags = %lx\n", page->flags); 575 printk (KERN_EMERG " page->flags = %lx\n", page->flags);
583 printk (KERN_EMERG " page->count = %x\n", page_count(page)); 576 printk (KERN_EMERG " page->count = %x\n", page_count(page));
584 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 577 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
578 BUG();
585 } 579 }
586#endif 580
587 BUG_ON(page_mapcount(page) < 0);
588 /* 581 /*
589 * It would be tidy to reset the PageAnon mapping here, 582 * It would be tidy to reset the PageAnon mapping here,
590 * but that might overwrite a racing page_add_anon_rmap 583 * but that might overwrite a racing page_add_anon_rmap
diff --git a/mm/shmem.c b/mm/shmem.c
index bb8ca7ef7094..4959535fc14c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -48,6 +48,7 @@
48#include <linux/ctype.h> 48#include <linux/ctype.h>
49#include <linux/migrate.h> 49#include <linux/migrate.h>
50#include <linux/highmem.h> 50#include <linux/highmem.h>
51#include <linux/backing-dev.h>
51 52
52#include <asm/uaccess.h> 53#include <asm/uaccess.h>
53#include <asm/div64.h> 54#include <asm/div64.h>
@@ -1131,7 +1132,7 @@ repeat:
1131 page_cache_release(swappage); 1132 page_cache_release(swappage);
1132 if (error == -ENOMEM) { 1133 if (error == -ENOMEM) {
1133 /* let kswapd refresh zone for GFP_ATOMICs */ 1134 /* let kswapd refresh zone for GFP_ATOMICs */
1134 blk_congestion_wait(WRITE, HZ/50); 1135 congestion_wait(WRITE, HZ/50);
1135 } 1136 }
1136 goto repeat; 1137 goto repeat;
1137 } 1138 }
@@ -1362,6 +1363,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1362 inode->i_mapping->a_ops = &shmem_aops; 1363 inode->i_mapping->a_ops = &shmem_aops;
1363 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1364 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1364 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1365 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1366 inode->i_generation = get_seconds();
1365 info = SHMEM_I(inode); 1367 info = SHMEM_I(inode);
1366 memset(info, 0, (char *)inode - (char *)info); 1368 memset(info, 0, (char *)inode - (char *)info);
1367 spin_lock_init(&info->lock); 1369 spin_lock_init(&info->lock);
@@ -1956,6 +1958,85 @@ static struct xattr_handler *shmem_xattr_handlers[] = {
1956}; 1958};
1957#endif 1959#endif
1958 1960
1961static struct dentry *shmem_get_parent(struct dentry *child)
1962{
1963 return ERR_PTR(-ESTALE);
1964}
1965
1966static int shmem_match(struct inode *ino, void *vfh)
1967{
1968 __u32 *fh = vfh;
1969 __u64 inum = fh[2];
1970 inum = (inum << 32) | fh[1];
1971 return ino->i_ino == inum && fh[0] == ino->i_generation;
1972}
1973
1974static struct dentry *shmem_get_dentry(struct super_block *sb, void *vfh)
1975{
1976 struct dentry *de = NULL;
1977 struct inode *inode;
1978 __u32 *fh = vfh;
1979 __u64 inum = fh[2];
1980 inum = (inum << 32) | fh[1];
1981
1982 inode = ilookup5(sb, (unsigned long)(inum+fh[0]), shmem_match, vfh);
1983 if (inode) {
1984 de = d_find_alias(inode);
1985 iput(inode);
1986 }
1987
1988 return de? de: ERR_PTR(-ESTALE);
1989}
1990
1991static struct dentry *shmem_decode_fh(struct super_block *sb, __u32 *fh,
1992 int len, int type,
1993 int (*acceptable)(void *context, struct dentry *de),
1994 void *context)
1995{
1996 if (len < 3)
1997 return ERR_PTR(-ESTALE);
1998
1999 return sb->s_export_op->find_exported_dentry(sb, fh, NULL, acceptable,
2000 context);
2001}
2002
2003static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2004 int connectable)
2005{
2006 struct inode *inode = dentry->d_inode;
2007
2008 if (*len < 3)
2009 return 255;
2010
2011 if (hlist_unhashed(&inode->i_hash)) {
2012 /* Unfortunately insert_inode_hash is not idempotent,
2013 * so as we hash inodes here rather than at creation
2014 * time, we need a lock to ensure we only try
2015 * to do it once
2016 */
2017 static DEFINE_SPINLOCK(lock);
2018 spin_lock(&lock);
2019 if (hlist_unhashed(&inode->i_hash))
2020 __insert_inode_hash(inode,
2021 inode->i_ino + inode->i_generation);
2022 spin_unlock(&lock);
2023 }
2024
2025 fh[0] = inode->i_generation;
2026 fh[1] = inode->i_ino;
2027 fh[2] = ((__u64)inode->i_ino) >> 32;
2028
2029 *len = 3;
2030 return 1;
2031}
2032
2033static struct export_operations shmem_export_ops = {
2034 .get_parent = shmem_get_parent,
2035 .get_dentry = shmem_get_dentry,
2036 .encode_fh = shmem_encode_fh,
2037 .decode_fh = shmem_decode_fh,
2038};
2039
1959static int shmem_parse_options(char *options, int *mode, uid_t *uid, 2040static int shmem_parse_options(char *options, int *mode, uid_t *uid,
1960 gid_t *gid, unsigned long *blocks, unsigned long *inodes, 2041 gid_t *gid, unsigned long *blocks, unsigned long *inodes,
1961 int *policy, nodemask_t *policy_nodes) 2042 int *policy, nodemask_t *policy_nodes)
@@ -2128,6 +2209,7 @@ static int shmem_fill_super(struct super_block *sb,
2128 &inodes, &policy, &policy_nodes)) 2209 &inodes, &policy, &policy_nodes))
2129 return -EINVAL; 2210 return -EINVAL;
2130 } 2211 }
2212 sb->s_export_op = &shmem_export_ops;
2131#else 2213#else
2132 sb->s_flags |= MS_NOUSER; 2214 sb->s_flags |= MS_NOUSER;
2133#endif 2215#endif
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index c946bf468718..f5664c5b9eb1 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -35,7 +35,7 @@ shmem_get_acl(struct inode *inode, int type)
35} 35}
36 36
37/** 37/**
38 * shmem_get_acl - generic_acl_operations->setacl() operation 38 * shmem_set_acl - generic_acl_operations->setacl() operation
39 */ 39 */
40static void 40static void
41shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) 41shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
diff --git a/mm/slab.c b/mm/slab.c
index 266449d604bd..3c4a7e34eddc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -883,7 +883,7 @@ static void init_reap_node(int cpu)
883 if (node == MAX_NUMNODES) 883 if (node == MAX_NUMNODES)
884 node = first_node(node_online_map); 884 node = first_node(node_online_map);
885 885
886 __get_cpu_var(reap_node) = node; 886 per_cpu(reap_node, cpu) = node;
887} 887}
888 888
889static void next_reap_node(void) 889static void next_reap_node(void)
@@ -3152,12 +3152,15 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3152 struct zone **z; 3152 struct zone **z;
3153 void *obj = NULL; 3153 void *obj = NULL;
3154 3154
3155 for (z = zonelist->zones; *z && !obj; z++) 3155 for (z = zonelist->zones; *z && !obj; z++) {
3156 int nid = zone_to_nid(*z);
3157
3156 if (zone_idx(*z) <= ZONE_NORMAL && 3158 if (zone_idx(*z) <= ZONE_NORMAL &&
3157 cpuset_zone_allowed(*z, flags)) 3159 cpuset_zone_allowed(*z, flags) &&
3160 cache->nodelists[nid])
3158 obj = __cache_alloc_node(cache, 3161 obj = __cache_alloc_node(cache,
3159 flags | __GFP_THISNODE, 3162 flags | __GFP_THISNODE, nid);
3160 zone_to_nid(*z)); 3163 }
3161 return obj; 3164 return obj;
3162} 3165}
3163 3166
diff --git a/mm/sparse.c b/mm/sparse.c
index 86c52ab80878..b3c82ba30012 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -211,7 +211,7 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
211 struct page *page, *ret; 211 struct page *page, *ret;
212 unsigned long memmap_size = sizeof(struct page) * nr_pages; 212 unsigned long memmap_size = sizeof(struct page) * nr_pages;
213 213
214 page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); 214 page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
215 if (page) 215 if (page)
216 goto got_map_page; 216 goto got_map_page;
217 217
diff --git a/mm/truncate.c b/mm/truncate.c
index f4edbc179d14..e07b1e682c38 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -96,7 +96,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
96 return 0; 96 return 0;
97 97
98 ret = remove_mapping(mapping, page); 98 ret = remove_mapping(mapping, page);
99 ClearPageUptodate(page);
100 99
101 return ret; 100 return ret;
102} 101}
@@ -302,7 +301,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
302 if (page->mapping != mapping) 301 if (page->mapping != mapping)
303 return 0; 302 return 0;
304 303
305 if (PagePrivate(page) && !try_to_release_page(page, 0)) 304 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
306 return 0; 305 return 0;
307 306
308 write_lock_irq(&mapping->tree_lock); 307 write_lock_irq(&mapping->tree_lock);
@@ -396,6 +395,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
396 pagevec_release(&pvec); 395 pagevec_release(&pvec);
397 cond_resched(); 396 cond_resched();
398 } 397 }
398 WARN_ON_ONCE(ret);
399 return ret; 399 return ret;
400} 400}
401EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); 401EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 750ab6ed13fc..86897ee792d6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -160,13 +160,15 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
160 return err; 160 return err;
161} 161}
162 162
163struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, 163static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
164 unsigned long start, unsigned long end, int node) 164 unsigned long start, unsigned long end,
165 int node, gfp_t gfp_mask)
165{ 166{
166 struct vm_struct **p, *tmp, *area; 167 struct vm_struct **p, *tmp, *area;
167 unsigned long align = 1; 168 unsigned long align = 1;
168 unsigned long addr; 169 unsigned long addr;
169 170
171 BUG_ON(in_interrupt());
170 if (flags & VM_IOREMAP) { 172 if (flags & VM_IOREMAP) {
171 int bit = fls(size); 173 int bit = fls(size);
172 174
@@ -179,16 +181,13 @@ struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
179 } 181 }
180 addr = ALIGN(start, align); 182 addr = ALIGN(start, align);
181 size = PAGE_ALIGN(size); 183 size = PAGE_ALIGN(size);
184 if (unlikely(!size))
185 return NULL;
182 186
183 area = kmalloc_node(sizeof(*area), GFP_KERNEL, node); 187 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node);
184 if (unlikely(!area)) 188 if (unlikely(!area))
185 return NULL; 189 return NULL;
186 190
187 if (unlikely(!size)) {
188 kfree (area);
189 return NULL;
190 }
191
192 /* 191 /*
193 * We always allocate a guard page. 192 * We always allocate a guard page.
194 */ 193 */
@@ -236,7 +235,7 @@ out:
236struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 235struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
237 unsigned long start, unsigned long end) 236 unsigned long start, unsigned long end)
238{ 237{
239 return __get_vm_area_node(size, flags, start, end, -1); 238 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
240} 239}
241 240
242/** 241/**
@@ -253,9 +252,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
253 return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); 252 return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
254} 253}
255 254
256struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node) 255struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
256 int node, gfp_t gfp_mask)
257{ 257{
258 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node); 258 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
259 gfp_mask);
259} 260}
260 261
261/* Caller must hold vmlist_lock */ 262/* Caller must hold vmlist_lock */
@@ -428,8 +429,11 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
428 if (array_size > PAGE_SIZE) { 429 if (array_size > PAGE_SIZE) {
429 pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); 430 pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
430 area->flags |= VM_VPAGES; 431 area->flags |= VM_VPAGES;
431 } else 432 } else {
432 pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node); 433 pages = kmalloc_node(array_size,
434 (gfp_mask & ~(__GFP_HIGHMEM | __GFP_ZERO)),
435 node);
436 }
433 area->pages = pages; 437 area->pages = pages;
434 if (!area->pages) { 438 if (!area->pages) {
435 remove_vm_area(area->addr); 439 remove_vm_area(area->addr);
@@ -484,7 +488,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
484 if (!size || (size >> PAGE_SHIFT) > num_physpages) 488 if (!size || (size >> PAGE_SHIFT) > num_physpages)
485 return NULL; 489 return NULL;
486 490
487 area = get_vm_area_node(size, VM_ALLOC, node); 491 area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask);
488 if (!area) 492 if (!area)
489 return NULL; 493 return NULL;
490 494
@@ -525,11 +529,12 @@ void *vmalloc_user(unsigned long size)
525 void *ret; 529 void *ret;
526 530
527 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); 531 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
528 write_lock(&vmlist_lock); 532 if (ret) {
529 area = __find_vm_area(ret); 533 write_lock(&vmlist_lock);
530 area->flags |= VM_USERMAP; 534 area = __find_vm_area(ret);
531 write_unlock(&vmlist_lock); 535 area->flags |= VM_USERMAP;
532 536 write_unlock(&vmlist_lock);
537 }
533 return ret; 538 return ret;
534} 539}
535EXPORT_SYMBOL(vmalloc_user); 540EXPORT_SYMBOL(vmalloc_user);
@@ -598,11 +603,12 @@ void *vmalloc_32_user(unsigned long size)
598 void *ret; 603 void *ret;
599 604
600 ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); 605 ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
601 write_lock(&vmlist_lock); 606 if (ret) {
602 area = __find_vm_area(ret); 607 write_lock(&vmlist_lock);
603 area->flags |= VM_USERMAP; 608 area = __find_vm_area(ret);
604 write_unlock(&vmlist_lock); 609 area->flags |= VM_USERMAP;
605 610 write_unlock(&vmlist_lock);
611 }
606 return ret; 612 return ret;
607} 613}
608EXPORT_SYMBOL(vmalloc_32_user); 614EXPORT_SYMBOL(vmalloc_32_user);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eca70310adb2..518540a4a2a6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -378,6 +378,12 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
378 return PAGE_CLEAN; 378 return PAGE_CLEAN;
379} 379}
380 380
381/*
382 * Attempt to detach a locked page from its ->mapping. If it is dirty or if
383 * someone else has a ref on the page, abort and return 0. If it was
384 * successfully detached, return 1. Assumes the caller has a single ref on
385 * this page.
386 */
381int remove_mapping(struct address_space *mapping, struct page *page) 387int remove_mapping(struct address_space *mapping, struct page *page)
382{ 388{
383 BUG_ON(!PageLocked(page)); 389 BUG_ON(!PageLocked(page));
@@ -717,6 +723,20 @@ done:
717 return nr_reclaimed; 723 return nr_reclaimed;
718} 724}
719 725
726/*
727 * We are about to scan this zone at a certain priority level. If that priority
728 * level is smaller (ie: more urgent) than the previous priority, then note
729 * that priority level within the zone. This is done so that when the next
730 * process comes in to scan this zone, it will immediately start out at this
731 * priority level rather than having to build up its own scanning priority.
732 * Here, this priority affects only the reclaim-mapped threshold.
733 */
734static inline void note_zone_scanning_priority(struct zone *zone, int priority)
735{
736 if (priority < zone->prev_priority)
737 zone->prev_priority = priority;
738}
739
720static inline int zone_is_near_oom(struct zone *zone) 740static inline int zone_is_near_oom(struct zone *zone)
721{ 741{
722 return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; 742 return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
@@ -740,7 +760,7 @@ static inline int zone_is_near_oom(struct zone *zone)
740 * But we had to alter page->flags anyway. 760 * But we had to alter page->flags anyway.
741 */ 761 */
742static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 762static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
743 struct scan_control *sc) 763 struct scan_control *sc, int priority)
744{ 764{
745 unsigned long pgmoved; 765 unsigned long pgmoved;
746 int pgdeactivate = 0; 766 int pgdeactivate = 0;
@@ -764,7 +784,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
764 * `distress' is a measure of how much trouble we're having 784 * `distress' is a measure of how much trouble we're having
765 * reclaiming pages. 0 -> no problems. 100 -> great trouble. 785 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
766 */ 786 */
767 distress = 100 >> zone->prev_priority; 787 distress = 100 >> min(zone->prev_priority, priority);
768 788
769 /* 789 /*
770 * The point of this algorithm is to decide when to start 790 * The point of this algorithm is to decide when to start
@@ -916,7 +936,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
916 nr_to_scan = min(nr_active, 936 nr_to_scan = min(nr_active,
917 (unsigned long)sc->swap_cluster_max); 937 (unsigned long)sc->swap_cluster_max);
918 nr_active -= nr_to_scan; 938 nr_active -= nr_to_scan;
919 shrink_active_list(nr_to_scan, zone, sc); 939 shrink_active_list(nr_to_scan, zone, sc, priority);
920 } 940 }
921 941
922 if (nr_inactive) { 942 if (nr_inactive) {
@@ -966,9 +986,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
966 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 986 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
967 continue; 987 continue;
968 988
969 zone->temp_priority = priority; 989 note_zone_scanning_priority(zone, priority);
970 if (zone->prev_priority > priority)
971 zone->prev_priority = priority;
972 990
973 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 991 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
974 continue; /* Let kswapd poll it */ 992 continue; /* Let kswapd poll it */
@@ -1018,7 +1036,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1018 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1036 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1019 continue; 1037 continue;
1020 1038
1021 zone->temp_priority = DEF_PRIORITY;
1022 lru_pages += zone->nr_active + zone->nr_inactive; 1039 lru_pages += zone->nr_active + zone->nr_inactive;
1023 } 1040 }
1024 1041
@@ -1053,19 +1070,28 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1053 1070
1054 /* Take a nap, wait for some writeback to complete */ 1071 /* Take a nap, wait for some writeback to complete */
1055 if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 1072 if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
1056 blk_congestion_wait(WRITE, HZ/10); 1073 congestion_wait(WRITE, HZ/10);
1057 } 1074 }
1058 /* top priority shrink_caches still had more to do? don't OOM, then */ 1075 /* top priority shrink_caches still had more to do? don't OOM, then */
1059 if (!sc.all_unreclaimable) 1076 if (!sc.all_unreclaimable)
1060 ret = 1; 1077 ret = 1;
1061out: 1078out:
1079 /*
1080 * Now that we've scanned all the zones at this priority level, note
1081 * that level within the zone so that the next thread which performs
1082 * scanning of this zone will immediately start out at this priority
1083 * level. This affects only the decision whether or not to bring
1084 * mapped pages onto the inactive list.
1085 */
1086 if (priority < 0)
1087 priority = 0;
1062 for (i = 0; zones[i] != 0; i++) { 1088 for (i = 0; zones[i] != 0; i++) {
1063 struct zone *zone = zones[i]; 1089 struct zone *zone = zones[i];
1064 1090
1065 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1091 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1066 continue; 1092 continue;
1067 1093
1068 zone->prev_priority = zone->temp_priority; 1094 zone->prev_priority = priority;
1069 } 1095 }
1070 return ret; 1096 return ret;
1071} 1097}
@@ -1105,6 +1131,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1105 .swap_cluster_max = SWAP_CLUSTER_MAX, 1131 .swap_cluster_max = SWAP_CLUSTER_MAX,
1106 .swappiness = vm_swappiness, 1132 .swappiness = vm_swappiness,
1107 }; 1133 };
1134 /*
1135 * temp_priority is used to remember the scanning priority at which
1136 * this zone was successfully refilled to free_pages == pages_high.
1137 */
1138 int temp_priority[MAX_NR_ZONES];
1108 1139
1109loop_again: 1140loop_again:
1110 total_scanned = 0; 1141 total_scanned = 0;
@@ -1112,11 +1143,8 @@ loop_again:
1112 sc.may_writepage = !laptop_mode; 1143 sc.may_writepage = !laptop_mode;
1113 count_vm_event(PAGEOUTRUN); 1144 count_vm_event(PAGEOUTRUN);
1114 1145
1115 for (i = 0; i < pgdat->nr_zones; i++) { 1146 for (i = 0; i < pgdat->nr_zones; i++)
1116 struct zone *zone = pgdat->node_zones + i; 1147 temp_priority[i] = DEF_PRIORITY;
1117
1118 zone->temp_priority = DEF_PRIORITY;
1119 }
1120 1148
1121 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1149 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1122 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 1150 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
@@ -1177,10 +1205,9 @@ scan:
1177 if (!zone_watermark_ok(zone, order, zone->pages_high, 1205 if (!zone_watermark_ok(zone, order, zone->pages_high,
1178 end_zone, 0)) 1206 end_zone, 0))
1179 all_zones_ok = 0; 1207 all_zones_ok = 0;
1180 zone->temp_priority = priority; 1208 temp_priority[i] = priority;
1181 if (zone->prev_priority > priority)
1182 zone->prev_priority = priority;
1183 sc.nr_scanned = 0; 1209 sc.nr_scanned = 0;
1210 note_zone_scanning_priority(zone, priority);
1184 nr_reclaimed += shrink_zone(priority, zone, &sc); 1211 nr_reclaimed += shrink_zone(priority, zone, &sc);
1185 reclaim_state->reclaimed_slab = 0; 1212 reclaim_state->reclaimed_slab = 0;
1186 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1213 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -1208,7 +1235,7 @@ scan:
1208 * another pass across the zones. 1235 * another pass across the zones.
1209 */ 1236 */
1210 if (total_scanned && priority < DEF_PRIORITY - 2) 1237 if (total_scanned && priority < DEF_PRIORITY - 2)
1211 blk_congestion_wait(WRITE, HZ/10); 1238 congestion_wait(WRITE, HZ/10);
1212 1239
1213 /* 1240 /*
1214 * We do this so kswapd doesn't build up large priorities for 1241 * We do this so kswapd doesn't build up large priorities for
@@ -1220,10 +1247,15 @@ scan:
1220 break; 1247 break;
1221 } 1248 }
1222out: 1249out:
1250 /*
1251 * Note within each zone the priority level at which this zone was
1252 * brought into a happy state. So that the next thread which scans this
1253 * zone will start out at that priority level.
1254 */
1223 for (i = 0; i < pgdat->nr_zones; i++) { 1255 for (i = 0; i < pgdat->nr_zones; i++) {
1224 struct zone *zone = pgdat->node_zones + i; 1256 struct zone *zone = pgdat->node_zones + i;
1225 1257
1226 zone->prev_priority = zone->temp_priority; 1258 zone->prev_priority = temp_priority[i];
1227 } 1259 }
1228 if (!all_zones_ok) { 1260 if (!all_zones_ok) {
1229 cond_resched(); 1261 cond_resched();
@@ -1352,7 +1384,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
1352 if (zone->nr_scan_active >= nr_pages || pass > 3) { 1384 if (zone->nr_scan_active >= nr_pages || pass > 3) {
1353 zone->nr_scan_active = 0; 1385 zone->nr_scan_active = 0;
1354 nr_to_scan = min(nr_pages, zone->nr_active); 1386 nr_to_scan = min(nr_pages, zone->nr_active);
1355 shrink_active_list(nr_to_scan, zone, sc); 1387 shrink_active_list(nr_to_scan, zone, sc, prio);
1356 } 1388 }
1357 } 1389 }
1358 1390
@@ -1452,7 +1484,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1452 goto out; 1484 goto out;
1453 1485
1454 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 1486 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
1455 blk_congestion_wait(WRITE, HZ / 10); 1487 congestion_wait(WRITE, HZ / 10);
1456 } 1488 }
1457 1489
1458 lru_pages = 0; 1490 lru_pages = 0;
@@ -1608,6 +1640,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1608 */ 1640 */
1609 priority = ZONE_RECLAIM_PRIORITY; 1641 priority = ZONE_RECLAIM_PRIORITY;
1610 do { 1642 do {
1643 note_zone_scanning_priority(zone, priority);
1611 nr_reclaimed += shrink_zone(priority, zone, &sc); 1644 nr_reclaimed += shrink_zone(priority, zone, &sc);
1612 priority--; 1645 priority--;
1613 } while (priority >= 0 && nr_reclaimed < nr_pages); 1646 } while (priority >= 0 && nr_reclaimed < nr_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 45b124e012f5..8614e8f6743b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -587,11 +587,9 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
587 seq_printf(m, 587 seq_printf(m,
588 "\n all_unreclaimable: %u" 588 "\n all_unreclaimable: %u"
589 "\n prev_priority: %i" 589 "\n prev_priority: %i"
590 "\n temp_priority: %i"
591 "\n start_pfn: %lu", 590 "\n start_pfn: %lu",
592 zone->all_unreclaimable, 591 zone->all_unreclaimable,
593 zone->prev_priority, 592 zone->prev_priority,
594 zone->temp_priority,
595 zone->zone_start_pfn); 593 zone->zone_start_pfn);
596 spin_unlock_irqrestore(&zone->lock, flags); 594 spin_unlock_irqrestore(&zone->lock, flags);
597 seq_putc(m, '\n'); 595 seq_putc(m, '\n');