aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDave Jones <davej@redhat.com>2006-12-12 17:41:41 -0500
committerDave Jones <davej@redhat.com>2006-12-12 17:41:41 -0500
commitc4366889dda8110247be59ca41fddb82951a8c26 (patch)
tree705c1a996bed8fd48ce94ff33ec9fd00f9b94875 /mm
parentdb2fb9db5735cc532fd4fc55e94b9a3c3750378e (diff)
parente1036502e5263851259d147771226161e5ccc85a (diff)
Merge ../linus
Conflicts: drivers/cpufreq/cpufreq.c
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile3
-rw-r--r--mm/allocpercpu.c9
-rw-r--r--mm/backing-dev.c69
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c122
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/hugetlb.c25
-rw-r--r--mm/memory.c36
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mempolicy.c12
-rw-r--r--mm/migrate.c22
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c19
-rw-r--r--mm/mmzone.c5
-rw-r--r--mm/nommu.c30
-rw-r--r--mm/oom_kill.c42
-rw-r--r--mm/page-writeback.c106
-rw-r--r--mm/page_alloc.c409
-rw-r--r--mm/page_io.c45
-rw-r--r--mm/pdflush.c1
-rw-r--r--mm/readahead.c14
-rw-r--r--mm/rmap.c36
-rw-r--r--mm/shmem.c112
-rw-r--r--mm/slab.c389
-rw-r--r--mm/sparse.c25
-rw-r--r--mm/swap.c10
-rw-r--r--mm/swapfile.c96
-rw-r--r--mm/thrash.c116
-rw-r--r--mm/tiny-shmem.c4
-rw-r--r--mm/truncate.c5
-rw-r--r--mm/vmalloc.c54
-rw-r--r--mm/vmscan.c88
-rw-r--r--mm/vmstat.c24
35 files changed, 1325 insertions, 618 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 12b3a4eee8..f3c077eb0b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,8 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 page_alloc.o page-writeback.o pdflush.o \ 11 page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o util.o mmzone.o vmstat.o $(mmu-y) 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 $(mmu-y)
14 15
15ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy) 16ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
16obj-y += bounce.o 17obj-y += bounce.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index eaa9abeea5..b2486cf887 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -17,10 +17,9 @@
17void percpu_depopulate(void *__pdata, int cpu) 17void percpu_depopulate(void *__pdata, int cpu)
18{ 18{
19 struct percpu_data *pdata = __percpu_disguise(__pdata); 19 struct percpu_data *pdata = __percpu_disguise(__pdata);
20 if (pdata->ptrs[cpu]) { 20
21 kfree(pdata->ptrs[cpu]); 21 kfree(pdata->ptrs[cpu]);
22 pdata->ptrs[cpu] = NULL; 22 pdata->ptrs[cpu] = NULL;
23 }
24} 23}
25EXPORT_SYMBOL_GPL(percpu_depopulate); 24EXPORT_SYMBOL_GPL(percpu_depopulate);
26 25
@@ -123,6 +122,8 @@ EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
123 */ 122 */
124void percpu_free(void *__pdata) 123void percpu_free(void *__pdata)
125{ 124{
125 if (unlikely(!__pdata))
126 return;
126 __percpu_depopulate_mask(__pdata, &cpu_possible_map); 127 __percpu_depopulate_mask(__pdata, &cpu_possible_map);
127 kfree(__percpu_disguise(__pdata)); 128 kfree(__percpu_disguise(__pdata));
128} 129}
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
new file mode 100644
index 0000000000..f50a2811f9
--- /dev/null
+++ b/mm/backing-dev.c
@@ -0,0 +1,69 @@
1
2#include <linux/wait.h>
3#include <linux/backing-dev.h>
4#include <linux/fs.h>
5#include <linux/sched.h>
6#include <linux/module.h>
7
8static wait_queue_head_t congestion_wqh[2] = {
9 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
10 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
11 };
12
13
14void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
15{
16 enum bdi_state bit;
17 wait_queue_head_t *wqh = &congestion_wqh[rw];
18
19 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
20 clear_bit(bit, &bdi->state);
21 smp_mb__after_clear_bit();
22 if (waitqueue_active(wqh))
23 wake_up(wqh);
24}
25EXPORT_SYMBOL(clear_bdi_congested);
26
27void set_bdi_congested(struct backing_dev_info *bdi, int rw)
28{
29 enum bdi_state bit;
30
31 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
32 set_bit(bit, &bdi->state);
33}
34EXPORT_SYMBOL(set_bdi_congested);
35
36/**
37 * congestion_wait - wait for a backing_dev to become uncongested
38 * @rw: READ or WRITE
39 * @timeout: timeout in jiffies
40 *
41 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
42 * write congestion. If no backing_devs are congested then just wait for the
43 * next write to be completed.
44 */
45long congestion_wait(int rw, long timeout)
46{
47 long ret;
48 DEFINE_WAIT(wait);
49 wait_queue_head_t *wqh = &congestion_wqh[rw];
50
51 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
52 ret = io_schedule_timeout(timeout);
53 finish_wait(wqh, &wait);
54 return ret;
55}
56EXPORT_SYMBOL(congestion_wait);
57
58/**
59 * congestion_end - wake up sleepers on a congested backing_dev_info
60 * @rw: READ or WRITE
61 */
62void congestion_end(int rw)
63{
64 wait_queue_head_t *wqh = &congestion_wqh[rw];
65
66 if (waitqueue_active(wqh))
67 wake_up(wqh);
68}
69EXPORT_SYMBOL(congestion_end);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d53112fcb4..00a96970b2 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -27,8 +27,6 @@ unsigned long max_low_pfn;
27unsigned long min_low_pfn; 27unsigned long min_low_pfn;
28unsigned long max_pfn; 28unsigned long max_pfn;
29 29
30EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */
31
32static LIST_HEAD(bdata_list); 30static LIST_HEAD(bdata_list);
33#ifdef CONFIG_CRASH_DUMP 31#ifdef CONFIG_CRASH_DUMP
34/* 32/*
@@ -196,6 +194,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
196 if (limit && bdata->node_boot_start >= limit) 194 if (limit && bdata->node_boot_start >= limit)
197 return NULL; 195 return NULL;
198 196
197 /* on nodes without memory - bootmem_map is NULL */
198 if (!bdata->node_bootmem_map)
199 return NULL;
200
199 end_pfn = bdata->node_low_pfn; 201 end_pfn = bdata->node_low_pfn;
200 limit = PFN_DOWN(limit); 202 limit = PFN_DOWN(limit);
201 if (limit && end_pfn > limit) 203 if (limit && end_pfn > limit)
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 168c78a121..0df4c899e9 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -38,7 +38,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
38 if (!file) 38 if (!file)
39 return -EBADF; 39 return -EBADF;
40 40
41 if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) { 41 if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
42 ret = -ESPIPE; 42 ret = -ESPIPE;
43 goto out; 43 goto out;
44 } 44 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 3464b681f8..8332c77b1b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -75,8 +75,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
75 * ->mmap_sem 75 * ->mmap_sem
76 * ->lock_page (access_process_vm) 76 * ->lock_page (access_process_vm)
77 * 77 *
78 * ->mmap_sem 78 * ->i_mutex (generic_file_buffered_write)
79 * ->i_mutex (msync) 79 * ->mmap_sem (fault_in_pages_readable->do_page_fault)
80 * 80 *
81 * ->i_mutex 81 * ->i_mutex
82 * ->i_alloc_sem (various) 82 * ->i_alloc_sem (various)
@@ -467,25 +467,15 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
467} 467}
468 468
469#ifdef CONFIG_NUMA 469#ifdef CONFIG_NUMA
470struct page *page_cache_alloc(struct address_space *x) 470struct page *__page_cache_alloc(gfp_t gfp)
471{ 471{
472 if (cpuset_do_page_mem_spread()) { 472 if (cpuset_do_page_mem_spread()) {
473 int n = cpuset_mem_spread_node(); 473 int n = cpuset_mem_spread_node();
474 return alloc_pages_node(n, mapping_gfp_mask(x), 0); 474 return alloc_pages_node(n, gfp, 0);
475 } 475 }
476 return alloc_pages(mapping_gfp_mask(x), 0); 476 return alloc_pages(gfp, 0);
477} 477}
478EXPORT_SYMBOL(page_cache_alloc); 478EXPORT_SYMBOL(__page_cache_alloc);
479
480struct page *page_cache_alloc_cold(struct address_space *x)
481{
482 if (cpuset_do_page_mem_spread()) {
483 int n = cpuset_mem_spread_node();
484 return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
485 }
486 return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
487}
488EXPORT_SYMBOL(page_cache_alloc_cold);
489#endif 479#endif
490 480
491static int __sleep_on_page_lock(void *word) 481static int __sleep_on_page_lock(void *word)
@@ -826,7 +816,6 @@ struct page *
826grab_cache_page_nowait(struct address_space *mapping, unsigned long index) 816grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
827{ 817{
828 struct page *page = find_get_page(mapping, index); 818 struct page *page = find_get_page(mapping, index);
829 gfp_t gfp_mask;
830 819
831 if (page) { 820 if (page) {
832 if (!TestSetPageLocked(page)) 821 if (!TestSetPageLocked(page))
@@ -834,9 +823,8 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
834 page_cache_release(page); 823 page_cache_release(page);
835 return NULL; 824 return NULL;
836 } 825 }
837 gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS; 826 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
838 page = alloc_pages(gfp_mask, 0); 827 if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
839 if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
840 page_cache_release(page); 828 page_cache_release(page);
841 page = NULL; 829 page = NULL;
842 } 830 }
@@ -1193,8 +1181,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1193 if (pos < size) { 1181 if (pos < size) {
1194 retval = generic_file_direct_IO(READ, iocb, 1182 retval = generic_file_direct_IO(READ, iocb,
1195 iov, pos, nr_segs); 1183 iov, pos, nr_segs);
1196 if (retval > 0 && !is_sync_kiocb(iocb))
1197 retval = -EIOCBQUEUED;
1198 if (retval > 0) 1184 if (retval > 0)
1199 *ppos = pos + retval; 1185 *ppos = pos + retval;
1200 } 1186 }
@@ -1457,7 +1443,6 @@ no_cached_page:
1457 * effect. 1443 * effect.
1458 */ 1444 */
1459 error = page_cache_read(file, pgoff); 1445 error = page_cache_read(file, pgoff);
1460 grab_swap_token();
1461 1446
1462 /* 1447 /*
1463 * The page we want has now been added to the page cache. 1448 * The page we want has now been added to the page cache.
@@ -1884,11 +1869,10 @@ repeat:
1884 * if suid or (sgid and xgrp) 1869 * if suid or (sgid and xgrp)
1885 * remove privs 1870 * remove privs
1886 */ 1871 */
1887int remove_suid(struct dentry *dentry) 1872int should_remove_suid(struct dentry *dentry)
1888{ 1873{
1889 mode_t mode = dentry->d_inode->i_mode; 1874 mode_t mode = dentry->d_inode->i_mode;
1890 int kill = 0; 1875 int kill = 0;
1891 int result = 0;
1892 1876
1893 /* suid always must be killed */ 1877 /* suid always must be killed */
1894 if (unlikely(mode & S_ISUID)) 1878 if (unlikely(mode & S_ISUID))
@@ -1901,13 +1885,29 @@ int remove_suid(struct dentry *dentry)
1901 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 1885 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1902 kill |= ATTR_KILL_SGID; 1886 kill |= ATTR_KILL_SGID;
1903 1887
1904 if (unlikely(kill && !capable(CAP_FSETID))) { 1888 if (unlikely(kill && !capable(CAP_FSETID)))
1905 struct iattr newattrs; 1889 return kill;
1906 1890
1907 newattrs.ia_valid = ATTR_FORCE | kill; 1891 return 0;
1908 result = notify_change(dentry, &newattrs); 1892}
1909 } 1893EXPORT_SYMBOL(should_remove_suid);
1910 return result; 1894
1895int __remove_suid(struct dentry *dentry, int kill)
1896{
1897 struct iattr newattrs;
1898
1899 newattrs.ia_valid = ATTR_FORCE | kill;
1900 return notify_change(dentry, &newattrs);
1901}
1902
1903int remove_suid(struct dentry *dentry)
1904{
1905 int kill = should_remove_suid(dentry);
1906
1907 if (unlikely(kill))
1908 return __remove_suid(dentry, kill);
1909
1910 return 0;
1911} 1911}
1912EXPORT_SYMBOL(remove_suid); 1912EXPORT_SYMBOL(remove_suid);
1913 1913
@@ -2045,15 +2045,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2045 * Sync the fs metadata but not the minor inode changes and 2045 * Sync the fs metadata but not the minor inode changes and
2046 * of course not the data as we did direct DMA for the IO. 2046 * of course not the data as we did direct DMA for the IO.
2047 * i_mutex is held, which protects generic_osync_inode() from 2047 * i_mutex is held, which protects generic_osync_inode() from
2048 * livelocking. 2048 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2049 */ 2049 */
2050 if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2050 if ((written >= 0 || written == -EIOCBQUEUED) &&
2051 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2051 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); 2052 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2052 if (err < 0) 2053 if (err < 0)
2053 written = err; 2054 written = err;
2054 } 2055 }
2055 if (written == count && !is_sync_kiocb(iocb))
2056 written = -EIOCBQUEUED;
2057 return written; 2056 return written;
2058} 2057}
2059EXPORT_SYMBOL(generic_file_direct_write); 2058EXPORT_SYMBOL(generic_file_direct_write);
@@ -2222,7 +2221,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2222 unsigned long nr_segs, loff_t *ppos) 2221 unsigned long nr_segs, loff_t *ppos)
2223{ 2222{
2224 struct file *file = iocb->ki_filp; 2223 struct file *file = iocb->ki_filp;
2225 const struct address_space * mapping = file->f_mapping; 2224 struct address_space * mapping = file->f_mapping;
2226 size_t ocount; /* original count */ 2225 size_t ocount; /* original count */
2227 size_t count; /* after file limit checks */ 2226 size_t count; /* after file limit checks */
2228 struct inode *inode = mapping->host; 2227 struct inode *inode = mapping->host;
@@ -2267,7 +2266,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2267 if (count == 0) 2266 if (count == 0)
2268 goto out; 2267 goto out;
2269 2268
2270 err = remove_suid(file->f_dentry); 2269 err = remove_suid(file->f_path.dentry);
2271 if (err) 2270 if (err)
2272 goto out; 2271 goto out;
2273 2272
@@ -2275,8 +2274,11 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2275 2274
2276 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 2275 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2277 if (unlikely(file->f_flags & O_DIRECT)) { 2276 if (unlikely(file->f_flags & O_DIRECT)) {
2278 written = generic_file_direct_write(iocb, iov, 2277 loff_t endbyte;
2279 &nr_segs, pos, ppos, count, ocount); 2278 ssize_t written_buffered;
2279
2280 written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
2281 ppos, count, ocount);
2280 if (written < 0 || written == count) 2282 if (written < 0 || written == count)
2281 goto out; 2283 goto out;
2282 /* 2284 /*
@@ -2285,10 +2287,46 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2285 */ 2287 */
2286 pos += written; 2288 pos += written;
2287 count -= written; 2289 count -= written;
2288 } 2290 written_buffered = generic_file_buffered_write(iocb, iov,
2291 nr_segs, pos, ppos, count,
2292 written);
2293 /*
2294 * If generic_file_buffered_write() retuned a synchronous error
2295 * then we want to return the number of bytes which were
2296 * direct-written, or the error code if that was zero. Note
2297 * that this differs from normal direct-io semantics, which
2298 * will return -EFOO even if some bytes were written.
2299 */
2300 if (written_buffered < 0) {
2301 err = written_buffered;
2302 goto out;
2303 }
2289 2304
2290 written = generic_file_buffered_write(iocb, iov, nr_segs, 2305 /*
2291 pos, ppos, count, written); 2306 * We need to ensure that the page cache pages are written to
2307 * disk and invalidated to preserve the expected O_DIRECT
2308 * semantics.
2309 */
2310 endbyte = pos + written_buffered - written - 1;
2311 err = do_sync_file_range(file, pos, endbyte,
2312 SYNC_FILE_RANGE_WAIT_BEFORE|
2313 SYNC_FILE_RANGE_WRITE|
2314 SYNC_FILE_RANGE_WAIT_AFTER);
2315 if (err == 0) {
2316 written = written_buffered;
2317 invalidate_mapping_pages(mapping,
2318 pos >> PAGE_CACHE_SHIFT,
2319 endbyte >> PAGE_CACHE_SHIFT);
2320 } else {
2321 /*
2322 * We don't know how much we wrote, so just return
2323 * the number of bytes which were direct-written
2324 */
2325 }
2326 } else {
2327 written = generic_file_buffered_write(iocb, iov, nr_segs,
2328 pos, ppos, count, written);
2329 }
2292out: 2330out:
2293 current->backing_dev_info = NULL; 2331 current->backing_dev_info = NULL;
2294 return written ? written : err; 2332 return written ? written : err;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b4fd0d7c9b..8d667617f5 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -379,7 +379,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
379 if (count == 0) 379 if (count == 0)
380 goto out_backing; 380 goto out_backing;
381 381
382 ret = remove_suid(filp->f_dentry); 382 ret = remove_suid(filp->f_path.dentry);
383 if (ret) 383 if (ret)
384 goto out_backing; 384 goto out_backing;
385 385
diff --git a/mm/fremap.c b/mm/fremap.c
index 7a9d0f5d24..b77a002c33 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -101,7 +101,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
101{ 101{
102 int err = -ENOMEM; 102 int err = -ENOMEM;
103 pte_t *pte; 103 pte_t *pte;
104 pte_t pte_val;
105 spinlock_t *ptl; 104 spinlock_t *ptl;
106 105
107 pte = get_locked_pte(mm, addr, &ptl); 106 pte = get_locked_pte(mm, addr, &ptl);
@@ -114,7 +113,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
114 } 113 }
115 114
116 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); 115 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
117 pte_val = *pte;
118 /* 116 /*
119 * We don't need to run update_mmu_cache() here because the "file pte" 117 * We don't need to run update_mmu_cache() here because the "file pte"
120 * being installed by install_file_pte() is not a real pte - it's a 118 * being installed by install_file_pte() is not a real pte - it's a
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2dbec90dc3..0ccc7f2302 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -109,7 +109,7 @@ static int alloc_fresh_huge_page(void)
109 if (nid == MAX_NUMNODES) 109 if (nid == MAX_NUMNODES)
110 nid = first_node(node_online_map); 110 nid = first_node(node_online_map);
111 if (page) { 111 if (page) {
112 page[1].lru.next = (void *)free_huge_page; /* dtor */ 112 set_compound_page_dtor(page, free_huge_page);
113 spin_lock(&hugetlb_lock); 113 spin_lock(&hugetlb_lock);
114 nr_huge_pages++; 114 nr_huge_pages++;
115 nr_huge_pages_node[page_to_nid(page)]++; 115 nr_huge_pages_node[page_to_nid(page)]++;
@@ -344,7 +344,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
344 entry = *src_pte; 344 entry = *src_pte;
345 ptepage = pte_page(entry); 345 ptepage = pte_page(entry);
346 get_page(ptepage); 346 get_page(ptepage);
347 add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
348 set_huge_pte_at(dst, addr, dst_pte, entry); 347 set_huge_pte_at(dst, addr, dst_pte, entry);
349 } 348 }
350 spin_unlock(&src->page_table_lock); 349 spin_unlock(&src->page_table_lock);
@@ -365,6 +364,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
365 pte_t pte; 364 pte_t pte;
366 struct page *page; 365 struct page *page;
367 struct page *tmp; 366 struct page *tmp;
367 /*
368 * A page gathering list, protected by per file i_mmap_lock. The
369 * lock is used to avoid list corruption from multiple unmapping
370 * of the same page since we are using page->lru.
371 */
368 LIST_HEAD(page_list); 372 LIST_HEAD(page_list);
369 373
370 WARN_ON(!is_vm_hugetlb_page(vma)); 374 WARN_ON(!is_vm_hugetlb_page(vma));
@@ -372,24 +376,21 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
372 BUG_ON(end & ~HPAGE_MASK); 376 BUG_ON(end & ~HPAGE_MASK);
373 377
374 spin_lock(&mm->page_table_lock); 378 spin_lock(&mm->page_table_lock);
375
376 /* Update high watermark before we lower rss */
377 update_hiwater_rss(mm);
378
379 for (address = start; address < end; address += HPAGE_SIZE) { 379 for (address = start; address < end; address += HPAGE_SIZE) {
380 ptep = huge_pte_offset(mm, address); 380 ptep = huge_pte_offset(mm, address);
381 if (!ptep) 381 if (!ptep)
382 continue; 382 continue;
383 383
384 if (huge_pmd_unshare(mm, &address, ptep))
385 continue;
386
384 pte = huge_ptep_get_and_clear(mm, address, ptep); 387 pte = huge_ptep_get_and_clear(mm, address, ptep);
385 if (pte_none(pte)) 388 if (pte_none(pte))
386 continue; 389 continue;
387 390
388 page = pte_page(pte); 391 page = pte_page(pte);
389 list_add(&page->lru, &page_list); 392 list_add(&page->lru, &page_list);
390 add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
391 } 393 }
392
393 spin_unlock(&mm->page_table_lock); 394 spin_unlock(&mm->page_table_lock);
394 flush_tlb_range(vma, start, end); 395 flush_tlb_range(vma, start, end);
395 list_for_each_entry_safe(page, tmp, &page_list, lru) { 396 list_for_each_entry_safe(page, tmp, &page_list, lru) {
@@ -478,6 +479,9 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
478retry: 479retry:
479 page = find_lock_page(mapping, idx); 480 page = find_lock_page(mapping, idx);
480 if (!page) { 481 if (!page) {
482 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
483 if (idx >= size)
484 goto out;
481 if (hugetlb_get_quota(mapping)) 485 if (hugetlb_get_quota(mapping))
482 goto out; 486 goto out;
483 page = alloc_huge_page(vma, address); 487 page = alloc_huge_page(vma, address);
@@ -512,7 +516,6 @@ retry:
512 if (!pte_none(*ptep)) 516 if (!pte_none(*ptep))
513 goto backout; 517 goto backout;
514 518
515 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
516 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 519 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
517 && (vma->vm_flags & VM_SHARED))); 520 && (vma->vm_flags & VM_SHARED)));
518 set_huge_pte_at(mm, address, ptep, new_pte); 521 set_huge_pte_at(mm, address, ptep, new_pte);
@@ -650,11 +653,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
650 BUG_ON(address >= end); 653 BUG_ON(address >= end);
651 flush_cache_range(vma, address, end); 654 flush_cache_range(vma, address, end);
652 655
656 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
653 spin_lock(&mm->page_table_lock); 657 spin_lock(&mm->page_table_lock);
654 for (; address < end; address += HPAGE_SIZE) { 658 for (; address < end; address += HPAGE_SIZE) {
655 ptep = huge_pte_offset(mm, address); 659 ptep = huge_pte_offset(mm, address);
656 if (!ptep) 660 if (!ptep)
657 continue; 661 continue;
662 if (huge_pmd_unshare(mm, &address, ptep))
663 continue;
658 if (!pte_none(*ptep)) { 664 if (!pte_none(*ptep)) {
659 pte = huge_ptep_get_and_clear(mm, address, ptep); 665 pte = huge_ptep_get_and_clear(mm, address, ptep);
660 pte = pte_mkhuge(pte_modify(pte, newprot)); 666 pte = pte_mkhuge(pte_modify(pte, newprot));
@@ -663,6 +669,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
663 } 669 }
664 } 670 }
665 spin_unlock(&mm->page_table_lock); 671 spin_unlock(&mm->page_table_lock);
672 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
666 673
667 flush_tlb_range(vma, start, end); 674 flush_tlb_range(vma, start, end);
668} 675}
diff --git a/mm/memory.c b/mm/memory.c
index b5a4aadd96..bf6100236e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1110,23 +1110,29 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1110{ 1110{
1111 pte_t *pte; 1111 pte_t *pte;
1112 spinlock_t *ptl; 1112 spinlock_t *ptl;
1113 int err = 0;
1113 1114
1114 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 1115 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1115 if (!pte) 1116 if (!pte)
1116 return -ENOMEM; 1117 return -EAGAIN;
1117 arch_enter_lazy_mmu_mode(); 1118 arch_enter_lazy_mmu_mode();
1118 do { 1119 do {
1119 struct page *page = ZERO_PAGE(addr); 1120 struct page *page = ZERO_PAGE(addr);
1120 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); 1121 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
1122
1123 if (unlikely(!pte_none(*pte))) {
1124 err = -EEXIST;
1125 pte++;
1126 break;
1127 }
1121 page_cache_get(page); 1128 page_cache_get(page);
1122 page_add_file_rmap(page); 1129 page_add_file_rmap(page);
1123 inc_mm_counter(mm, file_rss); 1130 inc_mm_counter(mm, file_rss);
1124 BUG_ON(!pte_none(*pte));
1125 set_pte_at(mm, addr, pte, zero_pte); 1131 set_pte_at(mm, addr, pte, zero_pte);
1126 } while (pte++, addr += PAGE_SIZE, addr != end); 1132 } while (pte++, addr += PAGE_SIZE, addr != end);
1127 arch_leave_lazy_mmu_mode(); 1133 arch_leave_lazy_mmu_mode();
1128 pte_unmap_unlock(pte - 1, ptl); 1134 pte_unmap_unlock(pte - 1, ptl);
1129 return 0; 1135 return err;
1130} 1136}
1131 1137
1132static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, 1138static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -1134,16 +1140,18 @@ static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
1134{ 1140{
1135 pmd_t *pmd; 1141 pmd_t *pmd;
1136 unsigned long next; 1142 unsigned long next;
1143 int err;
1137 1144
1138 pmd = pmd_alloc(mm, pud, addr); 1145 pmd = pmd_alloc(mm, pud, addr);
1139 if (!pmd) 1146 if (!pmd)
1140 return -ENOMEM; 1147 return -EAGAIN;
1141 do { 1148 do {
1142 next = pmd_addr_end(addr, end); 1149 next = pmd_addr_end(addr, end);
1143 if (zeromap_pte_range(mm, pmd, addr, next, prot)) 1150 err = zeromap_pte_range(mm, pmd, addr, next, prot);
1144 return -ENOMEM; 1151 if (err)
1152 break;
1145 } while (pmd++, addr = next, addr != end); 1153 } while (pmd++, addr = next, addr != end);
1146 return 0; 1154 return err;
1147} 1155}
1148 1156
1149static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, 1157static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
@@ -1151,16 +1159,18 @@ static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1151{ 1159{
1152 pud_t *pud; 1160 pud_t *pud;
1153 unsigned long next; 1161 unsigned long next;
1162 int err;
1154 1163
1155 pud = pud_alloc(mm, pgd, addr); 1164 pud = pud_alloc(mm, pgd, addr);
1156 if (!pud) 1165 if (!pud)
1157 return -ENOMEM; 1166 return -EAGAIN;
1158 do { 1167 do {
1159 next = pud_addr_end(addr, end); 1168 next = pud_addr_end(addr, end);
1160 if (zeromap_pmd_range(mm, pud, addr, next, prot)) 1169 err = zeromap_pmd_range(mm, pud, addr, next, prot);
1161 return -ENOMEM; 1170 if (err)
1171 break;
1162 } while (pud++, addr = next, addr != end); 1172 } while (pud++, addr = next, addr != end);
1163 return 0; 1173 return err;
1164} 1174}
1165 1175
1166int zeromap_page_range(struct vm_area_struct *vma, 1176int zeromap_page_range(struct vm_area_struct *vma,
@@ -1452,6 +1462,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
1452 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) 1462 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1453 memset(kaddr, 0, PAGE_SIZE); 1463 memset(kaddr, 0, PAGE_SIZE);
1454 kunmap_atomic(kaddr, KM_USER0); 1464 kunmap_atomic(kaddr, KM_USER0);
1465 flush_dcache_page(dst);
1455 return; 1466 return;
1456 1467
1457 } 1468 }
@@ -1901,7 +1912,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1901 1912
1902 return 0; 1913 return 0;
1903} 1914}
1904EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */
1905 1915
1906/** 1916/**
1907 * swapin_readahead - swap in pages in hope we need them soon 1917 * swapin_readahead - swap in pages in hope we need them soon
@@ -1990,6 +2000,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1990 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2000 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
1991 page = lookup_swap_cache(entry); 2001 page = lookup_swap_cache(entry);
1992 if (!page) { 2002 if (!page) {
2003 grab_swap_token(); /* Contend for token _before_ read-in */
1993 swapin_readahead(entry, address, vma); 2004 swapin_readahead(entry, address, vma);
1994 page = read_swap_cache_async(entry, vma, address); 2005 page = read_swap_cache_async(entry, vma, address);
1995 if (!page) { 2006 if (!page) {
@@ -2007,7 +2018,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2007 /* Had to read the page from swap area: Major fault */ 2018 /* Had to read the page from swap area: Major fault */
2008 ret = VM_FAULT_MAJOR; 2019 ret = VM_FAULT_MAJOR;
2009 count_vm_event(PGMAJFAULT); 2020 count_vm_event(PGMAJFAULT);
2010 grab_swap_token();
2011 } 2021 }
2012 2022
2013 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2023 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fd678a662e..0c055a090f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -72,7 +72,6 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
72 return ret; 72 return ret;
73 } 73 }
74 memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); 74 memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
75 zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
76 return 0; 75 return 0;
77} 76}
78 77
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 617fb31086..da94639465 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
141 enum zone_type k; 141 enum zone_type k;
142 142
143 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 143 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
144 max++; /* space for zlcache_ptr (see mmzone.h) */
144 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); 145 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
145 if (!zl) 146 if (!zl)
146 return NULL; 147 return NULL;
148 zl->zlcache_ptr = NULL;
147 num = 0; 149 num = 0;
148 /* First put in the highest zones from all nodes, then all the next 150 /* First put in the highest zones from all nodes, then all the next
149 lower zones etc. Avoid empty zones because the memory allocator 151 lower zones etc. Avoid empty zones because the memory allocator
@@ -219,7 +221,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
219 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 221 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
220 do { 222 do {
221 struct page *page; 223 struct page *page;
222 unsigned int nid; 224 int nid;
223 225
224 if (!pte_present(*pte)) 226 if (!pte_present(*pte))
225 continue; 227 continue;
@@ -1324,7 +1326,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
1324 atomic_set(&new->refcnt, 1); 1326 atomic_set(&new->refcnt, 1);
1325 if (new->policy == MPOL_BIND) { 1327 if (new->policy == MPOL_BIND) {
1326 int sz = ksize(old->v.zonelist); 1328 int sz = ksize(old->v.zonelist);
1327 new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL); 1329 new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1328 if (!new->v.zonelist) { 1330 if (!new->v.zonelist) {
1329 kmem_cache_free(policy_cache, new); 1331 kmem_cache_free(policy_cache, new);
1330 return ERR_PTR(-ENOMEM); 1332 return ERR_PTR(-ENOMEM);
@@ -1705,8 +1707,8 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1705 * Display pages allocated per node and memory policy via /proc. 1707 * Display pages allocated per node and memory policy via /proc.
1706 */ 1708 */
1707 1709
1708static const char *policy_types[] = { "default", "prefer", "bind", 1710static const char * const policy_types[] =
1709 "interleave" }; 1711 { "default", "prefer", "bind", "interleave" };
1710 1712
1711/* 1713/*
1712 * Convert a mempolicy into a string. 1714 * Convert a mempolicy into a string.
@@ -1855,7 +1857,7 @@ int show_numa_map(struct seq_file *m, void *v)
1855 1857
1856 if (file) { 1858 if (file) {
1857 seq_printf(m, " file="); 1859 seq_printf(m, " file=");
1858 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= "); 1860 seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1859 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1861 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1860 seq_printf(m, " heap"); 1862 seq_printf(m, " heap");
1861 } else if (vma->vm_start <= mm->start_stack && 1863 } else if (vma->vm_start <= mm->start_stack &&
diff --git a/mm/migrate.c b/mm/migrate.c
index ba2453f948..e9b161bde9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -294,7 +294,7 @@ out:
294static int migrate_page_move_mapping(struct address_space *mapping, 294static int migrate_page_move_mapping(struct address_space *mapping,
295 struct page *newpage, struct page *page) 295 struct page *newpage, struct page *page)
296{ 296{
297 struct page **radix_pointer; 297 void **pslot;
298 298
299 if (!mapping) { 299 if (!mapping) {
300 /* Anonymous page */ 300 /* Anonymous page */
@@ -305,12 +305,11 @@ static int migrate_page_move_mapping(struct address_space *mapping,
305 305
306 write_lock_irq(&mapping->tree_lock); 306 write_lock_irq(&mapping->tree_lock);
307 307
308 radix_pointer = (struct page **)radix_tree_lookup_slot( 308 pslot = radix_tree_lookup_slot(&mapping->page_tree,
309 &mapping->page_tree, 309 page_index(page));
310 page_index(page));
311 310
312 if (page_count(page) != 2 + !!PagePrivate(page) || 311 if (page_count(page) != 2 + !!PagePrivate(page) ||
313 *radix_pointer != page) { 312 (struct page *)radix_tree_deref_slot(pslot) != page) {
314 write_unlock_irq(&mapping->tree_lock); 313 write_unlock_irq(&mapping->tree_lock);
315 return -EAGAIN; 314 return -EAGAIN;
316 } 315 }
@@ -318,7 +317,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
318 /* 317 /*
319 * Now we know that no one else is looking at the page. 318 * Now we know that no one else is looking at the page.
320 */ 319 */
321 get_page(newpage); 320 get_page(newpage); /* add cache reference */
322#ifdef CONFIG_SWAP 321#ifdef CONFIG_SWAP
323 if (PageSwapCache(page)) { 322 if (PageSwapCache(page)) {
324 SetPageSwapCache(newpage); 323 SetPageSwapCache(newpage);
@@ -326,8 +325,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
326 } 325 }
327#endif 326#endif
328 327
329 *radix_pointer = newpage; 328 radix_tree_replace_slot(pslot, newpage);
329
330 /*
331 * Drop cache reference from old page.
332 * We know this isn't the last reference.
333 */
330 __put_page(page); 334 __put_page(page);
335
331 write_unlock_irq(&mapping->tree_lock); 336 write_unlock_irq(&mapping->tree_lock);
332 337
333 return 0; 338 return 0;
@@ -952,7 +957,8 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
952 goto out; 957 goto out;
953 958
954 pm[i].node = node; 959 pm[i].node = node;
955 } 960 } else
961 pm[i].node = 0; /* anything to not match MAX_NUMNODES */
956 } 962 }
957 /* End marker */ 963 /* End marker */
958 pm[nr_pages].node = MAX_NUMNODES; 964 pm[nr_pages].node = MAX_NUMNODES;
diff --git a/mm/mlock.c b/mm/mlock.c
index b90c59573a..3446b7ef73 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -65,7 +65,7 @@ success:
65 ret = make_pages_present(start, end); 65 ret = make_pages_present(start, end);
66 } 66 }
67 67
68 vma->vm_mm->locked_vm -= pages; 68 mm->locked_vm -= pages;
69out: 69out:
70 if (ret == -ENOMEM) 70 if (ret == -ENOMEM)
71 ret = -EAGAIN; 71 ret = -EAGAIN;
diff --git a/mm/mmap.c b/mm/mmap.c
index 497e502dfd..9717337293 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -188,7 +188,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
188 struct file *file, struct address_space *mapping) 188 struct file *file, struct address_space *mapping)
189{ 189{
190 if (vma->vm_flags & VM_DENYWRITE) 190 if (vma->vm_flags & VM_DENYWRITE)
191 atomic_inc(&file->f_dentry->d_inode->i_writecount); 191 atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
192 if (vma->vm_flags & VM_SHARED) 192 if (vma->vm_flags & VM_SHARED)
193 mapping->i_mmap_writable--; 193 mapping->i_mmap_writable--;
194 194
@@ -399,7 +399,7 @@ static inline void __vma_link_file(struct vm_area_struct *vma)
399 struct address_space *mapping = file->f_mapping; 399 struct address_space *mapping = file->f_mapping;
400 400
401 if (vma->vm_flags & VM_DENYWRITE) 401 if (vma->vm_flags & VM_DENYWRITE)
402 atomic_dec(&file->f_dentry->d_inode->i_writecount); 402 atomic_dec(&file->f_path.dentry->d_inode->i_writecount);
403 if (vma->vm_flags & VM_SHARED) 403 if (vma->vm_flags & VM_SHARED)
404 mapping->i_mmap_writable++; 404 mapping->i_mmap_writable++;
405 405
@@ -907,7 +907,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
907 * mounted, in which case we dont add PROT_EXEC.) 907 * mounted, in which case we dont add PROT_EXEC.)
908 */ 908 */
909 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) 909 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
910 if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))) 910 if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
911 prot |= PROT_EXEC; 911 prot |= PROT_EXEC;
912 912
913 if (!len) 913 if (!len)
@@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
960 return -EAGAIN; 960 return -EAGAIN;
961 } 961 }
962 962
963 inode = file ? file->f_dentry->d_inode : NULL; 963 inode = file ? file->f_path.dentry->d_inode : NULL;
964 964
965 if (file) { 965 if (file) {
966 switch (flags & MAP_TYPE) { 966 switch (flags & MAP_TYPE) {
@@ -989,7 +989,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
989 case MAP_PRIVATE: 989 case MAP_PRIVATE:
990 if (!(file->f_mode & FMODE_READ)) 990 if (!(file->f_mode & FMODE_READ))
991 return -EACCES; 991 return -EACCES;
992 if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { 992 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
993 if (vm_flags & VM_EXEC) 993 if (vm_flags & VM_EXEC)
994 return -EPERM; 994 return -EPERM;
995 vm_flags &= ~VM_MAYEXEC; 995 vm_flags &= ~VM_MAYEXEC;
@@ -1379,7 +1379,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1379 * Check if the given range is hugepage aligned, and 1379 * Check if the given range is hugepage aligned, and
1380 * can be made suitable for hugepages. 1380 * can be made suitable for hugepages.
1381 */ 1381 */
1382 ret = prepare_hugepage_range(addr, len); 1382 ret = prepare_hugepage_range(addr, len, pgoff);
1383 } else { 1383 } else {
1384 /* 1384 /*
1385 * Ensure that a normal request is not falling in a 1385 * Ensure that a normal request is not falling in a
@@ -1736,7 +1736,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1736 if (mm->map_count >= sysctl_max_map_count) 1736 if (mm->map_count >= sysctl_max_map_count)
1737 return -ENOMEM; 1737 return -ENOMEM;
1738 1738
1739 new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 1739 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1740 if (!new) 1740 if (!new)
1741 return -ENOMEM; 1741 return -ENOMEM;
1742 1742
@@ -1880,6 +1880,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
1880 if ((addr + len) > TASK_SIZE || (addr + len) < addr) 1880 if ((addr + len) > TASK_SIZE || (addr + len) < addr)
1881 return -EINVAL; 1881 return -EINVAL;
1882 1882
1883 if (is_hugepage_only_range(mm, addr, len))
1884 return -EINVAL;
1885
1883 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 1886 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1884 1887
1885 error = arch_mmap_check(addr, len, flags); 1888 error = arch_mmap_check(addr, len, flags);
@@ -2054,7 +2057,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2054 vma_start < new_vma->vm_end) 2057 vma_start < new_vma->vm_end)
2055 *vmap = new_vma; 2058 *vmap = new_vma;
2056 } else { 2059 } else {
2057 new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 2060 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2058 if (new_vma) { 2061 if (new_vma) {
2059 *new_vma = *vma; 2062 *new_vma = *vma;
2060 pol = mpol_copy(vma_policy(vma)); 2063 pol = mpol_copy(vma_policy(vma));
diff --git a/mm/mmzone.c b/mm/mmzone.c
index febea1c981..eb5838634f 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -14,8 +14,6 @@ struct pglist_data *first_online_pgdat(void)
14 return NODE_DATA(first_online_node); 14 return NODE_DATA(first_online_node);
15} 15}
16 16
17EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */
18
19struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) 17struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
20{ 18{
21 int nid = next_online_node(pgdat->node_id); 19 int nid = next_online_node(pgdat->node_id);
@@ -24,8 +22,6 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
24 return NULL; 22 return NULL;
25 return NODE_DATA(nid); 23 return NODE_DATA(nid);
26} 24}
27EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */
28
29 25
30/* 26/*
31 * next_zone - helper magic for for_each_zone() 27 * next_zone - helper magic for for_each_zone()
@@ -45,5 +41,4 @@ struct zone *next_zone(struct zone *zone)
45 } 41 }
46 return zone; 42 return zone;
47} 43}
48EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */
49 44
diff --git a/mm/nommu.c b/mm/nommu.c
index 8bdde9508f..23fb033e59 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -497,15 +497,17 @@ static int validate_mmap_request(struct file *file,
497 (flags & MAP_TYPE) != MAP_SHARED) 497 (flags & MAP_TYPE) != MAP_SHARED)
498 return -EINVAL; 498 return -EINVAL;
499 499
500 if (PAGE_ALIGN(len) == 0) 500 if (!len)
501 return addr;
502
503 if (len > TASK_SIZE)
504 return -EINVAL; 501 return -EINVAL;
505 502
503 /* Careful about overflows.. */
504 len = PAGE_ALIGN(len);
505 if (!len || len > TASK_SIZE)
506 return -ENOMEM;
507
506 /* offset overflow? */ 508 /* offset overflow? */
507 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) 509 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
508 return -EINVAL; 510 return -EOVERFLOW;
509 511
510 if (file) { 512 if (file) {
511 /* validate file mapping requests */ 513 /* validate file mapping requests */
@@ -521,7 +523,7 @@ static int validate_mmap_request(struct file *file,
521 */ 523 */
522 mapping = file->f_mapping; 524 mapping = file->f_mapping;
523 if (!mapping) 525 if (!mapping)
524 mapping = file->f_dentry->d_inode->i_mapping; 526 mapping = file->f_path.dentry->d_inode->i_mapping;
525 527
526 capabilities = 0; 528 capabilities = 0;
527 if (mapping && mapping->backing_dev_info) 529 if (mapping && mapping->backing_dev_info)
@@ -530,7 +532,7 @@ static int validate_mmap_request(struct file *file,
530 if (!capabilities) { 532 if (!capabilities) {
531 /* no explicit capabilities set, so assume some 533 /* no explicit capabilities set, so assume some
532 * defaults */ 534 * defaults */
533 switch (file->f_dentry->d_inode->i_mode & S_IFMT) { 535 switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) {
534 case S_IFREG: 536 case S_IFREG:
535 case S_IFBLK: 537 case S_IFBLK:
536 capabilities = BDI_CAP_MAP_COPY; 538 capabilities = BDI_CAP_MAP_COPY;
@@ -561,11 +563,11 @@ static int validate_mmap_request(struct file *file,
561 !(file->f_mode & FMODE_WRITE)) 563 !(file->f_mode & FMODE_WRITE))
562 return -EACCES; 564 return -EACCES;
563 565
564 if (IS_APPEND(file->f_dentry->d_inode) && 566 if (IS_APPEND(file->f_path.dentry->d_inode) &&
565 (file->f_mode & FMODE_WRITE)) 567 (file->f_mode & FMODE_WRITE))
566 return -EACCES; 568 return -EACCES;
567 569
568 if (locks_verify_locked(file->f_dentry->d_inode)) 570 if (locks_verify_locked(file->f_path.dentry->d_inode))
569 return -EAGAIN; 571 return -EAGAIN;
570 572
571 if (!(capabilities & BDI_CAP_MAP_DIRECT)) 573 if (!(capabilities & BDI_CAP_MAP_DIRECT))
@@ -596,7 +598,7 @@ static int validate_mmap_request(struct file *file,
596 598
597 /* handle executable mappings and implied executable 599 /* handle executable mappings and implied executable
598 * mappings */ 600 * mappings */
599 if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { 601 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
600 if (prot & PROT_EXEC) 602 if (prot & PROT_EXEC)
601 return -EPERM; 603 return -EPERM;
602 } 604 }
@@ -806,10 +808,9 @@ unsigned long do_mmap_pgoff(struct file *file,
806 vm_flags = determine_vm_flags(file, prot, flags, capabilities); 808 vm_flags = determine_vm_flags(file, prot, flags, capabilities);
807 809
808 /* we're going to need to record the mapping if it works */ 810 /* we're going to need to record the mapping if it works */
809 vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL); 811 vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
810 if (!vml) 812 if (!vml)
811 goto error_getting_vml; 813 goto error_getting_vml;
812 memset(vml, 0, sizeof(*vml));
813 814
814 down_write(&nommu_vma_sem); 815 down_write(&nommu_vma_sem);
815 816
@@ -832,7 +833,7 @@ unsigned long do_mmap_pgoff(struct file *file,
832 continue; 833 continue;
833 834
834 /* search for overlapping mappings on the same file */ 835 /* search for overlapping mappings on the same file */
835 if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode) 836 if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode)
836 continue; 837 continue;
837 838
838 if (vma->vm_pgoff >= pgoff + pglen) 839 if (vma->vm_pgoff >= pgoff + pglen)
@@ -885,11 +886,10 @@ unsigned long do_mmap_pgoff(struct file *file,
885 } 886 }
886 887
887 /* we're going to need a VMA struct as well */ 888 /* we're going to need a VMA struct as well */
888 vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); 889 vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
889 if (!vma) 890 if (!vma)
890 goto error_getting_vma; 891 goto error_getting_vma;
891 892
892 memset(vma, 0, sizeof(*vma));
893 INIT_LIST_HEAD(&vma->anon_vma_node); 893 INIT_LIST_HEAD(&vma->anon_vma_node);
894 atomic_set(&vma->vm_usage, 1); 894 atomic_set(&vma->vm_usage, 1);
895 if (file) 895 if (file)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 20f41b082e..223d9ccb7d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -15,6 +15,7 @@
15 * kernel subsystems and hints as to where to find out what things do. 15 * kernel subsystems and hints as to where to find out what things do.
16 */ 16 */
17 17
18#include <linux/oom.h>
18#include <linux/mm.h> 19#include <linux/mm.h>
19#include <linux/sched.h> 20#include <linux/sched.h>
20#include <linux/swap.h> 21#include <linux/swap.h>
@@ -263,7 +264,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
263 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO 264 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
264 * set. 265 * set.
265 */ 266 */
266static void __oom_kill_task(struct task_struct *p, const char *message) 267static void __oom_kill_task(struct task_struct *p, int verbose)
267{ 268{
268 if (is_init(p)) { 269 if (is_init(p)) {
269 WARN_ON(1); 270 WARN_ON(1);
@@ -277,10 +278,8 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
277 return; 278 return;
278 } 279 }
279 280
280 if (message) { 281 if (verbose)
281 printk(KERN_ERR "%s: Killed process %d (%s).\n", 282 printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm);
282 message, p->pid, p->comm);
283 }
284 283
285 /* 284 /*
286 * We give our sacrificial lamb high priority and access to 285 * We give our sacrificial lamb high priority and access to
@@ -293,7 +292,7 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
293 force_sig(SIGKILL, p); 292 force_sig(SIGKILL, p);
294} 293}
295 294
296static int oom_kill_task(struct task_struct *p, const char *message) 295static int oom_kill_task(struct task_struct *p)
297{ 296{
298 struct mm_struct *mm; 297 struct mm_struct *mm;
299 struct task_struct *g, *q; 298 struct task_struct *g, *q;
@@ -312,15 +311,25 @@ static int oom_kill_task(struct task_struct *p, const char *message)
312 if (mm == NULL) 311 if (mm == NULL)
313 return 1; 312 return 1;
314 313
315 __oom_kill_task(p, message); 314 /*
315 * Don't kill the process if any threads are set to OOM_DISABLE
316 */
317 do_each_thread(g, q) {
318 if (q->mm == mm && p->oomkilladj == OOM_DISABLE)
319 return 1;
320 } while_each_thread(g, q);
321
322 __oom_kill_task(p, 1);
323
316 /* 324 /*
317 * kill all processes that share the ->mm (i.e. all threads), 325 * kill all processes that share the ->mm (i.e. all threads),
318 * but are in a different thread group 326 * but are in a different thread group. Don't let them have access
327 * to memory reserves though, otherwise we might deplete all memory.
319 */ 328 */
320 do_each_thread(g, q) 329 do_each_thread(g, q) {
321 if (q->mm == mm && q->tgid != p->tgid) 330 if (q->mm == mm && q->tgid != p->tgid)
322 __oom_kill_task(q, message); 331 force_sig(SIGKILL, p);
323 while_each_thread(g, q); 332 } while_each_thread(g, q);
324 333
325 return 0; 334 return 0;
326} 335}
@@ -336,21 +345,22 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
336 * its children or threads, just set TIF_MEMDIE so it can die quickly 345 * its children or threads, just set TIF_MEMDIE so it can die quickly
337 */ 346 */
338 if (p->flags & PF_EXITING) { 347 if (p->flags & PF_EXITING) {
339 __oom_kill_task(p, NULL); 348 __oom_kill_task(p, 0);
340 return 0; 349 return 0;
341 } 350 }
342 351
343 printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li" 352 printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
344 " and children.\n", p->pid, p->comm, points); 353 message, p->pid, p->comm, points);
354
345 /* Try to kill a child first */ 355 /* Try to kill a child first */
346 list_for_each(tsk, &p->children) { 356 list_for_each(tsk, &p->children) {
347 c = list_entry(tsk, struct task_struct, sibling); 357 c = list_entry(tsk, struct task_struct, sibling);
348 if (c->mm == p->mm) 358 if (c->mm == p->mm)
349 continue; 359 continue;
350 if (!oom_kill_task(c, message)) 360 if (!oom_kill_task(c))
351 return 0; 361 return 0;
352 } 362 }
353 return oom_kill_task(p, message); 363 return oom_kill_task(p);
354} 364}
355 365
356static BLOCKING_NOTIFIER_HEAD(oom_notify_list); 366static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a0f3390574..237107c1b0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include <linux/task_io_accounting_ops.h>
24#include <linux/blkdev.h> 25#include <linux/blkdev.h>
25#include <linux/mpage.h> 26#include <linux/mpage.h>
26#include <linux/rmap.h> 27#include <linux/rmap.h>
@@ -222,7 +223,7 @@ static void balance_dirty_pages(struct address_space *mapping)
222 if (pages_written >= write_chunk) 223 if (pages_written >= write_chunk)
223 break; /* We've done our duty */ 224 break; /* We've done our duty */
224 } 225 }
225 blk_congestion_wait(WRITE, HZ/10); 226 congestion_wait(WRITE, HZ/10);
226 } 227 }
227 228
228 if (nr_reclaimable + global_page_state(NR_WRITEBACK) 229 if (nr_reclaimable + global_page_state(NR_WRITEBACK)
@@ -314,7 +315,7 @@ void throttle_vm_writeout(void)
314 if (global_page_state(NR_UNSTABLE_NFS) + 315 if (global_page_state(NR_UNSTABLE_NFS) +
315 global_page_state(NR_WRITEBACK) <= dirty_thresh) 316 global_page_state(NR_WRITEBACK) <= dirty_thresh)
316 break; 317 break;
317 blk_congestion_wait(WRITE, HZ/10); 318 congestion_wait(WRITE, HZ/10);
318 } 319 }
319} 320}
320 321
@@ -351,7 +352,7 @@ static void background_writeout(unsigned long _min_pages)
351 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 352 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
352 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { 353 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
353 /* Wrote less than expected */ 354 /* Wrote less than expected */
354 blk_congestion_wait(WRITE, HZ/10); 355 congestion_wait(WRITE, HZ/10);
355 if (!wbc.encountered_congestion) 356 if (!wbc.encountered_congestion)
356 break; 357 break;
357 } 358 }
@@ -422,7 +423,7 @@ static void wb_kupdate(unsigned long arg)
422 writeback_inodes(&wbc); 423 writeback_inodes(&wbc);
423 if (wbc.nr_to_write > 0) { 424 if (wbc.nr_to_write > 0) {
424 if (wbc.encountered_congestion) 425 if (wbc.encountered_congestion)
425 blk_congestion_wait(WRITE, HZ/10); 426 congestion_wait(WRITE, HZ/10);
426 else 427 else
427 break; /* All the old data is written */ 428 break; /* All the old data is written */
428 } 429 }
@@ -761,23 +762,24 @@ int __set_page_dirty_nobuffers(struct page *page)
761 struct address_space *mapping = page_mapping(page); 762 struct address_space *mapping = page_mapping(page);
762 struct address_space *mapping2; 763 struct address_space *mapping2;
763 764
764 if (mapping) { 765 if (!mapping)
765 write_lock_irq(&mapping->tree_lock); 766 return 1;
766 mapping2 = page_mapping(page); 767
767 if (mapping2) { /* Race with truncate? */ 768 write_lock_irq(&mapping->tree_lock);
768 BUG_ON(mapping2 != mapping); 769 mapping2 = page_mapping(page);
769 if (mapping_cap_account_dirty(mapping)) 770 if (mapping2) { /* Race with truncate? */
770 __inc_zone_page_state(page, 771 BUG_ON(mapping2 != mapping);
771 NR_FILE_DIRTY); 772 if (mapping_cap_account_dirty(mapping)) {
772 radix_tree_tag_set(&mapping->page_tree, 773 __inc_zone_page_state(page, NR_FILE_DIRTY);
773 page_index(page), PAGECACHE_TAG_DIRTY); 774 task_io_account_write(PAGE_CACHE_SIZE);
774 }
775 write_unlock_irq(&mapping->tree_lock);
776 if (mapping->host) {
777 /* !PageAnon && !swapper_space */
778 __mark_inode_dirty(mapping->host,
779 I_DIRTY_PAGES);
780 } 775 }
776 radix_tree_tag_set(&mapping->page_tree,
777 page_index(page), PAGECACHE_TAG_DIRTY);
778 }
779 write_unlock_irq(&mapping->tree_lock);
780 if (mapping->host) {
781 /* !PageAnon && !swapper_space */
782 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
781 } 783 }
782 return 1; 784 return 1;
783 } 785 }
@@ -851,27 +853,26 @@ int test_clear_page_dirty(struct page *page)
851 struct address_space *mapping = page_mapping(page); 853 struct address_space *mapping = page_mapping(page);
852 unsigned long flags; 854 unsigned long flags;
853 855
854 if (mapping) { 856 if (!mapping)
855 write_lock_irqsave(&mapping->tree_lock, flags); 857 return TestClearPageDirty(page);
856 if (TestClearPageDirty(page)) { 858
857 radix_tree_tag_clear(&mapping->page_tree, 859 write_lock_irqsave(&mapping->tree_lock, flags);
858 page_index(page), 860 if (TestClearPageDirty(page)) {
859 PAGECACHE_TAG_DIRTY); 861 radix_tree_tag_clear(&mapping->page_tree,
860 write_unlock_irqrestore(&mapping->tree_lock, flags); 862 page_index(page), PAGECACHE_TAG_DIRTY);
861 /*
862 * We can continue to use `mapping' here because the
863 * page is locked, which pins the address_space
864 */
865 if (mapping_cap_account_dirty(mapping)) {
866 page_mkclean(page);
867 dec_zone_page_state(page, NR_FILE_DIRTY);
868 }
869 return 1;
870 }
871 write_unlock_irqrestore(&mapping->tree_lock, flags); 863 write_unlock_irqrestore(&mapping->tree_lock, flags);
872 return 0; 864 /*
865 * We can continue to use `mapping' here because the
866 * page is locked, which pins the address_space
867 */
868 if (mapping_cap_account_dirty(mapping)) {
869 page_mkclean(page);
870 dec_zone_page_state(page, NR_FILE_DIRTY);
871 }
872 return 1;
873 } 873 }
874 return TestClearPageDirty(page); 874 write_unlock_irqrestore(&mapping->tree_lock, flags);
875 return 0;
875} 876}
876EXPORT_SYMBOL(test_clear_page_dirty); 877EXPORT_SYMBOL(test_clear_page_dirty);
877 878
@@ -893,17 +894,17 @@ int clear_page_dirty_for_io(struct page *page)
893{ 894{
894 struct address_space *mapping = page_mapping(page); 895 struct address_space *mapping = page_mapping(page);
895 896
896 if (mapping) { 897 if (!mapping)
897 if (TestClearPageDirty(page)) { 898 return TestClearPageDirty(page);
898 if (mapping_cap_account_dirty(mapping)) { 899
899 page_mkclean(page); 900 if (TestClearPageDirty(page)) {
900 dec_zone_page_state(page, NR_FILE_DIRTY); 901 if (mapping_cap_account_dirty(mapping)) {
901 } 902 page_mkclean(page);
902 return 1; 903 dec_zone_page_state(page, NR_FILE_DIRTY);
903 } 904 }
904 return 0; 905 return 1;
905 } 906 }
906 return TestClearPageDirty(page); 907 return 0;
907} 908}
908EXPORT_SYMBOL(clear_page_dirty_for_io); 909EXPORT_SYMBOL(clear_page_dirty_for_io);
909 910
@@ -956,15 +957,6 @@ int test_set_page_writeback(struct page *page)
956EXPORT_SYMBOL(test_set_page_writeback); 957EXPORT_SYMBOL(test_set_page_writeback);
957 958
958/* 959/*
959 * Wakes up tasks that are being throttled due to writeback congestion
960 */
961void writeback_congestion_end(void)
962{
963 blk_congestion_end(WRITE);
964}
965EXPORT_SYMBOL(writeback_congestion_end);
966
967/*
968 * Return true if any of the pages in the mapping are marged with the 960 * Return true if any of the pages in the mapping are marged with the
969 * passed tag. 961 * passed tag.
970 */ 962 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40db96a655..e6b17b2989 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -39,6 +39,8 @@
39#include <linux/stop_machine.h> 39#include <linux/stop_machine.h>
40#include <linux/sort.h> 40#include <linux/sort.h>
41#include <linux/pfn.h> 41#include <linux/pfn.h>
42#include <linux/backing-dev.h>
43#include <linux/fault-inject.h>
42 44
43#include <asm/tlbflush.h> 45#include <asm/tlbflush.h>
44#include <asm/div64.h> 46#include <asm/div64.h>
@@ -82,14 +84,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
82 84
83EXPORT_SYMBOL(totalram_pages); 85EXPORT_SYMBOL(totalram_pages);
84 86
85/* 87static char * const zone_names[MAX_NR_ZONES] = {
86 * Used by page_zone() to look up the address of the struct zone whose
87 * id is encoded in the upper bits of page->flags
88 */
89struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
90EXPORT_SYMBOL(zone_table);
91
92static char *zone_names[MAX_NR_ZONES] = {
93 "DMA", 88 "DMA",
94#ifdef CONFIG_ZONE_DMA32 89#ifdef CONFIG_ZONE_DMA32
95 "DMA32", 90 "DMA32",
@@ -236,7 +231,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
236 int i; 231 int i;
237 int nr_pages = 1 << order; 232 int nr_pages = 1 << order;
238 233
239 page[1].lru.next = (void *)free_compound_page; /* set dtor */ 234 set_compound_page_dtor(page, free_compound_page);
240 page[1].lru.prev = (void *)order; 235 page[1].lru.prev = (void *)order;
241 for (i = 0; i < nr_pages; i++) { 236 for (i = 0; i < nr_pages; i++) {
242 struct page *p = page + i; 237 struct page *p = page + i;
@@ -485,7 +480,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order)
485 spin_lock(&zone->lock); 480 spin_lock(&zone->lock);
486 zone->all_unreclaimable = 0; 481 zone->all_unreclaimable = 0;
487 zone->pages_scanned = 0; 482 zone->pages_scanned = 0;
488 __free_one_page(page, zone ,order); 483 __free_one_page(page, zone, order);
489 spin_unlock(&zone->lock); 484 spin_unlock(&zone->lock);
490} 485}
491 486
@@ -604,6 +599,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
604 1 << PG_checked | 1 << PG_mappedtodisk); 599 1 << PG_checked | 1 << PG_mappedtodisk);
605 set_page_private(page, 0); 600 set_page_private(page, 0);
606 set_page_refcounted(page); 601 set_page_refcounted(page);
602
603 arch_alloc_page(page, order);
607 kernel_map_pages(page, 1 << order, 1); 604 kernel_map_pages(page, 1 << order, 1);
608 605
609 if (gfp_flags & __GFP_ZERO) 606 if (gfp_flags & __GFP_ZERO)
@@ -689,9 +686,15 @@ void drain_node_pages(int nodeid)
689 686
690 pcp = &pset->pcp[i]; 687 pcp = &pset->pcp[i];
691 if (pcp->count) { 688 if (pcp->count) {
689 int to_drain;
690
692 local_irq_save(flags); 691 local_irq_save(flags);
693 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 692 if (pcp->count >= pcp->batch)
694 pcp->count = 0; 693 to_drain = pcp->batch;
694 else
695 to_drain = pcp->count;
696 free_pages_bulk(zone, to_drain, &pcp->list, 0);
697 pcp->count -= to_drain;
695 local_irq_restore(flags); 698 local_irq_restore(flags);
696 } 699 }
697 } 700 }
@@ -699,7 +702,6 @@ void drain_node_pages(int nodeid)
699} 702}
700#endif 703#endif
701 704
702#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
703static void __drain_pages(unsigned int cpu) 705static void __drain_pages(unsigned int cpu)
704{ 706{
705 unsigned long flags; 707 unsigned long flags;
@@ -721,7 +723,6 @@ static void __drain_pages(unsigned int cpu)
721 } 723 }
722 } 724 }
723} 725}
724#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
725 726
726#ifdef CONFIG_PM 727#ifdef CONFIG_PM
727 728
@@ -852,7 +853,7 @@ again:
852 pcp = &zone_pcp(zone, cpu)->pcp[cold]; 853 pcp = &zone_pcp(zone, cpu)->pcp[cold];
853 local_irq_save(flags); 854 local_irq_save(flags);
854 if (!pcp->count) { 855 if (!pcp->count) {
855 pcp->count += rmqueue_bulk(zone, 0, 856 pcp->count = rmqueue_bulk(zone, 0,
856 pcp->batch, &pcp->list); 857 pcp->batch, &pcp->list);
857 if (unlikely(!pcp->count)) 858 if (unlikely(!pcp->count))
858 goto failed; 859 goto failed;
@@ -892,6 +893,91 @@ failed:
892#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 893#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
893#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 894#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
894 895
896#ifdef CONFIG_FAIL_PAGE_ALLOC
897
898static struct fail_page_alloc_attr {
899 struct fault_attr attr;
900
901 u32 ignore_gfp_highmem;
902 u32 ignore_gfp_wait;
903
904#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
905
906 struct dentry *ignore_gfp_highmem_file;
907 struct dentry *ignore_gfp_wait_file;
908
909#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
910
911} fail_page_alloc = {
912 .attr = FAULT_ATTR_INITIALIZER,
913 .ignore_gfp_wait = 1,
914 .ignore_gfp_highmem = 1,
915};
916
917static int __init setup_fail_page_alloc(char *str)
918{
919 return setup_fault_attr(&fail_page_alloc.attr, str);
920}
921__setup("fail_page_alloc=", setup_fail_page_alloc);
922
923static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
924{
925 if (gfp_mask & __GFP_NOFAIL)
926 return 0;
927 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
928 return 0;
929 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
930 return 0;
931
932 return should_fail(&fail_page_alloc.attr, 1 << order);
933}
934
935#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
936
937static int __init fail_page_alloc_debugfs(void)
938{
939 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
940 struct dentry *dir;
941 int err;
942
943 err = init_fault_attr_dentries(&fail_page_alloc.attr,
944 "fail_page_alloc");
945 if (err)
946 return err;
947 dir = fail_page_alloc.attr.dentries.dir;
948
949 fail_page_alloc.ignore_gfp_wait_file =
950 debugfs_create_bool("ignore-gfp-wait", mode, dir,
951 &fail_page_alloc.ignore_gfp_wait);
952
953 fail_page_alloc.ignore_gfp_highmem_file =
954 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
955 &fail_page_alloc.ignore_gfp_highmem);
956
957 if (!fail_page_alloc.ignore_gfp_wait_file ||
958 !fail_page_alloc.ignore_gfp_highmem_file) {
959 err = -ENOMEM;
960 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
961 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
962 cleanup_fault_attr_dentries(&fail_page_alloc.attr);
963 }
964
965 return err;
966}
967
968late_initcall(fail_page_alloc_debugfs);
969
970#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
971
972#else /* CONFIG_FAIL_PAGE_ALLOC */
973
974static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
975{
976 return 0;
977}
978
979#endif /* CONFIG_FAIL_PAGE_ALLOC */
980
895/* 981/*
896 * Return 1 if free pages are above 'mark'. This takes into account the order 982 * Return 1 if free pages are above 'mark'. This takes into account the order
897 * of the allocation. 983 * of the allocation.
@@ -924,31 +1010,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
924 return 1; 1010 return 1;
925} 1011}
926 1012
1013#ifdef CONFIG_NUMA
1014/*
1015 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1016 * skip over zones that are not allowed by the cpuset, or that have
1017 * been recently (in last second) found to be nearly full. See further
1018 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1019 * that have to skip over alot of full or unallowed zones.
1020 *
1021 * If the zonelist cache is present in the passed in zonelist, then
1022 * returns a pointer to the allowed node mask (either the current
1023 * tasks mems_allowed, or node_online_map.)
1024 *
1025 * If the zonelist cache is not available for this zonelist, does
1026 * nothing and returns NULL.
1027 *
1028 * If the fullzones BITMAP in the zonelist cache is stale (more than
1029 * a second since last zap'd) then we zap it out (clear its bits.)
1030 *
1031 * We hold off even calling zlc_setup, until after we've checked the
1032 * first zone in the zonelist, on the theory that most allocations will
1033 * be satisfied from that first zone, so best to examine that zone as
1034 * quickly as we can.
1035 */
1036static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1037{
1038 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1039 nodemask_t *allowednodes; /* zonelist_cache approximation */
1040
1041 zlc = zonelist->zlcache_ptr;
1042 if (!zlc)
1043 return NULL;
1044
1045 if (jiffies - zlc->last_full_zap > 1 * HZ) {
1046 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1047 zlc->last_full_zap = jiffies;
1048 }
1049
1050 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1051 &cpuset_current_mems_allowed :
1052 &node_online_map;
1053 return allowednodes;
1054}
1055
1056/*
1057 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1058 * if it is worth looking at further for free memory:
1059 * 1) Check that the zone isn't thought to be full (doesn't have its
1060 * bit set in the zonelist_cache fullzones BITMAP).
1061 * 2) Check that the zones node (obtained from the zonelist_cache
1062 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1063 * Return true (non-zero) if zone is worth looking at further, or
1064 * else return false (zero) if it is not.
1065 *
1066 * This check -ignores- the distinction between various watermarks,
1067 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1068 * found to be full for any variation of these watermarks, it will
1069 * be considered full for up to one second by all requests, unless
1070 * we are so low on memory on all allowed nodes that we are forced
1071 * into the second scan of the zonelist.
1072 *
1073 * In the second scan we ignore this zonelist cache and exactly
1074 * apply the watermarks to all zones, even it is slower to do so.
1075 * We are low on memory in the second scan, and should leave no stone
1076 * unturned looking for a free page.
1077 */
1078static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1079 nodemask_t *allowednodes)
1080{
1081 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1082 int i; /* index of *z in zonelist zones */
1083 int n; /* node that zone *z is on */
1084
1085 zlc = zonelist->zlcache_ptr;
1086 if (!zlc)
1087 return 1;
1088
1089 i = z - zonelist->zones;
1090 n = zlc->z_to_n[i];
1091
1092 /* This zone is worth trying if it is allowed but not full */
1093 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1094}
1095
927/* 1096/*
928 * get_page_from_freeliest goes through the zonelist trying to allocate 1097 * Given 'z' scanning a zonelist, set the corresponding bit in
1098 * zlc->fullzones, so that subsequent attempts to allocate a page
1099 * from that zone don't waste time re-examining it.
1100 */
1101static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1102{
1103 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1104 int i; /* index of *z in zonelist zones */
1105
1106 zlc = zonelist->zlcache_ptr;
1107 if (!zlc)
1108 return;
1109
1110 i = z - zonelist->zones;
1111
1112 set_bit(i, zlc->fullzones);
1113}
1114
1115#else /* CONFIG_NUMA */
1116
1117static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1118{
1119 return NULL;
1120}
1121
1122static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1123 nodemask_t *allowednodes)
1124{
1125 return 1;
1126}
1127
1128static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1129{
1130}
1131#endif /* CONFIG_NUMA */
1132
1133/*
1134 * get_page_from_freelist goes through the zonelist trying to allocate
929 * a page. 1135 * a page.
930 */ 1136 */
931static struct page * 1137static struct page *
932get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1138get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
933 struct zonelist *zonelist, int alloc_flags) 1139 struct zonelist *zonelist, int alloc_flags)
934{ 1140{
935 struct zone **z = zonelist->zones; 1141 struct zone **z;
936 struct page *page = NULL; 1142 struct page *page = NULL;
937 int classzone_idx = zone_idx(*z); 1143 int classzone_idx = zone_idx(zonelist->zones[0]);
938 struct zone *zone; 1144 struct zone *zone;
1145 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1146 int zlc_active = 0; /* set if using zonelist_cache */
1147 int did_zlc_setup = 0; /* just call zlc_setup() one time */
939 1148
1149zonelist_scan:
940 /* 1150 /*
941 * Go through the zonelist once, looking for a zone with enough free. 1151 * Scan zonelist, looking for a zone with enough free.
942 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1152 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
943 */ 1153 */
1154 z = zonelist->zones;
1155
944 do { 1156 do {
1157 if (NUMA_BUILD && zlc_active &&
1158 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1159 continue;
945 zone = *z; 1160 zone = *z;
946 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && 1161 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
947 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) 1162 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
948 break; 1163 break;
949 if ((alloc_flags & ALLOC_CPUSET) && 1164 if ((alloc_flags & ALLOC_CPUSET) &&
950 !cpuset_zone_allowed(zone, gfp_mask)) 1165 !cpuset_zone_allowed(zone, gfp_mask))
951 continue; 1166 goto try_next_zone;
952 1167
953 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1168 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
954 unsigned long mark; 1169 unsigned long mark;
@@ -958,18 +1173,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
958 mark = zone->pages_low; 1173 mark = zone->pages_low;
959 else 1174 else
960 mark = zone->pages_high; 1175 mark = zone->pages_high;
961 if (!zone_watermark_ok(zone , order, mark, 1176 if (!zone_watermark_ok(zone, order, mark,
962 classzone_idx, alloc_flags)) 1177 classzone_idx, alloc_flags)) {
963 if (!zone_reclaim_mode || 1178 if (!zone_reclaim_mode ||
964 !zone_reclaim(zone, gfp_mask, order)) 1179 !zone_reclaim(zone, gfp_mask, order))
965 continue; 1180 goto this_zone_full;
1181 }
966 } 1182 }
967 1183
968 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1184 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
969 if (page) { 1185 if (page)
970 break; 1186 break;
1187this_zone_full:
1188 if (NUMA_BUILD)
1189 zlc_mark_zone_full(zonelist, z);
1190try_next_zone:
1191 if (NUMA_BUILD && !did_zlc_setup) {
1192 /* we do zlc_setup after the first zone is tried */
1193 allowednodes = zlc_setup(zonelist, alloc_flags);
1194 zlc_active = 1;
1195 did_zlc_setup = 1;
971 } 1196 }
972 } while (*(++z) != NULL); 1197 } while (*(++z) != NULL);
1198
1199 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1200 /* Disable zlc cache for second zonelist scan */
1201 zlc_active = 0;
1202 goto zonelist_scan;
1203 }
973 return page; 1204 return page;
974} 1205}
975 1206
@@ -991,6 +1222,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
991 1222
992 might_sleep_if(wait); 1223 might_sleep_if(wait);
993 1224
1225 if (should_fail_alloc_page(gfp_mask, order))
1226 return NULL;
1227
994restart: 1228restart:
995 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 1229 z = zonelist->zones; /* the list of zones suitable for gfp_mask */
996 1230
@@ -1004,9 +1238,19 @@ restart:
1004 if (page) 1238 if (page)
1005 goto got_pg; 1239 goto got_pg;
1006 1240
1007 do { 1241 /*
1242 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
1243 * __GFP_NOWARN set) should not cause reclaim since the subsystem
1244 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
1245 * using a larger set of nodes after it has established that the
1246 * allowed per node queues are empty and that nodes are
1247 * over allocated.
1248 */
1249 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1250 goto nopage;
1251
1252 for (z = zonelist->zones; *z; z++)
1008 wakeup_kswapd(*z, order); 1253 wakeup_kswapd(*z, order);
1009 } while (*(++z));
1010 1254
1011 /* 1255 /*
1012 * OK, we're below the kswapd watermark and have kicked background 1256 * OK, we're below the kswapd watermark and have kicked background
@@ -1040,6 +1284,7 @@ restart:
1040 1284
1041 /* This allocation should allow future memory freeing. */ 1285 /* This allocation should allow future memory freeing. */
1042 1286
1287rebalance:
1043 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1288 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
1044 && !in_interrupt()) { 1289 && !in_interrupt()) {
1045 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1290 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
@@ -1050,7 +1295,7 @@ nofail_alloc:
1050 if (page) 1295 if (page)
1051 goto got_pg; 1296 goto got_pg;
1052 if (gfp_mask & __GFP_NOFAIL) { 1297 if (gfp_mask & __GFP_NOFAIL) {
1053 blk_congestion_wait(WRITE, HZ/50); 1298 congestion_wait(WRITE, HZ/50);
1054 goto nofail_alloc; 1299 goto nofail_alloc;
1055 } 1300 }
1056 } 1301 }
@@ -1061,7 +1306,6 @@ nofail_alloc:
1061 if (!wait) 1306 if (!wait)
1062 goto nopage; 1307 goto nopage;
1063 1308
1064rebalance:
1065 cond_resched(); 1309 cond_resched();
1066 1310
1067 /* We now go into synchronous reclaim */ 1311 /* We now go into synchronous reclaim */
@@ -1113,7 +1357,7 @@ rebalance:
1113 do_retry = 1; 1357 do_retry = 1;
1114 } 1358 }
1115 if (do_retry) { 1359 if (do_retry) {
1116 blk_congestion_wait(WRITE, HZ/50); 1360 congestion_wait(WRITE, HZ/50);
1117 goto rebalance; 1361 goto rebalance;
1118 } 1362 }
1119 1363
@@ -1261,7 +1505,7 @@ unsigned int nr_free_pagecache_pages(void)
1261static inline void show_node(struct zone *zone) 1505static inline void show_node(struct zone *zone)
1262{ 1506{
1263 if (NUMA_BUILD) 1507 if (NUMA_BUILD)
1264 printk("Node %ld ", zone_to_nid(zone)); 1508 printk("Node %d ", zone_to_nid(zone));
1265} 1509}
1266 1510
1267void si_meminfo(struct sysinfo *val) 1511void si_meminfo(struct sysinfo *val)
@@ -1541,6 +1785,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1541 } 1785 }
1542} 1786}
1543 1787
1788/* Construct the zonelist performance cache - see further mmzone.h */
1789static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1790{
1791 int i;
1792
1793 for (i = 0; i < MAX_NR_ZONES; i++) {
1794 struct zonelist *zonelist;
1795 struct zonelist_cache *zlc;
1796 struct zone **z;
1797
1798 zonelist = pgdat->node_zonelists + i;
1799 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
1800 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1801 for (z = zonelist->zones; *z; z++)
1802 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
1803 }
1804}
1805
1544#else /* CONFIG_NUMA */ 1806#else /* CONFIG_NUMA */
1545 1807
1546static void __meminit build_zonelists(pg_data_t *pgdat) 1808static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1578,14 +1840,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1578 } 1840 }
1579} 1841}
1580 1842
1843/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1844static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1845{
1846 int i;
1847
1848 for (i = 0; i < MAX_NR_ZONES; i++)
1849 pgdat->node_zonelists[i].zlcache_ptr = NULL;
1850}
1851
1581#endif /* CONFIG_NUMA */ 1852#endif /* CONFIG_NUMA */
1582 1853
1583/* return values int ....just for stop_machine_run() */ 1854/* return values int ....just for stop_machine_run() */
1584static int __meminit __build_all_zonelists(void *dummy) 1855static int __meminit __build_all_zonelists(void *dummy)
1585{ 1856{
1586 int nid; 1857 int nid;
1587 for_each_online_node(nid) 1858
1859 for_each_online_node(nid) {
1588 build_zonelists(NODE_DATA(nid)); 1860 build_zonelists(NODE_DATA(nid));
1861 build_zonelist_cache(NODE_DATA(nid));
1862 }
1589 return 0; 1863 return 0;
1590} 1864}
1591 1865
@@ -1688,6 +1962,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1688 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 1962 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1689 if (!early_pfn_valid(pfn)) 1963 if (!early_pfn_valid(pfn))
1690 continue; 1964 continue;
1965 if (!early_pfn_in_nid(pfn, nid))
1966 continue;
1691 page = pfn_to_page(pfn); 1967 page = pfn_to_page(pfn);
1692 set_page_links(page, zone, nid, pfn); 1968 set_page_links(page, zone, nid, pfn);
1693 init_page_count(page); 1969 init_page_count(page);
@@ -1712,20 +1988,6 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1712 } 1988 }
1713} 1989}
1714 1990
1715#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1716void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
1717 unsigned long pfn, unsigned long size)
1718{
1719 unsigned long snum = pfn_to_section_nr(pfn);
1720 unsigned long end = pfn_to_section_nr(pfn + size);
1721
1722 if (FLAGS_HAS_NODE)
1723 zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
1724 else
1725 for (; snum <= end; snum++)
1726 zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
1727}
1728
1729#ifndef __HAVE_ARCH_MEMMAP_INIT 1991#ifndef __HAVE_ARCH_MEMMAP_INIT
1730#define memmap_init(size, nid, zone, start_pfn) \ 1992#define memmap_init(size, nid, zone, start_pfn) \
1731 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1993 memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1878,16 +2140,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
1878 int ret = NOTIFY_OK; 2140 int ret = NOTIFY_OK;
1879 2141
1880 switch (action) { 2142 switch (action) {
1881 case CPU_UP_PREPARE: 2143 case CPU_UP_PREPARE:
1882 if (process_zones(cpu)) 2144 if (process_zones(cpu))
1883 ret = NOTIFY_BAD; 2145 ret = NOTIFY_BAD;
1884 break; 2146 break;
1885 case CPU_UP_CANCELED: 2147 case CPU_UP_CANCELED:
1886 case CPU_DEAD: 2148 case CPU_DEAD:
1887 free_zone_pagesets(cpu); 2149 free_zone_pagesets(cpu);
1888 break; 2150 break;
1889 default: 2151 default:
1890 break; 2152 break;
1891 } 2153 }
1892 return ret; 2154 return ret;
1893} 2155}
@@ -2258,7 +2520,7 @@ unsigned long __init __absent_pages_in_range(int nid,
2258 2520
2259 /* Account for ranges past physical memory on this node */ 2521 /* Account for ranges past physical memory on this node */
2260 if (range_end_pfn > prev_end_pfn) 2522 if (range_end_pfn > prev_end_pfn)
2261 hole_pages = range_end_pfn - 2523 hole_pages += range_end_pfn -
2262 max(range_start_pfn, prev_end_pfn); 2524 max(range_start_pfn, prev_end_pfn);
2263 2525
2264 return hole_pages; 2526 return hole_pages;
@@ -2404,7 +2666,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2404 zone->zone_pgdat = pgdat; 2666 zone->zone_pgdat = pgdat;
2405 zone->free_pages = 0; 2667 zone->free_pages = 0;
2406 2668
2407 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 2669 zone->prev_priority = DEF_PRIORITY;
2408 2670
2409 zone_pcp_init(zone); 2671 zone_pcp_init(zone);
2410 INIT_LIST_HEAD(&zone->active_list); 2672 INIT_LIST_HEAD(&zone->active_list);
@@ -2418,7 +2680,6 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2418 if (!size) 2680 if (!size)
2419 continue; 2681 continue;
2420 2682
2421 zonetable_add(zone, nid, j, zone_start_pfn, size);
2422 ret = init_currently_empty_zone(zone, zone_start_pfn, size); 2683 ret = init_currently_empty_zone(zone, zone_start_pfn, size);
2423 BUG_ON(ret); 2684 BUG_ON(ret);
2424 zone_start_pfn += size; 2685 zone_start_pfn += size;
@@ -2609,6 +2870,9 @@ unsigned long __init find_min_pfn_for_node(unsigned long nid)
2609{ 2870{
2610 int i; 2871 int i;
2611 2872
2873 /* Regions in the early_node_map can be in any order */
2874 sort_node_map();
2875
2612 /* Assuming a sorted map, the first range found has the starting pfn */ 2876 /* Assuming a sorted map, the first range found has the starting pfn */
2613 for_each_active_range_index_in_nid(i, nid) 2877 for_each_active_range_index_in_nid(i, nid)
2614 return early_node_map[i].start_pfn; 2878 return early_node_map[i].start_pfn;
@@ -2677,9 +2941,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
2677 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 2941 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
2678 } 2942 }
2679 2943
2680 /* Regions in the early_node_map can be in any order */
2681 sort_node_map();
2682
2683 /* Print out the zone ranges */ 2944 /* Print out the zone ranges */
2684 printk("Zone PFN ranges:\n"); 2945 printk("Zone PFN ranges:\n");
2685 for (i = 0; i < MAX_NR_ZONES; i++) 2946 for (i = 0; i < MAX_NR_ZONES; i++)
@@ -2733,7 +2994,6 @@ void __init free_area_init(unsigned long *zones_size)
2733 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 2994 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
2734} 2995}
2735 2996
2736#ifdef CONFIG_HOTPLUG_CPU
2737static int page_alloc_cpu_notify(struct notifier_block *self, 2997static int page_alloc_cpu_notify(struct notifier_block *self,
2738 unsigned long action, void *hcpu) 2998 unsigned long action, void *hcpu)
2739{ 2999{
@@ -2748,7 +3008,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
2748 } 3008 }
2749 return NOTIFY_OK; 3009 return NOTIFY_OK;
2750} 3010}
2751#endif /* CONFIG_HOTPLUG_CPU */
2752 3011
2753void __init page_alloc_init(void) 3012void __init page_alloc_init(void)
2754{ 3013{
@@ -3052,7 +3311,7 @@ void *__init alloc_large_system_hash(const char *tablename,
3052 /* allow the kernel cmdline to have a say */ 3311 /* allow the kernel cmdline to have a say */
3053 if (!numentries) { 3312 if (!numentries) {
3054 /* round applicable memory size up to nearest megabyte */ 3313 /* round applicable memory size up to nearest megabyte */
3055 numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; 3314 numentries = nr_kernel_pages;
3056 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 3315 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
3057 numentries >>= 20 - PAGE_SHIFT; 3316 numentries >>= 20 - PAGE_SHIFT;
3058 numentries <<= 20 - PAGE_SHIFT; 3317 numentries <<= 20 - PAGE_SHIFT;
@@ -3074,7 +3333,7 @@ void *__init alloc_large_system_hash(const char *tablename,
3074 if (numentries > max) 3333 if (numentries > max)
3075 numentries = max; 3334 numentries = max;
3076 3335
3077 log2qty = long_log2(numentries); 3336 log2qty = ilog2(numentries);
3078 3337
3079 do { 3338 do {
3080 size = bucketsize << log2qty; 3339 size = bucketsize << log2qty;
@@ -3096,7 +3355,7 @@ void *__init alloc_large_system_hash(const char *tablename,
3096 printk("%s hash table entries: %d (order: %d, %lu bytes)\n", 3355 printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
3097 tablename, 3356 tablename,
3098 (1U << log2qty), 3357 (1U << log2qty),
3099 long_log2(size) - PAGE_SHIFT, 3358 ilog2(size) - PAGE_SHIFT,
3100 size); 3359 size);
3101 3360
3102 if (_hash_shift) 3361 if (_hash_shift)
@@ -3119,3 +3378,19 @@ unsigned long page_to_pfn(struct page *page)
3119EXPORT_SYMBOL(pfn_to_page); 3378EXPORT_SYMBOL(pfn_to_page);
3120EXPORT_SYMBOL(page_to_pfn); 3379EXPORT_SYMBOL(page_to_pfn);
3121#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 3380#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
3381
3382#if MAX_NUMNODES > 1
3383/*
3384 * Find the highest possible node id.
3385 */
3386int highest_possible_node_id(void)
3387{
3388 unsigned int node;
3389 unsigned int highest = 0;
3390
3391 for_each_node_mask(node, node_possible_map)
3392 highest = node;
3393 return highest;
3394}
3395EXPORT_SYMBOL(highest_possible_node_id);
3396#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index d4840ecbf8..dbffec0d78 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -147,48 +147,3 @@ int swap_readpage(struct file *file, struct page *page)
147out: 147out:
148 return ret; 148 return ret;
149} 149}
150
151#ifdef CONFIG_SOFTWARE_SUSPEND
152/*
153 * A scruffy utility function to read or write an arbitrary swap page
154 * and wait on the I/O. The caller must have a ref on the page.
155 *
156 * We use end_swap_bio_read() even for writes, because it happens to do what
157 * we want.
158 */
159int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
160 struct bio **bio_chain)
161{
162 struct bio *bio;
163 int ret = 0;
164 int bio_rw;
165
166 lock_page(page);
167
168 bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read);
169 if (bio == NULL) {
170 unlock_page(page);
171 ret = -ENOMEM;
172 goto out;
173 }
174
175 bio_rw = rw;
176 if (!bio_chain)
177 bio_rw |= (1 << BIO_RW_SYNC);
178 if (bio_chain)
179 bio_get(bio);
180 submit_bio(bio_rw, bio);
181 if (bio_chain == NULL) {
182 wait_on_page_locked(page);
183
184 if (!PageUptodate(page) || PageError(page))
185 ret = -EIO;
186 }
187 if (bio_chain) {
188 bio->bi_private = *bio_chain;
189 *bio_chain = bio;
190 }
191out:
192 return ret;
193}
194#endif
diff --git a/mm/pdflush.c b/mm/pdflush.c
index b02102feeb..8ce0900dc9 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> // Prototypes pdflush_operation() 21#include <linux/writeback.h> // Prototypes pdflush_operation()
22#include <linux/kthread.h> 22#include <linux/kthread.h>
23#include <linux/cpuset.h> 23#include <linux/cpuset.h>
24#include <linux/freezer.h>
24 25
25 26
26/* 27/*
diff --git a/mm/readahead.c b/mm/readahead.c
index 1ba736ac03..0f539e8e82 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/blkdev.h> 14#include <linux/blkdev.h>
15#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
16#include <linux/task_io_accounting_ops.h>
16#include <linux/pagevec.h> 17#include <linux/pagevec.h>
17 18
18void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 19void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
@@ -148,15 +149,10 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
148 if (!pagevec_add(&lru_pvec, page)) 149 if (!pagevec_add(&lru_pvec, page))
149 __pagevec_lru_add(&lru_pvec); 150 __pagevec_lru_add(&lru_pvec);
150 if (ret) { 151 if (ret) {
151 while (!list_empty(pages)) { 152 put_pages_list(pages);
152 struct page *victim;
153
154 victim = list_to_page(pages);
155 list_del(&victim->lru);
156 page_cache_release(victim);
157 }
158 break; 153 break;
159 } 154 }
155 task_io_account_read(PAGE_CACHE_SIZE);
160 } 156 }
161 pagevec_lru_add(&lru_pvec); 157 pagevec_lru_add(&lru_pvec);
162 return ret; 158 return ret;
@@ -173,6 +169,8 @@ static int read_pages(struct address_space *mapping, struct file *filp,
173 169
174 if (mapping->a_ops->readpages) { 170 if (mapping->a_ops->readpages) {
175 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); 171 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
172 /* Clean up the remaining pages */
173 put_pages_list(pages);
176 goto out; 174 goto out;
177 } 175 }
178 176
@@ -454,7 +452,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
454 * 452 *
455 * Note that @filp is purely used for passing on to the ->readpage[s]() 453 * Note that @filp is purely used for passing on to the ->readpage[s]()
456 * handler: it may refer to a different file from @mapping (so we may not use 454 * handler: it may refer to a different file from @mapping (so we may not use
457 * @filp->f_mapping or @filp->f_dentry->d_inode here). 455 * @filp->f_mapping or @filp->f_path.dentry->d_inode here).
458 * Also, @ra may not be equal to &@filp->f_ra. 456 * Also, @ra may not be equal to &@filp->f_ra.
459 * 457 *
460 */ 458 */
diff --git a/mm/rmap.c b/mm/rmap.c
index a9136d8b75..d8a842a586 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,27 +21,21 @@
21 * Lock ordering in mm: 21 * Lock ordering in mm:
22 * 22 *
23 * inode->i_mutex (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * inode->i_alloc_sem 24 * inode->i_alloc_sem (vmtruncate_range)
25 * 25 * mm->mmap_sem
26 * When a page fault occurs in writing from user to file, down_read 26 * page->flags PG_locked (lock_page)
27 * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within 27 * mapping->i_mmap_lock
28 * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never 28 * anon_vma->lock
29 * taken together; in truncation, i_mutex is taken outermost. 29 * mm->page_table_lock or pte_lock
30 * 30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
31 * mm->mmap_sem 31 * swap_lock (in swap_duplicate, swap_info_get)
32 * page->flags PG_locked (lock_page) 32 * mmlist_lock (in mmput, drain_mmlist and others)
33 * mapping->i_mmap_lock 33 * mapping->private_lock (in __set_page_dirty_buffers)
34 * anon_vma->lock 34 * inode_lock (in set_page_dirty's __mark_inode_dirty)
35 * mm->page_table_lock or pte_lock 35 * sb_lock (within inode_lock in fs/fs-writeback.c)
36 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * swap_lock (in swap_duplicate, swap_info_get) 37 * in arch-dependent flush_dcache_mmap_lock,
38 * mmlist_lock (in mmput, drain_mmlist and others) 38 * within inode_lock in __sync_single_inode)
39 * mapping->private_lock (in __set_page_dirty_buffers)
40 * inode_lock (in set_page_dirty's __mark_inode_dirty)
41 * sb_lock (within inode_lock in fs/fs-writeback.c)
42 * mapping->tree_lock (widely used, in set_page_dirty,
43 * in arch-dependent flush_dcache_mmap_lock,
44 * within inode_lock in __sync_single_inode)
45 */ 39 */
46 40
47#include <linux/mm.h> 41#include <linux/mm.h>
diff --git a/mm/shmem.c b/mm/shmem.c
index bb8ca7ef70..4bb28d218e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -48,6 +48,7 @@
48#include <linux/ctype.h> 48#include <linux/ctype.h>
49#include <linux/migrate.h> 49#include <linux/migrate.h>
50#include <linux/highmem.h> 50#include <linux/highmem.h>
51#include <linux/backing-dev.h>
51 52
52#include <asm/uaccess.h> 53#include <asm/uaccess.h>
53#include <asm/div64.h> 54#include <asm/div64.h>
@@ -176,7 +177,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
176 177
177static struct super_operations shmem_ops; 178static struct super_operations shmem_ops;
178static const struct address_space_operations shmem_aops; 179static const struct address_space_operations shmem_aops;
179static struct file_operations shmem_file_operations; 180static const struct file_operations shmem_file_operations;
180static struct inode_operations shmem_inode_operations; 181static struct inode_operations shmem_inode_operations;
181static struct inode_operations shmem_dir_inode_operations; 182static struct inode_operations shmem_dir_inode_operations;
182static struct inode_operations shmem_special_inode_operations; 183static struct inode_operations shmem_special_inode_operations;
@@ -1131,7 +1132,7 @@ repeat:
1131 page_cache_release(swappage); 1132 page_cache_release(swappage);
1132 if (error == -ENOMEM) { 1133 if (error == -ENOMEM) {
1133 /* let kswapd refresh zone for GFP_ATOMICs */ 1134 /* let kswapd refresh zone for GFP_ATOMICs */
1134 blk_congestion_wait(WRITE, HZ/50); 1135 congestion_wait(WRITE, HZ/50);
1135 } 1136 }
1136 goto repeat; 1137 goto repeat;
1137 } 1138 }
@@ -1224,7 +1225,7 @@ failed:
1224 1225
1225struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type) 1226struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
1226{ 1227{
1227 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1228 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1228 struct page *page = NULL; 1229 struct page *page = NULL;
1229 unsigned long idx; 1230 unsigned long idx;
1230 int error; 1231 int error;
@@ -1247,7 +1248,7 @@ static int shmem_populate(struct vm_area_struct *vma,
1247 unsigned long addr, unsigned long len, 1248 unsigned long addr, unsigned long len,
1248 pgprot_t prot, unsigned long pgoff, int nonblock) 1249 pgprot_t prot, unsigned long pgoff, int nonblock)
1249{ 1250{
1250 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1251 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1251 struct mm_struct *mm = vma->vm_mm; 1252 struct mm_struct *mm = vma->vm_mm;
1252 enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; 1253 enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
1253 unsigned long size; 1254 unsigned long size;
@@ -1292,14 +1293,14 @@ static int shmem_populate(struct vm_area_struct *vma,
1292#ifdef CONFIG_NUMA 1293#ifdef CONFIG_NUMA
1293int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) 1294int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
1294{ 1295{
1295 struct inode *i = vma->vm_file->f_dentry->d_inode; 1296 struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1296 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); 1297 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
1297} 1298}
1298 1299
1299struct mempolicy * 1300struct mempolicy *
1300shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) 1301shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
1301{ 1302{
1302 struct inode *i = vma->vm_file->f_dentry->d_inode; 1303 struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1303 unsigned long idx; 1304 unsigned long idx;
1304 1305
1305 idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1306 idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -1309,7 +1310,7 @@ shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
1309 1310
1310int shmem_lock(struct file *file, int lock, struct user_struct *user) 1311int shmem_lock(struct file *file, int lock, struct user_struct *user)
1311{ 1312{
1312 struct inode *inode = file->f_dentry->d_inode; 1313 struct inode *inode = file->f_path.dentry->d_inode;
1313 struct shmem_inode_info *info = SHMEM_I(inode); 1314 struct shmem_inode_info *info = SHMEM_I(inode);
1314 int retval = -ENOMEM; 1315 int retval = -ENOMEM;
1315 1316
@@ -1362,6 +1363,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1362 inode->i_mapping->a_ops = &shmem_aops; 1363 inode->i_mapping->a_ops = &shmem_aops;
1363 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1364 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1364 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1365 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1366 inode->i_generation = get_seconds();
1365 info = SHMEM_I(inode); 1367 info = SHMEM_I(inode);
1366 memset(info, 0, (char *)inode - (char *)info); 1368 memset(info, 0, (char *)inode - (char *)info);
1367 spin_lock_init(&info->lock); 1369 spin_lock_init(&info->lock);
@@ -1420,7 +1422,7 @@ shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsig
1420static ssize_t 1422static ssize_t
1421shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) 1423shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
1422{ 1424{
1423 struct inode *inode = file->f_dentry->d_inode; 1425 struct inode *inode = file->f_path.dentry->d_inode;
1424 loff_t pos; 1426 loff_t pos;
1425 unsigned long written; 1427 unsigned long written;
1426 ssize_t err; 1428 ssize_t err;
@@ -1440,7 +1442,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
1440 if (err || !count) 1442 if (err || !count)
1441 goto out; 1443 goto out;
1442 1444
1443 err = remove_suid(file->f_dentry); 1445 err = remove_suid(file->f_path.dentry);
1444 if (err) 1446 if (err)
1445 goto out; 1447 goto out;
1446 1448
@@ -1522,7 +1524,7 @@ out:
1522 1524
1523static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) 1525static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1524{ 1526{
1525 struct inode *inode = filp->f_dentry->d_inode; 1527 struct inode *inode = filp->f_path.dentry->d_inode;
1526 struct address_space *mapping = inode->i_mapping; 1528 struct address_space *mapping = inode->i_mapping;
1527 unsigned long index, offset; 1529 unsigned long index, offset;
1528 1530
@@ -1941,7 +1943,7 @@ static int shmem_xattr_security_set(struct inode *inode, const char *name,
1941 return security_inode_setsecurity(inode, name, value, size, flags); 1943 return security_inode_setsecurity(inode, name, value, size, flags);
1942} 1944}
1943 1945
1944struct xattr_handler shmem_xattr_security_handler = { 1946static struct xattr_handler shmem_xattr_security_handler = {
1945 .prefix = XATTR_SECURITY_PREFIX, 1947 .prefix = XATTR_SECURITY_PREFIX,
1946 .list = shmem_xattr_security_list, 1948 .list = shmem_xattr_security_list,
1947 .get = shmem_xattr_security_get, 1949 .get = shmem_xattr_security_get,
@@ -1956,6 +1958,85 @@ static struct xattr_handler *shmem_xattr_handlers[] = {
1956}; 1958};
1957#endif 1959#endif
1958 1960
1961static struct dentry *shmem_get_parent(struct dentry *child)
1962{
1963 return ERR_PTR(-ESTALE);
1964}
1965
1966static int shmem_match(struct inode *ino, void *vfh)
1967{
1968 __u32 *fh = vfh;
1969 __u64 inum = fh[2];
1970 inum = (inum << 32) | fh[1];
1971 return ino->i_ino == inum && fh[0] == ino->i_generation;
1972}
1973
1974static struct dentry *shmem_get_dentry(struct super_block *sb, void *vfh)
1975{
1976 struct dentry *de = NULL;
1977 struct inode *inode;
1978 __u32 *fh = vfh;
1979 __u64 inum = fh[2];
1980 inum = (inum << 32) | fh[1];
1981
1982 inode = ilookup5(sb, (unsigned long)(inum+fh[0]), shmem_match, vfh);
1983 if (inode) {
1984 de = d_find_alias(inode);
1985 iput(inode);
1986 }
1987
1988 return de? de: ERR_PTR(-ESTALE);
1989}
1990
1991static struct dentry *shmem_decode_fh(struct super_block *sb, __u32 *fh,
1992 int len, int type,
1993 int (*acceptable)(void *context, struct dentry *de),
1994 void *context)
1995{
1996 if (len < 3)
1997 return ERR_PTR(-ESTALE);
1998
1999 return sb->s_export_op->find_exported_dentry(sb, fh, NULL, acceptable,
2000 context);
2001}
2002
2003static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2004 int connectable)
2005{
2006 struct inode *inode = dentry->d_inode;
2007
2008 if (*len < 3)
2009 return 255;
2010
2011 if (hlist_unhashed(&inode->i_hash)) {
2012 /* Unfortunately insert_inode_hash is not idempotent,
2013 * so as we hash inodes here rather than at creation
2014 * time, we need a lock to ensure we only try
2015 * to do it once
2016 */
2017 static DEFINE_SPINLOCK(lock);
2018 spin_lock(&lock);
2019 if (hlist_unhashed(&inode->i_hash))
2020 __insert_inode_hash(inode,
2021 inode->i_ino + inode->i_generation);
2022 spin_unlock(&lock);
2023 }
2024
2025 fh[0] = inode->i_generation;
2026 fh[1] = inode->i_ino;
2027 fh[2] = ((__u64)inode->i_ino) >> 32;
2028
2029 *len = 3;
2030 return 1;
2031}
2032
2033static struct export_operations shmem_export_ops = {
2034 .get_parent = shmem_get_parent,
2035 .get_dentry = shmem_get_dentry,
2036 .encode_fh = shmem_encode_fh,
2037 .decode_fh = shmem_decode_fh,
2038};
2039
1959static int shmem_parse_options(char *options, int *mode, uid_t *uid, 2040static int shmem_parse_options(char *options, int *mode, uid_t *uid,
1960 gid_t *gid, unsigned long *blocks, unsigned long *inodes, 2041 gid_t *gid, unsigned long *blocks, unsigned long *inodes,
1961 int *policy, nodemask_t *policy_nodes) 2042 int *policy, nodemask_t *policy_nodes)
@@ -2128,6 +2209,7 @@ static int shmem_fill_super(struct super_block *sb,
2128 &inodes, &policy, &policy_nodes)) 2209 &inodes, &policy, &policy_nodes))
2129 return -EINVAL; 2210 return -EINVAL;
2130 } 2211 }
2212 sb->s_export_op = &shmem_export_ops;
2131#else 2213#else
2132 sb->s_flags |= MS_NOUSER; 2214 sb->s_flags |= MS_NOUSER;
2133#endif 2215#endif
@@ -2181,7 +2263,7 @@ static struct kmem_cache *shmem_inode_cachep;
2181static struct inode *shmem_alloc_inode(struct super_block *sb) 2263static struct inode *shmem_alloc_inode(struct super_block *sb)
2182{ 2264{
2183 struct shmem_inode_info *p; 2265 struct shmem_inode_info *p;
2184 p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL); 2266 p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2185 if (!p) 2267 if (!p)
2186 return NULL; 2268 return NULL;
2187 return &p->vfs_inode; 2269 return &p->vfs_inode;
@@ -2237,7 +2319,7 @@ static const struct address_space_operations shmem_aops = {
2237 .migratepage = migrate_page, 2319 .migratepage = migrate_page,
2238}; 2320};
2239 2321
2240static struct file_operations shmem_file_operations = { 2322static const struct file_operations shmem_file_operations = {
2241 .mmap = shmem_mmap, 2323 .mmap = shmem_mmap,
2242#ifdef CONFIG_TMPFS 2324#ifdef CONFIG_TMPFS
2243 .llseek = generic_file_llseek, 2325 .llseek = generic_file_llseek,
@@ -2411,8 +2493,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2411 d_instantiate(dentry, inode); 2493 d_instantiate(dentry, inode);
2412 inode->i_size = size; 2494 inode->i_size = size;
2413 inode->i_nlink = 0; /* It is unlinked */ 2495 inode->i_nlink = 0; /* It is unlinked */
2414 file->f_vfsmnt = mntget(shm_mnt); 2496 file->f_path.mnt = mntget(shm_mnt);
2415 file->f_dentry = dentry; 2497 file->f_path.dentry = dentry;
2416 file->f_mapping = inode->i_mapping; 2498 file->f_mapping = inode->i_mapping;
2417 file->f_op = &shmem_file_operations; 2499 file->f_op = &shmem_file_operations;
2418 file->f_mode = FMODE_WRITE | FMODE_READ; 2500 file->f_mode = FMODE_WRITE | FMODE_READ;
diff --git a/mm/slab.c b/mm/slab.c
index 266449d604..2c655532f5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -103,12 +103,13 @@
103#include <linux/module.h> 103#include <linux/module.h>
104#include <linux/rcupdate.h> 104#include <linux/rcupdate.h>
105#include <linux/string.h> 105#include <linux/string.h>
106#include <linux/uaccess.h>
106#include <linux/nodemask.h> 107#include <linux/nodemask.h>
107#include <linux/mempolicy.h> 108#include <linux/mempolicy.h>
108#include <linux/mutex.h> 109#include <linux/mutex.h>
110#include <linux/fault-inject.h>
109#include <linux/rtmutex.h> 111#include <linux/rtmutex.h>
110 112
111#include <asm/uaccess.h>
112#include <asm/cacheflush.h> 113#include <asm/cacheflush.h>
113#include <asm/tlbflush.h> 114#include <asm/tlbflush.h>
114#include <asm/page.h> 115#include <asm/page.h>
@@ -313,7 +314,7 @@ static int drain_freelist(struct kmem_cache *cache,
313static void free_block(struct kmem_cache *cachep, void **objpp, int len, 314static void free_block(struct kmem_cache *cachep, void **objpp, int len,
314 int node); 315 int node);
315static int enable_cpucache(struct kmem_cache *cachep); 316static int enable_cpucache(struct kmem_cache *cachep);
316static void cache_reap(void *unused); 317static void cache_reap(struct work_struct *unused);
317 318
318/* 319/*
319 * This function must be completely optimized away if a constant is passed to 320 * This function must be completely optimized away if a constant is passed to
@@ -730,7 +731,10 @@ static inline void init_lock_keys(void)
730} 731}
731#endif 732#endif
732 733
733/* Guard access to the cache-chain. */ 734/*
735 * 1. Guard access to the cache-chain.
736 * 2. Protect sanity of cpu_online_map against cpu hotplug events
737 */
734static DEFINE_MUTEX(cache_chain_mutex); 738static DEFINE_MUTEX(cache_chain_mutex);
735static struct list_head cache_chain; 739static struct list_head cache_chain;
736 740
@@ -753,7 +757,7 @@ int slab_is_available(void)
753 return g_cpucache_up == FULL; 757 return g_cpucache_up == FULL;
754} 758}
755 759
756static DEFINE_PER_CPU(struct work_struct, reap_work); 760static DEFINE_PER_CPU(struct delayed_work, reap_work);
757 761
758static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 762static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
759{ 763{
@@ -866,6 +870,22 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
866 dump_stack(); 870 dump_stack();
867} 871}
868 872
873/*
874 * By default on NUMA we use alien caches to stage the freeing of
875 * objects allocated from other nodes. This causes massive memory
876 * inefficiencies when using fake NUMA setup to split memory into a
877 * large number of small nodes, so it can be disabled on the command
878 * line
879 */
880
881static int use_alien_caches __read_mostly = 1;
882static int __init noaliencache_setup(char *s)
883{
884 use_alien_caches = 0;
885 return 1;
886}
887__setup("noaliencache", noaliencache_setup);
888
869#ifdef CONFIG_NUMA 889#ifdef CONFIG_NUMA
870/* 890/*
871 * Special reaping functions for NUMA systems called from cache_reap(). 891 * Special reaping functions for NUMA systems called from cache_reap().
@@ -883,7 +903,7 @@ static void init_reap_node(int cpu)
883 if (node == MAX_NUMNODES) 903 if (node == MAX_NUMNODES)
884 node = first_node(node_online_map); 904 node = first_node(node_online_map);
885 905
886 __get_cpu_var(reap_node) = node; 906 per_cpu(reap_node, cpu) = node;
887} 907}
888 908
889static void next_reap_node(void) 909static void next_reap_node(void)
@@ -916,17 +936,18 @@ static void next_reap_node(void)
916 */ 936 */
917static void __devinit start_cpu_timer(int cpu) 937static void __devinit start_cpu_timer(int cpu)
918{ 938{
919 struct work_struct *reap_work = &per_cpu(reap_work, cpu); 939 struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
920 940
921 /* 941 /*
922 * When this gets called from do_initcalls via cpucache_init(), 942 * When this gets called from do_initcalls via cpucache_init(),
923 * init_workqueues() has already run, so keventd will be setup 943 * init_workqueues() has already run, so keventd will be setup
924 * at that time. 944 * at that time.
925 */ 945 */
926 if (keventd_up() && reap_work->func == NULL) { 946 if (keventd_up() && reap_work->work.func == NULL) {
927 init_reap_node(cpu); 947 init_reap_node(cpu);
928 INIT_WORK(reap_work, cache_reap, NULL); 948 INIT_DELAYED_WORK(reap_work, cache_reap);
929 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 949 schedule_delayed_work_on(cpu, reap_work,
950 __round_jiffies_relative(HZ, cpu));
930 } 951 }
931} 952}
932 953
@@ -996,7 +1017,7 @@ static inline void *alternate_node_alloc(struct kmem_cache *cachep,
996 return NULL; 1017 return NULL;
997} 1018}
998 1019
999static inline void *__cache_alloc_node(struct kmem_cache *cachep, 1020static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1000 gfp_t flags, int nodeid) 1021 gfp_t flags, int nodeid)
1001{ 1022{
1002 return NULL; 1023 return NULL;
@@ -1004,7 +1025,7 @@ static inline void *__cache_alloc_node(struct kmem_cache *cachep,
1004 1025
1005#else /* CONFIG_NUMA */ 1026#else /* CONFIG_NUMA */
1006 1027
1007static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); 1028static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1008static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 1029static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1009 1030
1010static struct array_cache **alloc_alien_cache(int node, int limit) 1031static struct array_cache **alloc_alien_cache(int node, int limit)
@@ -1114,7 +1135,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1114 * Make sure we are not freeing a object from another node to the array 1135 * Make sure we are not freeing a object from another node to the array
1115 * cache on this cpu. 1136 * cache on this cpu.
1116 */ 1137 */
1117 if (likely(slabp->nodeid == node)) 1138 if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))
1118 return 0; 1139 return 0;
1119 1140
1120 l3 = cachep->nodelists[node]; 1141 l3 = cachep->nodelists[node];
@@ -1192,7 +1213,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1192 list_for_each_entry(cachep, &cache_chain, next) { 1213 list_for_each_entry(cachep, &cache_chain, next) {
1193 struct array_cache *nc; 1214 struct array_cache *nc;
1194 struct array_cache *shared; 1215 struct array_cache *shared;
1195 struct array_cache **alien; 1216 struct array_cache **alien = NULL;
1196 1217
1197 nc = alloc_arraycache(node, cachep->limit, 1218 nc = alloc_arraycache(node, cachep->limit,
1198 cachep->batchcount); 1219 cachep->batchcount);
@@ -1204,9 +1225,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1204 if (!shared) 1225 if (!shared)
1205 goto bad; 1226 goto bad;
1206 1227
1207 alien = alloc_alien_cache(node, cachep->limit); 1228 if (use_alien_caches) {
1208 if (!alien) 1229 alien = alloc_alien_cache(node, cachep->limit);
1209 goto bad; 1230 if (!alien)
1231 goto bad;
1232 }
1210 cachep->array[cpu] = nc; 1233 cachep->array[cpu] = nc;
1211 l3 = cachep->nodelists[node]; 1234 l3 = cachep->nodelists[node];
1212 BUG_ON(!l3); 1235 BUG_ON(!l3);
@@ -1230,12 +1253,18 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1230 kfree(shared); 1253 kfree(shared);
1231 free_alien_cache(alien); 1254 free_alien_cache(alien);
1232 } 1255 }
1233 mutex_unlock(&cache_chain_mutex);
1234 break; 1256 break;
1235 case CPU_ONLINE: 1257 case CPU_ONLINE:
1258 mutex_unlock(&cache_chain_mutex);
1236 start_cpu_timer(cpu); 1259 start_cpu_timer(cpu);
1237 break; 1260 break;
1238#ifdef CONFIG_HOTPLUG_CPU 1261#ifdef CONFIG_HOTPLUG_CPU
1262 case CPU_DOWN_PREPARE:
1263 mutex_lock(&cache_chain_mutex);
1264 break;
1265 case CPU_DOWN_FAILED:
1266 mutex_unlock(&cache_chain_mutex);
1267 break;
1239 case CPU_DEAD: 1268 case CPU_DEAD:
1240 /* 1269 /*
1241 * Even if all the cpus of a node are down, we don't free the 1270 * Even if all the cpus of a node are down, we don't free the
@@ -1246,8 +1275,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1246 * gets destroyed at kmem_cache_destroy(). 1275 * gets destroyed at kmem_cache_destroy().
1247 */ 1276 */
1248 /* fall thru */ 1277 /* fall thru */
1278#endif
1249 case CPU_UP_CANCELED: 1279 case CPU_UP_CANCELED:
1250 mutex_lock(&cache_chain_mutex);
1251 list_for_each_entry(cachep, &cache_chain, next) { 1280 list_for_each_entry(cachep, &cache_chain, next) {
1252 struct array_cache *nc; 1281 struct array_cache *nc;
1253 struct array_cache *shared; 1282 struct array_cache *shared;
@@ -1308,11 +1337,9 @@ free_array_cache:
1308 } 1337 }
1309 mutex_unlock(&cache_chain_mutex); 1338 mutex_unlock(&cache_chain_mutex);
1310 break; 1339 break;
1311#endif
1312 } 1340 }
1313 return NOTIFY_OK; 1341 return NOTIFY_OK;
1314bad: 1342bad:
1315 mutex_unlock(&cache_chain_mutex);
1316 return NOTIFY_BAD; 1343 return NOTIFY_BAD;
1317} 1344}
1318 1345
@@ -1580,12 +1607,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1580 flags |= __GFP_COMP; 1607 flags |= __GFP_COMP;
1581#endif 1608#endif
1582 1609
1583 /* 1610 flags |= cachep->gfpflags;
1584 * Under NUMA we want memory on the indicated node. We will handle
1585 * the needed fallback ourselves since we want to serve from our
1586 * per node object lists first for other nodes.
1587 */
1588 flags |= cachep->gfpflags | GFP_THISNODE;
1589 1611
1590 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1612 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1591 if (!page) 1613 if (!page)
@@ -2098,15 +2120,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2098 } 2120 }
2099 2121
2100 /* 2122 /*
2101 * Prevent CPUs from coming and going. 2123 * We use cache_chain_mutex to ensure a consistent view of
2102 * lock_cpu_hotplug() nests outside cache_chain_mutex 2124 * cpu_online_map as well. Please see cpuup_callback
2103 */ 2125 */
2104 lock_cpu_hotplug();
2105
2106 mutex_lock(&cache_chain_mutex); 2126 mutex_lock(&cache_chain_mutex);
2107 2127
2108 list_for_each_entry(pc, &cache_chain, next) { 2128 list_for_each_entry(pc, &cache_chain, next) {
2109 mm_segment_t old_fs = get_fs();
2110 char tmp; 2129 char tmp;
2111 int res; 2130 int res;
2112 2131
@@ -2115,9 +2134,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2115 * destroy its slab cache and no-one else reuses the vmalloc 2134 * destroy its slab cache and no-one else reuses the vmalloc
2116 * area of the module. Print a warning. 2135 * area of the module. Print a warning.
2117 */ 2136 */
2118 set_fs(KERNEL_DS); 2137 res = probe_kernel_address(pc->name, tmp);
2119 res = __get_user(tmp, pc->name);
2120 set_fs(old_fs);
2121 if (res) { 2138 if (res) {
2122 printk("SLAB: cache with size %d has lost its name\n", 2139 printk("SLAB: cache with size %d has lost its name\n",
2123 pc->buffer_size); 2140 pc->buffer_size);
@@ -2197,25 +2214,24 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2197 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) 2214 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
2198 ralign = BYTES_PER_WORD; 2215 ralign = BYTES_PER_WORD;
2199 2216
2200 /* 2) arch mandated alignment: disables debug if necessary */ 2217 /* 2) arch mandated alignment */
2201 if (ralign < ARCH_SLAB_MINALIGN) { 2218 if (ralign < ARCH_SLAB_MINALIGN) {
2202 ralign = ARCH_SLAB_MINALIGN; 2219 ralign = ARCH_SLAB_MINALIGN;
2203 if (ralign > BYTES_PER_WORD)
2204 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2205 } 2220 }
2206 /* 3) caller mandated alignment: disables debug if necessary */ 2221 /* 3) caller mandated alignment */
2207 if (ralign < align) { 2222 if (ralign < align) {
2208 ralign = align; 2223 ralign = align;
2209 if (ralign > BYTES_PER_WORD)
2210 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2211 } 2224 }
2225 /* disable debug if necessary */
2226 if (ralign > BYTES_PER_WORD)
2227 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2212 /* 2228 /*
2213 * 4) Store it. 2229 * 4) Store it.
2214 */ 2230 */
2215 align = ralign; 2231 align = ralign;
2216 2232
2217 /* Get cache's description obj. */ 2233 /* Get cache's description obj. */
2218 cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); 2234 cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
2219 if (!cachep) 2235 if (!cachep)
2220 goto oops; 2236 goto oops;
2221 2237
@@ -2326,7 +2342,6 @@ oops:
2326 panic("kmem_cache_create(): failed to create slab `%s'\n", 2342 panic("kmem_cache_create(): failed to create slab `%s'\n",
2327 name); 2343 name);
2328 mutex_unlock(&cache_chain_mutex); 2344 mutex_unlock(&cache_chain_mutex);
2329 unlock_cpu_hotplug();
2330 return cachep; 2345 return cachep;
2331} 2346}
2332EXPORT_SYMBOL(kmem_cache_create); 2347EXPORT_SYMBOL(kmem_cache_create);
@@ -2444,6 +2459,7 @@ out:
2444 return nr_freed; 2459 return nr_freed;
2445} 2460}
2446 2461
2462/* Called with cache_chain_mutex held to protect against cpu hotplug */
2447static int __cache_shrink(struct kmem_cache *cachep) 2463static int __cache_shrink(struct kmem_cache *cachep)
2448{ 2464{
2449 int ret = 0, i = 0; 2465 int ret = 0, i = 0;
@@ -2474,9 +2490,13 @@ static int __cache_shrink(struct kmem_cache *cachep)
2474 */ 2490 */
2475int kmem_cache_shrink(struct kmem_cache *cachep) 2491int kmem_cache_shrink(struct kmem_cache *cachep)
2476{ 2492{
2493 int ret;
2477 BUG_ON(!cachep || in_interrupt()); 2494 BUG_ON(!cachep || in_interrupt());
2478 2495
2479 return __cache_shrink(cachep); 2496 mutex_lock(&cache_chain_mutex);
2497 ret = __cache_shrink(cachep);
2498 mutex_unlock(&cache_chain_mutex);
2499 return ret;
2480} 2500}
2481EXPORT_SYMBOL(kmem_cache_shrink); 2501EXPORT_SYMBOL(kmem_cache_shrink);
2482 2502
@@ -2500,23 +2520,16 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2500{ 2520{
2501 BUG_ON(!cachep || in_interrupt()); 2521 BUG_ON(!cachep || in_interrupt());
2502 2522
2503 /* Don't let CPUs to come and go */
2504 lock_cpu_hotplug();
2505
2506 /* Find the cache in the chain of caches. */ 2523 /* Find the cache in the chain of caches. */
2507 mutex_lock(&cache_chain_mutex); 2524 mutex_lock(&cache_chain_mutex);
2508 /* 2525 /*
2509 * the chain is never empty, cache_cache is never destroyed 2526 * the chain is never empty, cache_cache is never destroyed
2510 */ 2527 */
2511 list_del(&cachep->next); 2528 list_del(&cachep->next);
2512 mutex_unlock(&cache_chain_mutex);
2513
2514 if (__cache_shrink(cachep)) { 2529 if (__cache_shrink(cachep)) {
2515 slab_error(cachep, "Can't free all objects"); 2530 slab_error(cachep, "Can't free all objects");
2516 mutex_lock(&cache_chain_mutex);
2517 list_add(&cachep->next, &cache_chain); 2531 list_add(&cachep->next, &cache_chain);
2518 mutex_unlock(&cache_chain_mutex); 2532 mutex_unlock(&cache_chain_mutex);
2519 unlock_cpu_hotplug();
2520 return; 2533 return;
2521 } 2534 }
2522 2535
@@ -2524,7 +2537,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2524 synchronize_rcu(); 2537 synchronize_rcu();
2525 2538
2526 __kmem_cache_destroy(cachep); 2539 __kmem_cache_destroy(cachep);
2527 unlock_cpu_hotplug(); 2540 mutex_unlock(&cache_chain_mutex);
2528} 2541}
2529EXPORT_SYMBOL(kmem_cache_destroy); 2542EXPORT_SYMBOL(kmem_cache_destroy);
2530 2543
@@ -2548,7 +2561,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2548 if (OFF_SLAB(cachep)) { 2561 if (OFF_SLAB(cachep)) {
2549 /* Slab management obj is off-slab. */ 2562 /* Slab management obj is off-slab. */
2550 slabp = kmem_cache_alloc_node(cachep->slabp_cache, 2563 slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2551 local_flags, nodeid); 2564 local_flags & ~GFP_THISNODE, nodeid);
2552 if (!slabp) 2565 if (!slabp)
2553 return NULL; 2566 return NULL;
2554 } else { 2567 } else {
@@ -2618,7 +2631,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2618 2631
2619static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2632static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2620{ 2633{
2621 if (flags & SLAB_DMA) 2634 if (flags & GFP_DMA)
2622 BUG_ON(!(cachep->gfpflags & GFP_DMA)); 2635 BUG_ON(!(cachep->gfpflags & GFP_DMA));
2623 else 2636 else
2624 BUG_ON(cachep->gfpflags & GFP_DMA); 2637 BUG_ON(cachep->gfpflags & GFP_DMA);
@@ -2689,10 +2702,10 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2689 * Grow (by 1) the number of slabs within a cache. This is called by 2702 * Grow (by 1) the number of slabs within a cache. This is called by
2690 * kmem_cache_alloc() when there are no active objs left in a cache. 2703 * kmem_cache_alloc() when there are no active objs left in a cache.
2691 */ 2704 */
2692static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) 2705static int cache_grow(struct kmem_cache *cachep,
2706 gfp_t flags, int nodeid, void *objp)
2693{ 2707{
2694 struct slab *slabp; 2708 struct slab *slabp;
2695 void *objp;
2696 size_t offset; 2709 size_t offset;
2697 gfp_t local_flags; 2710 gfp_t local_flags;
2698 unsigned long ctor_flags; 2711 unsigned long ctor_flags;
@@ -2702,12 +2715,12 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2702 * Be lazy and only check for valid flags here, keeping it out of the 2715 * Be lazy and only check for valid flags here, keeping it out of the
2703 * critical path in kmem_cache_alloc(). 2716 * critical path in kmem_cache_alloc().
2704 */ 2717 */
2705 BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)); 2718 BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
2706 if (flags & SLAB_NO_GROW) 2719 if (flags & __GFP_NO_GROW)
2707 return 0; 2720 return 0;
2708 2721
2709 ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2722 ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2710 local_flags = (flags & SLAB_LEVEL_MASK); 2723 local_flags = (flags & GFP_LEVEL_MASK);
2711 if (!(local_flags & __GFP_WAIT)) 2724 if (!(local_flags & __GFP_WAIT))
2712 /* 2725 /*
2713 * Not allowed to sleep. Need to tell a constructor about 2726 * Not allowed to sleep. Need to tell a constructor about
@@ -2744,12 +2757,14 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2744 * Get mem for the objs. Attempt to allocate a physical page from 2757 * Get mem for the objs. Attempt to allocate a physical page from
2745 * 'nodeid'. 2758 * 'nodeid'.
2746 */ 2759 */
2747 objp = kmem_getpages(cachep, flags, nodeid); 2760 if (!objp)
2761 objp = kmem_getpages(cachep, flags, nodeid);
2748 if (!objp) 2762 if (!objp)
2749 goto failed; 2763 goto failed;
2750 2764
2751 /* Get slab management. */ 2765 /* Get slab management. */
2752 slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid); 2766 slabp = alloc_slabmgmt(cachep, objp, offset,
2767 local_flags & ~GFP_THISNODE, nodeid);
2753 if (!slabp) 2768 if (!slabp)
2754 goto opps1; 2769 goto opps1;
2755 2770
@@ -2987,7 +3002,7 @@ alloc_done:
2987 3002
2988 if (unlikely(!ac->avail)) { 3003 if (unlikely(!ac->avail)) {
2989 int x; 3004 int x;
2990 x = cache_grow(cachep, flags, node); 3005 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
2991 3006
2992 /* cache_grow can reenable interrupts, then ac could change. */ 3007 /* cache_grow can reenable interrupts, then ac could change. */
2993 ac = cpu_cache_get(cachep); 3008 ac = cpu_cache_get(cachep);
@@ -3063,18 +3078,101 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3063 3078
3064 cachep->ctor(objp, cachep, ctor_flags); 3079 cachep->ctor(objp, cachep, ctor_flags);
3065 } 3080 }
3081#if ARCH_SLAB_MINALIGN
3082 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3083 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3084 objp, ARCH_SLAB_MINALIGN);
3085 }
3086#endif
3066 return objp; 3087 return objp;
3067} 3088}
3068#else 3089#else
3069#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 3090#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3070#endif 3091#endif
3071 3092
3093#ifdef CONFIG_FAILSLAB
3094
3095static struct failslab_attr {
3096
3097 struct fault_attr attr;
3098
3099 u32 ignore_gfp_wait;
3100#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3101 struct dentry *ignore_gfp_wait_file;
3102#endif
3103
3104} failslab = {
3105 .attr = FAULT_ATTR_INITIALIZER,
3106 .ignore_gfp_wait = 1,
3107};
3108
3109static int __init setup_failslab(char *str)
3110{
3111 return setup_fault_attr(&failslab.attr, str);
3112}
3113__setup("failslab=", setup_failslab);
3114
3115static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3116{
3117 if (cachep == &cache_cache)
3118 return 0;
3119 if (flags & __GFP_NOFAIL)
3120 return 0;
3121 if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
3122 return 0;
3123
3124 return should_fail(&failslab.attr, obj_size(cachep));
3125}
3126
3127#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3128
3129static int __init failslab_debugfs(void)
3130{
3131 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
3132 struct dentry *dir;
3133 int err;
3134
3135 err = init_fault_attr_dentries(&failslab.attr, "failslab");
3136 if (err)
3137 return err;
3138 dir = failslab.attr.dentries.dir;
3139
3140 failslab.ignore_gfp_wait_file =
3141 debugfs_create_bool("ignore-gfp-wait", mode, dir,
3142 &failslab.ignore_gfp_wait);
3143
3144 if (!failslab.ignore_gfp_wait_file) {
3145 err = -ENOMEM;
3146 debugfs_remove(failslab.ignore_gfp_wait_file);
3147 cleanup_fault_attr_dentries(&failslab.attr);
3148 }
3149
3150 return err;
3151}
3152
3153late_initcall(failslab_debugfs);
3154
3155#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3156
3157#else /* CONFIG_FAILSLAB */
3158
3159static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3160{
3161 return 0;
3162}
3163
3164#endif /* CONFIG_FAILSLAB */
3165
3072static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3166static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3073{ 3167{
3074 void *objp; 3168 void *objp;
3075 struct array_cache *ac; 3169 struct array_cache *ac;
3076 3170
3077 check_irq_off(); 3171 check_irq_off();
3172
3173 if (should_failslab(cachep, flags))
3174 return NULL;
3175
3078 ac = cpu_cache_get(cachep); 3176 ac = cpu_cache_get(cachep);
3079 if (likely(ac->avail)) { 3177 if (likely(ac->avail)) {
3080 STATS_INC_ALLOCHIT(cachep); 3178 STATS_INC_ALLOCHIT(cachep);
@@ -3105,10 +3203,10 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
3105 objp = ____cache_alloc(cachep, flags); 3203 objp = ____cache_alloc(cachep, flags);
3106 /* 3204 /*
3107 * We may just have run out of memory on the local node. 3205 * We may just have run out of memory on the local node.
3108 * __cache_alloc_node() knows how to locate memory on other nodes 3206 * ____cache_alloc_node() knows how to locate memory on other nodes
3109 */ 3207 */
3110 if (NUMA_BUILD && !objp) 3208 if (NUMA_BUILD && !objp)
3111 objp = __cache_alloc_node(cachep, flags, numa_node_id()); 3209 objp = ____cache_alloc_node(cachep, flags, numa_node_id());
3112 local_irq_restore(save_flags); 3210 local_irq_restore(save_flags);
3113 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 3211 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
3114 caller); 3212 caller);
@@ -3135,15 +3233,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3135 else if (current->mempolicy) 3233 else if (current->mempolicy)
3136 nid_alloc = slab_node(current->mempolicy); 3234 nid_alloc = slab_node(current->mempolicy);
3137 if (nid_alloc != nid_here) 3235 if (nid_alloc != nid_here)
3138 return __cache_alloc_node(cachep, flags, nid_alloc); 3236 return ____cache_alloc_node(cachep, flags, nid_alloc);
3139 return NULL; 3237 return NULL;
3140} 3238}
3141 3239
3142/* 3240/*
3143 * Fallback function if there was no memory available and no objects on a 3241 * Fallback function if there was no memory available and no objects on a
3144 * certain node and we are allowed to fall back. We mimick the behavior of 3242 * certain node and fall back is permitted. First we scan all the
3145 * the page allocator. We fall back according to a zonelist determined by 3243 * available nodelists for available objects. If that fails then we
3146 * the policy layer while obeying cpuset constraints. 3244 * perform an allocation without specifying a node. This allows the page
3245 * allocator to do its reclaim / fallback magic. We then insert the
3246 * slab into the proper nodelist and then allocate from it.
3147 */ 3247 */
3148void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) 3248void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3149{ 3249{
@@ -3151,20 +3251,59 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3151 ->node_zonelists[gfp_zone(flags)]; 3251 ->node_zonelists[gfp_zone(flags)];
3152 struct zone **z; 3252 struct zone **z;
3153 void *obj = NULL; 3253 void *obj = NULL;
3254 int nid;
3154 3255
3155 for (z = zonelist->zones; *z && !obj; z++) 3256retry:
3156 if (zone_idx(*z) <= ZONE_NORMAL && 3257 /*
3157 cpuset_zone_allowed(*z, flags)) 3258 * Look through allowed nodes for objects available
3158 obj = __cache_alloc_node(cache, 3259 * from existing per node queues.
3159 flags | __GFP_THISNODE, 3260 */
3160 zone_to_nid(*z)); 3261 for (z = zonelist->zones; *z && !obj; z++) {
3262 nid = zone_to_nid(*z);
3263
3264 if (cpuset_zone_allowed(*z, flags | __GFP_HARDWALL) &&
3265 cache->nodelists[nid] &&
3266 cache->nodelists[nid]->free_objects)
3267 obj = ____cache_alloc_node(cache,
3268 flags | GFP_THISNODE, nid);
3269 }
3270
3271 if (!obj) {
3272 /*
3273 * This allocation will be performed within the constraints
3274 * of the current cpuset / memory policy requirements.
3275 * We may trigger various forms of reclaim on the allowed
3276 * set and go into memory reserves if necessary.
3277 */
3278 obj = kmem_getpages(cache, flags, -1);
3279 if (obj) {
3280 /*
3281 * Insert into the appropriate per node queues
3282 */
3283 nid = page_to_nid(virt_to_page(obj));
3284 if (cache_grow(cache, flags, nid, obj)) {
3285 obj = ____cache_alloc_node(cache,
3286 flags | GFP_THISNODE, nid);
3287 if (!obj)
3288 /*
3289 * Another processor may allocate the
3290 * objects in the slab since we are
3291 * not holding any locks.
3292 */
3293 goto retry;
3294 } else {
3295 kmem_freepages(cache, obj);
3296 obj = NULL;
3297 }
3298 }
3299 }
3161 return obj; 3300 return obj;
3162} 3301}
3163 3302
3164/* 3303/*
3165 * A interface to enable slab creation on nodeid 3304 * A interface to enable slab creation on nodeid
3166 */ 3305 */
3167static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3306static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3168 int nodeid) 3307 int nodeid)
3169{ 3308{
3170 struct list_head *entry; 3309 struct list_head *entry;
@@ -3213,7 +3352,7 @@ retry:
3213 3352
3214must_grow: 3353must_grow:
3215 spin_unlock(&l3->list_lock); 3354 spin_unlock(&l3->list_lock);
3216 x = cache_grow(cachep, flags, nodeid); 3355 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3217 if (x) 3356 if (x)
3218 goto retry; 3357 goto retry;
3219 3358
@@ -3431,35 +3570,59 @@ out:
3431 * @flags: See kmalloc(). 3570 * @flags: See kmalloc().
3432 * @nodeid: node number of the target node. 3571 * @nodeid: node number of the target node.
3433 * 3572 *
3434 * Identical to kmem_cache_alloc, except that this function is slow 3573 * Identical to kmem_cache_alloc but it will allocate memory on the given
3435 * and can sleep. And it will allocate memory on the given node, which 3574 * node, which can improve the performance for cpu bound structures.
3436 * can improve the performance for cpu bound structures. 3575 *
3437 * New and improved: it will now make sure that the object gets 3576 * Fallback to other node is possible if __GFP_THISNODE is not set.
3438 * put on the correct node list so that there is no false sharing.
3439 */ 3577 */
3440void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3578static __always_inline void *
3579__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3580 int nodeid, void *caller)
3441{ 3581{
3442 unsigned long save_flags; 3582 unsigned long save_flags;
3443 void *ptr; 3583 void *ptr = NULL;
3444 3584
3445 cache_alloc_debugcheck_before(cachep, flags); 3585 cache_alloc_debugcheck_before(cachep, flags);
3446 local_irq_save(save_flags); 3586 local_irq_save(save_flags);
3447 3587
3448 if (nodeid == -1 || nodeid == numa_node_id() || 3588 if (unlikely(nodeid == -1))
3449 !cachep->nodelists[nodeid]) 3589 nodeid = numa_node_id();
3450 ptr = ____cache_alloc(cachep, flags);
3451 else
3452 ptr = __cache_alloc_node(cachep, flags, nodeid);
3453 local_irq_restore(save_flags);
3454 3590
3455 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, 3591 if (likely(cachep->nodelists[nodeid])) {
3456 __builtin_return_address(0)); 3592 if (nodeid == numa_node_id()) {
3593 /*
3594 * Use the locally cached objects if possible.
3595 * However ____cache_alloc does not allow fallback
3596 * to other nodes. It may fail while we still have
3597 * objects on other nodes available.
3598 */
3599 ptr = ____cache_alloc(cachep, flags);
3600 }
3601 if (!ptr) {
3602 /* ___cache_alloc_node can fall back to other nodes */
3603 ptr = ____cache_alloc_node(cachep, flags, nodeid);
3604 }
3605 } else {
3606 /* Node not bootstrapped yet */
3607 if (!(flags & __GFP_THISNODE))
3608 ptr = fallback_alloc(cachep, flags);
3609 }
3610
3611 local_irq_restore(save_flags);
3612 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3457 3613
3458 return ptr; 3614 return ptr;
3459} 3615}
3616
3617void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3618{
3619 return __cache_alloc_node(cachep, flags, nodeid,
3620 __builtin_return_address(0));
3621}
3460EXPORT_SYMBOL(kmem_cache_alloc_node); 3622EXPORT_SYMBOL(kmem_cache_alloc_node);
3461 3623
3462void *__kmalloc_node(size_t size, gfp_t flags, int node) 3624static __always_inline void *
3625__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3463{ 3626{
3464 struct kmem_cache *cachep; 3627 struct kmem_cache *cachep;
3465 3628
@@ -3468,8 +3631,29 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
3468 return NULL; 3631 return NULL;
3469 return kmem_cache_alloc_node(cachep, flags, node); 3632 return kmem_cache_alloc_node(cachep, flags, node);
3470} 3633}
3634
3635#ifdef CONFIG_DEBUG_SLAB
3636void *__kmalloc_node(size_t size, gfp_t flags, int node)
3637{
3638 return __do_kmalloc_node(size, flags, node,
3639 __builtin_return_address(0));
3640}
3471EXPORT_SYMBOL(__kmalloc_node); 3641EXPORT_SYMBOL(__kmalloc_node);
3472#endif 3642
3643void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3644 int node, void *caller)
3645{
3646 return __do_kmalloc_node(size, flags, node, caller);
3647}
3648EXPORT_SYMBOL(__kmalloc_node_track_caller);
3649#else
3650void *__kmalloc_node(size_t size, gfp_t flags, int node)
3651{
3652 return __do_kmalloc_node(size, flags, node, NULL);
3653}
3654EXPORT_SYMBOL(__kmalloc_node);
3655#endif /* CONFIG_DEBUG_SLAB */
3656#endif /* CONFIG_NUMA */
3473 3657
3474/** 3658/**
3475 * __do_kmalloc - allocate memory 3659 * __do_kmalloc - allocate memory
@@ -3580,13 +3764,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3580 int node; 3764 int node;
3581 struct kmem_list3 *l3; 3765 struct kmem_list3 *l3;
3582 struct array_cache *new_shared; 3766 struct array_cache *new_shared;
3583 struct array_cache **new_alien; 3767 struct array_cache **new_alien = NULL;
3584 3768
3585 for_each_online_node(node) { 3769 for_each_online_node(node) {
3586 3770
3587 new_alien = alloc_alien_cache(node, cachep->limit); 3771 if (use_alien_caches) {
3588 if (!new_alien) 3772 new_alien = alloc_alien_cache(node, cachep->limit);
3589 goto fail; 3773 if (!new_alien)
3774 goto fail;
3775 }
3590 3776
3591 new_shared = alloc_arraycache(node, 3777 new_shared = alloc_arraycache(node,
3592 cachep->shared*cachep->batchcount, 3778 cachep->shared*cachep->batchcount,
@@ -3812,7 +3998,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3812 * If we cannot acquire the cache chain mutex then just give up - we'll try 3998 * If we cannot acquire the cache chain mutex then just give up - we'll try
3813 * again on the next iteration. 3999 * again on the next iteration.
3814 */ 4000 */
3815static void cache_reap(void *unused) 4001static void cache_reap(struct work_struct *unused)
3816{ 4002{
3817 struct kmem_cache *searchp; 4003 struct kmem_cache *searchp;
3818 struct kmem_list3 *l3; 4004 struct kmem_list3 *l3;
@@ -3821,7 +4007,7 @@ static void cache_reap(void *unused)
3821 if (!mutex_trylock(&cache_chain_mutex)) { 4007 if (!mutex_trylock(&cache_chain_mutex)) {
3822 /* Give up. Setup the next iteration. */ 4008 /* Give up. Setup the next iteration. */
3823 schedule_delayed_work(&__get_cpu_var(reap_work), 4009 schedule_delayed_work(&__get_cpu_var(reap_work),
3824 REAPTIMEOUT_CPUC); 4010 round_jiffies_relative(REAPTIMEOUT_CPUC));
3825 return; 4011 return;
3826 } 4012 }
3827 4013
@@ -3867,7 +4053,8 @@ next:
3867 next_reap_node(); 4053 next_reap_node();
3868 refresh_cpu_vm_stats(smp_processor_id()); 4054 refresh_cpu_vm_stats(smp_processor_id());
3869 /* Set up the next iteration */ 4055 /* Set up the next iteration */
3870 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 4056 schedule_delayed_work(&__get_cpu_var(reap_work),
4057 round_jiffies_relative(REAPTIMEOUT_CPUC));
3871} 4058}
3872 4059
3873#ifdef CONFIG_PROC_FS 4060#ifdef CONFIG_PROC_FS
@@ -4035,7 +4222,7 @@ static int s_show(struct seq_file *m, void *p)
4035 * + further values on SMP and with statistics enabled 4222 * + further values on SMP and with statistics enabled
4036 */ 4223 */
4037 4224
4038struct seq_operations slabinfo_op = { 4225const struct seq_operations slabinfo_op = {
4039 .start = s_start, 4226 .start = s_start,
4040 .next = s_next, 4227 .next = s_next,
4041 .stop = s_stop, 4228 .stop = s_stop,
@@ -4233,7 +4420,7 @@ static int leaks_show(struct seq_file *m, void *p)
4233 return 0; 4420 return 0;
4234} 4421}
4235 4422
4236struct seq_operations slabstats_op = { 4423const struct seq_operations slabstats_op = {
4237 .start = leaks_start, 4424 .start = leaks_start,
4238 .next = s_next, 4425 .next = s_next,
4239 .stop = s_stop, 4426 .stop = s_stop,
diff --git a/mm/sparse.c b/mm/sparse.c
index 86c52ab808..ac26eb0d73 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -24,6 +24,25 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
24#endif 24#endif
25EXPORT_SYMBOL(mem_section); 25EXPORT_SYMBOL(mem_section);
26 26
27#ifdef NODE_NOT_IN_PAGE_FLAGS
28/*
29 * If we did not store the node number in the page then we have to
30 * do a lookup in the section_to_node_table in order to find which
31 * node the page belongs to.
32 */
33#if MAX_NUMNODES <= 256
34static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
35#else
36static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
37#endif
38
39int page_to_nid(struct page *page)
40{
41 return section_to_node_table[page_to_section(page)];
42}
43EXPORT_SYMBOL(page_to_nid);
44#endif
45
27#ifdef CONFIG_SPARSEMEM_EXTREME 46#ifdef CONFIG_SPARSEMEM_EXTREME
28static struct mem_section *sparse_index_alloc(int nid) 47static struct mem_section *sparse_index_alloc(int nid)
29{ 48{
@@ -49,6 +68,10 @@ static int sparse_index_init(unsigned long section_nr, int nid)
49 struct mem_section *section; 68 struct mem_section *section;
50 int ret = 0; 69 int ret = 0;
51 70
71#ifdef NODE_NOT_IN_PAGE_FLAGS
72 section_to_node_table[section_nr] = nid;
73#endif
74
52 if (mem_section[root]) 75 if (mem_section[root])
53 return -EEXIST; 76 return -EEXIST;
54 77
@@ -211,7 +234,7 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
211 struct page *page, *ret; 234 struct page *page, *ret;
212 unsigned long memmap_size = sizeof(struct page) * nr_pages; 235 unsigned long memmap_size = sizeof(struct page) * nr_pages;
213 236
214 page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); 237 page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
215 if (page) 238 if (page)
216 goto got_map_page; 239 goto got_map_page;
217 240
diff --git a/mm/swap.c b/mm/swap.c
index 2e0e871f54..2ed7be3979 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -57,9 +57,9 @@ static void put_compound_page(struct page *page)
57{ 57{
58 page = (struct page *)page_private(page); 58 page = (struct page *)page_private(page);
59 if (put_page_testzero(page)) { 59 if (put_page_testzero(page)) {
60 void (*dtor)(struct page *page); 60 compound_page_dtor *dtor;
61 61
62 dtor = (void (*)(struct page *))page[1].lru.next; 62 dtor = get_compound_page_dtor(page);
63 (*dtor)(page); 63 (*dtor)(page);
64 } 64 }
65} 65}
@@ -216,7 +216,7 @@ void lru_add_drain(void)
216} 216}
217 217
218#ifdef CONFIG_NUMA 218#ifdef CONFIG_NUMA
219static void lru_add_drain_per_cpu(void *dummy) 219static void lru_add_drain_per_cpu(struct work_struct *dummy)
220{ 220{
221 lru_add_drain(); 221 lru_add_drain();
222} 222}
@@ -226,7 +226,7 @@ static void lru_add_drain_per_cpu(void *dummy)
226 */ 226 */
227int lru_add_drain_all(void) 227int lru_add_drain_all(void)
228{ 228{
229 return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); 229 return schedule_on_each_cpu(lru_add_drain_per_cpu);
230} 230}
231 231
232#else 232#else
@@ -514,5 +514,7 @@ void __init swap_setup(void)
514 * Right now other parts of the system means that we 514 * Right now other parts of the system means that we
515 * _really_ don't want to cluster much more 515 * _really_ don't want to cluster much more
516 */ 516 */
517#ifdef CONFIG_HOTPLUG_CPU
517 hotcpu_notifier(cpu_swap_callback, 0); 518 hotcpu_notifier(cpu_swap_callback, 0);
519#endif
518} 520}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a15def63f2..b9fc0e5de6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -427,34 +427,48 @@ void free_swap_and_cache(swp_entry_t entry)
427 427
428#ifdef CONFIG_SOFTWARE_SUSPEND 428#ifdef CONFIG_SOFTWARE_SUSPEND
429/* 429/*
430 * Find the swap type that corresponds to given device (if any) 430 * Find the swap type that corresponds to given device (if any).
431 * 431 *
432 * This is needed for software suspend and is done in such a way that inode 432 * @offset - number of the PAGE_SIZE-sized block of the device, starting
433 * aliasing is allowed. 433 * from 0, in which the swap header is expected to be located.
434 *
435 * This is needed for the suspend to disk (aka swsusp).
434 */ 436 */
435int swap_type_of(dev_t device) 437int swap_type_of(dev_t device, sector_t offset)
436{ 438{
439 struct block_device *bdev = NULL;
437 int i; 440 int i;
438 441
442 if (device)
443 bdev = bdget(device);
444
439 spin_lock(&swap_lock); 445 spin_lock(&swap_lock);
440 for (i = 0; i < nr_swapfiles; i++) { 446 for (i = 0; i < nr_swapfiles; i++) {
441 struct inode *inode; 447 struct swap_info_struct *sis = swap_info + i;
442 448
443 if (!(swap_info[i].flags & SWP_WRITEOK)) 449 if (!(sis->flags & SWP_WRITEOK))
444 continue; 450 continue;
445 451
446 if (!device) { 452 if (!bdev) {
447 spin_unlock(&swap_lock); 453 spin_unlock(&swap_lock);
448 return i; 454 return i;
449 } 455 }
450 inode = swap_info[i].swap_file->f_dentry->d_inode; 456 if (bdev == sis->bdev) {
451 if (S_ISBLK(inode->i_mode) && 457 struct swap_extent *se;
452 device == MKDEV(imajor(inode), iminor(inode))) { 458
453 spin_unlock(&swap_lock); 459 se = list_entry(sis->extent_list.next,
454 return i; 460 struct swap_extent, list);
461 if (se->start_block == offset) {
462 spin_unlock(&swap_lock);
463 bdput(bdev);
464 return i;
465 }
455 } 466 }
456 } 467 }
457 spin_unlock(&swap_lock); 468 spin_unlock(&swap_lock);
469 if (bdev)
470 bdput(bdev);
471
458 return -ENODEV; 472 return -ENODEV;
459} 473}
460 474
@@ -931,6 +945,23 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
931 } 945 }
932} 946}
933 947
948#ifdef CONFIG_SOFTWARE_SUSPEND
949/*
950 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
951 * corresponding to given index in swap_info (swap type).
952 */
953sector_t swapdev_block(int swap_type, pgoff_t offset)
954{
955 struct swap_info_struct *sis;
956
957 if (swap_type >= nr_swapfiles)
958 return 0;
959
960 sis = swap_info + swap_type;
961 return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
962}
963#endif /* CONFIG_SOFTWARE_SUSPEND */
964
934/* 965/*
935 * Free all of a swapdev's extent information 966 * Free all of a swapdev's extent information
936 */ 967 */
@@ -1274,10 +1305,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1274 1305
1275 mutex_lock(&swapon_mutex); 1306 mutex_lock(&swapon_mutex);
1276 1307
1308 if (!l)
1309 return SEQ_START_TOKEN;
1310
1277 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1311 for (i = 0; i < nr_swapfiles; i++, ptr++) {
1278 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1312 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
1279 continue; 1313 continue;
1280 if (!l--) 1314 if (!--l)
1281 return ptr; 1315 return ptr;
1282 } 1316 }
1283 1317
@@ -1286,10 +1320,17 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1286 1320
1287static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 1321static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1288{ 1322{
1289 struct swap_info_struct *ptr = v; 1323 struct swap_info_struct *ptr;
1290 struct swap_info_struct *endptr = swap_info + nr_swapfiles; 1324 struct swap_info_struct *endptr = swap_info + nr_swapfiles;
1291 1325
1292 for (++ptr; ptr < endptr; ptr++) { 1326 if (v == SEQ_START_TOKEN)
1327 ptr = swap_info;
1328 else {
1329 ptr = v;
1330 ptr++;
1331 }
1332
1333 for (; ptr < endptr; ptr++) {
1293 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1334 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
1294 continue; 1335 continue;
1295 ++*pos; 1336 ++*pos;
@@ -1310,14 +1351,16 @@ static int swap_show(struct seq_file *swap, void *v)
1310 struct file *file; 1351 struct file *file;
1311 int len; 1352 int len;
1312 1353
1313 if (v == swap_info) 1354 if (ptr == SEQ_START_TOKEN) {
1314 seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 1355 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1356 return 0;
1357 }
1315 1358
1316 file = ptr->swap_file; 1359 file = ptr->swap_file;
1317 len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); 1360 len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\");
1318 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1361 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1319 len < 40 ? 40 - len : 1, " ", 1362 len < 40 ? 40 - len : 1, " ",
1320 S_ISBLK(file->f_dentry->d_inode->i_mode) ? 1363 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1321 "partition" : "file\t", 1364 "partition" : "file\t",
1322 ptr->pages << (PAGE_SHIFT - 10), 1365 ptr->pages << (PAGE_SHIFT - 10),
1323 ptr->inuse_pages << (PAGE_SHIFT - 10), 1366 ptr->inuse_pages << (PAGE_SHIFT - 10),
@@ -1325,7 +1368,7 @@ static int swap_show(struct seq_file *swap, void *v)
1325 return 0; 1368 return 0;
1326} 1369}
1327 1370
1328static struct seq_operations swaps_op = { 1371static const struct seq_operations swaps_op = {
1329 .start = swap_start, 1372 .start = swap_start,
1330 .next = swap_next, 1373 .next = swap_next,
1331 .stop = swap_stop, 1374 .stop = swap_stop,
@@ -1337,7 +1380,7 @@ static int swaps_open(struct inode *inode, struct file *file)
1337 return seq_open(file, &swaps_op); 1380 return seq_open(file, &swaps_op);
1338} 1381}
1339 1382
1340static struct file_operations proc_swaps_operations = { 1383static const struct file_operations proc_swaps_operations = {
1341 .open = swaps_open, 1384 .open = swaps_open,
1342 .read = seq_read, 1385 .read = seq_read,
1343 .llseek = seq_lseek, 1386 .llseek = seq_lseek,
@@ -1540,6 +1583,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1540 error = -EINVAL; 1583 error = -EINVAL;
1541 if (!maxpages) 1584 if (!maxpages)
1542 goto bad_swap; 1585 goto bad_swap;
1586 if (swapfilesize && maxpages > swapfilesize) {
1587 printk(KERN_WARNING
1588 "Swap area shorter than signature indicates\n");
1589 goto bad_swap;
1590 }
1543 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1591 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1544 goto bad_swap; 1592 goto bad_swap;
1545 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1593 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
@@ -1567,12 +1615,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1567 goto bad_swap; 1615 goto bad_swap;
1568 } 1616 }
1569 1617
1570 if (swapfilesize && maxpages > swapfilesize) {
1571 printk(KERN_WARNING
1572 "Swap area shorter than signature indicates\n");
1573 error = -EINVAL;
1574 goto bad_swap;
1575 }
1576 if (nr_good_pages) { 1618 if (nr_good_pages) {
1577 p->swap_map[0] = SWAP_MAP_BAD; 1619 p->swap_map[0] = SWAP_MAP_BAD;
1578 p->max = maxpages; 1620 p->max = maxpages;
diff --git a/mm/thrash.c b/mm/thrash.c
index f4c560b4a2..9ef9071f99 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -7,100 +7,74 @@
7 * 7 *
8 * Simple token based thrashing protection, using the algorithm 8 * Simple token based thrashing protection, using the algorithm
9 * described in: http://www.cs.wm.edu/~sjiang/token.pdf 9 * described in: http://www.cs.wm.edu/~sjiang/token.pdf
10 *
11 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
12 * Improved algorithm to pass token:
13 * Each task has a priority which is incremented if it contended
14 * for the token in an interval less than its previous attempt.
15 * If the token is acquired, that task's priority is boosted to prevent
16 * the token from bouncing around too often and to let the task make
17 * some progress in its execution.
10 */ 18 */
19
11#include <linux/jiffies.h> 20#include <linux/jiffies.h>
12#include <linux/mm.h> 21#include <linux/mm.h>
13#include <linux/sched.h> 22#include <linux/sched.h>
14#include <linux/swap.h> 23#include <linux/swap.h>
15 24
16static DEFINE_SPINLOCK(swap_token_lock); 25static DEFINE_SPINLOCK(swap_token_lock);
17static unsigned long swap_token_timeout; 26struct mm_struct *swap_token_mm;
18static unsigned long swap_token_check; 27static unsigned int global_faults;
19struct mm_struct * swap_token_mm = &init_mm;
20
21#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
22#define SWAP_TOKEN_TIMEOUT (300 * HZ)
23/*
24 * Currently disabled; Needs further code to work at HZ * 300.
25 */
26unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT;
27
28/*
29 * Take the token away if the process had no page faults
30 * in the last interval, or if it has held the token for
31 * too long.
32 */
33#define SWAP_TOKEN_ENOUGH_RSS 1
34#define SWAP_TOKEN_TIMED_OUT 2
35static int should_release_swap_token(struct mm_struct *mm)
36{
37 int ret = 0;
38 if (!mm->recent_pagein)
39 ret = SWAP_TOKEN_ENOUGH_RSS;
40 else if (time_after(jiffies, swap_token_timeout))
41 ret = SWAP_TOKEN_TIMED_OUT;
42 mm->recent_pagein = 0;
43 return ret;
44}
45 28
46/*
47 * Try to grab the swapout protection token. We only try to
48 * grab it once every TOKEN_CHECK_INTERVAL, both to prevent
49 * SMP lock contention and to check that the process that held
50 * the token before is no longer thrashing.
51 */
52void grab_swap_token(void) 29void grab_swap_token(void)
53{ 30{
54 struct mm_struct *mm; 31 int current_interval;
55 int reason;
56 32
57 /* We have the token. Let others know we still need it. */ 33 global_faults++;
58 if (has_swap_token(current->mm)) {
59 current->mm->recent_pagein = 1;
60 if (unlikely(!swap_token_default_timeout))
61 disable_swap_token();
62 return;
63 }
64
65 if (time_after(jiffies, swap_token_check)) {
66 34
67 if (!swap_token_default_timeout) { 35 current_interval = global_faults - current->mm->faultstamp;
68 swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
69 return;
70 }
71
72 /* ... or if we recently held the token. */
73 if (time_before(jiffies, current->mm->swap_token_time))
74 return;
75 36
76 if (!spin_trylock(&swap_token_lock)) 37 if (!spin_trylock(&swap_token_lock))
77 return; 38 return;
78 39
79 swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; 40 /* First come first served */
41 if (swap_token_mm == NULL) {
42 current->mm->token_priority = current->mm->token_priority + 2;
43 swap_token_mm = current->mm;
44 goto out;
45 }
80 46
81 mm = swap_token_mm; 47 if (current->mm != swap_token_mm) {
82 if ((reason = should_release_swap_token(mm))) { 48 if (current_interval < current->mm->last_interval)
83 unsigned long eligible = jiffies; 49 current->mm->token_priority++;
84 if (reason == SWAP_TOKEN_TIMED_OUT) { 50 else {
85 eligible += swap_token_default_timeout; 51 current->mm->token_priority--;
86 } 52 if (unlikely(current->mm->token_priority < 0))
87 mm->swap_token_time = eligible; 53 current->mm->token_priority = 0;
88 swap_token_timeout = jiffies + swap_token_default_timeout; 54 }
55 /* Check if we deserve the token */
56 if (current->mm->token_priority >
57 swap_token_mm->token_priority) {
58 current->mm->token_priority += 2;
89 swap_token_mm = current->mm; 59 swap_token_mm = current->mm;
90 } 60 }
91 spin_unlock(&swap_token_lock); 61 } else {
62 /* Token holder came in again! */
63 current->mm->token_priority += 2;
92 } 64 }
93 return; 65
66out:
67 current->mm->faultstamp = global_faults;
68 current->mm->last_interval = current_interval;
69 spin_unlock(&swap_token_lock);
70return;
94} 71}
95 72
96/* Called on process exit. */ 73/* Called on process exit. */
97void __put_swap_token(struct mm_struct *mm) 74void __put_swap_token(struct mm_struct *mm)
98{ 75{
99 spin_lock(&swap_token_lock); 76 spin_lock(&swap_token_lock);
100 if (likely(mm == swap_token_mm)) { 77 if (likely(mm == swap_token_mm))
101 mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL; 78 swap_token_mm = NULL;
102 swap_token_mm = &init_mm;
103 swap_token_check = jiffies;
104 }
105 spin_unlock(&swap_token_lock); 79 spin_unlock(&swap_token_lock);
106} 80}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index 5f2cbf0f15..c7f6e1914b 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -79,8 +79,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
79 d_instantiate(dentry, inode); 79 d_instantiate(dentry, inode);
80 inode->i_nlink = 0; /* It is unlinked */ 80 inode->i_nlink = 0; /* It is unlinked */
81 81
82 file->f_vfsmnt = mntget(shm_mnt); 82 file->f_path.mnt = mntget(shm_mnt);
83 file->f_dentry = dentry; 83 file->f_path.dentry = dentry;
84 file->f_mapping = inode->i_mapping; 84 file->f_mapping = inode->i_mapping;
85 file->f_op = &ramfs_file_operations; 85 file->f_op = &ramfs_file_operations;
86 file->f_mode = FMODE_WRITE | FMODE_READ; 86 file->f_mode = FMODE_WRITE | FMODE_READ;
diff --git a/mm/truncate.c b/mm/truncate.c
index 11ca480701..9bfb8e8538 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/pagemap.h> 14#include <linux/pagemap.h>
15#include <linux/pagevec.h> 15#include <linux/pagevec.h>
16#include <linux/task_io_accounting_ops.h>
16#include <linux/buffer_head.h> /* grr. try_to_release_page, 17#include <linux/buffer_head.h> /* grr. try_to_release_page,
17 do_invalidatepage */ 18 do_invalidatepage */
18 19
@@ -69,7 +70,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
69 if (PagePrivate(page)) 70 if (PagePrivate(page))
70 do_invalidatepage(page, 0); 71 do_invalidatepage(page, 0);
71 72
72 clear_page_dirty(page); 73 if (test_clear_page_dirty(page))
74 task_io_account_cancelled_write(PAGE_CACHE_SIZE);
73 ClearPageUptodate(page); 75 ClearPageUptodate(page);
74 ClearPageMappedToDisk(page); 76 ClearPageMappedToDisk(page);
75 remove_from_page_cache(page); 77 remove_from_page_cache(page);
@@ -96,7 +98,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
96 return 0; 98 return 0;
97 99
98 ret = remove_mapping(mapping, page); 100 ret = remove_mapping(mapping, page);
99 ClearPageUptodate(page);
100 101
101 return ret; 102 return ret;
102} 103}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 750ab6ed13..86897ee792 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -160,13 +160,15 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
160 return err; 160 return err;
161} 161}
162 162
163struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, 163static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
164 unsigned long start, unsigned long end, int node) 164 unsigned long start, unsigned long end,
165 int node, gfp_t gfp_mask)
165{ 166{
166 struct vm_struct **p, *tmp, *area; 167 struct vm_struct **p, *tmp, *area;
167 unsigned long align = 1; 168 unsigned long align = 1;
168 unsigned long addr; 169 unsigned long addr;
169 170
171 BUG_ON(in_interrupt());
170 if (flags & VM_IOREMAP) { 172 if (flags & VM_IOREMAP) {
171 int bit = fls(size); 173 int bit = fls(size);
172 174
@@ -179,16 +181,13 @@ struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
179 } 181 }
180 addr = ALIGN(start, align); 182 addr = ALIGN(start, align);
181 size = PAGE_ALIGN(size); 183 size = PAGE_ALIGN(size);
184 if (unlikely(!size))
185 return NULL;
182 186
183 area = kmalloc_node(sizeof(*area), GFP_KERNEL, node); 187 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node);
184 if (unlikely(!area)) 188 if (unlikely(!area))
185 return NULL; 189 return NULL;
186 190
187 if (unlikely(!size)) {
188 kfree (area);
189 return NULL;
190 }
191
192 /* 191 /*
193 * We always allocate a guard page. 192 * We always allocate a guard page.
194 */ 193 */
@@ -236,7 +235,7 @@ out:
236struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 235struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
237 unsigned long start, unsigned long end) 236 unsigned long start, unsigned long end)
238{ 237{
239 return __get_vm_area_node(size, flags, start, end, -1); 238 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
240} 239}
241 240
242/** 241/**
@@ -253,9 +252,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
253 return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); 252 return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
254} 253}
255 254
256struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node) 255struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
256 int node, gfp_t gfp_mask)
257{ 257{
258 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node); 258 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
259 gfp_mask);
259} 260}
260 261
261/* Caller must hold vmlist_lock */ 262/* Caller must hold vmlist_lock */
@@ -428,8 +429,11 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
428 if (array_size > PAGE_SIZE) { 429 if (array_size > PAGE_SIZE) {
429 pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); 430 pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
430 area->flags |= VM_VPAGES; 431 area->flags |= VM_VPAGES;
431 } else 432 } else {
432 pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node); 433 pages = kmalloc_node(array_size,
434 (gfp_mask & ~(__GFP_HIGHMEM | __GFP_ZERO)),
435 node);
436 }
433 area->pages = pages; 437 area->pages = pages;
434 if (!area->pages) { 438 if (!area->pages) {
435 remove_vm_area(area->addr); 439 remove_vm_area(area->addr);
@@ -484,7 +488,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
484 if (!size || (size >> PAGE_SHIFT) > num_physpages) 488 if (!size || (size >> PAGE_SHIFT) > num_physpages)
485 return NULL; 489 return NULL;
486 490
487 area = get_vm_area_node(size, VM_ALLOC, node); 491 area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask);
488 if (!area) 492 if (!area)
489 return NULL; 493 return NULL;
490 494
@@ -525,11 +529,12 @@ void *vmalloc_user(unsigned long size)
525 void *ret; 529 void *ret;
526 530
527 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); 531 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
528 write_lock(&vmlist_lock); 532 if (ret) {
529 area = __find_vm_area(ret); 533 write_lock(&vmlist_lock);
530 area->flags |= VM_USERMAP; 534 area = __find_vm_area(ret);
531 write_unlock(&vmlist_lock); 535 area->flags |= VM_USERMAP;
532 536 write_unlock(&vmlist_lock);
537 }
533 return ret; 538 return ret;
534} 539}
535EXPORT_SYMBOL(vmalloc_user); 540EXPORT_SYMBOL(vmalloc_user);
@@ -598,11 +603,12 @@ void *vmalloc_32_user(unsigned long size)
598 void *ret; 603 void *ret;
599 604
600 ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); 605 ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
601 write_lock(&vmlist_lock); 606 if (ret) {
602 area = __find_vm_area(ret); 607 write_lock(&vmlist_lock);
603 area->flags |= VM_USERMAP; 608 area = __find_vm_area(ret);
604 write_unlock(&vmlist_lock); 609 area->flags |= VM_USERMAP;
605 610 write_unlock(&vmlist_lock);
611 }
606 return ret; 612 return ret;
607} 613}
608EXPORT_SYMBOL(vmalloc_32_user); 614EXPORT_SYMBOL(vmalloc_32_user);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eca70310ad..093f5fe6dd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -36,6 +36,7 @@
36#include <linux/rwsem.h> 36#include <linux/rwsem.h>
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/kthread.h> 38#include <linux/kthread.h>
39#include <linux/freezer.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include <asm/div64.h> 42#include <asm/div64.h>
@@ -378,6 +379,12 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
378 return PAGE_CLEAN; 379 return PAGE_CLEAN;
379} 380}
380 381
382/*
383 * Attempt to detach a locked page from its ->mapping. If it is dirty or if
384 * someone else has a ref on the page, abort and return 0. If it was
385 * successfully detached, return 1. Assumes the caller has a single ref on
386 * this page.
387 */
381int remove_mapping(struct address_space *mapping, struct page *page) 388int remove_mapping(struct address_space *mapping, struct page *page)
382{ 389{
383 BUG_ON(!PageLocked(page)); 390 BUG_ON(!PageLocked(page));
@@ -717,6 +724,20 @@ done:
717 return nr_reclaimed; 724 return nr_reclaimed;
718} 725}
719 726
727/*
728 * We are about to scan this zone at a certain priority level. If that priority
729 * level is smaller (ie: more urgent) than the previous priority, then note
730 * that priority level within the zone. This is done so that when the next
731 * process comes in to scan this zone, it will immediately start out at this
732 * priority level rather than having to build up its own scanning priority.
733 * Here, this priority affects only the reclaim-mapped threshold.
734 */
735static inline void note_zone_scanning_priority(struct zone *zone, int priority)
736{
737 if (priority < zone->prev_priority)
738 zone->prev_priority = priority;
739}
740
720static inline int zone_is_near_oom(struct zone *zone) 741static inline int zone_is_near_oom(struct zone *zone)
721{ 742{
722 return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; 743 return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
@@ -740,7 +761,7 @@ static inline int zone_is_near_oom(struct zone *zone)
740 * But we had to alter page->flags anyway. 761 * But we had to alter page->flags anyway.
741 */ 762 */
742static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 763static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
743 struct scan_control *sc) 764 struct scan_control *sc, int priority)
744{ 765{
745 unsigned long pgmoved; 766 unsigned long pgmoved;
746 int pgdeactivate = 0; 767 int pgdeactivate = 0;
@@ -764,7 +785,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
764 * `distress' is a measure of how much trouble we're having 785 * `distress' is a measure of how much trouble we're having
765 * reclaiming pages. 0 -> no problems. 100 -> great trouble. 786 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
766 */ 787 */
767 distress = 100 >> zone->prev_priority; 788 distress = 100 >> min(zone->prev_priority, priority);
768 789
769 /* 790 /*
770 * The point of this algorithm is to decide when to start 791 * The point of this algorithm is to decide when to start
@@ -916,7 +937,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
916 nr_to_scan = min(nr_active, 937 nr_to_scan = min(nr_active,
917 (unsigned long)sc->swap_cluster_max); 938 (unsigned long)sc->swap_cluster_max);
918 nr_active -= nr_to_scan; 939 nr_active -= nr_to_scan;
919 shrink_active_list(nr_to_scan, zone, sc); 940 shrink_active_list(nr_to_scan, zone, sc, priority);
920 } 941 }
921 942
922 if (nr_inactive) { 943 if (nr_inactive) {
@@ -966,9 +987,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
966 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 987 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
967 continue; 988 continue;
968 989
969 zone->temp_priority = priority; 990 note_zone_scanning_priority(zone, priority);
970 if (zone->prev_priority > priority)
971 zone->prev_priority = priority;
972 991
973 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 992 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
974 continue; /* Let kswapd poll it */ 993 continue; /* Let kswapd poll it */
@@ -1018,7 +1037,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1018 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1037 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1019 continue; 1038 continue;
1020 1039
1021 zone->temp_priority = DEF_PRIORITY;
1022 lru_pages += zone->nr_active + zone->nr_inactive; 1040 lru_pages += zone->nr_active + zone->nr_inactive;
1023 } 1041 }
1024 1042
@@ -1053,19 +1071,28 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1053 1071
1054 /* Take a nap, wait for some writeback to complete */ 1072 /* Take a nap, wait for some writeback to complete */
1055 if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 1073 if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
1056 blk_congestion_wait(WRITE, HZ/10); 1074 congestion_wait(WRITE, HZ/10);
1057 } 1075 }
1058 /* top priority shrink_caches still had more to do? don't OOM, then */ 1076 /* top priority shrink_caches still had more to do? don't OOM, then */
1059 if (!sc.all_unreclaimable) 1077 if (!sc.all_unreclaimable)
1060 ret = 1; 1078 ret = 1;
1061out: 1079out:
1080 /*
1081 * Now that we've scanned all the zones at this priority level, note
1082 * that level within the zone so that the next thread which performs
1083 * scanning of this zone will immediately start out at this priority
1084 * level. This affects only the decision whether or not to bring
1085 * mapped pages onto the inactive list.
1086 */
1087 if (priority < 0)
1088 priority = 0;
1062 for (i = 0; zones[i] != 0; i++) { 1089 for (i = 0; zones[i] != 0; i++) {
1063 struct zone *zone = zones[i]; 1090 struct zone *zone = zones[i];
1064 1091
1065 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1092 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1066 continue; 1093 continue;
1067 1094
1068 zone->prev_priority = zone->temp_priority; 1095 zone->prev_priority = priority;
1069 } 1096 }
1070 return ret; 1097 return ret;
1071} 1098}
@@ -1105,6 +1132,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1105 .swap_cluster_max = SWAP_CLUSTER_MAX, 1132 .swap_cluster_max = SWAP_CLUSTER_MAX,
1106 .swappiness = vm_swappiness, 1133 .swappiness = vm_swappiness,
1107 }; 1134 };
1135 /*
1136 * temp_priority is used to remember the scanning priority at which
1137 * this zone was successfully refilled to free_pages == pages_high.
1138 */
1139 int temp_priority[MAX_NR_ZONES];
1108 1140
1109loop_again: 1141loop_again:
1110 total_scanned = 0; 1142 total_scanned = 0;
@@ -1112,11 +1144,8 @@ loop_again:
1112 sc.may_writepage = !laptop_mode; 1144 sc.may_writepage = !laptop_mode;
1113 count_vm_event(PAGEOUTRUN); 1145 count_vm_event(PAGEOUTRUN);
1114 1146
1115 for (i = 0; i < pgdat->nr_zones; i++) { 1147 for (i = 0; i < pgdat->nr_zones; i++)
1116 struct zone *zone = pgdat->node_zones + i; 1148 temp_priority[i] = DEF_PRIORITY;
1117
1118 zone->temp_priority = DEF_PRIORITY;
1119 }
1120 1149
1121 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1150 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1122 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 1151 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
@@ -1144,11 +1173,12 @@ loop_again:
1144 if (!zone_watermark_ok(zone, order, zone->pages_high, 1173 if (!zone_watermark_ok(zone, order, zone->pages_high,
1145 0, 0)) { 1174 0, 0)) {
1146 end_zone = i; 1175 end_zone = i;
1147 goto scan; 1176 break;
1148 } 1177 }
1149 } 1178 }
1150 goto out; 1179 if (i < 0)
1151scan: 1180 goto out;
1181
1152 for (i = 0; i <= end_zone; i++) { 1182 for (i = 0; i <= end_zone; i++) {
1153 struct zone *zone = pgdat->node_zones + i; 1183 struct zone *zone = pgdat->node_zones + i;
1154 1184
@@ -1177,10 +1207,9 @@ scan:
1177 if (!zone_watermark_ok(zone, order, zone->pages_high, 1207 if (!zone_watermark_ok(zone, order, zone->pages_high,
1178 end_zone, 0)) 1208 end_zone, 0))
1179 all_zones_ok = 0; 1209 all_zones_ok = 0;
1180 zone->temp_priority = priority; 1210 temp_priority[i] = priority;
1181 if (zone->prev_priority > priority)
1182 zone->prev_priority = priority;
1183 sc.nr_scanned = 0; 1211 sc.nr_scanned = 0;
1212 note_zone_scanning_priority(zone, priority);
1184 nr_reclaimed += shrink_zone(priority, zone, &sc); 1213 nr_reclaimed += shrink_zone(priority, zone, &sc);
1185 reclaim_state->reclaimed_slab = 0; 1214 reclaim_state->reclaimed_slab = 0;
1186 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1215 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -1208,7 +1237,7 @@ scan:
1208 * another pass across the zones. 1237 * another pass across the zones.
1209 */ 1238 */
1210 if (total_scanned && priority < DEF_PRIORITY - 2) 1239 if (total_scanned && priority < DEF_PRIORITY - 2)
1211 blk_congestion_wait(WRITE, HZ/10); 1240 congestion_wait(WRITE, HZ/10);
1212 1241
1213 /* 1242 /*
1214 * We do this so kswapd doesn't build up large priorities for 1243 * We do this so kswapd doesn't build up large priorities for
@@ -1220,13 +1249,21 @@ scan:
1220 break; 1249 break;
1221 } 1250 }
1222out: 1251out:
1252 /*
1253 * Note within each zone the priority level at which this zone was
1254 * brought into a happy state. So that the next thread which scans this
1255 * zone will start out at that priority level.
1256 */
1223 for (i = 0; i < pgdat->nr_zones; i++) { 1257 for (i = 0; i < pgdat->nr_zones; i++) {
1224 struct zone *zone = pgdat->node_zones + i; 1258 struct zone *zone = pgdat->node_zones + i;
1225 1259
1226 zone->prev_priority = zone->temp_priority; 1260 zone->prev_priority = temp_priority[i];
1227 } 1261 }
1228 if (!all_zones_ok) { 1262 if (!all_zones_ok) {
1229 cond_resched(); 1263 cond_resched();
1264
1265 try_to_freeze();
1266
1230 goto loop_again; 1267 goto loop_again;
1231 } 1268 }
1232 1269
@@ -1352,7 +1389,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
1352 if (zone->nr_scan_active >= nr_pages || pass > 3) { 1389 if (zone->nr_scan_active >= nr_pages || pass > 3) {
1353 zone->nr_scan_active = 0; 1390 zone->nr_scan_active = 0;
1354 nr_to_scan = min(nr_pages, zone->nr_active); 1391 nr_to_scan = min(nr_pages, zone->nr_active);
1355 shrink_active_list(nr_to_scan, zone, sc); 1392 shrink_active_list(nr_to_scan, zone, sc, prio);
1356 } 1393 }
1357 } 1394 }
1358 1395
@@ -1452,7 +1489,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1452 goto out; 1489 goto out;
1453 1490
1454 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 1491 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
1455 blk_congestion_wait(WRITE, HZ / 10); 1492 congestion_wait(WRITE, HZ / 10);
1456 } 1493 }
1457 1494
1458 lru_pages = 0; 1495 lru_pages = 0;
@@ -1476,7 +1513,6 @@ out:
1476} 1513}
1477#endif 1514#endif
1478 1515
1479#ifdef CONFIG_HOTPLUG_CPU
1480/* It's optimal to keep kswapds on the same CPUs as their memory, but 1516/* It's optimal to keep kswapds on the same CPUs as their memory, but
1481 not required for correctness. So if the last cpu in a node goes 1517 not required for correctness. So if the last cpu in a node goes
1482 away, we get changed to run anywhere: as the first one comes back, 1518 away, we get changed to run anywhere: as the first one comes back,
@@ -1497,7 +1533,6 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
1497 } 1533 }
1498 return NOTIFY_OK; 1534 return NOTIFY_OK;
1499} 1535}
1500#endif /* CONFIG_HOTPLUG_CPU */
1501 1536
1502/* 1537/*
1503 * This kswapd start function will be called by init and node-hot-add. 1538 * This kswapd start function will be called by init and node-hot-add.
@@ -1608,6 +1643,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1608 */ 1643 */
1609 priority = ZONE_RECLAIM_PRIORITY; 1644 priority = ZONE_RECLAIM_PRIORITY;
1610 do { 1645 do {
1646 note_zone_scanning_priority(zone, priority);
1611 nr_reclaimed += shrink_zone(priority, zone, &sc); 1647 nr_reclaimed += shrink_zone(priority, zone, &sc);
1612 priority--; 1648 priority--;
1613 } while (priority >= 0 && nr_reclaimed < nr_pages); 1649 } while (priority >= 0 && nr_reclaimed < nr_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 45b124e012..dc005a0c96 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -430,7 +430,7 @@ static int frag_show(struct seq_file *m, void *arg)
430 return 0; 430 return 0;
431} 431}
432 432
433struct seq_operations fragmentation_op = { 433const struct seq_operations fragmentation_op = {
434 .start = frag_start, 434 .start = frag_start,
435 .next = frag_next, 435 .next = frag_next,
436 .stop = frag_stop, 436 .stop = frag_stop,
@@ -452,7 +452,7 @@ struct seq_operations fragmentation_op = {
452#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \ 452#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
453 TEXT_FOR_HIGHMEM(xx) 453 TEXT_FOR_HIGHMEM(xx)
454 454
455static char *vmstat_text[] = { 455static const char * const vmstat_text[] = {
456 /* Zoned VM counters */ 456 /* Zoned VM counters */
457 "nr_anon_pages", 457 "nr_anon_pages",
458 "nr_mapped", 458 "nr_mapped",
@@ -587,11 +587,9 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
587 seq_printf(m, 587 seq_printf(m,
588 "\n all_unreclaimable: %u" 588 "\n all_unreclaimable: %u"
589 "\n prev_priority: %i" 589 "\n prev_priority: %i"
590 "\n temp_priority: %i"
591 "\n start_pfn: %lu", 590 "\n start_pfn: %lu",
592 zone->all_unreclaimable, 591 zone->all_unreclaimable,
593 zone->prev_priority, 592 zone->prev_priority,
594 zone->temp_priority,
595 zone->zone_start_pfn); 593 zone->zone_start_pfn);
596 spin_unlock_irqrestore(&zone->lock, flags); 594 spin_unlock_irqrestore(&zone->lock, flags);
597 seq_putc(m, '\n'); 595 seq_putc(m, '\n');
@@ -599,7 +597,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
599 return 0; 597 return 0;
600} 598}
601 599
602struct seq_operations zoneinfo_op = { 600const struct seq_operations zoneinfo_op = {
603 .start = frag_start, /* iterate over all zones. The same as in 601 .start = frag_start, /* iterate over all zones. The same as in
604 * fragmentation. */ 602 * fragmentation. */
605 .next = frag_next, 603 .next = frag_next,
@@ -662,7 +660,7 @@ static void vmstat_stop(struct seq_file *m, void *arg)
662 m->private = NULL; 660 m->private = NULL;
663} 661}
664 662
665struct seq_operations vmstat_op = { 663const struct seq_operations vmstat_op = {
666 .start = vmstat_start, 664 .start = vmstat_start,
667 .next = vmstat_next, 665 .next = vmstat_next,
668 .stop = vmstat_stop, 666 .stop = vmstat_stop,
@@ -681,13 +679,13 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
681 void *hcpu) 679 void *hcpu)
682{ 680{
683 switch (action) { 681 switch (action) {
684 case CPU_UP_PREPARE: 682 case CPU_UP_PREPARE:
685 case CPU_UP_CANCELED: 683 case CPU_UP_CANCELED:
686 case CPU_DEAD: 684 case CPU_DEAD:
687 refresh_zone_stat_thresholds(); 685 refresh_zone_stat_thresholds();
688 break; 686 break;
689 default: 687 default:
690 break; 688 break;
691 } 689 }
692 return NOTIFY_OK; 690 return NOTIFY_OK;
693} 691}