diff options
author | Dave Jones <davej@redhat.com> | 2006-12-12 17:41:41 -0500 |
---|---|---|
committer | Dave Jones <davej@redhat.com> | 2006-12-12 17:41:41 -0500 |
commit | c4366889dda8110247be59ca41fddb82951a8c26 (patch) | |
tree | 705c1a996bed8fd48ce94ff33ec9fd00f9b94875 /mm | |
parent | db2fb9db5735cc532fd4fc55e94b9a3c3750378e (diff) | |
parent | e1036502e5263851259d147771226161e5ccc85a (diff) |
Merge ../linus
Conflicts:
drivers/cpufreq/cpufreq.c
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/allocpercpu.c | 9 | ||||
-rw-r--r-- | mm/backing-dev.c | 69 | ||||
-rw-r--r-- | mm/bootmem.c | 6 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 122 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/fremap.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 25 | ||||
-rw-r--r-- | mm/memory.c | 36 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 1 | ||||
-rw-r--r-- | mm/mempolicy.c | 12 | ||||
-rw-r--r-- | mm/migrate.c | 22 | ||||
-rw-r--r-- | mm/mlock.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 19 | ||||
-rw-r--r-- | mm/mmzone.c | 5 | ||||
-rw-r--r-- | mm/nommu.c | 30 | ||||
-rw-r--r-- | mm/oom_kill.c | 42 | ||||
-rw-r--r-- | mm/page-writeback.c | 106 | ||||
-rw-r--r-- | mm/page_alloc.c | 409 | ||||
-rw-r--r-- | mm/page_io.c | 45 | ||||
-rw-r--r-- | mm/pdflush.c | 1 | ||||
-rw-r--r-- | mm/readahead.c | 14 | ||||
-rw-r--r-- | mm/rmap.c | 36 | ||||
-rw-r--r-- | mm/shmem.c | 112 | ||||
-rw-r--r-- | mm/slab.c | 389 | ||||
-rw-r--r-- | mm/sparse.c | 25 | ||||
-rw-r--r-- | mm/swap.c | 10 | ||||
-rw-r--r-- | mm/swapfile.c | 96 | ||||
-rw-r--r-- | mm/thrash.c | 116 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 4 | ||||
-rw-r--r-- | mm/truncate.c | 5 | ||||
-rw-r--r-- | mm/vmalloc.c | 54 | ||||
-rw-r--r-- | mm/vmscan.c | 88 | ||||
-rw-r--r-- | mm/vmstat.c | 24 |
35 files changed, 1325 insertions, 618 deletions
diff --git a/mm/Makefile b/mm/Makefile index 12b3a4eee8..f3c077eb0b 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -10,7 +10,8 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | |||
10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ |
11 | page_alloc.o page-writeback.o pdflush.o \ | 11 | page_alloc.o page-writeback.o pdflush.o \ |
12 | readahead.o swap.o truncate.o vmscan.o \ | 12 | readahead.o swap.o truncate.o vmscan.o \ |
13 | prio_tree.o util.o mmzone.o vmstat.o $(mmu-y) | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
14 | $(mmu-y) | ||
14 | 15 | ||
15 | ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy) | 16 | ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy) |
16 | obj-y += bounce.o | 17 | obj-y += bounce.o |
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index eaa9abeea5..b2486cf887 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c | |||
@@ -17,10 +17,9 @@ | |||
17 | void percpu_depopulate(void *__pdata, int cpu) | 17 | void percpu_depopulate(void *__pdata, int cpu) |
18 | { | 18 | { |
19 | struct percpu_data *pdata = __percpu_disguise(__pdata); | 19 | struct percpu_data *pdata = __percpu_disguise(__pdata); |
20 | if (pdata->ptrs[cpu]) { | 20 | |
21 | kfree(pdata->ptrs[cpu]); | 21 | kfree(pdata->ptrs[cpu]); |
22 | pdata->ptrs[cpu] = NULL; | 22 | pdata->ptrs[cpu] = NULL; |
23 | } | ||
24 | } | 23 | } |
25 | EXPORT_SYMBOL_GPL(percpu_depopulate); | 24 | EXPORT_SYMBOL_GPL(percpu_depopulate); |
26 | 25 | ||
@@ -123,6 +122,8 @@ EXPORT_SYMBOL_GPL(__percpu_alloc_mask); | |||
123 | */ | 122 | */ |
124 | void percpu_free(void *__pdata) | 123 | void percpu_free(void *__pdata) |
125 | { | 124 | { |
125 | if (unlikely(!__pdata)) | ||
126 | return; | ||
126 | __percpu_depopulate_mask(__pdata, &cpu_possible_map); | 127 | __percpu_depopulate_mask(__pdata, &cpu_possible_map); |
127 | kfree(__percpu_disguise(__pdata)); | 128 | kfree(__percpu_disguise(__pdata)); |
128 | } | 129 | } |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c new file mode 100644 index 0000000000..f50a2811f9 --- /dev/null +++ b/mm/backing-dev.c | |||
@@ -0,0 +1,69 @@ | |||
1 | |||
2 | #include <linux/wait.h> | ||
3 | #include <linux/backing-dev.h> | ||
4 | #include <linux/fs.h> | ||
5 | #include <linux/sched.h> | ||
6 | #include <linux/module.h> | ||
7 | |||
8 | static wait_queue_head_t congestion_wqh[2] = { | ||
9 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), | ||
10 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) | ||
11 | }; | ||
12 | |||
13 | |||
14 | void clear_bdi_congested(struct backing_dev_info *bdi, int rw) | ||
15 | { | ||
16 | enum bdi_state bit; | ||
17 | wait_queue_head_t *wqh = &congestion_wqh[rw]; | ||
18 | |||
19 | bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; | ||
20 | clear_bit(bit, &bdi->state); | ||
21 | smp_mb__after_clear_bit(); | ||
22 | if (waitqueue_active(wqh)) | ||
23 | wake_up(wqh); | ||
24 | } | ||
25 | EXPORT_SYMBOL(clear_bdi_congested); | ||
26 | |||
27 | void set_bdi_congested(struct backing_dev_info *bdi, int rw) | ||
28 | { | ||
29 | enum bdi_state bit; | ||
30 | |||
31 | bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; | ||
32 | set_bit(bit, &bdi->state); | ||
33 | } | ||
34 | EXPORT_SYMBOL(set_bdi_congested); | ||
35 | |||
36 | /** | ||
37 | * congestion_wait - wait for a backing_dev to become uncongested | ||
38 | * @rw: READ or WRITE | ||
39 | * @timeout: timeout in jiffies | ||
40 | * | ||
41 | * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit | ||
42 | * write congestion. If no backing_devs are congested then just wait for the | ||
43 | * next write to be completed. | ||
44 | */ | ||
45 | long congestion_wait(int rw, long timeout) | ||
46 | { | ||
47 | long ret; | ||
48 | DEFINE_WAIT(wait); | ||
49 | wait_queue_head_t *wqh = &congestion_wqh[rw]; | ||
50 | |||
51 | prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); | ||
52 | ret = io_schedule_timeout(timeout); | ||
53 | finish_wait(wqh, &wait); | ||
54 | return ret; | ||
55 | } | ||
56 | EXPORT_SYMBOL(congestion_wait); | ||
57 | |||
58 | /** | ||
59 | * congestion_end - wake up sleepers on a congested backing_dev_info | ||
60 | * @rw: READ or WRITE | ||
61 | */ | ||
62 | void congestion_end(int rw) | ||
63 | { | ||
64 | wait_queue_head_t *wqh = &congestion_wqh[rw]; | ||
65 | |||
66 | if (waitqueue_active(wqh)) | ||
67 | wake_up(wqh); | ||
68 | } | ||
69 | EXPORT_SYMBOL(congestion_end); | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index d53112fcb4..00a96970b2 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -27,8 +27,6 @@ unsigned long max_low_pfn; | |||
27 | unsigned long min_low_pfn; | 27 | unsigned long min_low_pfn; |
28 | unsigned long max_pfn; | 28 | unsigned long max_pfn; |
29 | 29 | ||
30 | EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */ | ||
31 | |||
32 | static LIST_HEAD(bdata_list); | 30 | static LIST_HEAD(bdata_list); |
33 | #ifdef CONFIG_CRASH_DUMP | 31 | #ifdef CONFIG_CRASH_DUMP |
34 | /* | 32 | /* |
@@ -196,6 +194,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | |||
196 | if (limit && bdata->node_boot_start >= limit) | 194 | if (limit && bdata->node_boot_start >= limit) |
197 | return NULL; | 195 | return NULL; |
198 | 196 | ||
197 | /* on nodes without memory - bootmem_map is NULL */ | ||
198 | if (!bdata->node_bootmem_map) | ||
199 | return NULL; | ||
200 | |||
199 | end_pfn = bdata->node_low_pfn; | 201 | end_pfn = bdata->node_low_pfn; |
200 | limit = PFN_DOWN(limit); | 202 | limit = PFN_DOWN(limit); |
201 | if (limit && end_pfn > limit) | 203 | if (limit && end_pfn > limit) |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 168c78a121..0df4c899e9 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -38,7 +38,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
38 | if (!file) | 38 | if (!file) |
39 | return -EBADF; | 39 | return -EBADF; |
40 | 40 | ||
41 | if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) { | 41 | if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) { |
42 | ret = -ESPIPE; | 42 | ret = -ESPIPE; |
43 | goto out; | 43 | goto out; |
44 | } | 44 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index 3464b681f8..8332c77b1b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -75,8 +75,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
75 | * ->mmap_sem | 75 | * ->mmap_sem |
76 | * ->lock_page (access_process_vm) | 76 | * ->lock_page (access_process_vm) |
77 | * | 77 | * |
78 | * ->mmap_sem | 78 | * ->i_mutex (generic_file_buffered_write) |
79 | * ->i_mutex (msync) | 79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) |
80 | * | 80 | * |
81 | * ->i_mutex | 81 | * ->i_mutex |
82 | * ->i_alloc_sem (various) | 82 | * ->i_alloc_sem (various) |
@@ -467,25 +467,15 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | |||
467 | } | 467 | } |
468 | 468 | ||
469 | #ifdef CONFIG_NUMA | 469 | #ifdef CONFIG_NUMA |
470 | struct page *page_cache_alloc(struct address_space *x) | 470 | struct page *__page_cache_alloc(gfp_t gfp) |
471 | { | 471 | { |
472 | if (cpuset_do_page_mem_spread()) { | 472 | if (cpuset_do_page_mem_spread()) { |
473 | int n = cpuset_mem_spread_node(); | 473 | int n = cpuset_mem_spread_node(); |
474 | return alloc_pages_node(n, mapping_gfp_mask(x), 0); | 474 | return alloc_pages_node(n, gfp, 0); |
475 | } | 475 | } |
476 | return alloc_pages(mapping_gfp_mask(x), 0); | 476 | return alloc_pages(gfp, 0); |
477 | } | 477 | } |
478 | EXPORT_SYMBOL(page_cache_alloc); | 478 | EXPORT_SYMBOL(__page_cache_alloc); |
479 | |||
480 | struct page *page_cache_alloc_cold(struct address_space *x) | ||
481 | { | ||
482 | if (cpuset_do_page_mem_spread()) { | ||
483 | int n = cpuset_mem_spread_node(); | ||
484 | return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0); | ||
485 | } | ||
486 | return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0); | ||
487 | } | ||
488 | EXPORT_SYMBOL(page_cache_alloc_cold); | ||
489 | #endif | 479 | #endif |
490 | 480 | ||
491 | static int __sleep_on_page_lock(void *word) | 481 | static int __sleep_on_page_lock(void *word) |
@@ -826,7 +816,6 @@ struct page * | |||
826 | grab_cache_page_nowait(struct address_space *mapping, unsigned long index) | 816 | grab_cache_page_nowait(struct address_space *mapping, unsigned long index) |
827 | { | 817 | { |
828 | struct page *page = find_get_page(mapping, index); | 818 | struct page *page = find_get_page(mapping, index); |
829 | gfp_t gfp_mask; | ||
830 | 819 | ||
831 | if (page) { | 820 | if (page) { |
832 | if (!TestSetPageLocked(page)) | 821 | if (!TestSetPageLocked(page)) |
@@ -834,9 +823,8 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index) | |||
834 | page_cache_release(page); | 823 | page_cache_release(page); |
835 | return NULL; | 824 | return NULL; |
836 | } | 825 | } |
837 | gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS; | 826 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); |
838 | page = alloc_pages(gfp_mask, 0); | 827 | if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { |
839 | if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) { | ||
840 | page_cache_release(page); | 828 | page_cache_release(page); |
841 | page = NULL; | 829 | page = NULL; |
842 | } | 830 | } |
@@ -1193,8 +1181,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1193 | if (pos < size) { | 1181 | if (pos < size) { |
1194 | retval = generic_file_direct_IO(READ, iocb, | 1182 | retval = generic_file_direct_IO(READ, iocb, |
1195 | iov, pos, nr_segs); | 1183 | iov, pos, nr_segs); |
1196 | if (retval > 0 && !is_sync_kiocb(iocb)) | ||
1197 | retval = -EIOCBQUEUED; | ||
1198 | if (retval > 0) | 1184 | if (retval > 0) |
1199 | *ppos = pos + retval; | 1185 | *ppos = pos + retval; |
1200 | } | 1186 | } |
@@ -1457,7 +1443,6 @@ no_cached_page: | |||
1457 | * effect. | 1443 | * effect. |
1458 | */ | 1444 | */ |
1459 | error = page_cache_read(file, pgoff); | 1445 | error = page_cache_read(file, pgoff); |
1460 | grab_swap_token(); | ||
1461 | 1446 | ||
1462 | /* | 1447 | /* |
1463 | * The page we want has now been added to the page cache. | 1448 | * The page we want has now been added to the page cache. |
@@ -1884,11 +1869,10 @@ repeat: | |||
1884 | * if suid or (sgid and xgrp) | 1869 | * if suid or (sgid and xgrp) |
1885 | * remove privs | 1870 | * remove privs |
1886 | */ | 1871 | */ |
1887 | int remove_suid(struct dentry *dentry) | 1872 | int should_remove_suid(struct dentry *dentry) |
1888 | { | 1873 | { |
1889 | mode_t mode = dentry->d_inode->i_mode; | 1874 | mode_t mode = dentry->d_inode->i_mode; |
1890 | int kill = 0; | 1875 | int kill = 0; |
1891 | int result = 0; | ||
1892 | 1876 | ||
1893 | /* suid always must be killed */ | 1877 | /* suid always must be killed */ |
1894 | if (unlikely(mode & S_ISUID)) | 1878 | if (unlikely(mode & S_ISUID)) |
@@ -1901,13 +1885,29 @@ int remove_suid(struct dentry *dentry) | |||
1901 | if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) | 1885 | if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) |
1902 | kill |= ATTR_KILL_SGID; | 1886 | kill |= ATTR_KILL_SGID; |
1903 | 1887 | ||
1904 | if (unlikely(kill && !capable(CAP_FSETID))) { | 1888 | if (unlikely(kill && !capable(CAP_FSETID))) |
1905 | struct iattr newattrs; | 1889 | return kill; |
1906 | 1890 | ||
1907 | newattrs.ia_valid = ATTR_FORCE | kill; | 1891 | return 0; |
1908 | result = notify_change(dentry, &newattrs); | 1892 | } |
1909 | } | 1893 | EXPORT_SYMBOL(should_remove_suid); |
1910 | return result; | 1894 | |
1895 | int __remove_suid(struct dentry *dentry, int kill) | ||
1896 | { | ||
1897 | struct iattr newattrs; | ||
1898 | |||
1899 | newattrs.ia_valid = ATTR_FORCE | kill; | ||
1900 | return notify_change(dentry, &newattrs); | ||
1901 | } | ||
1902 | |||
1903 | int remove_suid(struct dentry *dentry) | ||
1904 | { | ||
1905 | int kill = should_remove_suid(dentry); | ||
1906 | |||
1907 | if (unlikely(kill)) | ||
1908 | return __remove_suid(dentry, kill); | ||
1909 | |||
1910 | return 0; | ||
1911 | } | 1911 | } |
1912 | EXPORT_SYMBOL(remove_suid); | 1912 | EXPORT_SYMBOL(remove_suid); |
1913 | 1913 | ||
@@ -2045,15 +2045,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
2045 | * Sync the fs metadata but not the minor inode changes and | 2045 | * Sync the fs metadata but not the minor inode changes and |
2046 | * of course not the data as we did direct DMA for the IO. | 2046 | * of course not the data as we did direct DMA for the IO. |
2047 | * i_mutex is held, which protects generic_osync_inode() from | 2047 | * i_mutex is held, which protects generic_osync_inode() from |
2048 | * livelocking. | 2048 | * livelocking. AIO O_DIRECT ops attempt to sync metadata here. |
2049 | */ | 2049 | */ |
2050 | if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2050 | if ((written >= 0 || written == -EIOCBQUEUED) && |
2051 | ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
2051 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); | 2052 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); |
2052 | if (err < 0) | 2053 | if (err < 0) |
2053 | written = err; | 2054 | written = err; |
2054 | } | 2055 | } |
2055 | if (written == count && !is_sync_kiocb(iocb)) | ||
2056 | written = -EIOCBQUEUED; | ||
2057 | return written; | 2056 | return written; |
2058 | } | 2057 | } |
2059 | EXPORT_SYMBOL(generic_file_direct_write); | 2058 | EXPORT_SYMBOL(generic_file_direct_write); |
@@ -2222,7 +2221,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
2222 | unsigned long nr_segs, loff_t *ppos) | 2221 | unsigned long nr_segs, loff_t *ppos) |
2223 | { | 2222 | { |
2224 | struct file *file = iocb->ki_filp; | 2223 | struct file *file = iocb->ki_filp; |
2225 | const struct address_space * mapping = file->f_mapping; | 2224 | struct address_space * mapping = file->f_mapping; |
2226 | size_t ocount; /* original count */ | 2225 | size_t ocount; /* original count */ |
2227 | size_t count; /* after file limit checks */ | 2226 | size_t count; /* after file limit checks */ |
2228 | struct inode *inode = mapping->host; | 2227 | struct inode *inode = mapping->host; |
@@ -2267,7 +2266,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
2267 | if (count == 0) | 2266 | if (count == 0) |
2268 | goto out; | 2267 | goto out; |
2269 | 2268 | ||
2270 | err = remove_suid(file->f_dentry); | 2269 | err = remove_suid(file->f_path.dentry); |
2271 | if (err) | 2270 | if (err) |
2272 | goto out; | 2271 | goto out; |
2273 | 2272 | ||
@@ -2275,8 +2274,11 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
2275 | 2274 | ||
2276 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 2275 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ |
2277 | if (unlikely(file->f_flags & O_DIRECT)) { | 2276 | if (unlikely(file->f_flags & O_DIRECT)) { |
2278 | written = generic_file_direct_write(iocb, iov, | 2277 | loff_t endbyte; |
2279 | &nr_segs, pos, ppos, count, ocount); | 2278 | ssize_t written_buffered; |
2279 | |||
2280 | written = generic_file_direct_write(iocb, iov, &nr_segs, pos, | ||
2281 | ppos, count, ocount); | ||
2280 | if (written < 0 || written == count) | 2282 | if (written < 0 || written == count) |
2281 | goto out; | 2283 | goto out; |
2282 | /* | 2284 | /* |
@@ -2285,10 +2287,46 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
2285 | */ | 2287 | */ |
2286 | pos += written; | 2288 | pos += written; |
2287 | count -= written; | 2289 | count -= written; |
2288 | } | 2290 | written_buffered = generic_file_buffered_write(iocb, iov, |
2291 | nr_segs, pos, ppos, count, | ||
2292 | written); | ||
2293 | /* | ||
2294 | * If generic_file_buffered_write() retuned a synchronous error | ||
2295 | * then we want to return the number of bytes which were | ||
2296 | * direct-written, or the error code if that was zero. Note | ||
2297 | * that this differs from normal direct-io semantics, which | ||
2298 | * will return -EFOO even if some bytes were written. | ||
2299 | */ | ||
2300 | if (written_buffered < 0) { | ||
2301 | err = written_buffered; | ||
2302 | goto out; | ||
2303 | } | ||
2289 | 2304 | ||
2290 | written = generic_file_buffered_write(iocb, iov, nr_segs, | 2305 | /* |
2291 | pos, ppos, count, written); | 2306 | * We need to ensure that the page cache pages are written to |
2307 | * disk and invalidated to preserve the expected O_DIRECT | ||
2308 | * semantics. | ||
2309 | */ | ||
2310 | endbyte = pos + written_buffered - written - 1; | ||
2311 | err = do_sync_file_range(file, pos, endbyte, | ||
2312 | SYNC_FILE_RANGE_WAIT_BEFORE| | ||
2313 | SYNC_FILE_RANGE_WRITE| | ||
2314 | SYNC_FILE_RANGE_WAIT_AFTER); | ||
2315 | if (err == 0) { | ||
2316 | written = written_buffered; | ||
2317 | invalidate_mapping_pages(mapping, | ||
2318 | pos >> PAGE_CACHE_SHIFT, | ||
2319 | endbyte >> PAGE_CACHE_SHIFT); | ||
2320 | } else { | ||
2321 | /* | ||
2322 | * We don't know how much we wrote, so just return | ||
2323 | * the number of bytes which were direct-written | ||
2324 | */ | ||
2325 | } | ||
2326 | } else { | ||
2327 | written = generic_file_buffered_write(iocb, iov, nr_segs, | ||
2328 | pos, ppos, count, written); | ||
2329 | } | ||
2292 | out: | 2330 | out: |
2293 | current->backing_dev_info = NULL; | 2331 | current->backing_dev_info = NULL; |
2294 | return written ? written : err; | 2332 | return written ? written : err; |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b4fd0d7c9b..8d667617f5 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -379,7 +379,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, | |||
379 | if (count == 0) | 379 | if (count == 0) |
380 | goto out_backing; | 380 | goto out_backing; |
381 | 381 | ||
382 | ret = remove_suid(filp->f_dentry); | 382 | ret = remove_suid(filp->f_path.dentry); |
383 | if (ret) | 383 | if (ret) |
384 | goto out_backing; | 384 | goto out_backing; |
385 | 385 | ||
diff --git a/mm/fremap.c b/mm/fremap.c index 7a9d0f5d24..b77a002c33 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -101,7 +101,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
101 | { | 101 | { |
102 | int err = -ENOMEM; | 102 | int err = -ENOMEM; |
103 | pte_t *pte; | 103 | pte_t *pte; |
104 | pte_t pte_val; | ||
105 | spinlock_t *ptl; | 104 | spinlock_t *ptl; |
106 | 105 | ||
107 | pte = get_locked_pte(mm, addr, &ptl); | 106 | pte = get_locked_pte(mm, addr, &ptl); |
@@ -114,7 +113,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
114 | } | 113 | } |
115 | 114 | ||
116 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); | 115 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); |
117 | pte_val = *pte; | ||
118 | /* | 116 | /* |
119 | * We don't need to run update_mmu_cache() here because the "file pte" | 117 | * We don't need to run update_mmu_cache() here because the "file pte" |
120 | * being installed by install_file_pte() is not a real pte - it's a | 118 | * being installed by install_file_pte() is not a real pte - it's a |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2dbec90dc3..0ccc7f2302 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -109,7 +109,7 @@ static int alloc_fresh_huge_page(void) | |||
109 | if (nid == MAX_NUMNODES) | 109 | if (nid == MAX_NUMNODES) |
110 | nid = first_node(node_online_map); | 110 | nid = first_node(node_online_map); |
111 | if (page) { | 111 | if (page) { |
112 | page[1].lru.next = (void *)free_huge_page; /* dtor */ | 112 | set_compound_page_dtor(page, free_huge_page); |
113 | spin_lock(&hugetlb_lock); | 113 | spin_lock(&hugetlb_lock); |
114 | nr_huge_pages++; | 114 | nr_huge_pages++; |
115 | nr_huge_pages_node[page_to_nid(page)]++; | 115 | nr_huge_pages_node[page_to_nid(page)]++; |
@@ -344,7 +344,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
344 | entry = *src_pte; | 344 | entry = *src_pte; |
345 | ptepage = pte_page(entry); | 345 | ptepage = pte_page(entry); |
346 | get_page(ptepage); | 346 | get_page(ptepage); |
347 | add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE); | ||
348 | set_huge_pte_at(dst, addr, dst_pte, entry); | 347 | set_huge_pte_at(dst, addr, dst_pte, entry); |
349 | } | 348 | } |
350 | spin_unlock(&src->page_table_lock); | 349 | spin_unlock(&src->page_table_lock); |
@@ -365,6 +364,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
365 | pte_t pte; | 364 | pte_t pte; |
366 | struct page *page; | 365 | struct page *page; |
367 | struct page *tmp; | 366 | struct page *tmp; |
367 | /* | ||
368 | * A page gathering list, protected by per file i_mmap_lock. The | ||
369 | * lock is used to avoid list corruption from multiple unmapping | ||
370 | * of the same page since we are using page->lru. | ||
371 | */ | ||
368 | LIST_HEAD(page_list); | 372 | LIST_HEAD(page_list); |
369 | 373 | ||
370 | WARN_ON(!is_vm_hugetlb_page(vma)); | 374 | WARN_ON(!is_vm_hugetlb_page(vma)); |
@@ -372,24 +376,21 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
372 | BUG_ON(end & ~HPAGE_MASK); | 376 | BUG_ON(end & ~HPAGE_MASK); |
373 | 377 | ||
374 | spin_lock(&mm->page_table_lock); | 378 | spin_lock(&mm->page_table_lock); |
375 | |||
376 | /* Update high watermark before we lower rss */ | ||
377 | update_hiwater_rss(mm); | ||
378 | |||
379 | for (address = start; address < end; address += HPAGE_SIZE) { | 379 | for (address = start; address < end; address += HPAGE_SIZE) { |
380 | ptep = huge_pte_offset(mm, address); | 380 | ptep = huge_pte_offset(mm, address); |
381 | if (!ptep) | 381 | if (!ptep) |
382 | continue; | 382 | continue; |
383 | 383 | ||
384 | if (huge_pmd_unshare(mm, &address, ptep)) | ||
385 | continue; | ||
386 | |||
384 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 387 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
385 | if (pte_none(pte)) | 388 | if (pte_none(pte)) |
386 | continue; | 389 | continue; |
387 | 390 | ||
388 | page = pte_page(pte); | 391 | page = pte_page(pte); |
389 | list_add(&page->lru, &page_list); | 392 | list_add(&page->lru, &page_list); |
390 | add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); | ||
391 | } | 393 | } |
392 | |||
393 | spin_unlock(&mm->page_table_lock); | 394 | spin_unlock(&mm->page_table_lock); |
394 | flush_tlb_range(vma, start, end); | 395 | flush_tlb_range(vma, start, end); |
395 | list_for_each_entry_safe(page, tmp, &page_list, lru) { | 396 | list_for_each_entry_safe(page, tmp, &page_list, lru) { |
@@ -478,6 +479,9 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
478 | retry: | 479 | retry: |
479 | page = find_lock_page(mapping, idx); | 480 | page = find_lock_page(mapping, idx); |
480 | if (!page) { | 481 | if (!page) { |
482 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | ||
483 | if (idx >= size) | ||
484 | goto out; | ||
481 | if (hugetlb_get_quota(mapping)) | 485 | if (hugetlb_get_quota(mapping)) |
482 | goto out; | 486 | goto out; |
483 | page = alloc_huge_page(vma, address); | 487 | page = alloc_huge_page(vma, address); |
@@ -512,7 +516,6 @@ retry: | |||
512 | if (!pte_none(*ptep)) | 516 | if (!pte_none(*ptep)) |
513 | goto backout; | 517 | goto backout; |
514 | 518 | ||
515 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); | ||
516 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) | 519 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) |
517 | && (vma->vm_flags & VM_SHARED))); | 520 | && (vma->vm_flags & VM_SHARED))); |
518 | set_huge_pte_at(mm, address, ptep, new_pte); | 521 | set_huge_pte_at(mm, address, ptep, new_pte); |
@@ -650,11 +653,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
650 | BUG_ON(address >= end); | 653 | BUG_ON(address >= end); |
651 | flush_cache_range(vma, address, end); | 654 | flush_cache_range(vma, address, end); |
652 | 655 | ||
656 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | ||
653 | spin_lock(&mm->page_table_lock); | 657 | spin_lock(&mm->page_table_lock); |
654 | for (; address < end; address += HPAGE_SIZE) { | 658 | for (; address < end; address += HPAGE_SIZE) { |
655 | ptep = huge_pte_offset(mm, address); | 659 | ptep = huge_pte_offset(mm, address); |
656 | if (!ptep) | 660 | if (!ptep) |
657 | continue; | 661 | continue; |
662 | if (huge_pmd_unshare(mm, &address, ptep)) | ||
663 | continue; | ||
658 | if (!pte_none(*ptep)) { | 664 | if (!pte_none(*ptep)) { |
659 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 665 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
660 | pte = pte_mkhuge(pte_modify(pte, newprot)); | 666 | pte = pte_mkhuge(pte_modify(pte, newprot)); |
@@ -663,6 +669,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
663 | } | 669 | } |
664 | } | 670 | } |
665 | spin_unlock(&mm->page_table_lock); | 671 | spin_unlock(&mm->page_table_lock); |
672 | spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); | ||
666 | 673 | ||
667 | flush_tlb_range(vma, start, end); | 674 | flush_tlb_range(vma, start, end); |
668 | } | 675 | } |
diff --git a/mm/memory.c b/mm/memory.c index b5a4aadd96..bf6100236e 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1110,23 +1110,29 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1110 | { | 1110 | { |
1111 | pte_t *pte; | 1111 | pte_t *pte; |
1112 | spinlock_t *ptl; | 1112 | spinlock_t *ptl; |
1113 | int err = 0; | ||
1113 | 1114 | ||
1114 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | 1115 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); |
1115 | if (!pte) | 1116 | if (!pte) |
1116 | return -ENOMEM; | 1117 | return -EAGAIN; |
1117 | arch_enter_lazy_mmu_mode(); | 1118 | arch_enter_lazy_mmu_mode(); |
1118 | do { | 1119 | do { |
1119 | struct page *page = ZERO_PAGE(addr); | 1120 | struct page *page = ZERO_PAGE(addr); |
1120 | pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); | 1121 | pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); |
1122 | |||
1123 | if (unlikely(!pte_none(*pte))) { | ||
1124 | err = -EEXIST; | ||
1125 | pte++; | ||
1126 | break; | ||
1127 | } | ||
1121 | page_cache_get(page); | 1128 | page_cache_get(page); |
1122 | page_add_file_rmap(page); | 1129 | page_add_file_rmap(page); |
1123 | inc_mm_counter(mm, file_rss); | 1130 | inc_mm_counter(mm, file_rss); |
1124 | BUG_ON(!pte_none(*pte)); | ||
1125 | set_pte_at(mm, addr, pte, zero_pte); | 1131 | set_pte_at(mm, addr, pte, zero_pte); |
1126 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1132 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1127 | arch_leave_lazy_mmu_mode(); | 1133 | arch_leave_lazy_mmu_mode(); |
1128 | pte_unmap_unlock(pte - 1, ptl); | 1134 | pte_unmap_unlock(pte - 1, ptl); |
1129 | return 0; | 1135 | return err; |
1130 | } | 1136 | } |
1131 | 1137 | ||
1132 | static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, | 1138 | static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, |
@@ -1134,16 +1140,18 @@ static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
1134 | { | 1140 | { |
1135 | pmd_t *pmd; | 1141 | pmd_t *pmd; |
1136 | unsigned long next; | 1142 | unsigned long next; |
1143 | int err; | ||
1137 | 1144 | ||
1138 | pmd = pmd_alloc(mm, pud, addr); | 1145 | pmd = pmd_alloc(mm, pud, addr); |
1139 | if (!pmd) | 1146 | if (!pmd) |
1140 | return -ENOMEM; | 1147 | return -EAGAIN; |
1141 | do { | 1148 | do { |
1142 | next = pmd_addr_end(addr, end); | 1149 | next = pmd_addr_end(addr, end); |
1143 | if (zeromap_pte_range(mm, pmd, addr, next, prot)) | 1150 | err = zeromap_pte_range(mm, pmd, addr, next, prot); |
1144 | return -ENOMEM; | 1151 | if (err) |
1152 | break; | ||
1145 | } while (pmd++, addr = next, addr != end); | 1153 | } while (pmd++, addr = next, addr != end); |
1146 | return 0; | 1154 | return err; |
1147 | } | 1155 | } |
1148 | 1156 | ||
1149 | static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, | 1157 | static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, |
@@ -1151,16 +1159,18 @@ static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, | |||
1151 | { | 1159 | { |
1152 | pud_t *pud; | 1160 | pud_t *pud; |
1153 | unsigned long next; | 1161 | unsigned long next; |
1162 | int err; | ||
1154 | 1163 | ||
1155 | pud = pud_alloc(mm, pgd, addr); | 1164 | pud = pud_alloc(mm, pgd, addr); |
1156 | if (!pud) | 1165 | if (!pud) |
1157 | return -ENOMEM; | 1166 | return -EAGAIN; |
1158 | do { | 1167 | do { |
1159 | next = pud_addr_end(addr, end); | 1168 | next = pud_addr_end(addr, end); |
1160 | if (zeromap_pmd_range(mm, pud, addr, next, prot)) | 1169 | err = zeromap_pmd_range(mm, pud, addr, next, prot); |
1161 | return -ENOMEM; | 1170 | if (err) |
1171 | break; | ||
1162 | } while (pud++, addr = next, addr != end); | 1172 | } while (pud++, addr = next, addr != end); |
1163 | return 0; | 1173 | return err; |
1164 | } | 1174 | } |
1165 | 1175 | ||
1166 | int zeromap_page_range(struct vm_area_struct *vma, | 1176 | int zeromap_page_range(struct vm_area_struct *vma, |
@@ -1452,6 +1462,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo | |||
1452 | if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) | 1462 | if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) |
1453 | memset(kaddr, 0, PAGE_SIZE); | 1463 | memset(kaddr, 0, PAGE_SIZE); |
1454 | kunmap_atomic(kaddr, KM_USER0); | 1464 | kunmap_atomic(kaddr, KM_USER0); |
1465 | flush_dcache_page(dst); | ||
1455 | return; | 1466 | return; |
1456 | 1467 | ||
1457 | } | 1468 | } |
@@ -1901,7 +1912,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | |||
1901 | 1912 | ||
1902 | return 0; | 1913 | return 0; |
1903 | } | 1914 | } |
1904 | EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */ | ||
1905 | 1915 | ||
1906 | /** | 1916 | /** |
1907 | * swapin_readahead - swap in pages in hope we need them soon | 1917 | * swapin_readahead - swap in pages in hope we need them soon |
@@ -1990,6 +2000,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1990 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2000 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
1991 | page = lookup_swap_cache(entry); | 2001 | page = lookup_swap_cache(entry); |
1992 | if (!page) { | 2002 | if (!page) { |
2003 | grab_swap_token(); /* Contend for token _before_ read-in */ | ||
1993 | swapin_readahead(entry, address, vma); | 2004 | swapin_readahead(entry, address, vma); |
1994 | page = read_swap_cache_async(entry, vma, address); | 2005 | page = read_swap_cache_async(entry, vma, address); |
1995 | if (!page) { | 2006 | if (!page) { |
@@ -2007,7 +2018,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2007 | /* Had to read the page from swap area: Major fault */ | 2018 | /* Had to read the page from swap area: Major fault */ |
2008 | ret = VM_FAULT_MAJOR; | 2019 | ret = VM_FAULT_MAJOR; |
2009 | count_vm_event(PGMAJFAULT); | 2020 | count_vm_event(PGMAJFAULT); |
2010 | grab_swap_token(); | ||
2011 | } | 2021 | } |
2012 | 2022 | ||
2013 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2023 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fd678a662e..0c055a090f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -72,7 +72,6 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
72 | return ret; | 72 | return ret; |
73 | } | 73 | } |
74 | memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); | 74 | memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); |
75 | zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); | ||
76 | return 0; | 75 | return 0; |
77 | } | 76 | } |
78 | 77 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 617fb31086..da94639465 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) | |||
141 | enum zone_type k; | 141 | enum zone_type k; |
142 | 142 | ||
143 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); | 143 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); |
144 | max++; /* space for zlcache_ptr (see mmzone.h) */ | ||
144 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); | 145 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); |
145 | if (!zl) | 146 | if (!zl) |
146 | return NULL; | 147 | return NULL; |
148 | zl->zlcache_ptr = NULL; | ||
147 | num = 0; | 149 | num = 0; |
148 | /* First put in the highest zones from all nodes, then all the next | 150 | /* First put in the highest zones from all nodes, then all the next |
149 | lower zones etc. Avoid empty zones because the memory allocator | 151 | lower zones etc. Avoid empty zones because the memory allocator |
@@ -219,7 +221,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
219 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 221 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
220 | do { | 222 | do { |
221 | struct page *page; | 223 | struct page *page; |
222 | unsigned int nid; | 224 | int nid; |
223 | 225 | ||
224 | if (!pte_present(*pte)) | 226 | if (!pte_present(*pte)) |
225 | continue; | 227 | continue; |
@@ -1324,7 +1326,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) | |||
1324 | atomic_set(&new->refcnt, 1); | 1326 | atomic_set(&new->refcnt, 1); |
1325 | if (new->policy == MPOL_BIND) { | 1327 | if (new->policy == MPOL_BIND) { |
1326 | int sz = ksize(old->v.zonelist); | 1328 | int sz = ksize(old->v.zonelist); |
1327 | new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL); | 1329 | new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL); |
1328 | if (!new->v.zonelist) { | 1330 | if (!new->v.zonelist) { |
1329 | kmem_cache_free(policy_cache, new); | 1331 | kmem_cache_free(policy_cache, new); |
1330 | return ERR_PTR(-ENOMEM); | 1332 | return ERR_PTR(-ENOMEM); |
@@ -1705,8 +1707,8 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) | |||
1705 | * Display pages allocated per node and memory policy via /proc. | 1707 | * Display pages allocated per node and memory policy via /proc. |
1706 | */ | 1708 | */ |
1707 | 1709 | ||
1708 | static const char *policy_types[] = { "default", "prefer", "bind", | 1710 | static const char * const policy_types[] = |
1709 | "interleave" }; | 1711 | { "default", "prefer", "bind", "interleave" }; |
1710 | 1712 | ||
1711 | /* | 1713 | /* |
1712 | * Convert a mempolicy into a string. | 1714 | * Convert a mempolicy into a string. |
@@ -1855,7 +1857,7 @@ int show_numa_map(struct seq_file *m, void *v) | |||
1855 | 1857 | ||
1856 | if (file) { | 1858 | if (file) { |
1857 | seq_printf(m, " file="); | 1859 | seq_printf(m, " file="); |
1858 | seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= "); | 1860 | seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= "); |
1859 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { | 1861 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { |
1860 | seq_printf(m, " heap"); | 1862 | seq_printf(m, " heap"); |
1861 | } else if (vma->vm_start <= mm->start_stack && | 1863 | } else if (vma->vm_start <= mm->start_stack && |
diff --git a/mm/migrate.c b/mm/migrate.c index ba2453f948..e9b161bde9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -294,7 +294,7 @@ out: | |||
294 | static int migrate_page_move_mapping(struct address_space *mapping, | 294 | static int migrate_page_move_mapping(struct address_space *mapping, |
295 | struct page *newpage, struct page *page) | 295 | struct page *newpage, struct page *page) |
296 | { | 296 | { |
297 | struct page **radix_pointer; | 297 | void **pslot; |
298 | 298 | ||
299 | if (!mapping) { | 299 | if (!mapping) { |
300 | /* Anonymous page */ | 300 | /* Anonymous page */ |
@@ -305,12 +305,11 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
305 | 305 | ||
306 | write_lock_irq(&mapping->tree_lock); | 306 | write_lock_irq(&mapping->tree_lock); |
307 | 307 | ||
308 | radix_pointer = (struct page **)radix_tree_lookup_slot( | 308 | pslot = radix_tree_lookup_slot(&mapping->page_tree, |
309 | &mapping->page_tree, | 309 | page_index(page)); |
310 | page_index(page)); | ||
311 | 310 | ||
312 | if (page_count(page) != 2 + !!PagePrivate(page) || | 311 | if (page_count(page) != 2 + !!PagePrivate(page) || |
313 | *radix_pointer != page) { | 312 | (struct page *)radix_tree_deref_slot(pslot) != page) { |
314 | write_unlock_irq(&mapping->tree_lock); | 313 | write_unlock_irq(&mapping->tree_lock); |
315 | return -EAGAIN; | 314 | return -EAGAIN; |
316 | } | 315 | } |
@@ -318,7 +317,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
318 | /* | 317 | /* |
319 | * Now we know that no one else is looking at the page. | 318 | * Now we know that no one else is looking at the page. |
320 | */ | 319 | */ |
321 | get_page(newpage); | 320 | get_page(newpage); /* add cache reference */ |
322 | #ifdef CONFIG_SWAP | 321 | #ifdef CONFIG_SWAP |
323 | if (PageSwapCache(page)) { | 322 | if (PageSwapCache(page)) { |
324 | SetPageSwapCache(newpage); | 323 | SetPageSwapCache(newpage); |
@@ -326,8 +325,14 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
326 | } | 325 | } |
327 | #endif | 326 | #endif |
328 | 327 | ||
329 | *radix_pointer = newpage; | 328 | radix_tree_replace_slot(pslot, newpage); |
329 | |||
330 | /* | ||
331 | * Drop cache reference from old page. | ||
332 | * We know this isn't the last reference. | ||
333 | */ | ||
330 | __put_page(page); | 334 | __put_page(page); |
335 | |||
331 | write_unlock_irq(&mapping->tree_lock); | 336 | write_unlock_irq(&mapping->tree_lock); |
332 | 337 | ||
333 | return 0; | 338 | return 0; |
@@ -952,7 +957,8 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | |||
952 | goto out; | 957 | goto out; |
953 | 958 | ||
954 | pm[i].node = node; | 959 | pm[i].node = node; |
955 | } | 960 | } else |
961 | pm[i].node = 0; /* anything to not match MAX_NUMNODES */ | ||
956 | } | 962 | } |
957 | /* End marker */ | 963 | /* End marker */ |
958 | pm[nr_pages].node = MAX_NUMNODES; | 964 | pm[nr_pages].node = MAX_NUMNODES; |
diff --git a/mm/mlock.c b/mm/mlock.c index b90c59573a..3446b7ef73 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -65,7 +65,7 @@ success: | |||
65 | ret = make_pages_present(start, end); | 65 | ret = make_pages_present(start, end); |
66 | } | 66 | } |
67 | 67 | ||
68 | vma->vm_mm->locked_vm -= pages; | 68 | mm->locked_vm -= pages; |
69 | out: | 69 | out: |
70 | if (ret == -ENOMEM) | 70 | if (ret == -ENOMEM) |
71 | ret = -EAGAIN; | 71 | ret = -EAGAIN; |
@@ -188,7 +188,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, | |||
188 | struct file *file, struct address_space *mapping) | 188 | struct file *file, struct address_space *mapping) |
189 | { | 189 | { |
190 | if (vma->vm_flags & VM_DENYWRITE) | 190 | if (vma->vm_flags & VM_DENYWRITE) |
191 | atomic_inc(&file->f_dentry->d_inode->i_writecount); | 191 | atomic_inc(&file->f_path.dentry->d_inode->i_writecount); |
192 | if (vma->vm_flags & VM_SHARED) | 192 | if (vma->vm_flags & VM_SHARED) |
193 | mapping->i_mmap_writable--; | 193 | mapping->i_mmap_writable--; |
194 | 194 | ||
@@ -399,7 +399,7 @@ static inline void __vma_link_file(struct vm_area_struct *vma) | |||
399 | struct address_space *mapping = file->f_mapping; | 399 | struct address_space *mapping = file->f_mapping; |
400 | 400 | ||
401 | if (vma->vm_flags & VM_DENYWRITE) | 401 | if (vma->vm_flags & VM_DENYWRITE) |
402 | atomic_dec(&file->f_dentry->d_inode->i_writecount); | 402 | atomic_dec(&file->f_path.dentry->d_inode->i_writecount); |
403 | if (vma->vm_flags & VM_SHARED) | 403 | if (vma->vm_flags & VM_SHARED) |
404 | mapping->i_mmap_writable++; | 404 | mapping->i_mmap_writable++; |
405 | 405 | ||
@@ -907,7 +907,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
907 | * mounted, in which case we dont add PROT_EXEC.) | 907 | * mounted, in which case we dont add PROT_EXEC.) |
908 | */ | 908 | */ |
909 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) | 909 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) |
910 | if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))) | 910 | if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) |
911 | prot |= PROT_EXEC; | 911 | prot |= PROT_EXEC; |
912 | 912 | ||
913 | if (!len) | 913 | if (!len) |
@@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
960 | return -EAGAIN; | 960 | return -EAGAIN; |
961 | } | 961 | } |
962 | 962 | ||
963 | inode = file ? file->f_dentry->d_inode : NULL; | 963 | inode = file ? file->f_path.dentry->d_inode : NULL; |
964 | 964 | ||
965 | if (file) { | 965 | if (file) { |
966 | switch (flags & MAP_TYPE) { | 966 | switch (flags & MAP_TYPE) { |
@@ -989,7 +989,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
989 | case MAP_PRIVATE: | 989 | case MAP_PRIVATE: |
990 | if (!(file->f_mode & FMODE_READ)) | 990 | if (!(file->f_mode & FMODE_READ)) |
991 | return -EACCES; | 991 | return -EACCES; |
992 | if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { | 992 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { |
993 | if (vm_flags & VM_EXEC) | 993 | if (vm_flags & VM_EXEC) |
994 | return -EPERM; | 994 | return -EPERM; |
995 | vm_flags &= ~VM_MAYEXEC; | 995 | vm_flags &= ~VM_MAYEXEC; |
@@ -1379,7 +1379,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | |||
1379 | * Check if the given range is hugepage aligned, and | 1379 | * Check if the given range is hugepage aligned, and |
1380 | * can be made suitable for hugepages. | 1380 | * can be made suitable for hugepages. |
1381 | */ | 1381 | */ |
1382 | ret = prepare_hugepage_range(addr, len); | 1382 | ret = prepare_hugepage_range(addr, len, pgoff); |
1383 | } else { | 1383 | } else { |
1384 | /* | 1384 | /* |
1385 | * Ensure that a normal request is not falling in a | 1385 | * Ensure that a normal request is not falling in a |
@@ -1736,7 +1736,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1736 | if (mm->map_count >= sysctl_max_map_count) | 1736 | if (mm->map_count >= sysctl_max_map_count) |
1737 | return -ENOMEM; | 1737 | return -ENOMEM; |
1738 | 1738 | ||
1739 | new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | 1739 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
1740 | if (!new) | 1740 | if (!new) |
1741 | return -ENOMEM; | 1741 | return -ENOMEM; |
1742 | 1742 | ||
@@ -1880,6 +1880,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
1880 | if ((addr + len) > TASK_SIZE || (addr + len) < addr) | 1880 | if ((addr + len) > TASK_SIZE || (addr + len) < addr) |
1881 | return -EINVAL; | 1881 | return -EINVAL; |
1882 | 1882 | ||
1883 | if (is_hugepage_only_range(mm, addr, len)) | ||
1884 | return -EINVAL; | ||
1885 | |||
1883 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | 1886 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; |
1884 | 1887 | ||
1885 | error = arch_mmap_check(addr, len, flags); | 1888 | error = arch_mmap_check(addr, len, flags); |
@@ -2054,7 +2057,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2054 | vma_start < new_vma->vm_end) | 2057 | vma_start < new_vma->vm_end) |
2055 | *vmap = new_vma; | 2058 | *vmap = new_vma; |
2056 | } else { | 2059 | } else { |
2057 | new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | 2060 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2058 | if (new_vma) { | 2061 | if (new_vma) { |
2059 | *new_vma = *vma; | 2062 | *new_vma = *vma; |
2060 | pol = mpol_copy(vma_policy(vma)); | 2063 | pol = mpol_copy(vma_policy(vma)); |
diff --git a/mm/mmzone.c b/mm/mmzone.c index febea1c981..eb5838634f 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -14,8 +14,6 @@ struct pglist_data *first_online_pgdat(void) | |||
14 | return NODE_DATA(first_online_node); | 14 | return NODE_DATA(first_online_node); |
15 | } | 15 | } |
16 | 16 | ||
17 | EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */ | ||
18 | |||
19 | struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) | 17 | struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) |
20 | { | 18 | { |
21 | int nid = next_online_node(pgdat->node_id); | 19 | int nid = next_online_node(pgdat->node_id); |
@@ -24,8 +22,6 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) | |||
24 | return NULL; | 22 | return NULL; |
25 | return NODE_DATA(nid); | 23 | return NODE_DATA(nid); |
26 | } | 24 | } |
27 | EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */ | ||
28 | |||
29 | 25 | ||
30 | /* | 26 | /* |
31 | * next_zone - helper magic for for_each_zone() | 27 | * next_zone - helper magic for for_each_zone() |
@@ -45,5 +41,4 @@ struct zone *next_zone(struct zone *zone) | |||
45 | } | 41 | } |
46 | return zone; | 42 | return zone; |
47 | } | 43 | } |
48 | EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */ | ||
49 | 44 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index 8bdde9508f..23fb033e59 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -497,15 +497,17 @@ static int validate_mmap_request(struct file *file, | |||
497 | (flags & MAP_TYPE) != MAP_SHARED) | 497 | (flags & MAP_TYPE) != MAP_SHARED) |
498 | return -EINVAL; | 498 | return -EINVAL; |
499 | 499 | ||
500 | if (PAGE_ALIGN(len) == 0) | 500 | if (!len) |
501 | return addr; | ||
502 | |||
503 | if (len > TASK_SIZE) | ||
504 | return -EINVAL; | 501 | return -EINVAL; |
505 | 502 | ||
503 | /* Careful about overflows.. */ | ||
504 | len = PAGE_ALIGN(len); | ||
505 | if (!len || len > TASK_SIZE) | ||
506 | return -ENOMEM; | ||
507 | |||
506 | /* offset overflow? */ | 508 | /* offset overflow? */ |
507 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) | 509 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) |
508 | return -EINVAL; | 510 | return -EOVERFLOW; |
509 | 511 | ||
510 | if (file) { | 512 | if (file) { |
511 | /* validate file mapping requests */ | 513 | /* validate file mapping requests */ |
@@ -521,7 +523,7 @@ static int validate_mmap_request(struct file *file, | |||
521 | */ | 523 | */ |
522 | mapping = file->f_mapping; | 524 | mapping = file->f_mapping; |
523 | if (!mapping) | 525 | if (!mapping) |
524 | mapping = file->f_dentry->d_inode->i_mapping; | 526 | mapping = file->f_path.dentry->d_inode->i_mapping; |
525 | 527 | ||
526 | capabilities = 0; | 528 | capabilities = 0; |
527 | if (mapping && mapping->backing_dev_info) | 529 | if (mapping && mapping->backing_dev_info) |
@@ -530,7 +532,7 @@ static int validate_mmap_request(struct file *file, | |||
530 | if (!capabilities) { | 532 | if (!capabilities) { |
531 | /* no explicit capabilities set, so assume some | 533 | /* no explicit capabilities set, so assume some |
532 | * defaults */ | 534 | * defaults */ |
533 | switch (file->f_dentry->d_inode->i_mode & S_IFMT) { | 535 | switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) { |
534 | case S_IFREG: | 536 | case S_IFREG: |
535 | case S_IFBLK: | 537 | case S_IFBLK: |
536 | capabilities = BDI_CAP_MAP_COPY; | 538 | capabilities = BDI_CAP_MAP_COPY; |
@@ -561,11 +563,11 @@ static int validate_mmap_request(struct file *file, | |||
561 | !(file->f_mode & FMODE_WRITE)) | 563 | !(file->f_mode & FMODE_WRITE)) |
562 | return -EACCES; | 564 | return -EACCES; |
563 | 565 | ||
564 | if (IS_APPEND(file->f_dentry->d_inode) && | 566 | if (IS_APPEND(file->f_path.dentry->d_inode) && |
565 | (file->f_mode & FMODE_WRITE)) | 567 | (file->f_mode & FMODE_WRITE)) |
566 | return -EACCES; | 568 | return -EACCES; |
567 | 569 | ||
568 | if (locks_verify_locked(file->f_dentry->d_inode)) | 570 | if (locks_verify_locked(file->f_path.dentry->d_inode)) |
569 | return -EAGAIN; | 571 | return -EAGAIN; |
570 | 572 | ||
571 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) | 573 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) |
@@ -596,7 +598,7 @@ static int validate_mmap_request(struct file *file, | |||
596 | 598 | ||
597 | /* handle executable mappings and implied executable | 599 | /* handle executable mappings and implied executable |
598 | * mappings */ | 600 | * mappings */ |
599 | if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { | 601 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { |
600 | if (prot & PROT_EXEC) | 602 | if (prot & PROT_EXEC) |
601 | return -EPERM; | 603 | return -EPERM; |
602 | } | 604 | } |
@@ -806,10 +808,9 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
806 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); | 808 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); |
807 | 809 | ||
808 | /* we're going to need to record the mapping if it works */ | 810 | /* we're going to need to record the mapping if it works */ |
809 | vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL); | 811 | vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL); |
810 | if (!vml) | 812 | if (!vml) |
811 | goto error_getting_vml; | 813 | goto error_getting_vml; |
812 | memset(vml, 0, sizeof(*vml)); | ||
813 | 814 | ||
814 | down_write(&nommu_vma_sem); | 815 | down_write(&nommu_vma_sem); |
815 | 816 | ||
@@ -832,7 +833,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
832 | continue; | 833 | continue; |
833 | 834 | ||
834 | /* search for overlapping mappings on the same file */ | 835 | /* search for overlapping mappings on the same file */ |
835 | if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode) | 836 | if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) |
836 | continue; | 837 | continue; |
837 | 838 | ||
838 | if (vma->vm_pgoff >= pgoff + pglen) | 839 | if (vma->vm_pgoff >= pgoff + pglen) |
@@ -885,11 +886,10 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
885 | } | 886 | } |
886 | 887 | ||
887 | /* we're going to need a VMA struct as well */ | 888 | /* we're going to need a VMA struct as well */ |
888 | vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); | 889 | vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); |
889 | if (!vma) | 890 | if (!vma) |
890 | goto error_getting_vma; | 891 | goto error_getting_vma; |
891 | 892 | ||
892 | memset(vma, 0, sizeof(*vma)); | ||
893 | INIT_LIST_HEAD(&vma->anon_vma_node); | 893 | INIT_LIST_HEAD(&vma->anon_vma_node); |
894 | atomic_set(&vma->vm_usage, 1); | 894 | atomic_set(&vma->vm_usage, 1); |
895 | if (file) | 895 | if (file) |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 20f41b082e..223d9ccb7d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -15,6 +15,7 @@ | |||
15 | * kernel subsystems and hints as to where to find out what things do. | 15 | * kernel subsystems and hints as to where to find out what things do. |
16 | */ | 16 | */ |
17 | 17 | ||
18 | #include <linux/oom.h> | ||
18 | #include <linux/mm.h> | 19 | #include <linux/mm.h> |
19 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
20 | #include <linux/swap.h> | 21 | #include <linux/swap.h> |
@@ -263,7 +264,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) | |||
263 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO | 264 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO |
264 | * set. | 265 | * set. |
265 | */ | 266 | */ |
266 | static void __oom_kill_task(struct task_struct *p, const char *message) | 267 | static void __oom_kill_task(struct task_struct *p, int verbose) |
267 | { | 268 | { |
268 | if (is_init(p)) { | 269 | if (is_init(p)) { |
269 | WARN_ON(1); | 270 | WARN_ON(1); |
@@ -277,10 +278,8 @@ static void __oom_kill_task(struct task_struct *p, const char *message) | |||
277 | return; | 278 | return; |
278 | } | 279 | } |
279 | 280 | ||
280 | if (message) { | 281 | if (verbose) |
281 | printk(KERN_ERR "%s: Killed process %d (%s).\n", | 282 | printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm); |
282 | message, p->pid, p->comm); | ||
283 | } | ||
284 | 283 | ||
285 | /* | 284 | /* |
286 | * We give our sacrificial lamb high priority and access to | 285 | * We give our sacrificial lamb high priority and access to |
@@ -293,7 +292,7 @@ static void __oom_kill_task(struct task_struct *p, const char *message) | |||
293 | force_sig(SIGKILL, p); | 292 | force_sig(SIGKILL, p); |
294 | } | 293 | } |
295 | 294 | ||
296 | static int oom_kill_task(struct task_struct *p, const char *message) | 295 | static int oom_kill_task(struct task_struct *p) |
297 | { | 296 | { |
298 | struct mm_struct *mm; | 297 | struct mm_struct *mm; |
299 | struct task_struct *g, *q; | 298 | struct task_struct *g, *q; |
@@ -312,15 +311,25 @@ static int oom_kill_task(struct task_struct *p, const char *message) | |||
312 | if (mm == NULL) | 311 | if (mm == NULL) |
313 | return 1; | 312 | return 1; |
314 | 313 | ||
315 | __oom_kill_task(p, message); | 314 | /* |
315 | * Don't kill the process if any threads are set to OOM_DISABLE | ||
316 | */ | ||
317 | do_each_thread(g, q) { | ||
318 | if (q->mm == mm && p->oomkilladj == OOM_DISABLE) | ||
319 | return 1; | ||
320 | } while_each_thread(g, q); | ||
321 | |||
322 | __oom_kill_task(p, 1); | ||
323 | |||
316 | /* | 324 | /* |
317 | * kill all processes that share the ->mm (i.e. all threads), | 325 | * kill all processes that share the ->mm (i.e. all threads), |
318 | * but are in a different thread group | 326 | * but are in a different thread group. Don't let them have access |
327 | * to memory reserves though, otherwise we might deplete all memory. | ||
319 | */ | 328 | */ |
320 | do_each_thread(g, q) | 329 | do_each_thread(g, q) { |
321 | if (q->mm == mm && q->tgid != p->tgid) | 330 | if (q->mm == mm && q->tgid != p->tgid) |
322 | __oom_kill_task(q, message); | 331 | force_sig(SIGKILL, p); |
323 | while_each_thread(g, q); | 332 | } while_each_thread(g, q); |
324 | 333 | ||
325 | return 0; | 334 | return 0; |
326 | } | 335 | } |
@@ -336,21 +345,22 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, | |||
336 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 345 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
337 | */ | 346 | */ |
338 | if (p->flags & PF_EXITING) { | 347 | if (p->flags & PF_EXITING) { |
339 | __oom_kill_task(p, NULL); | 348 | __oom_kill_task(p, 0); |
340 | return 0; | 349 | return 0; |
341 | } | 350 | } |
342 | 351 | ||
343 | printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li" | 352 | printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", |
344 | " and children.\n", p->pid, p->comm, points); | 353 | message, p->pid, p->comm, points); |
354 | |||
345 | /* Try to kill a child first */ | 355 | /* Try to kill a child first */ |
346 | list_for_each(tsk, &p->children) { | 356 | list_for_each(tsk, &p->children) { |
347 | c = list_entry(tsk, struct task_struct, sibling); | 357 | c = list_entry(tsk, struct task_struct, sibling); |
348 | if (c->mm == p->mm) | 358 | if (c->mm == p->mm) |
349 | continue; | 359 | continue; |
350 | if (!oom_kill_task(c, message)) | 360 | if (!oom_kill_task(c)) |
351 | return 0; | 361 | return 0; |
352 | } | 362 | } |
353 | return oom_kill_task(p, message); | 363 | return oom_kill_task(p); |
354 | } | 364 | } |
355 | 365 | ||
356 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); | 366 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a0f3390574..237107c1b0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/writeback.h> | 21 | #include <linux/writeback.h> |
22 | #include <linux/init.h> | 22 | #include <linux/init.h> |
23 | #include <linux/backing-dev.h> | 23 | #include <linux/backing-dev.h> |
24 | #include <linux/task_io_accounting_ops.h> | ||
24 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> |
25 | #include <linux/mpage.h> | 26 | #include <linux/mpage.h> |
26 | #include <linux/rmap.h> | 27 | #include <linux/rmap.h> |
@@ -222,7 +223,7 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
222 | if (pages_written >= write_chunk) | 223 | if (pages_written >= write_chunk) |
223 | break; /* We've done our duty */ | 224 | break; /* We've done our duty */ |
224 | } | 225 | } |
225 | blk_congestion_wait(WRITE, HZ/10); | 226 | congestion_wait(WRITE, HZ/10); |
226 | } | 227 | } |
227 | 228 | ||
228 | if (nr_reclaimable + global_page_state(NR_WRITEBACK) | 229 | if (nr_reclaimable + global_page_state(NR_WRITEBACK) |
@@ -314,7 +315,7 @@ void throttle_vm_writeout(void) | |||
314 | if (global_page_state(NR_UNSTABLE_NFS) + | 315 | if (global_page_state(NR_UNSTABLE_NFS) + |
315 | global_page_state(NR_WRITEBACK) <= dirty_thresh) | 316 | global_page_state(NR_WRITEBACK) <= dirty_thresh) |
316 | break; | 317 | break; |
317 | blk_congestion_wait(WRITE, HZ/10); | 318 | congestion_wait(WRITE, HZ/10); |
318 | } | 319 | } |
319 | } | 320 | } |
320 | 321 | ||
@@ -351,7 +352,7 @@ static void background_writeout(unsigned long _min_pages) | |||
351 | min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; | 352 | min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; |
352 | if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { | 353 | if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { |
353 | /* Wrote less than expected */ | 354 | /* Wrote less than expected */ |
354 | blk_congestion_wait(WRITE, HZ/10); | 355 | congestion_wait(WRITE, HZ/10); |
355 | if (!wbc.encountered_congestion) | 356 | if (!wbc.encountered_congestion) |
356 | break; | 357 | break; |
357 | } | 358 | } |
@@ -422,7 +423,7 @@ static void wb_kupdate(unsigned long arg) | |||
422 | writeback_inodes(&wbc); | 423 | writeback_inodes(&wbc); |
423 | if (wbc.nr_to_write > 0) { | 424 | if (wbc.nr_to_write > 0) { |
424 | if (wbc.encountered_congestion) | 425 | if (wbc.encountered_congestion) |
425 | blk_congestion_wait(WRITE, HZ/10); | 426 | congestion_wait(WRITE, HZ/10); |
426 | else | 427 | else |
427 | break; /* All the old data is written */ | 428 | break; /* All the old data is written */ |
428 | } | 429 | } |
@@ -761,23 +762,24 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
761 | struct address_space *mapping = page_mapping(page); | 762 | struct address_space *mapping = page_mapping(page); |
762 | struct address_space *mapping2; | 763 | struct address_space *mapping2; |
763 | 764 | ||
764 | if (mapping) { | 765 | if (!mapping) |
765 | write_lock_irq(&mapping->tree_lock); | 766 | return 1; |
766 | mapping2 = page_mapping(page); | 767 | |
767 | if (mapping2) { /* Race with truncate? */ | 768 | write_lock_irq(&mapping->tree_lock); |
768 | BUG_ON(mapping2 != mapping); | 769 | mapping2 = page_mapping(page); |
769 | if (mapping_cap_account_dirty(mapping)) | 770 | if (mapping2) { /* Race with truncate? */ |
770 | __inc_zone_page_state(page, | 771 | BUG_ON(mapping2 != mapping); |
771 | NR_FILE_DIRTY); | 772 | if (mapping_cap_account_dirty(mapping)) { |
772 | radix_tree_tag_set(&mapping->page_tree, | 773 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
773 | page_index(page), PAGECACHE_TAG_DIRTY); | 774 | task_io_account_write(PAGE_CACHE_SIZE); |
774 | } | ||
775 | write_unlock_irq(&mapping->tree_lock); | ||
776 | if (mapping->host) { | ||
777 | /* !PageAnon && !swapper_space */ | ||
778 | __mark_inode_dirty(mapping->host, | ||
779 | I_DIRTY_PAGES); | ||
780 | } | 775 | } |
776 | radix_tree_tag_set(&mapping->page_tree, | ||
777 | page_index(page), PAGECACHE_TAG_DIRTY); | ||
778 | } | ||
779 | write_unlock_irq(&mapping->tree_lock); | ||
780 | if (mapping->host) { | ||
781 | /* !PageAnon && !swapper_space */ | ||
782 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | ||
781 | } | 783 | } |
782 | return 1; | 784 | return 1; |
783 | } | 785 | } |
@@ -851,27 +853,26 @@ int test_clear_page_dirty(struct page *page) | |||
851 | struct address_space *mapping = page_mapping(page); | 853 | struct address_space *mapping = page_mapping(page); |
852 | unsigned long flags; | 854 | unsigned long flags; |
853 | 855 | ||
854 | if (mapping) { | 856 | if (!mapping) |
855 | write_lock_irqsave(&mapping->tree_lock, flags); | 857 | return TestClearPageDirty(page); |
856 | if (TestClearPageDirty(page)) { | 858 | |
857 | radix_tree_tag_clear(&mapping->page_tree, | 859 | write_lock_irqsave(&mapping->tree_lock, flags); |
858 | page_index(page), | 860 | if (TestClearPageDirty(page)) { |
859 | PAGECACHE_TAG_DIRTY); | 861 | radix_tree_tag_clear(&mapping->page_tree, |
860 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 862 | page_index(page), PAGECACHE_TAG_DIRTY); |
861 | /* | ||
862 | * We can continue to use `mapping' here because the | ||
863 | * page is locked, which pins the address_space | ||
864 | */ | ||
865 | if (mapping_cap_account_dirty(mapping)) { | ||
866 | page_mkclean(page); | ||
867 | dec_zone_page_state(page, NR_FILE_DIRTY); | ||
868 | } | ||
869 | return 1; | ||
870 | } | ||
871 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 863 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
872 | return 0; | 864 | /* |
865 | * We can continue to use `mapping' here because the | ||
866 | * page is locked, which pins the address_space | ||
867 | */ | ||
868 | if (mapping_cap_account_dirty(mapping)) { | ||
869 | page_mkclean(page); | ||
870 | dec_zone_page_state(page, NR_FILE_DIRTY); | ||
871 | } | ||
872 | return 1; | ||
873 | } | 873 | } |
874 | return TestClearPageDirty(page); | 874 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
875 | return 0; | ||
875 | } | 876 | } |
876 | EXPORT_SYMBOL(test_clear_page_dirty); | 877 | EXPORT_SYMBOL(test_clear_page_dirty); |
877 | 878 | ||
@@ -893,17 +894,17 @@ int clear_page_dirty_for_io(struct page *page) | |||
893 | { | 894 | { |
894 | struct address_space *mapping = page_mapping(page); | 895 | struct address_space *mapping = page_mapping(page); |
895 | 896 | ||
896 | if (mapping) { | 897 | if (!mapping) |
897 | if (TestClearPageDirty(page)) { | 898 | return TestClearPageDirty(page); |
898 | if (mapping_cap_account_dirty(mapping)) { | 899 | |
899 | page_mkclean(page); | 900 | if (TestClearPageDirty(page)) { |
900 | dec_zone_page_state(page, NR_FILE_DIRTY); | 901 | if (mapping_cap_account_dirty(mapping)) { |
901 | } | 902 | page_mkclean(page); |
902 | return 1; | 903 | dec_zone_page_state(page, NR_FILE_DIRTY); |
903 | } | 904 | } |
904 | return 0; | 905 | return 1; |
905 | } | 906 | } |
906 | return TestClearPageDirty(page); | 907 | return 0; |
907 | } | 908 | } |
908 | EXPORT_SYMBOL(clear_page_dirty_for_io); | 909 | EXPORT_SYMBOL(clear_page_dirty_for_io); |
909 | 910 | ||
@@ -956,15 +957,6 @@ int test_set_page_writeback(struct page *page) | |||
956 | EXPORT_SYMBOL(test_set_page_writeback); | 957 | EXPORT_SYMBOL(test_set_page_writeback); |
957 | 958 | ||
958 | /* | 959 | /* |
959 | * Wakes up tasks that are being throttled due to writeback congestion | ||
960 | */ | ||
961 | void writeback_congestion_end(void) | ||
962 | { | ||
963 | blk_congestion_end(WRITE); | ||
964 | } | ||
965 | EXPORT_SYMBOL(writeback_congestion_end); | ||
966 | |||
967 | /* | ||
968 | * Return true if any of the pages in the mapping are marged with the | 960 | * Return true if any of the pages in the mapping are marged with the |
969 | * passed tag. | 961 | * passed tag. |
970 | */ | 962 | */ |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 40db96a655..e6b17b2989 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -39,6 +39,8 @@ | |||
39 | #include <linux/stop_machine.h> | 39 | #include <linux/stop_machine.h> |
40 | #include <linux/sort.h> | 40 | #include <linux/sort.h> |
41 | #include <linux/pfn.h> | 41 | #include <linux/pfn.h> |
42 | #include <linux/backing-dev.h> | ||
43 | #include <linux/fault-inject.h> | ||
42 | 44 | ||
43 | #include <asm/tlbflush.h> | 45 | #include <asm/tlbflush.h> |
44 | #include <asm/div64.h> | 46 | #include <asm/div64.h> |
@@ -82,14 +84,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { | |||
82 | 84 | ||
83 | EXPORT_SYMBOL(totalram_pages); | 85 | EXPORT_SYMBOL(totalram_pages); |
84 | 86 | ||
85 | /* | 87 | static char * const zone_names[MAX_NR_ZONES] = { |
86 | * Used by page_zone() to look up the address of the struct zone whose | ||
87 | * id is encoded in the upper bits of page->flags | ||
88 | */ | ||
89 | struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; | ||
90 | EXPORT_SYMBOL(zone_table); | ||
91 | |||
92 | static char *zone_names[MAX_NR_ZONES] = { | ||
93 | "DMA", | 88 | "DMA", |
94 | #ifdef CONFIG_ZONE_DMA32 | 89 | #ifdef CONFIG_ZONE_DMA32 |
95 | "DMA32", | 90 | "DMA32", |
@@ -236,7 +231,7 @@ static void prep_compound_page(struct page *page, unsigned long order) | |||
236 | int i; | 231 | int i; |
237 | int nr_pages = 1 << order; | 232 | int nr_pages = 1 << order; |
238 | 233 | ||
239 | page[1].lru.next = (void *)free_compound_page; /* set dtor */ | 234 | set_compound_page_dtor(page, free_compound_page); |
240 | page[1].lru.prev = (void *)order; | 235 | page[1].lru.prev = (void *)order; |
241 | for (i = 0; i < nr_pages; i++) { | 236 | for (i = 0; i < nr_pages; i++) { |
242 | struct page *p = page + i; | 237 | struct page *p = page + i; |
@@ -485,7 +480,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order) | |||
485 | spin_lock(&zone->lock); | 480 | spin_lock(&zone->lock); |
486 | zone->all_unreclaimable = 0; | 481 | zone->all_unreclaimable = 0; |
487 | zone->pages_scanned = 0; | 482 | zone->pages_scanned = 0; |
488 | __free_one_page(page, zone ,order); | 483 | __free_one_page(page, zone, order); |
489 | spin_unlock(&zone->lock); | 484 | spin_unlock(&zone->lock); |
490 | } | 485 | } |
491 | 486 | ||
@@ -604,6 +599,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
604 | 1 << PG_checked | 1 << PG_mappedtodisk); | 599 | 1 << PG_checked | 1 << PG_mappedtodisk); |
605 | set_page_private(page, 0); | 600 | set_page_private(page, 0); |
606 | set_page_refcounted(page); | 601 | set_page_refcounted(page); |
602 | |||
603 | arch_alloc_page(page, order); | ||
607 | kernel_map_pages(page, 1 << order, 1); | 604 | kernel_map_pages(page, 1 << order, 1); |
608 | 605 | ||
609 | if (gfp_flags & __GFP_ZERO) | 606 | if (gfp_flags & __GFP_ZERO) |
@@ -689,9 +686,15 @@ void drain_node_pages(int nodeid) | |||
689 | 686 | ||
690 | pcp = &pset->pcp[i]; | 687 | pcp = &pset->pcp[i]; |
691 | if (pcp->count) { | 688 | if (pcp->count) { |
689 | int to_drain; | ||
690 | |||
692 | local_irq_save(flags); | 691 | local_irq_save(flags); |
693 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | 692 | if (pcp->count >= pcp->batch) |
694 | pcp->count = 0; | 693 | to_drain = pcp->batch; |
694 | else | ||
695 | to_drain = pcp->count; | ||
696 | free_pages_bulk(zone, to_drain, &pcp->list, 0); | ||
697 | pcp->count -= to_drain; | ||
695 | local_irq_restore(flags); | 698 | local_irq_restore(flags); |
696 | } | 699 | } |
697 | } | 700 | } |
@@ -699,7 +702,6 @@ void drain_node_pages(int nodeid) | |||
699 | } | 702 | } |
700 | #endif | 703 | #endif |
701 | 704 | ||
702 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) | ||
703 | static void __drain_pages(unsigned int cpu) | 705 | static void __drain_pages(unsigned int cpu) |
704 | { | 706 | { |
705 | unsigned long flags; | 707 | unsigned long flags; |
@@ -721,7 +723,6 @@ static void __drain_pages(unsigned int cpu) | |||
721 | } | 723 | } |
722 | } | 724 | } |
723 | } | 725 | } |
724 | #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ | ||
725 | 726 | ||
726 | #ifdef CONFIG_PM | 727 | #ifdef CONFIG_PM |
727 | 728 | ||
@@ -852,7 +853,7 @@ again: | |||
852 | pcp = &zone_pcp(zone, cpu)->pcp[cold]; | 853 | pcp = &zone_pcp(zone, cpu)->pcp[cold]; |
853 | local_irq_save(flags); | 854 | local_irq_save(flags); |
854 | if (!pcp->count) { | 855 | if (!pcp->count) { |
855 | pcp->count += rmqueue_bulk(zone, 0, | 856 | pcp->count = rmqueue_bulk(zone, 0, |
856 | pcp->batch, &pcp->list); | 857 | pcp->batch, &pcp->list); |
857 | if (unlikely(!pcp->count)) | 858 | if (unlikely(!pcp->count)) |
858 | goto failed; | 859 | goto failed; |
@@ -892,6 +893,91 @@ failed: | |||
892 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 893 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
893 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 894 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
894 | 895 | ||
896 | #ifdef CONFIG_FAIL_PAGE_ALLOC | ||
897 | |||
898 | static struct fail_page_alloc_attr { | ||
899 | struct fault_attr attr; | ||
900 | |||
901 | u32 ignore_gfp_highmem; | ||
902 | u32 ignore_gfp_wait; | ||
903 | |||
904 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
905 | |||
906 | struct dentry *ignore_gfp_highmem_file; | ||
907 | struct dentry *ignore_gfp_wait_file; | ||
908 | |||
909 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
910 | |||
911 | } fail_page_alloc = { | ||
912 | .attr = FAULT_ATTR_INITIALIZER, | ||
913 | .ignore_gfp_wait = 1, | ||
914 | .ignore_gfp_highmem = 1, | ||
915 | }; | ||
916 | |||
917 | static int __init setup_fail_page_alloc(char *str) | ||
918 | { | ||
919 | return setup_fault_attr(&fail_page_alloc.attr, str); | ||
920 | } | ||
921 | __setup("fail_page_alloc=", setup_fail_page_alloc); | ||
922 | |||
923 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | ||
924 | { | ||
925 | if (gfp_mask & __GFP_NOFAIL) | ||
926 | return 0; | ||
927 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) | ||
928 | return 0; | ||
929 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) | ||
930 | return 0; | ||
931 | |||
932 | return should_fail(&fail_page_alloc.attr, 1 << order); | ||
933 | } | ||
934 | |||
935 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
936 | |||
937 | static int __init fail_page_alloc_debugfs(void) | ||
938 | { | ||
939 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | ||
940 | struct dentry *dir; | ||
941 | int err; | ||
942 | |||
943 | err = init_fault_attr_dentries(&fail_page_alloc.attr, | ||
944 | "fail_page_alloc"); | ||
945 | if (err) | ||
946 | return err; | ||
947 | dir = fail_page_alloc.attr.dentries.dir; | ||
948 | |||
949 | fail_page_alloc.ignore_gfp_wait_file = | ||
950 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | ||
951 | &fail_page_alloc.ignore_gfp_wait); | ||
952 | |||
953 | fail_page_alloc.ignore_gfp_highmem_file = | ||
954 | debugfs_create_bool("ignore-gfp-highmem", mode, dir, | ||
955 | &fail_page_alloc.ignore_gfp_highmem); | ||
956 | |||
957 | if (!fail_page_alloc.ignore_gfp_wait_file || | ||
958 | !fail_page_alloc.ignore_gfp_highmem_file) { | ||
959 | err = -ENOMEM; | ||
960 | debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); | ||
961 | debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); | ||
962 | cleanup_fault_attr_dentries(&fail_page_alloc.attr); | ||
963 | } | ||
964 | |||
965 | return err; | ||
966 | } | ||
967 | |||
968 | late_initcall(fail_page_alloc_debugfs); | ||
969 | |||
970 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
971 | |||
972 | #else /* CONFIG_FAIL_PAGE_ALLOC */ | ||
973 | |||
974 | static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | ||
975 | { | ||
976 | return 0; | ||
977 | } | ||
978 | |||
979 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | ||
980 | |||
895 | /* | 981 | /* |
896 | * Return 1 if free pages are above 'mark'. This takes into account the order | 982 | * Return 1 if free pages are above 'mark'. This takes into account the order |
897 | * of the allocation. | 983 | * of the allocation. |
@@ -924,31 +1010,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
924 | return 1; | 1010 | return 1; |
925 | } | 1011 | } |
926 | 1012 | ||
1013 | #ifdef CONFIG_NUMA | ||
1014 | /* | ||
1015 | * zlc_setup - Setup for "zonelist cache". Uses cached zone data to | ||
1016 | * skip over zones that are not allowed by the cpuset, or that have | ||
1017 | * been recently (in last second) found to be nearly full. See further | ||
1018 | * comments in mmzone.h. Reduces cache footprint of zonelist scans | ||
1019 | * that have to skip over alot of full or unallowed zones. | ||
1020 | * | ||
1021 | * If the zonelist cache is present in the passed in zonelist, then | ||
1022 | * returns a pointer to the allowed node mask (either the current | ||
1023 | * tasks mems_allowed, or node_online_map.) | ||
1024 | * | ||
1025 | * If the zonelist cache is not available for this zonelist, does | ||
1026 | * nothing and returns NULL. | ||
1027 | * | ||
1028 | * If the fullzones BITMAP in the zonelist cache is stale (more than | ||
1029 | * a second since last zap'd) then we zap it out (clear its bits.) | ||
1030 | * | ||
1031 | * We hold off even calling zlc_setup, until after we've checked the | ||
1032 | * first zone in the zonelist, on the theory that most allocations will | ||
1033 | * be satisfied from that first zone, so best to examine that zone as | ||
1034 | * quickly as we can. | ||
1035 | */ | ||
1036 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | ||
1037 | { | ||
1038 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1039 | nodemask_t *allowednodes; /* zonelist_cache approximation */ | ||
1040 | |||
1041 | zlc = zonelist->zlcache_ptr; | ||
1042 | if (!zlc) | ||
1043 | return NULL; | ||
1044 | |||
1045 | if (jiffies - zlc->last_full_zap > 1 * HZ) { | ||
1046 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
1047 | zlc->last_full_zap = jiffies; | ||
1048 | } | ||
1049 | |||
1050 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | ||
1051 | &cpuset_current_mems_allowed : | ||
1052 | &node_online_map; | ||
1053 | return allowednodes; | ||
1054 | } | ||
1055 | |||
1056 | /* | ||
1057 | * Given 'z' scanning a zonelist, run a couple of quick checks to see | ||
1058 | * if it is worth looking at further for free memory: | ||
1059 | * 1) Check that the zone isn't thought to be full (doesn't have its | ||
1060 | * bit set in the zonelist_cache fullzones BITMAP). | ||
1061 | * 2) Check that the zones node (obtained from the zonelist_cache | ||
1062 | * z_to_n[] mapping) is allowed in the passed in allowednodes mask. | ||
1063 | * Return true (non-zero) if zone is worth looking at further, or | ||
1064 | * else return false (zero) if it is not. | ||
1065 | * | ||
1066 | * This check -ignores- the distinction between various watermarks, | ||
1067 | * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is | ||
1068 | * found to be full for any variation of these watermarks, it will | ||
1069 | * be considered full for up to one second by all requests, unless | ||
1070 | * we are so low on memory on all allowed nodes that we are forced | ||
1071 | * into the second scan of the zonelist. | ||
1072 | * | ||
1073 | * In the second scan we ignore this zonelist cache and exactly | ||
1074 | * apply the watermarks to all zones, even it is slower to do so. | ||
1075 | * We are low on memory in the second scan, and should leave no stone | ||
1076 | * unturned looking for a free page. | ||
1077 | */ | ||
1078 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | ||
1079 | nodemask_t *allowednodes) | ||
1080 | { | ||
1081 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1082 | int i; /* index of *z in zonelist zones */ | ||
1083 | int n; /* node that zone *z is on */ | ||
1084 | |||
1085 | zlc = zonelist->zlcache_ptr; | ||
1086 | if (!zlc) | ||
1087 | return 1; | ||
1088 | |||
1089 | i = z - zonelist->zones; | ||
1090 | n = zlc->z_to_n[i]; | ||
1091 | |||
1092 | /* This zone is worth trying if it is allowed but not full */ | ||
1093 | return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); | ||
1094 | } | ||
1095 | |||
927 | /* | 1096 | /* |
928 | * get_page_from_freeliest goes through the zonelist trying to allocate | 1097 | * Given 'z' scanning a zonelist, set the corresponding bit in |
1098 | * zlc->fullzones, so that subsequent attempts to allocate a page | ||
1099 | * from that zone don't waste time re-examining it. | ||
1100 | */ | ||
1101 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | ||
1102 | { | ||
1103 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1104 | int i; /* index of *z in zonelist zones */ | ||
1105 | |||
1106 | zlc = zonelist->zlcache_ptr; | ||
1107 | if (!zlc) | ||
1108 | return; | ||
1109 | |||
1110 | i = z - zonelist->zones; | ||
1111 | |||
1112 | set_bit(i, zlc->fullzones); | ||
1113 | } | ||
1114 | |||
1115 | #else /* CONFIG_NUMA */ | ||
1116 | |||
1117 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | ||
1118 | { | ||
1119 | return NULL; | ||
1120 | } | ||
1121 | |||
1122 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | ||
1123 | nodemask_t *allowednodes) | ||
1124 | { | ||
1125 | return 1; | ||
1126 | } | ||
1127 | |||
1128 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | ||
1129 | { | ||
1130 | } | ||
1131 | #endif /* CONFIG_NUMA */ | ||
1132 | |||
1133 | /* | ||
1134 | * get_page_from_freelist goes through the zonelist trying to allocate | ||
929 | * a page. | 1135 | * a page. |
930 | */ | 1136 | */ |
931 | static struct page * | 1137 | static struct page * |
932 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | 1138 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, |
933 | struct zonelist *zonelist, int alloc_flags) | 1139 | struct zonelist *zonelist, int alloc_flags) |
934 | { | 1140 | { |
935 | struct zone **z = zonelist->zones; | 1141 | struct zone **z; |
936 | struct page *page = NULL; | 1142 | struct page *page = NULL; |
937 | int classzone_idx = zone_idx(*z); | 1143 | int classzone_idx = zone_idx(zonelist->zones[0]); |
938 | struct zone *zone; | 1144 | struct zone *zone; |
1145 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | ||
1146 | int zlc_active = 0; /* set if using zonelist_cache */ | ||
1147 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | ||
939 | 1148 | ||
1149 | zonelist_scan: | ||
940 | /* | 1150 | /* |
941 | * Go through the zonelist once, looking for a zone with enough free. | 1151 | * Scan zonelist, looking for a zone with enough free. |
942 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1152 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
943 | */ | 1153 | */ |
1154 | z = zonelist->zones; | ||
1155 | |||
944 | do { | 1156 | do { |
1157 | if (NUMA_BUILD && zlc_active && | ||
1158 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | ||
1159 | continue; | ||
945 | zone = *z; | 1160 | zone = *z; |
946 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && | 1161 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && |
947 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | 1162 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) |
948 | break; | 1163 | break; |
949 | if ((alloc_flags & ALLOC_CPUSET) && | 1164 | if ((alloc_flags & ALLOC_CPUSET) && |
950 | !cpuset_zone_allowed(zone, gfp_mask)) | 1165 | !cpuset_zone_allowed(zone, gfp_mask)) |
951 | continue; | 1166 | goto try_next_zone; |
952 | 1167 | ||
953 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1168 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
954 | unsigned long mark; | 1169 | unsigned long mark; |
@@ -958,18 +1173,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
958 | mark = zone->pages_low; | 1173 | mark = zone->pages_low; |
959 | else | 1174 | else |
960 | mark = zone->pages_high; | 1175 | mark = zone->pages_high; |
961 | if (!zone_watermark_ok(zone , order, mark, | 1176 | if (!zone_watermark_ok(zone, order, mark, |
962 | classzone_idx, alloc_flags)) | 1177 | classzone_idx, alloc_flags)) { |
963 | if (!zone_reclaim_mode || | 1178 | if (!zone_reclaim_mode || |
964 | !zone_reclaim(zone, gfp_mask, order)) | 1179 | !zone_reclaim(zone, gfp_mask, order)) |
965 | continue; | 1180 | goto this_zone_full; |
1181 | } | ||
966 | } | 1182 | } |
967 | 1183 | ||
968 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); | 1184 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); |
969 | if (page) { | 1185 | if (page) |
970 | break; | 1186 | break; |
1187 | this_zone_full: | ||
1188 | if (NUMA_BUILD) | ||
1189 | zlc_mark_zone_full(zonelist, z); | ||
1190 | try_next_zone: | ||
1191 | if (NUMA_BUILD && !did_zlc_setup) { | ||
1192 | /* we do zlc_setup after the first zone is tried */ | ||
1193 | allowednodes = zlc_setup(zonelist, alloc_flags); | ||
1194 | zlc_active = 1; | ||
1195 | did_zlc_setup = 1; | ||
971 | } | 1196 | } |
972 | } while (*(++z) != NULL); | 1197 | } while (*(++z) != NULL); |
1198 | |||
1199 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | ||
1200 | /* Disable zlc cache for second zonelist scan */ | ||
1201 | zlc_active = 0; | ||
1202 | goto zonelist_scan; | ||
1203 | } | ||
973 | return page; | 1204 | return page; |
974 | } | 1205 | } |
975 | 1206 | ||
@@ -991,6 +1222,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, | |||
991 | 1222 | ||
992 | might_sleep_if(wait); | 1223 | might_sleep_if(wait); |
993 | 1224 | ||
1225 | if (should_fail_alloc_page(gfp_mask, order)) | ||
1226 | return NULL; | ||
1227 | |||
994 | restart: | 1228 | restart: |
995 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ | 1229 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ |
996 | 1230 | ||
@@ -1004,9 +1238,19 @@ restart: | |||
1004 | if (page) | 1238 | if (page) |
1005 | goto got_pg; | 1239 | goto got_pg; |
1006 | 1240 | ||
1007 | do { | 1241 | /* |
1242 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | ||
1243 | * __GFP_NOWARN set) should not cause reclaim since the subsystem | ||
1244 | * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim | ||
1245 | * using a larger set of nodes after it has established that the | ||
1246 | * allowed per node queues are empty and that nodes are | ||
1247 | * over allocated. | ||
1248 | */ | ||
1249 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | ||
1250 | goto nopage; | ||
1251 | |||
1252 | for (z = zonelist->zones; *z; z++) | ||
1008 | wakeup_kswapd(*z, order); | 1253 | wakeup_kswapd(*z, order); |
1009 | } while (*(++z)); | ||
1010 | 1254 | ||
1011 | /* | 1255 | /* |
1012 | * OK, we're below the kswapd watermark and have kicked background | 1256 | * OK, we're below the kswapd watermark and have kicked background |
@@ -1040,6 +1284,7 @@ restart: | |||
1040 | 1284 | ||
1041 | /* This allocation should allow future memory freeing. */ | 1285 | /* This allocation should allow future memory freeing. */ |
1042 | 1286 | ||
1287 | rebalance: | ||
1043 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | 1288 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) |
1044 | && !in_interrupt()) { | 1289 | && !in_interrupt()) { |
1045 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1290 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
@@ -1050,7 +1295,7 @@ nofail_alloc: | |||
1050 | if (page) | 1295 | if (page) |
1051 | goto got_pg; | 1296 | goto got_pg; |
1052 | if (gfp_mask & __GFP_NOFAIL) { | 1297 | if (gfp_mask & __GFP_NOFAIL) { |
1053 | blk_congestion_wait(WRITE, HZ/50); | 1298 | congestion_wait(WRITE, HZ/50); |
1054 | goto nofail_alloc; | 1299 | goto nofail_alloc; |
1055 | } | 1300 | } |
1056 | } | 1301 | } |
@@ -1061,7 +1306,6 @@ nofail_alloc: | |||
1061 | if (!wait) | 1306 | if (!wait) |
1062 | goto nopage; | 1307 | goto nopage; |
1063 | 1308 | ||
1064 | rebalance: | ||
1065 | cond_resched(); | 1309 | cond_resched(); |
1066 | 1310 | ||
1067 | /* We now go into synchronous reclaim */ | 1311 | /* We now go into synchronous reclaim */ |
@@ -1113,7 +1357,7 @@ rebalance: | |||
1113 | do_retry = 1; | 1357 | do_retry = 1; |
1114 | } | 1358 | } |
1115 | if (do_retry) { | 1359 | if (do_retry) { |
1116 | blk_congestion_wait(WRITE, HZ/50); | 1360 | congestion_wait(WRITE, HZ/50); |
1117 | goto rebalance; | 1361 | goto rebalance; |
1118 | } | 1362 | } |
1119 | 1363 | ||
@@ -1261,7 +1505,7 @@ unsigned int nr_free_pagecache_pages(void) | |||
1261 | static inline void show_node(struct zone *zone) | 1505 | static inline void show_node(struct zone *zone) |
1262 | { | 1506 | { |
1263 | if (NUMA_BUILD) | 1507 | if (NUMA_BUILD) |
1264 | printk("Node %ld ", zone_to_nid(zone)); | 1508 | printk("Node %d ", zone_to_nid(zone)); |
1265 | } | 1509 | } |
1266 | 1510 | ||
1267 | void si_meminfo(struct sysinfo *val) | 1511 | void si_meminfo(struct sysinfo *val) |
@@ -1541,6 +1785,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1541 | } | 1785 | } |
1542 | } | 1786 | } |
1543 | 1787 | ||
1788 | /* Construct the zonelist performance cache - see further mmzone.h */ | ||
1789 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | ||
1790 | { | ||
1791 | int i; | ||
1792 | |||
1793 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1794 | struct zonelist *zonelist; | ||
1795 | struct zonelist_cache *zlc; | ||
1796 | struct zone **z; | ||
1797 | |||
1798 | zonelist = pgdat->node_zonelists + i; | ||
1799 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; | ||
1800 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
1801 | for (z = zonelist->zones; *z; z++) | ||
1802 | zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); | ||
1803 | } | ||
1804 | } | ||
1805 | |||
1544 | #else /* CONFIG_NUMA */ | 1806 | #else /* CONFIG_NUMA */ |
1545 | 1807 | ||
1546 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1808 | static void __meminit build_zonelists(pg_data_t *pgdat) |
@@ -1578,14 +1840,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1578 | } | 1840 | } |
1579 | } | 1841 | } |
1580 | 1842 | ||
1843 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | ||
1844 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | ||
1845 | { | ||
1846 | int i; | ||
1847 | |||
1848 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
1849 | pgdat->node_zonelists[i].zlcache_ptr = NULL; | ||
1850 | } | ||
1851 | |||
1581 | #endif /* CONFIG_NUMA */ | 1852 | #endif /* CONFIG_NUMA */ |
1582 | 1853 | ||
1583 | /* return values int ....just for stop_machine_run() */ | 1854 | /* return values int ....just for stop_machine_run() */ |
1584 | static int __meminit __build_all_zonelists(void *dummy) | 1855 | static int __meminit __build_all_zonelists(void *dummy) |
1585 | { | 1856 | { |
1586 | int nid; | 1857 | int nid; |
1587 | for_each_online_node(nid) | 1858 | |
1859 | for_each_online_node(nid) { | ||
1588 | build_zonelists(NODE_DATA(nid)); | 1860 | build_zonelists(NODE_DATA(nid)); |
1861 | build_zonelist_cache(NODE_DATA(nid)); | ||
1862 | } | ||
1589 | return 0; | 1863 | return 0; |
1590 | } | 1864 | } |
1591 | 1865 | ||
@@ -1688,6 +1962,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
1688 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 1962 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
1689 | if (!early_pfn_valid(pfn)) | 1963 | if (!early_pfn_valid(pfn)) |
1690 | continue; | 1964 | continue; |
1965 | if (!early_pfn_in_nid(pfn, nid)) | ||
1966 | continue; | ||
1691 | page = pfn_to_page(pfn); | 1967 | page = pfn_to_page(pfn); |
1692 | set_page_links(page, zone, nid, pfn); | 1968 | set_page_links(page, zone, nid, pfn); |
1693 | init_page_count(page); | 1969 | init_page_count(page); |
@@ -1712,20 +1988,6 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | |||
1712 | } | 1988 | } |
1713 | } | 1989 | } |
1714 | 1990 | ||
1715 | #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) | ||
1716 | void zonetable_add(struct zone *zone, int nid, enum zone_type zid, | ||
1717 | unsigned long pfn, unsigned long size) | ||
1718 | { | ||
1719 | unsigned long snum = pfn_to_section_nr(pfn); | ||
1720 | unsigned long end = pfn_to_section_nr(pfn + size); | ||
1721 | |||
1722 | if (FLAGS_HAS_NODE) | ||
1723 | zone_table[ZONETABLE_INDEX(nid, zid)] = zone; | ||
1724 | else | ||
1725 | for (; snum <= end; snum++) | ||
1726 | zone_table[ZONETABLE_INDEX(snum, zid)] = zone; | ||
1727 | } | ||
1728 | |||
1729 | #ifndef __HAVE_ARCH_MEMMAP_INIT | 1991 | #ifndef __HAVE_ARCH_MEMMAP_INIT |
1730 | #define memmap_init(size, nid, zone, start_pfn) \ | 1992 | #define memmap_init(size, nid, zone, start_pfn) \ |
1731 | memmap_init_zone((size), (nid), (zone), (start_pfn)) | 1993 | memmap_init_zone((size), (nid), (zone), (start_pfn)) |
@@ -1878,16 +2140,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, | |||
1878 | int ret = NOTIFY_OK; | 2140 | int ret = NOTIFY_OK; |
1879 | 2141 | ||
1880 | switch (action) { | 2142 | switch (action) { |
1881 | case CPU_UP_PREPARE: | 2143 | case CPU_UP_PREPARE: |
1882 | if (process_zones(cpu)) | 2144 | if (process_zones(cpu)) |
1883 | ret = NOTIFY_BAD; | 2145 | ret = NOTIFY_BAD; |
1884 | break; | 2146 | break; |
1885 | case CPU_UP_CANCELED: | 2147 | case CPU_UP_CANCELED: |
1886 | case CPU_DEAD: | 2148 | case CPU_DEAD: |
1887 | free_zone_pagesets(cpu); | 2149 | free_zone_pagesets(cpu); |
1888 | break; | 2150 | break; |
1889 | default: | 2151 | default: |
1890 | break; | 2152 | break; |
1891 | } | 2153 | } |
1892 | return ret; | 2154 | return ret; |
1893 | } | 2155 | } |
@@ -2258,7 +2520,7 @@ unsigned long __init __absent_pages_in_range(int nid, | |||
2258 | 2520 | ||
2259 | /* Account for ranges past physical memory on this node */ | 2521 | /* Account for ranges past physical memory on this node */ |
2260 | if (range_end_pfn > prev_end_pfn) | 2522 | if (range_end_pfn > prev_end_pfn) |
2261 | hole_pages = range_end_pfn - | 2523 | hole_pages += range_end_pfn - |
2262 | max(range_start_pfn, prev_end_pfn); | 2524 | max(range_start_pfn, prev_end_pfn); |
2263 | 2525 | ||
2264 | return hole_pages; | 2526 | return hole_pages; |
@@ -2404,7 +2666,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
2404 | zone->zone_pgdat = pgdat; | 2666 | zone->zone_pgdat = pgdat; |
2405 | zone->free_pages = 0; | 2667 | zone->free_pages = 0; |
2406 | 2668 | ||
2407 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; | 2669 | zone->prev_priority = DEF_PRIORITY; |
2408 | 2670 | ||
2409 | zone_pcp_init(zone); | 2671 | zone_pcp_init(zone); |
2410 | INIT_LIST_HEAD(&zone->active_list); | 2672 | INIT_LIST_HEAD(&zone->active_list); |
@@ -2418,7 +2680,6 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
2418 | if (!size) | 2680 | if (!size) |
2419 | continue; | 2681 | continue; |
2420 | 2682 | ||
2421 | zonetable_add(zone, nid, j, zone_start_pfn, size); | ||
2422 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); | 2683 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); |
2423 | BUG_ON(ret); | 2684 | BUG_ON(ret); |
2424 | zone_start_pfn += size; | 2685 | zone_start_pfn += size; |
@@ -2609,6 +2870,9 @@ unsigned long __init find_min_pfn_for_node(unsigned long nid) | |||
2609 | { | 2870 | { |
2610 | int i; | 2871 | int i; |
2611 | 2872 | ||
2873 | /* Regions in the early_node_map can be in any order */ | ||
2874 | sort_node_map(); | ||
2875 | |||
2612 | /* Assuming a sorted map, the first range found has the starting pfn */ | 2876 | /* Assuming a sorted map, the first range found has the starting pfn */ |
2613 | for_each_active_range_index_in_nid(i, nid) | 2877 | for_each_active_range_index_in_nid(i, nid) |
2614 | return early_node_map[i].start_pfn; | 2878 | return early_node_map[i].start_pfn; |
@@ -2677,9 +2941,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
2677 | max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); | 2941 | max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); |
2678 | } | 2942 | } |
2679 | 2943 | ||
2680 | /* Regions in the early_node_map can be in any order */ | ||
2681 | sort_node_map(); | ||
2682 | |||
2683 | /* Print out the zone ranges */ | 2944 | /* Print out the zone ranges */ |
2684 | printk("Zone PFN ranges:\n"); | 2945 | printk("Zone PFN ranges:\n"); |
2685 | for (i = 0; i < MAX_NR_ZONES; i++) | 2946 | for (i = 0; i < MAX_NR_ZONES; i++) |
@@ -2733,7 +2994,6 @@ void __init free_area_init(unsigned long *zones_size) | |||
2733 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 2994 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
2734 | } | 2995 | } |
2735 | 2996 | ||
2736 | #ifdef CONFIG_HOTPLUG_CPU | ||
2737 | static int page_alloc_cpu_notify(struct notifier_block *self, | 2997 | static int page_alloc_cpu_notify(struct notifier_block *self, |
2738 | unsigned long action, void *hcpu) | 2998 | unsigned long action, void *hcpu) |
2739 | { | 2999 | { |
@@ -2748,7 +3008,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self, | |||
2748 | } | 3008 | } |
2749 | return NOTIFY_OK; | 3009 | return NOTIFY_OK; |
2750 | } | 3010 | } |
2751 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
2752 | 3011 | ||
2753 | void __init page_alloc_init(void) | 3012 | void __init page_alloc_init(void) |
2754 | { | 3013 | { |
@@ -3052,7 +3311,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
3052 | /* allow the kernel cmdline to have a say */ | 3311 | /* allow the kernel cmdline to have a say */ |
3053 | if (!numentries) { | 3312 | if (!numentries) { |
3054 | /* round applicable memory size up to nearest megabyte */ | 3313 | /* round applicable memory size up to nearest megabyte */ |
3055 | numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; | 3314 | numentries = nr_kernel_pages; |
3056 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; | 3315 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; |
3057 | numentries >>= 20 - PAGE_SHIFT; | 3316 | numentries >>= 20 - PAGE_SHIFT; |
3058 | numentries <<= 20 - PAGE_SHIFT; | 3317 | numentries <<= 20 - PAGE_SHIFT; |
@@ -3074,7 +3333,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
3074 | if (numentries > max) | 3333 | if (numentries > max) |
3075 | numentries = max; | 3334 | numentries = max; |
3076 | 3335 | ||
3077 | log2qty = long_log2(numentries); | 3336 | log2qty = ilog2(numentries); |
3078 | 3337 | ||
3079 | do { | 3338 | do { |
3080 | size = bucketsize << log2qty; | 3339 | size = bucketsize << log2qty; |
@@ -3096,7 +3355,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
3096 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", | 3355 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", |
3097 | tablename, | 3356 | tablename, |
3098 | (1U << log2qty), | 3357 | (1U << log2qty), |
3099 | long_log2(size) - PAGE_SHIFT, | 3358 | ilog2(size) - PAGE_SHIFT, |
3100 | size); | 3359 | size); |
3101 | 3360 | ||
3102 | if (_hash_shift) | 3361 | if (_hash_shift) |
@@ -3119,3 +3378,19 @@ unsigned long page_to_pfn(struct page *page) | |||
3119 | EXPORT_SYMBOL(pfn_to_page); | 3378 | EXPORT_SYMBOL(pfn_to_page); |
3120 | EXPORT_SYMBOL(page_to_pfn); | 3379 | EXPORT_SYMBOL(page_to_pfn); |
3121 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ | 3380 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ |
3381 | |||
3382 | #if MAX_NUMNODES > 1 | ||
3383 | /* | ||
3384 | * Find the highest possible node id. | ||
3385 | */ | ||
3386 | int highest_possible_node_id(void) | ||
3387 | { | ||
3388 | unsigned int node; | ||
3389 | unsigned int highest = 0; | ||
3390 | |||
3391 | for_each_node_mask(node, node_possible_map) | ||
3392 | highest = node; | ||
3393 | return highest; | ||
3394 | } | ||
3395 | EXPORT_SYMBOL(highest_possible_node_id); | ||
3396 | #endif | ||
diff --git a/mm/page_io.c b/mm/page_io.c index d4840ecbf8..dbffec0d78 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -147,48 +147,3 @@ int swap_readpage(struct file *file, struct page *page) | |||
147 | out: | 147 | out: |
148 | return ret; | 148 | return ret; |
149 | } | 149 | } |
150 | |||
151 | #ifdef CONFIG_SOFTWARE_SUSPEND | ||
152 | /* | ||
153 | * A scruffy utility function to read or write an arbitrary swap page | ||
154 | * and wait on the I/O. The caller must have a ref on the page. | ||
155 | * | ||
156 | * We use end_swap_bio_read() even for writes, because it happens to do what | ||
157 | * we want. | ||
158 | */ | ||
159 | int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page, | ||
160 | struct bio **bio_chain) | ||
161 | { | ||
162 | struct bio *bio; | ||
163 | int ret = 0; | ||
164 | int bio_rw; | ||
165 | |||
166 | lock_page(page); | ||
167 | |||
168 | bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read); | ||
169 | if (bio == NULL) { | ||
170 | unlock_page(page); | ||
171 | ret = -ENOMEM; | ||
172 | goto out; | ||
173 | } | ||
174 | |||
175 | bio_rw = rw; | ||
176 | if (!bio_chain) | ||
177 | bio_rw |= (1 << BIO_RW_SYNC); | ||
178 | if (bio_chain) | ||
179 | bio_get(bio); | ||
180 | submit_bio(bio_rw, bio); | ||
181 | if (bio_chain == NULL) { | ||
182 | wait_on_page_locked(page); | ||
183 | |||
184 | if (!PageUptodate(page) || PageError(page)) | ||
185 | ret = -EIO; | ||
186 | } | ||
187 | if (bio_chain) { | ||
188 | bio->bi_private = *bio_chain; | ||
189 | *bio_chain = bio; | ||
190 | } | ||
191 | out: | ||
192 | return ret; | ||
193 | } | ||
194 | #endif | ||
diff --git a/mm/pdflush.c b/mm/pdflush.c index b02102feeb..8ce0900dc9 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/writeback.h> // Prototypes pdflush_operation() | 21 | #include <linux/writeback.h> // Prototypes pdflush_operation() |
22 | #include <linux/kthread.h> | 22 | #include <linux/kthread.h> |
23 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
24 | #include <linux/freezer.h> | ||
24 | 25 | ||
25 | 26 | ||
26 | /* | 27 | /* |
diff --git a/mm/readahead.c b/mm/readahead.c index 1ba736ac03..0f539e8e82 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/blkdev.h> | 14 | #include <linux/blkdev.h> |
15 | #include <linux/backing-dev.h> | 15 | #include <linux/backing-dev.h> |
16 | #include <linux/task_io_accounting_ops.h> | ||
16 | #include <linux/pagevec.h> | 17 | #include <linux/pagevec.h> |
17 | 18 | ||
18 | void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | 19 | void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) |
@@ -148,15 +149,10 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, | |||
148 | if (!pagevec_add(&lru_pvec, page)) | 149 | if (!pagevec_add(&lru_pvec, page)) |
149 | __pagevec_lru_add(&lru_pvec); | 150 | __pagevec_lru_add(&lru_pvec); |
150 | if (ret) { | 151 | if (ret) { |
151 | while (!list_empty(pages)) { | 152 | put_pages_list(pages); |
152 | struct page *victim; | ||
153 | |||
154 | victim = list_to_page(pages); | ||
155 | list_del(&victim->lru); | ||
156 | page_cache_release(victim); | ||
157 | } | ||
158 | break; | 153 | break; |
159 | } | 154 | } |
155 | task_io_account_read(PAGE_CACHE_SIZE); | ||
160 | } | 156 | } |
161 | pagevec_lru_add(&lru_pvec); | 157 | pagevec_lru_add(&lru_pvec); |
162 | return ret; | 158 | return ret; |
@@ -173,6 +169,8 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
173 | 169 | ||
174 | if (mapping->a_ops->readpages) { | 170 | if (mapping->a_ops->readpages) { |
175 | ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); | 171 | ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); |
172 | /* Clean up the remaining pages */ | ||
173 | put_pages_list(pages); | ||
176 | goto out; | 174 | goto out; |
177 | } | 175 | } |
178 | 176 | ||
@@ -454,7 +452,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp, | |||
454 | * | 452 | * |
455 | * Note that @filp is purely used for passing on to the ->readpage[s]() | 453 | * Note that @filp is purely used for passing on to the ->readpage[s]() |
456 | * handler: it may refer to a different file from @mapping (so we may not use | 454 | * handler: it may refer to a different file from @mapping (so we may not use |
457 | * @filp->f_mapping or @filp->f_dentry->d_inode here). | 455 | * @filp->f_mapping or @filp->f_path.dentry->d_inode here). |
458 | * Also, @ra may not be equal to &@filp->f_ra. | 456 | * Also, @ra may not be equal to &@filp->f_ra. |
459 | * | 457 | * |
460 | */ | 458 | */ |
@@ -21,27 +21,21 @@ | |||
21 | * Lock ordering in mm: | 21 | * Lock ordering in mm: |
22 | * | 22 | * |
23 | * inode->i_mutex (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
24 | * inode->i_alloc_sem | 24 | * inode->i_alloc_sem (vmtruncate_range) |
25 | * | 25 | * mm->mmap_sem |
26 | * When a page fault occurs in writing from user to file, down_read | 26 | * page->flags PG_locked (lock_page) |
27 | * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within | 27 | * mapping->i_mmap_lock |
28 | * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never | 28 | * anon_vma->lock |
29 | * taken together; in truncation, i_mutex is taken outermost. | 29 | * mm->page_table_lock or pte_lock |
30 | * | 30 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
31 | * mm->mmap_sem | 31 | * swap_lock (in swap_duplicate, swap_info_get) |
32 | * page->flags PG_locked (lock_page) | 32 | * mmlist_lock (in mmput, drain_mmlist and others) |
33 | * mapping->i_mmap_lock | 33 | * mapping->private_lock (in __set_page_dirty_buffers) |
34 | * anon_vma->lock | 34 | * inode_lock (in set_page_dirty's __mark_inode_dirty) |
35 | * mm->page_table_lock or pte_lock | 35 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
36 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
37 | * swap_lock (in swap_duplicate, swap_info_get) | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * mmlist_lock (in mmput, drain_mmlist and others) | 38 | * within inode_lock in __sync_single_inode) |
39 | * mapping->private_lock (in __set_page_dirty_buffers) | ||
40 | * inode_lock (in set_page_dirty's __mark_inode_dirty) | ||
41 | * sb_lock (within inode_lock in fs/fs-writeback.c) | ||
42 | * mapping->tree_lock (widely used, in set_page_dirty, | ||
43 | * in arch-dependent flush_dcache_mmap_lock, | ||
44 | * within inode_lock in __sync_single_inode) | ||
45 | */ | 39 | */ |
46 | 40 | ||
47 | #include <linux/mm.h> | 41 | #include <linux/mm.h> |
diff --git a/mm/shmem.c b/mm/shmem.c index bb8ca7ef70..4bb28d218e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <linux/ctype.h> | 48 | #include <linux/ctype.h> |
49 | #include <linux/migrate.h> | 49 | #include <linux/migrate.h> |
50 | #include <linux/highmem.h> | 50 | #include <linux/highmem.h> |
51 | #include <linux/backing-dev.h> | ||
51 | 52 | ||
52 | #include <asm/uaccess.h> | 53 | #include <asm/uaccess.h> |
53 | #include <asm/div64.h> | 54 | #include <asm/div64.h> |
@@ -176,7 +177,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) | |||
176 | 177 | ||
177 | static struct super_operations shmem_ops; | 178 | static struct super_operations shmem_ops; |
178 | static const struct address_space_operations shmem_aops; | 179 | static const struct address_space_operations shmem_aops; |
179 | static struct file_operations shmem_file_operations; | 180 | static const struct file_operations shmem_file_operations; |
180 | static struct inode_operations shmem_inode_operations; | 181 | static struct inode_operations shmem_inode_operations; |
181 | static struct inode_operations shmem_dir_inode_operations; | 182 | static struct inode_operations shmem_dir_inode_operations; |
182 | static struct inode_operations shmem_special_inode_operations; | 183 | static struct inode_operations shmem_special_inode_operations; |
@@ -1131,7 +1132,7 @@ repeat: | |||
1131 | page_cache_release(swappage); | 1132 | page_cache_release(swappage); |
1132 | if (error == -ENOMEM) { | 1133 | if (error == -ENOMEM) { |
1133 | /* let kswapd refresh zone for GFP_ATOMICs */ | 1134 | /* let kswapd refresh zone for GFP_ATOMICs */ |
1134 | blk_congestion_wait(WRITE, HZ/50); | 1135 | congestion_wait(WRITE, HZ/50); |
1135 | } | 1136 | } |
1136 | goto repeat; | 1137 | goto repeat; |
1137 | } | 1138 | } |
@@ -1224,7 +1225,7 @@ failed: | |||
1224 | 1225 | ||
1225 | struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type) | 1226 | struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type) |
1226 | { | 1227 | { |
1227 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | 1228 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1228 | struct page *page = NULL; | 1229 | struct page *page = NULL; |
1229 | unsigned long idx; | 1230 | unsigned long idx; |
1230 | int error; | 1231 | int error; |
@@ -1247,7 +1248,7 @@ static int shmem_populate(struct vm_area_struct *vma, | |||
1247 | unsigned long addr, unsigned long len, | 1248 | unsigned long addr, unsigned long len, |
1248 | pgprot_t prot, unsigned long pgoff, int nonblock) | 1249 | pgprot_t prot, unsigned long pgoff, int nonblock) |
1249 | { | 1250 | { |
1250 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | 1251 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1251 | struct mm_struct *mm = vma->vm_mm; | 1252 | struct mm_struct *mm = vma->vm_mm; |
1252 | enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; | 1253 | enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; |
1253 | unsigned long size; | 1254 | unsigned long size; |
@@ -1292,14 +1293,14 @@ static int shmem_populate(struct vm_area_struct *vma, | |||
1292 | #ifdef CONFIG_NUMA | 1293 | #ifdef CONFIG_NUMA |
1293 | int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | 1294 | int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) |
1294 | { | 1295 | { |
1295 | struct inode *i = vma->vm_file->f_dentry->d_inode; | 1296 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; |
1296 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); | 1297 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); |
1297 | } | 1298 | } |
1298 | 1299 | ||
1299 | struct mempolicy * | 1300 | struct mempolicy * |
1300 | shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) | 1301 | shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) |
1301 | { | 1302 | { |
1302 | struct inode *i = vma->vm_file->f_dentry->d_inode; | 1303 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; |
1303 | unsigned long idx; | 1304 | unsigned long idx; |
1304 | 1305 | ||
1305 | idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 1306 | idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
@@ -1309,7 +1310,7 @@ shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) | |||
1309 | 1310 | ||
1310 | int shmem_lock(struct file *file, int lock, struct user_struct *user) | 1311 | int shmem_lock(struct file *file, int lock, struct user_struct *user) |
1311 | { | 1312 | { |
1312 | struct inode *inode = file->f_dentry->d_inode; | 1313 | struct inode *inode = file->f_path.dentry->d_inode; |
1313 | struct shmem_inode_info *info = SHMEM_I(inode); | 1314 | struct shmem_inode_info *info = SHMEM_I(inode); |
1314 | int retval = -ENOMEM; | 1315 | int retval = -ENOMEM; |
1315 | 1316 | ||
@@ -1362,6 +1363,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1362 | inode->i_mapping->a_ops = &shmem_aops; | 1363 | inode->i_mapping->a_ops = &shmem_aops; |
1363 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; | 1364 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; |
1364 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 1365 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
1366 | inode->i_generation = get_seconds(); | ||
1365 | info = SHMEM_I(inode); | 1367 | info = SHMEM_I(inode); |
1366 | memset(info, 0, (char *)inode - (char *)info); | 1368 | memset(info, 0, (char *)inode - (char *)info); |
1367 | spin_lock_init(&info->lock); | 1369 | spin_lock_init(&info->lock); |
@@ -1420,7 +1422,7 @@ shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsig | |||
1420 | static ssize_t | 1422 | static ssize_t |
1421 | shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) | 1423 | shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) |
1422 | { | 1424 | { |
1423 | struct inode *inode = file->f_dentry->d_inode; | 1425 | struct inode *inode = file->f_path.dentry->d_inode; |
1424 | loff_t pos; | 1426 | loff_t pos; |
1425 | unsigned long written; | 1427 | unsigned long written; |
1426 | ssize_t err; | 1428 | ssize_t err; |
@@ -1440,7 +1442,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t | |||
1440 | if (err || !count) | 1442 | if (err || !count) |
1441 | goto out; | 1443 | goto out; |
1442 | 1444 | ||
1443 | err = remove_suid(file->f_dentry); | 1445 | err = remove_suid(file->f_path.dentry); |
1444 | if (err) | 1446 | if (err) |
1445 | goto out; | 1447 | goto out; |
1446 | 1448 | ||
@@ -1522,7 +1524,7 @@ out: | |||
1522 | 1524 | ||
1523 | static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) | 1525 | static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) |
1524 | { | 1526 | { |
1525 | struct inode *inode = filp->f_dentry->d_inode; | 1527 | struct inode *inode = filp->f_path.dentry->d_inode; |
1526 | struct address_space *mapping = inode->i_mapping; | 1528 | struct address_space *mapping = inode->i_mapping; |
1527 | unsigned long index, offset; | 1529 | unsigned long index, offset; |
1528 | 1530 | ||
@@ -1941,7 +1943,7 @@ static int shmem_xattr_security_set(struct inode *inode, const char *name, | |||
1941 | return security_inode_setsecurity(inode, name, value, size, flags); | 1943 | return security_inode_setsecurity(inode, name, value, size, flags); |
1942 | } | 1944 | } |
1943 | 1945 | ||
1944 | struct xattr_handler shmem_xattr_security_handler = { | 1946 | static struct xattr_handler shmem_xattr_security_handler = { |
1945 | .prefix = XATTR_SECURITY_PREFIX, | 1947 | .prefix = XATTR_SECURITY_PREFIX, |
1946 | .list = shmem_xattr_security_list, | 1948 | .list = shmem_xattr_security_list, |
1947 | .get = shmem_xattr_security_get, | 1949 | .get = shmem_xattr_security_get, |
@@ -1956,6 +1958,85 @@ static struct xattr_handler *shmem_xattr_handlers[] = { | |||
1956 | }; | 1958 | }; |
1957 | #endif | 1959 | #endif |
1958 | 1960 | ||
1961 | static struct dentry *shmem_get_parent(struct dentry *child) | ||
1962 | { | ||
1963 | return ERR_PTR(-ESTALE); | ||
1964 | } | ||
1965 | |||
1966 | static int shmem_match(struct inode *ino, void *vfh) | ||
1967 | { | ||
1968 | __u32 *fh = vfh; | ||
1969 | __u64 inum = fh[2]; | ||
1970 | inum = (inum << 32) | fh[1]; | ||
1971 | return ino->i_ino == inum && fh[0] == ino->i_generation; | ||
1972 | } | ||
1973 | |||
1974 | static struct dentry *shmem_get_dentry(struct super_block *sb, void *vfh) | ||
1975 | { | ||
1976 | struct dentry *de = NULL; | ||
1977 | struct inode *inode; | ||
1978 | __u32 *fh = vfh; | ||
1979 | __u64 inum = fh[2]; | ||
1980 | inum = (inum << 32) | fh[1]; | ||
1981 | |||
1982 | inode = ilookup5(sb, (unsigned long)(inum+fh[0]), shmem_match, vfh); | ||
1983 | if (inode) { | ||
1984 | de = d_find_alias(inode); | ||
1985 | iput(inode); | ||
1986 | } | ||
1987 | |||
1988 | return de? de: ERR_PTR(-ESTALE); | ||
1989 | } | ||
1990 | |||
1991 | static struct dentry *shmem_decode_fh(struct super_block *sb, __u32 *fh, | ||
1992 | int len, int type, | ||
1993 | int (*acceptable)(void *context, struct dentry *de), | ||
1994 | void *context) | ||
1995 | { | ||
1996 | if (len < 3) | ||
1997 | return ERR_PTR(-ESTALE); | ||
1998 | |||
1999 | return sb->s_export_op->find_exported_dentry(sb, fh, NULL, acceptable, | ||
2000 | context); | ||
2001 | } | ||
2002 | |||
2003 | static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, | ||
2004 | int connectable) | ||
2005 | { | ||
2006 | struct inode *inode = dentry->d_inode; | ||
2007 | |||
2008 | if (*len < 3) | ||
2009 | return 255; | ||
2010 | |||
2011 | if (hlist_unhashed(&inode->i_hash)) { | ||
2012 | /* Unfortunately insert_inode_hash is not idempotent, | ||
2013 | * so as we hash inodes here rather than at creation | ||
2014 | * time, we need a lock to ensure we only try | ||
2015 | * to do it once | ||
2016 | */ | ||
2017 | static DEFINE_SPINLOCK(lock); | ||
2018 | spin_lock(&lock); | ||
2019 | if (hlist_unhashed(&inode->i_hash)) | ||
2020 | __insert_inode_hash(inode, | ||
2021 | inode->i_ino + inode->i_generation); | ||
2022 | spin_unlock(&lock); | ||
2023 | } | ||
2024 | |||
2025 | fh[0] = inode->i_generation; | ||
2026 | fh[1] = inode->i_ino; | ||
2027 | fh[2] = ((__u64)inode->i_ino) >> 32; | ||
2028 | |||
2029 | *len = 3; | ||
2030 | return 1; | ||
2031 | } | ||
2032 | |||
2033 | static struct export_operations shmem_export_ops = { | ||
2034 | .get_parent = shmem_get_parent, | ||
2035 | .get_dentry = shmem_get_dentry, | ||
2036 | .encode_fh = shmem_encode_fh, | ||
2037 | .decode_fh = shmem_decode_fh, | ||
2038 | }; | ||
2039 | |||
1959 | static int shmem_parse_options(char *options, int *mode, uid_t *uid, | 2040 | static int shmem_parse_options(char *options, int *mode, uid_t *uid, |
1960 | gid_t *gid, unsigned long *blocks, unsigned long *inodes, | 2041 | gid_t *gid, unsigned long *blocks, unsigned long *inodes, |
1961 | int *policy, nodemask_t *policy_nodes) | 2042 | int *policy, nodemask_t *policy_nodes) |
@@ -2128,6 +2209,7 @@ static int shmem_fill_super(struct super_block *sb, | |||
2128 | &inodes, &policy, &policy_nodes)) | 2209 | &inodes, &policy, &policy_nodes)) |
2129 | return -EINVAL; | 2210 | return -EINVAL; |
2130 | } | 2211 | } |
2212 | sb->s_export_op = &shmem_export_ops; | ||
2131 | #else | 2213 | #else |
2132 | sb->s_flags |= MS_NOUSER; | 2214 | sb->s_flags |= MS_NOUSER; |
2133 | #endif | 2215 | #endif |
@@ -2181,7 +2263,7 @@ static struct kmem_cache *shmem_inode_cachep; | |||
2181 | static struct inode *shmem_alloc_inode(struct super_block *sb) | 2263 | static struct inode *shmem_alloc_inode(struct super_block *sb) |
2182 | { | 2264 | { |
2183 | struct shmem_inode_info *p; | 2265 | struct shmem_inode_info *p; |
2184 | p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL); | 2266 | p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); |
2185 | if (!p) | 2267 | if (!p) |
2186 | return NULL; | 2268 | return NULL; |
2187 | return &p->vfs_inode; | 2269 | return &p->vfs_inode; |
@@ -2237,7 +2319,7 @@ static const struct address_space_operations shmem_aops = { | |||
2237 | .migratepage = migrate_page, | 2319 | .migratepage = migrate_page, |
2238 | }; | 2320 | }; |
2239 | 2321 | ||
2240 | static struct file_operations shmem_file_operations = { | 2322 | static const struct file_operations shmem_file_operations = { |
2241 | .mmap = shmem_mmap, | 2323 | .mmap = shmem_mmap, |
2242 | #ifdef CONFIG_TMPFS | 2324 | #ifdef CONFIG_TMPFS |
2243 | .llseek = generic_file_llseek, | 2325 | .llseek = generic_file_llseek, |
@@ -2411,8 +2493,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
2411 | d_instantiate(dentry, inode); | 2493 | d_instantiate(dentry, inode); |
2412 | inode->i_size = size; | 2494 | inode->i_size = size; |
2413 | inode->i_nlink = 0; /* It is unlinked */ | 2495 | inode->i_nlink = 0; /* It is unlinked */ |
2414 | file->f_vfsmnt = mntget(shm_mnt); | 2496 | file->f_path.mnt = mntget(shm_mnt); |
2415 | file->f_dentry = dentry; | 2497 | file->f_path.dentry = dentry; |
2416 | file->f_mapping = inode->i_mapping; | 2498 | file->f_mapping = inode->i_mapping; |
2417 | file->f_op = &shmem_file_operations; | 2499 | file->f_op = &shmem_file_operations; |
2418 | file->f_mode = FMODE_WRITE | FMODE_READ; | 2500 | file->f_mode = FMODE_WRITE | FMODE_READ; |
@@ -103,12 +103,13 @@ | |||
103 | #include <linux/module.h> | 103 | #include <linux/module.h> |
104 | #include <linux/rcupdate.h> | 104 | #include <linux/rcupdate.h> |
105 | #include <linux/string.h> | 105 | #include <linux/string.h> |
106 | #include <linux/uaccess.h> | ||
106 | #include <linux/nodemask.h> | 107 | #include <linux/nodemask.h> |
107 | #include <linux/mempolicy.h> | 108 | #include <linux/mempolicy.h> |
108 | #include <linux/mutex.h> | 109 | #include <linux/mutex.h> |
110 | #include <linux/fault-inject.h> | ||
109 | #include <linux/rtmutex.h> | 111 | #include <linux/rtmutex.h> |
110 | 112 | ||
111 | #include <asm/uaccess.h> | ||
112 | #include <asm/cacheflush.h> | 113 | #include <asm/cacheflush.h> |
113 | #include <asm/tlbflush.h> | 114 | #include <asm/tlbflush.h> |
114 | #include <asm/page.h> | 115 | #include <asm/page.h> |
@@ -313,7 +314,7 @@ static int drain_freelist(struct kmem_cache *cache, | |||
313 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, | 314 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, |
314 | int node); | 315 | int node); |
315 | static int enable_cpucache(struct kmem_cache *cachep); | 316 | static int enable_cpucache(struct kmem_cache *cachep); |
316 | static void cache_reap(void *unused); | 317 | static void cache_reap(struct work_struct *unused); |
317 | 318 | ||
318 | /* | 319 | /* |
319 | * This function must be completely optimized away if a constant is passed to | 320 | * This function must be completely optimized away if a constant is passed to |
@@ -730,7 +731,10 @@ static inline void init_lock_keys(void) | |||
730 | } | 731 | } |
731 | #endif | 732 | #endif |
732 | 733 | ||
733 | /* Guard access to the cache-chain. */ | 734 | /* |
735 | * 1. Guard access to the cache-chain. | ||
736 | * 2. Protect sanity of cpu_online_map against cpu hotplug events | ||
737 | */ | ||
734 | static DEFINE_MUTEX(cache_chain_mutex); | 738 | static DEFINE_MUTEX(cache_chain_mutex); |
735 | static struct list_head cache_chain; | 739 | static struct list_head cache_chain; |
736 | 740 | ||
@@ -753,7 +757,7 @@ int slab_is_available(void) | |||
753 | return g_cpucache_up == FULL; | 757 | return g_cpucache_up == FULL; |
754 | } | 758 | } |
755 | 759 | ||
756 | static DEFINE_PER_CPU(struct work_struct, reap_work); | 760 | static DEFINE_PER_CPU(struct delayed_work, reap_work); |
757 | 761 | ||
758 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 762 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) |
759 | { | 763 | { |
@@ -866,6 +870,22 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, | |||
866 | dump_stack(); | 870 | dump_stack(); |
867 | } | 871 | } |
868 | 872 | ||
873 | /* | ||
874 | * By default on NUMA we use alien caches to stage the freeing of | ||
875 | * objects allocated from other nodes. This causes massive memory | ||
876 | * inefficiencies when using fake NUMA setup to split memory into a | ||
877 | * large number of small nodes, so it can be disabled on the command | ||
878 | * line | ||
879 | */ | ||
880 | |||
881 | static int use_alien_caches __read_mostly = 1; | ||
882 | static int __init noaliencache_setup(char *s) | ||
883 | { | ||
884 | use_alien_caches = 0; | ||
885 | return 1; | ||
886 | } | ||
887 | __setup("noaliencache", noaliencache_setup); | ||
888 | |||
869 | #ifdef CONFIG_NUMA | 889 | #ifdef CONFIG_NUMA |
870 | /* | 890 | /* |
871 | * Special reaping functions for NUMA systems called from cache_reap(). | 891 | * Special reaping functions for NUMA systems called from cache_reap(). |
@@ -883,7 +903,7 @@ static void init_reap_node(int cpu) | |||
883 | if (node == MAX_NUMNODES) | 903 | if (node == MAX_NUMNODES) |
884 | node = first_node(node_online_map); | 904 | node = first_node(node_online_map); |
885 | 905 | ||
886 | __get_cpu_var(reap_node) = node; | 906 | per_cpu(reap_node, cpu) = node; |
887 | } | 907 | } |
888 | 908 | ||
889 | static void next_reap_node(void) | 909 | static void next_reap_node(void) |
@@ -916,17 +936,18 @@ static void next_reap_node(void) | |||
916 | */ | 936 | */ |
917 | static void __devinit start_cpu_timer(int cpu) | 937 | static void __devinit start_cpu_timer(int cpu) |
918 | { | 938 | { |
919 | struct work_struct *reap_work = &per_cpu(reap_work, cpu); | 939 | struct delayed_work *reap_work = &per_cpu(reap_work, cpu); |
920 | 940 | ||
921 | /* | 941 | /* |
922 | * When this gets called from do_initcalls via cpucache_init(), | 942 | * When this gets called from do_initcalls via cpucache_init(), |
923 | * init_workqueues() has already run, so keventd will be setup | 943 | * init_workqueues() has already run, so keventd will be setup |
924 | * at that time. | 944 | * at that time. |
925 | */ | 945 | */ |
926 | if (keventd_up() && reap_work->func == NULL) { | 946 | if (keventd_up() && reap_work->work.func == NULL) { |
927 | init_reap_node(cpu); | 947 | init_reap_node(cpu); |
928 | INIT_WORK(reap_work, cache_reap, NULL); | 948 | INIT_DELAYED_WORK(reap_work, cache_reap); |
929 | schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); | 949 | schedule_delayed_work_on(cpu, reap_work, |
950 | __round_jiffies_relative(HZ, cpu)); | ||
930 | } | 951 | } |
931 | } | 952 | } |
932 | 953 | ||
@@ -996,7 +1017,7 @@ static inline void *alternate_node_alloc(struct kmem_cache *cachep, | |||
996 | return NULL; | 1017 | return NULL; |
997 | } | 1018 | } |
998 | 1019 | ||
999 | static inline void *__cache_alloc_node(struct kmem_cache *cachep, | 1020 | static inline void *____cache_alloc_node(struct kmem_cache *cachep, |
1000 | gfp_t flags, int nodeid) | 1021 | gfp_t flags, int nodeid) |
1001 | { | 1022 | { |
1002 | return NULL; | 1023 | return NULL; |
@@ -1004,7 +1025,7 @@ static inline void *__cache_alloc_node(struct kmem_cache *cachep, | |||
1004 | 1025 | ||
1005 | #else /* CONFIG_NUMA */ | 1026 | #else /* CONFIG_NUMA */ |
1006 | 1027 | ||
1007 | static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); | 1028 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); |
1008 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); | 1029 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); |
1009 | 1030 | ||
1010 | static struct array_cache **alloc_alien_cache(int node, int limit) | 1031 | static struct array_cache **alloc_alien_cache(int node, int limit) |
@@ -1114,7 +1135,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
1114 | * Make sure we are not freeing a object from another node to the array | 1135 | * Make sure we are not freeing a object from another node to the array |
1115 | * cache on this cpu. | 1136 | * cache on this cpu. |
1116 | */ | 1137 | */ |
1117 | if (likely(slabp->nodeid == node)) | 1138 | if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches)) |
1118 | return 0; | 1139 | return 0; |
1119 | 1140 | ||
1120 | l3 = cachep->nodelists[node]; | 1141 | l3 = cachep->nodelists[node]; |
@@ -1192,7 +1213,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1192 | list_for_each_entry(cachep, &cache_chain, next) { | 1213 | list_for_each_entry(cachep, &cache_chain, next) { |
1193 | struct array_cache *nc; | 1214 | struct array_cache *nc; |
1194 | struct array_cache *shared; | 1215 | struct array_cache *shared; |
1195 | struct array_cache **alien; | 1216 | struct array_cache **alien = NULL; |
1196 | 1217 | ||
1197 | nc = alloc_arraycache(node, cachep->limit, | 1218 | nc = alloc_arraycache(node, cachep->limit, |
1198 | cachep->batchcount); | 1219 | cachep->batchcount); |
@@ -1204,9 +1225,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1204 | if (!shared) | 1225 | if (!shared) |
1205 | goto bad; | 1226 | goto bad; |
1206 | 1227 | ||
1207 | alien = alloc_alien_cache(node, cachep->limit); | 1228 | if (use_alien_caches) { |
1208 | if (!alien) | 1229 | alien = alloc_alien_cache(node, cachep->limit); |
1209 | goto bad; | 1230 | if (!alien) |
1231 | goto bad; | ||
1232 | } | ||
1210 | cachep->array[cpu] = nc; | 1233 | cachep->array[cpu] = nc; |
1211 | l3 = cachep->nodelists[node]; | 1234 | l3 = cachep->nodelists[node]; |
1212 | BUG_ON(!l3); | 1235 | BUG_ON(!l3); |
@@ -1230,12 +1253,18 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1230 | kfree(shared); | 1253 | kfree(shared); |
1231 | free_alien_cache(alien); | 1254 | free_alien_cache(alien); |
1232 | } | 1255 | } |
1233 | mutex_unlock(&cache_chain_mutex); | ||
1234 | break; | 1256 | break; |
1235 | case CPU_ONLINE: | 1257 | case CPU_ONLINE: |
1258 | mutex_unlock(&cache_chain_mutex); | ||
1236 | start_cpu_timer(cpu); | 1259 | start_cpu_timer(cpu); |
1237 | break; | 1260 | break; |
1238 | #ifdef CONFIG_HOTPLUG_CPU | 1261 | #ifdef CONFIG_HOTPLUG_CPU |
1262 | case CPU_DOWN_PREPARE: | ||
1263 | mutex_lock(&cache_chain_mutex); | ||
1264 | break; | ||
1265 | case CPU_DOWN_FAILED: | ||
1266 | mutex_unlock(&cache_chain_mutex); | ||
1267 | break; | ||
1239 | case CPU_DEAD: | 1268 | case CPU_DEAD: |
1240 | /* | 1269 | /* |
1241 | * Even if all the cpus of a node are down, we don't free the | 1270 | * Even if all the cpus of a node are down, we don't free the |
@@ -1246,8 +1275,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1246 | * gets destroyed at kmem_cache_destroy(). | 1275 | * gets destroyed at kmem_cache_destroy(). |
1247 | */ | 1276 | */ |
1248 | /* fall thru */ | 1277 | /* fall thru */ |
1278 | #endif | ||
1249 | case CPU_UP_CANCELED: | 1279 | case CPU_UP_CANCELED: |
1250 | mutex_lock(&cache_chain_mutex); | ||
1251 | list_for_each_entry(cachep, &cache_chain, next) { | 1280 | list_for_each_entry(cachep, &cache_chain, next) { |
1252 | struct array_cache *nc; | 1281 | struct array_cache *nc; |
1253 | struct array_cache *shared; | 1282 | struct array_cache *shared; |
@@ -1308,11 +1337,9 @@ free_array_cache: | |||
1308 | } | 1337 | } |
1309 | mutex_unlock(&cache_chain_mutex); | 1338 | mutex_unlock(&cache_chain_mutex); |
1310 | break; | 1339 | break; |
1311 | #endif | ||
1312 | } | 1340 | } |
1313 | return NOTIFY_OK; | 1341 | return NOTIFY_OK; |
1314 | bad: | 1342 | bad: |
1315 | mutex_unlock(&cache_chain_mutex); | ||
1316 | return NOTIFY_BAD; | 1343 | return NOTIFY_BAD; |
1317 | } | 1344 | } |
1318 | 1345 | ||
@@ -1580,12 +1607,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1580 | flags |= __GFP_COMP; | 1607 | flags |= __GFP_COMP; |
1581 | #endif | 1608 | #endif |
1582 | 1609 | ||
1583 | /* | 1610 | flags |= cachep->gfpflags; |
1584 | * Under NUMA we want memory on the indicated node. We will handle | ||
1585 | * the needed fallback ourselves since we want to serve from our | ||
1586 | * per node object lists first for other nodes. | ||
1587 | */ | ||
1588 | flags |= cachep->gfpflags | GFP_THISNODE; | ||
1589 | 1611 | ||
1590 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 1612 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); |
1591 | if (!page) | 1613 | if (!page) |
@@ -2098,15 +2120,12 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2098 | } | 2120 | } |
2099 | 2121 | ||
2100 | /* | 2122 | /* |
2101 | * Prevent CPUs from coming and going. | 2123 | * We use cache_chain_mutex to ensure a consistent view of |
2102 | * lock_cpu_hotplug() nests outside cache_chain_mutex | 2124 | * cpu_online_map as well. Please see cpuup_callback |
2103 | */ | 2125 | */ |
2104 | lock_cpu_hotplug(); | ||
2105 | |||
2106 | mutex_lock(&cache_chain_mutex); | 2126 | mutex_lock(&cache_chain_mutex); |
2107 | 2127 | ||
2108 | list_for_each_entry(pc, &cache_chain, next) { | 2128 | list_for_each_entry(pc, &cache_chain, next) { |
2109 | mm_segment_t old_fs = get_fs(); | ||
2110 | char tmp; | 2129 | char tmp; |
2111 | int res; | 2130 | int res; |
2112 | 2131 | ||
@@ -2115,9 +2134,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2115 | * destroy its slab cache and no-one else reuses the vmalloc | 2134 | * destroy its slab cache and no-one else reuses the vmalloc |
2116 | * area of the module. Print a warning. | 2135 | * area of the module. Print a warning. |
2117 | */ | 2136 | */ |
2118 | set_fs(KERNEL_DS); | 2137 | res = probe_kernel_address(pc->name, tmp); |
2119 | res = __get_user(tmp, pc->name); | ||
2120 | set_fs(old_fs); | ||
2121 | if (res) { | 2138 | if (res) { |
2122 | printk("SLAB: cache with size %d has lost its name\n", | 2139 | printk("SLAB: cache with size %d has lost its name\n", |
2123 | pc->buffer_size); | 2140 | pc->buffer_size); |
@@ -2197,25 +2214,24 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2197 | if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) | 2214 | if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) |
2198 | ralign = BYTES_PER_WORD; | 2215 | ralign = BYTES_PER_WORD; |
2199 | 2216 | ||
2200 | /* 2) arch mandated alignment: disables debug if necessary */ | 2217 | /* 2) arch mandated alignment */ |
2201 | if (ralign < ARCH_SLAB_MINALIGN) { | 2218 | if (ralign < ARCH_SLAB_MINALIGN) { |
2202 | ralign = ARCH_SLAB_MINALIGN; | 2219 | ralign = ARCH_SLAB_MINALIGN; |
2203 | if (ralign > BYTES_PER_WORD) | ||
2204 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | ||
2205 | } | 2220 | } |
2206 | /* 3) caller mandated alignment: disables debug if necessary */ | 2221 | /* 3) caller mandated alignment */ |
2207 | if (ralign < align) { | 2222 | if (ralign < align) { |
2208 | ralign = align; | 2223 | ralign = align; |
2209 | if (ralign > BYTES_PER_WORD) | ||
2210 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | ||
2211 | } | 2224 | } |
2225 | /* disable debug if necessary */ | ||
2226 | if (ralign > BYTES_PER_WORD) | ||
2227 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | ||
2212 | /* | 2228 | /* |
2213 | * 4) Store it. | 2229 | * 4) Store it. |
2214 | */ | 2230 | */ |
2215 | align = ralign; | 2231 | align = ralign; |
2216 | 2232 | ||
2217 | /* Get cache's description obj. */ | 2233 | /* Get cache's description obj. */ |
2218 | cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); | 2234 | cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); |
2219 | if (!cachep) | 2235 | if (!cachep) |
2220 | goto oops; | 2236 | goto oops; |
2221 | 2237 | ||
@@ -2326,7 +2342,6 @@ oops: | |||
2326 | panic("kmem_cache_create(): failed to create slab `%s'\n", | 2342 | panic("kmem_cache_create(): failed to create slab `%s'\n", |
2327 | name); | 2343 | name); |
2328 | mutex_unlock(&cache_chain_mutex); | 2344 | mutex_unlock(&cache_chain_mutex); |
2329 | unlock_cpu_hotplug(); | ||
2330 | return cachep; | 2345 | return cachep; |
2331 | } | 2346 | } |
2332 | EXPORT_SYMBOL(kmem_cache_create); | 2347 | EXPORT_SYMBOL(kmem_cache_create); |
@@ -2444,6 +2459,7 @@ out: | |||
2444 | return nr_freed; | 2459 | return nr_freed; |
2445 | } | 2460 | } |
2446 | 2461 | ||
2462 | /* Called with cache_chain_mutex held to protect against cpu hotplug */ | ||
2447 | static int __cache_shrink(struct kmem_cache *cachep) | 2463 | static int __cache_shrink(struct kmem_cache *cachep) |
2448 | { | 2464 | { |
2449 | int ret = 0, i = 0; | 2465 | int ret = 0, i = 0; |
@@ -2474,9 +2490,13 @@ static int __cache_shrink(struct kmem_cache *cachep) | |||
2474 | */ | 2490 | */ |
2475 | int kmem_cache_shrink(struct kmem_cache *cachep) | 2491 | int kmem_cache_shrink(struct kmem_cache *cachep) |
2476 | { | 2492 | { |
2493 | int ret; | ||
2477 | BUG_ON(!cachep || in_interrupt()); | 2494 | BUG_ON(!cachep || in_interrupt()); |
2478 | 2495 | ||
2479 | return __cache_shrink(cachep); | 2496 | mutex_lock(&cache_chain_mutex); |
2497 | ret = __cache_shrink(cachep); | ||
2498 | mutex_unlock(&cache_chain_mutex); | ||
2499 | return ret; | ||
2480 | } | 2500 | } |
2481 | EXPORT_SYMBOL(kmem_cache_shrink); | 2501 | EXPORT_SYMBOL(kmem_cache_shrink); |
2482 | 2502 | ||
@@ -2500,23 +2520,16 @@ void kmem_cache_destroy(struct kmem_cache *cachep) | |||
2500 | { | 2520 | { |
2501 | BUG_ON(!cachep || in_interrupt()); | 2521 | BUG_ON(!cachep || in_interrupt()); |
2502 | 2522 | ||
2503 | /* Don't let CPUs to come and go */ | ||
2504 | lock_cpu_hotplug(); | ||
2505 | |||
2506 | /* Find the cache in the chain of caches. */ | 2523 | /* Find the cache in the chain of caches. */ |
2507 | mutex_lock(&cache_chain_mutex); | 2524 | mutex_lock(&cache_chain_mutex); |
2508 | /* | 2525 | /* |
2509 | * the chain is never empty, cache_cache is never destroyed | 2526 | * the chain is never empty, cache_cache is never destroyed |
2510 | */ | 2527 | */ |
2511 | list_del(&cachep->next); | 2528 | list_del(&cachep->next); |
2512 | mutex_unlock(&cache_chain_mutex); | ||
2513 | |||
2514 | if (__cache_shrink(cachep)) { | 2529 | if (__cache_shrink(cachep)) { |
2515 | slab_error(cachep, "Can't free all objects"); | 2530 | slab_error(cachep, "Can't free all objects"); |
2516 | mutex_lock(&cache_chain_mutex); | ||
2517 | list_add(&cachep->next, &cache_chain); | 2531 | list_add(&cachep->next, &cache_chain); |
2518 | mutex_unlock(&cache_chain_mutex); | 2532 | mutex_unlock(&cache_chain_mutex); |
2519 | unlock_cpu_hotplug(); | ||
2520 | return; | 2533 | return; |
2521 | } | 2534 | } |
2522 | 2535 | ||
@@ -2524,7 +2537,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) | |||
2524 | synchronize_rcu(); | 2537 | synchronize_rcu(); |
2525 | 2538 | ||
2526 | __kmem_cache_destroy(cachep); | 2539 | __kmem_cache_destroy(cachep); |
2527 | unlock_cpu_hotplug(); | 2540 | mutex_unlock(&cache_chain_mutex); |
2528 | } | 2541 | } |
2529 | EXPORT_SYMBOL(kmem_cache_destroy); | 2542 | EXPORT_SYMBOL(kmem_cache_destroy); |
2530 | 2543 | ||
@@ -2548,7 +2561,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | |||
2548 | if (OFF_SLAB(cachep)) { | 2561 | if (OFF_SLAB(cachep)) { |
2549 | /* Slab management obj is off-slab. */ | 2562 | /* Slab management obj is off-slab. */ |
2550 | slabp = kmem_cache_alloc_node(cachep->slabp_cache, | 2563 | slabp = kmem_cache_alloc_node(cachep->slabp_cache, |
2551 | local_flags, nodeid); | 2564 | local_flags & ~GFP_THISNODE, nodeid); |
2552 | if (!slabp) | 2565 | if (!slabp) |
2553 | return NULL; | 2566 | return NULL; |
2554 | } else { | 2567 | } else { |
@@ -2618,7 +2631,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2618 | 2631 | ||
2619 | static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) | 2632 | static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) |
2620 | { | 2633 | { |
2621 | if (flags & SLAB_DMA) | 2634 | if (flags & GFP_DMA) |
2622 | BUG_ON(!(cachep->gfpflags & GFP_DMA)); | 2635 | BUG_ON(!(cachep->gfpflags & GFP_DMA)); |
2623 | else | 2636 | else |
2624 | BUG_ON(cachep->gfpflags & GFP_DMA); | 2637 | BUG_ON(cachep->gfpflags & GFP_DMA); |
@@ -2689,10 +2702,10 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, | |||
2689 | * Grow (by 1) the number of slabs within a cache. This is called by | 2702 | * Grow (by 1) the number of slabs within a cache. This is called by |
2690 | * kmem_cache_alloc() when there are no active objs left in a cache. | 2703 | * kmem_cache_alloc() when there are no active objs left in a cache. |
2691 | */ | 2704 | */ |
2692 | static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 2705 | static int cache_grow(struct kmem_cache *cachep, |
2706 | gfp_t flags, int nodeid, void *objp) | ||
2693 | { | 2707 | { |
2694 | struct slab *slabp; | 2708 | struct slab *slabp; |
2695 | void *objp; | ||
2696 | size_t offset; | 2709 | size_t offset; |
2697 | gfp_t local_flags; | 2710 | gfp_t local_flags; |
2698 | unsigned long ctor_flags; | 2711 | unsigned long ctor_flags; |
@@ -2702,12 +2715,12 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
2702 | * Be lazy and only check for valid flags here, keeping it out of the | 2715 | * Be lazy and only check for valid flags here, keeping it out of the |
2703 | * critical path in kmem_cache_alloc(). | 2716 | * critical path in kmem_cache_alloc(). |
2704 | */ | 2717 | */ |
2705 | BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)); | 2718 | BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW)); |
2706 | if (flags & SLAB_NO_GROW) | 2719 | if (flags & __GFP_NO_GROW) |
2707 | return 0; | 2720 | return 0; |
2708 | 2721 | ||
2709 | ctor_flags = SLAB_CTOR_CONSTRUCTOR; | 2722 | ctor_flags = SLAB_CTOR_CONSTRUCTOR; |
2710 | local_flags = (flags & SLAB_LEVEL_MASK); | 2723 | local_flags = (flags & GFP_LEVEL_MASK); |
2711 | if (!(local_flags & __GFP_WAIT)) | 2724 | if (!(local_flags & __GFP_WAIT)) |
2712 | /* | 2725 | /* |
2713 | * Not allowed to sleep. Need to tell a constructor about | 2726 | * Not allowed to sleep. Need to tell a constructor about |
@@ -2744,12 +2757,14 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
2744 | * Get mem for the objs. Attempt to allocate a physical page from | 2757 | * Get mem for the objs. Attempt to allocate a physical page from |
2745 | * 'nodeid'. | 2758 | * 'nodeid'. |
2746 | */ | 2759 | */ |
2747 | objp = kmem_getpages(cachep, flags, nodeid); | 2760 | if (!objp) |
2761 | objp = kmem_getpages(cachep, flags, nodeid); | ||
2748 | if (!objp) | 2762 | if (!objp) |
2749 | goto failed; | 2763 | goto failed; |
2750 | 2764 | ||
2751 | /* Get slab management. */ | 2765 | /* Get slab management. */ |
2752 | slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid); | 2766 | slabp = alloc_slabmgmt(cachep, objp, offset, |
2767 | local_flags & ~GFP_THISNODE, nodeid); | ||
2753 | if (!slabp) | 2768 | if (!slabp) |
2754 | goto opps1; | 2769 | goto opps1; |
2755 | 2770 | ||
@@ -2987,7 +3002,7 @@ alloc_done: | |||
2987 | 3002 | ||
2988 | if (unlikely(!ac->avail)) { | 3003 | if (unlikely(!ac->avail)) { |
2989 | int x; | 3004 | int x; |
2990 | x = cache_grow(cachep, flags, node); | 3005 | x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); |
2991 | 3006 | ||
2992 | /* cache_grow can reenable interrupts, then ac could change. */ | 3007 | /* cache_grow can reenable interrupts, then ac could change. */ |
2993 | ac = cpu_cache_get(cachep); | 3008 | ac = cpu_cache_get(cachep); |
@@ -3063,18 +3078,101 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3063 | 3078 | ||
3064 | cachep->ctor(objp, cachep, ctor_flags); | 3079 | cachep->ctor(objp, cachep, ctor_flags); |
3065 | } | 3080 | } |
3081 | #if ARCH_SLAB_MINALIGN | ||
3082 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { | ||
3083 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", | ||
3084 | objp, ARCH_SLAB_MINALIGN); | ||
3085 | } | ||
3086 | #endif | ||
3066 | return objp; | 3087 | return objp; |
3067 | } | 3088 | } |
3068 | #else | 3089 | #else |
3069 | #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) | 3090 | #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) |
3070 | #endif | 3091 | #endif |
3071 | 3092 | ||
3093 | #ifdef CONFIG_FAILSLAB | ||
3094 | |||
3095 | static struct failslab_attr { | ||
3096 | |||
3097 | struct fault_attr attr; | ||
3098 | |||
3099 | u32 ignore_gfp_wait; | ||
3100 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
3101 | struct dentry *ignore_gfp_wait_file; | ||
3102 | #endif | ||
3103 | |||
3104 | } failslab = { | ||
3105 | .attr = FAULT_ATTR_INITIALIZER, | ||
3106 | .ignore_gfp_wait = 1, | ||
3107 | }; | ||
3108 | |||
3109 | static int __init setup_failslab(char *str) | ||
3110 | { | ||
3111 | return setup_fault_attr(&failslab.attr, str); | ||
3112 | } | ||
3113 | __setup("failslab=", setup_failslab); | ||
3114 | |||
3115 | static int should_failslab(struct kmem_cache *cachep, gfp_t flags) | ||
3116 | { | ||
3117 | if (cachep == &cache_cache) | ||
3118 | return 0; | ||
3119 | if (flags & __GFP_NOFAIL) | ||
3120 | return 0; | ||
3121 | if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT)) | ||
3122 | return 0; | ||
3123 | |||
3124 | return should_fail(&failslab.attr, obj_size(cachep)); | ||
3125 | } | ||
3126 | |||
3127 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
3128 | |||
3129 | static int __init failslab_debugfs(void) | ||
3130 | { | ||
3131 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | ||
3132 | struct dentry *dir; | ||
3133 | int err; | ||
3134 | |||
3135 | err = init_fault_attr_dentries(&failslab.attr, "failslab"); | ||
3136 | if (err) | ||
3137 | return err; | ||
3138 | dir = failslab.attr.dentries.dir; | ||
3139 | |||
3140 | failslab.ignore_gfp_wait_file = | ||
3141 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | ||
3142 | &failslab.ignore_gfp_wait); | ||
3143 | |||
3144 | if (!failslab.ignore_gfp_wait_file) { | ||
3145 | err = -ENOMEM; | ||
3146 | debugfs_remove(failslab.ignore_gfp_wait_file); | ||
3147 | cleanup_fault_attr_dentries(&failslab.attr); | ||
3148 | } | ||
3149 | |||
3150 | return err; | ||
3151 | } | ||
3152 | |||
3153 | late_initcall(failslab_debugfs); | ||
3154 | |||
3155 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
3156 | |||
3157 | #else /* CONFIG_FAILSLAB */ | ||
3158 | |||
3159 | static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags) | ||
3160 | { | ||
3161 | return 0; | ||
3162 | } | ||
3163 | |||
3164 | #endif /* CONFIG_FAILSLAB */ | ||
3165 | |||
3072 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 3166 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
3073 | { | 3167 | { |
3074 | void *objp; | 3168 | void *objp; |
3075 | struct array_cache *ac; | 3169 | struct array_cache *ac; |
3076 | 3170 | ||
3077 | check_irq_off(); | 3171 | check_irq_off(); |
3172 | |||
3173 | if (should_failslab(cachep, flags)) | ||
3174 | return NULL; | ||
3175 | |||
3078 | ac = cpu_cache_get(cachep); | 3176 | ac = cpu_cache_get(cachep); |
3079 | if (likely(ac->avail)) { | 3177 | if (likely(ac->avail)) { |
3080 | STATS_INC_ALLOCHIT(cachep); | 3178 | STATS_INC_ALLOCHIT(cachep); |
@@ -3105,10 +3203,10 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, | |||
3105 | objp = ____cache_alloc(cachep, flags); | 3203 | objp = ____cache_alloc(cachep, flags); |
3106 | /* | 3204 | /* |
3107 | * We may just have run out of memory on the local node. | 3205 | * We may just have run out of memory on the local node. |
3108 | * __cache_alloc_node() knows how to locate memory on other nodes | 3206 | * ____cache_alloc_node() knows how to locate memory on other nodes |
3109 | */ | 3207 | */ |
3110 | if (NUMA_BUILD && !objp) | 3208 | if (NUMA_BUILD && !objp) |
3111 | objp = __cache_alloc_node(cachep, flags, numa_node_id()); | 3209 | objp = ____cache_alloc_node(cachep, flags, numa_node_id()); |
3112 | local_irq_restore(save_flags); | 3210 | local_irq_restore(save_flags); |
3113 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, | 3211 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, |
3114 | caller); | 3212 | caller); |
@@ -3135,15 +3233,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3135 | else if (current->mempolicy) | 3233 | else if (current->mempolicy) |
3136 | nid_alloc = slab_node(current->mempolicy); | 3234 | nid_alloc = slab_node(current->mempolicy); |
3137 | if (nid_alloc != nid_here) | 3235 | if (nid_alloc != nid_here) |
3138 | return __cache_alloc_node(cachep, flags, nid_alloc); | 3236 | return ____cache_alloc_node(cachep, flags, nid_alloc); |
3139 | return NULL; | 3237 | return NULL; |
3140 | } | 3238 | } |
3141 | 3239 | ||
3142 | /* | 3240 | /* |
3143 | * Fallback function if there was no memory available and no objects on a | 3241 | * Fallback function if there was no memory available and no objects on a |
3144 | * certain node and we are allowed to fall back. We mimick the behavior of | 3242 | * certain node and fall back is permitted. First we scan all the |
3145 | * the page allocator. We fall back according to a zonelist determined by | 3243 | * available nodelists for available objects. If that fails then we |
3146 | * the policy layer while obeying cpuset constraints. | 3244 | * perform an allocation without specifying a node. This allows the page |
3245 | * allocator to do its reclaim / fallback magic. We then insert the | ||
3246 | * slab into the proper nodelist and then allocate from it. | ||
3147 | */ | 3247 | */ |
3148 | void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | 3248 | void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) |
3149 | { | 3249 | { |
@@ -3151,20 +3251,59 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3151 | ->node_zonelists[gfp_zone(flags)]; | 3251 | ->node_zonelists[gfp_zone(flags)]; |
3152 | struct zone **z; | 3252 | struct zone **z; |
3153 | void *obj = NULL; | 3253 | void *obj = NULL; |
3254 | int nid; | ||
3154 | 3255 | ||
3155 | for (z = zonelist->zones; *z && !obj; z++) | 3256 | retry: |
3156 | if (zone_idx(*z) <= ZONE_NORMAL && | 3257 | /* |
3157 | cpuset_zone_allowed(*z, flags)) | 3258 | * Look through allowed nodes for objects available |
3158 | obj = __cache_alloc_node(cache, | 3259 | * from existing per node queues. |
3159 | flags | __GFP_THISNODE, | 3260 | */ |
3160 | zone_to_nid(*z)); | 3261 | for (z = zonelist->zones; *z && !obj; z++) { |
3262 | nid = zone_to_nid(*z); | ||
3263 | |||
3264 | if (cpuset_zone_allowed(*z, flags | __GFP_HARDWALL) && | ||
3265 | cache->nodelists[nid] && | ||
3266 | cache->nodelists[nid]->free_objects) | ||
3267 | obj = ____cache_alloc_node(cache, | ||
3268 | flags | GFP_THISNODE, nid); | ||
3269 | } | ||
3270 | |||
3271 | if (!obj) { | ||
3272 | /* | ||
3273 | * This allocation will be performed within the constraints | ||
3274 | * of the current cpuset / memory policy requirements. | ||
3275 | * We may trigger various forms of reclaim on the allowed | ||
3276 | * set and go into memory reserves if necessary. | ||
3277 | */ | ||
3278 | obj = kmem_getpages(cache, flags, -1); | ||
3279 | if (obj) { | ||
3280 | /* | ||
3281 | * Insert into the appropriate per node queues | ||
3282 | */ | ||
3283 | nid = page_to_nid(virt_to_page(obj)); | ||
3284 | if (cache_grow(cache, flags, nid, obj)) { | ||
3285 | obj = ____cache_alloc_node(cache, | ||
3286 | flags | GFP_THISNODE, nid); | ||
3287 | if (!obj) | ||
3288 | /* | ||
3289 | * Another processor may allocate the | ||
3290 | * objects in the slab since we are | ||
3291 | * not holding any locks. | ||
3292 | */ | ||
3293 | goto retry; | ||
3294 | } else { | ||
3295 | kmem_freepages(cache, obj); | ||
3296 | obj = NULL; | ||
3297 | } | ||
3298 | } | ||
3299 | } | ||
3161 | return obj; | 3300 | return obj; |
3162 | } | 3301 | } |
3163 | 3302 | ||
3164 | /* | 3303 | /* |
3165 | * A interface to enable slab creation on nodeid | 3304 | * A interface to enable slab creation on nodeid |
3166 | */ | 3305 | */ |
3167 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | 3306 | static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, |
3168 | int nodeid) | 3307 | int nodeid) |
3169 | { | 3308 | { |
3170 | struct list_head *entry; | 3309 | struct list_head *entry; |
@@ -3213,7 +3352,7 @@ retry: | |||
3213 | 3352 | ||
3214 | must_grow: | 3353 | must_grow: |
3215 | spin_unlock(&l3->list_lock); | 3354 | spin_unlock(&l3->list_lock); |
3216 | x = cache_grow(cachep, flags, nodeid); | 3355 | x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); |
3217 | if (x) | 3356 | if (x) |
3218 | goto retry; | 3357 | goto retry; |
3219 | 3358 | ||
@@ -3431,35 +3570,59 @@ out: | |||
3431 | * @flags: See kmalloc(). | 3570 | * @flags: See kmalloc(). |
3432 | * @nodeid: node number of the target node. | 3571 | * @nodeid: node number of the target node. |
3433 | * | 3572 | * |
3434 | * Identical to kmem_cache_alloc, except that this function is slow | 3573 | * Identical to kmem_cache_alloc but it will allocate memory on the given |
3435 | * and can sleep. And it will allocate memory on the given node, which | 3574 | * node, which can improve the performance for cpu bound structures. |
3436 | * can improve the performance for cpu bound structures. | 3575 | * |
3437 | * New and improved: it will now make sure that the object gets | 3576 | * Fallback to other node is possible if __GFP_THISNODE is not set. |
3438 | * put on the correct node list so that there is no false sharing. | ||
3439 | */ | 3577 | */ |
3440 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 3578 | static __always_inline void * |
3579 | __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | ||
3580 | int nodeid, void *caller) | ||
3441 | { | 3581 | { |
3442 | unsigned long save_flags; | 3582 | unsigned long save_flags; |
3443 | void *ptr; | 3583 | void *ptr = NULL; |
3444 | 3584 | ||
3445 | cache_alloc_debugcheck_before(cachep, flags); | 3585 | cache_alloc_debugcheck_before(cachep, flags); |
3446 | local_irq_save(save_flags); | 3586 | local_irq_save(save_flags); |
3447 | 3587 | ||
3448 | if (nodeid == -1 || nodeid == numa_node_id() || | 3588 | if (unlikely(nodeid == -1)) |
3449 | !cachep->nodelists[nodeid]) | 3589 | nodeid = numa_node_id(); |
3450 | ptr = ____cache_alloc(cachep, flags); | ||
3451 | else | ||
3452 | ptr = __cache_alloc_node(cachep, flags, nodeid); | ||
3453 | local_irq_restore(save_flags); | ||
3454 | 3590 | ||
3455 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, | 3591 | if (likely(cachep->nodelists[nodeid])) { |
3456 | __builtin_return_address(0)); | 3592 | if (nodeid == numa_node_id()) { |
3593 | /* | ||
3594 | * Use the locally cached objects if possible. | ||
3595 | * However ____cache_alloc does not allow fallback | ||
3596 | * to other nodes. It may fail while we still have | ||
3597 | * objects on other nodes available. | ||
3598 | */ | ||
3599 | ptr = ____cache_alloc(cachep, flags); | ||
3600 | } | ||
3601 | if (!ptr) { | ||
3602 | /* ___cache_alloc_node can fall back to other nodes */ | ||
3603 | ptr = ____cache_alloc_node(cachep, flags, nodeid); | ||
3604 | } | ||
3605 | } else { | ||
3606 | /* Node not bootstrapped yet */ | ||
3607 | if (!(flags & __GFP_THISNODE)) | ||
3608 | ptr = fallback_alloc(cachep, flags); | ||
3609 | } | ||
3610 | |||
3611 | local_irq_restore(save_flags); | ||
3612 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); | ||
3457 | 3613 | ||
3458 | return ptr; | 3614 | return ptr; |
3459 | } | 3615 | } |
3616 | |||
3617 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | ||
3618 | { | ||
3619 | return __cache_alloc_node(cachep, flags, nodeid, | ||
3620 | __builtin_return_address(0)); | ||
3621 | } | ||
3460 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3622 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
3461 | 3623 | ||
3462 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3624 | static __always_inline void * |
3625 | __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) | ||
3463 | { | 3626 | { |
3464 | struct kmem_cache *cachep; | 3627 | struct kmem_cache *cachep; |
3465 | 3628 | ||
@@ -3468,8 +3631,29 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) | |||
3468 | return NULL; | 3631 | return NULL; |
3469 | return kmem_cache_alloc_node(cachep, flags, node); | 3632 | return kmem_cache_alloc_node(cachep, flags, node); |
3470 | } | 3633 | } |
3634 | |||
3635 | #ifdef CONFIG_DEBUG_SLAB | ||
3636 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | ||
3637 | { | ||
3638 | return __do_kmalloc_node(size, flags, node, | ||
3639 | __builtin_return_address(0)); | ||
3640 | } | ||
3471 | EXPORT_SYMBOL(__kmalloc_node); | 3641 | EXPORT_SYMBOL(__kmalloc_node); |
3472 | #endif | 3642 | |
3643 | void *__kmalloc_node_track_caller(size_t size, gfp_t flags, | ||
3644 | int node, void *caller) | ||
3645 | { | ||
3646 | return __do_kmalloc_node(size, flags, node, caller); | ||
3647 | } | ||
3648 | EXPORT_SYMBOL(__kmalloc_node_track_caller); | ||
3649 | #else | ||
3650 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | ||
3651 | { | ||
3652 | return __do_kmalloc_node(size, flags, node, NULL); | ||
3653 | } | ||
3654 | EXPORT_SYMBOL(__kmalloc_node); | ||
3655 | #endif /* CONFIG_DEBUG_SLAB */ | ||
3656 | #endif /* CONFIG_NUMA */ | ||
3473 | 3657 | ||
3474 | /** | 3658 | /** |
3475 | * __do_kmalloc - allocate memory | 3659 | * __do_kmalloc - allocate memory |
@@ -3580,13 +3764,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
3580 | int node; | 3764 | int node; |
3581 | struct kmem_list3 *l3; | 3765 | struct kmem_list3 *l3; |
3582 | struct array_cache *new_shared; | 3766 | struct array_cache *new_shared; |
3583 | struct array_cache **new_alien; | 3767 | struct array_cache **new_alien = NULL; |
3584 | 3768 | ||
3585 | for_each_online_node(node) { | 3769 | for_each_online_node(node) { |
3586 | 3770 | ||
3587 | new_alien = alloc_alien_cache(node, cachep->limit); | 3771 | if (use_alien_caches) { |
3588 | if (!new_alien) | 3772 | new_alien = alloc_alien_cache(node, cachep->limit); |
3589 | goto fail; | 3773 | if (!new_alien) |
3774 | goto fail; | ||
3775 | } | ||
3590 | 3776 | ||
3591 | new_shared = alloc_arraycache(node, | 3777 | new_shared = alloc_arraycache(node, |
3592 | cachep->shared*cachep->batchcount, | 3778 | cachep->shared*cachep->batchcount, |
@@ -3812,7 +3998,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, | |||
3812 | * If we cannot acquire the cache chain mutex then just give up - we'll try | 3998 | * If we cannot acquire the cache chain mutex then just give up - we'll try |
3813 | * again on the next iteration. | 3999 | * again on the next iteration. |
3814 | */ | 4000 | */ |
3815 | static void cache_reap(void *unused) | 4001 | static void cache_reap(struct work_struct *unused) |
3816 | { | 4002 | { |
3817 | struct kmem_cache *searchp; | 4003 | struct kmem_cache *searchp; |
3818 | struct kmem_list3 *l3; | 4004 | struct kmem_list3 *l3; |
@@ -3821,7 +4007,7 @@ static void cache_reap(void *unused) | |||
3821 | if (!mutex_trylock(&cache_chain_mutex)) { | 4007 | if (!mutex_trylock(&cache_chain_mutex)) { |
3822 | /* Give up. Setup the next iteration. */ | 4008 | /* Give up. Setup the next iteration. */ |
3823 | schedule_delayed_work(&__get_cpu_var(reap_work), | 4009 | schedule_delayed_work(&__get_cpu_var(reap_work), |
3824 | REAPTIMEOUT_CPUC); | 4010 | round_jiffies_relative(REAPTIMEOUT_CPUC)); |
3825 | return; | 4011 | return; |
3826 | } | 4012 | } |
3827 | 4013 | ||
@@ -3867,7 +4053,8 @@ next: | |||
3867 | next_reap_node(); | 4053 | next_reap_node(); |
3868 | refresh_cpu_vm_stats(smp_processor_id()); | 4054 | refresh_cpu_vm_stats(smp_processor_id()); |
3869 | /* Set up the next iteration */ | 4055 | /* Set up the next iteration */ |
3870 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); | 4056 | schedule_delayed_work(&__get_cpu_var(reap_work), |
4057 | round_jiffies_relative(REAPTIMEOUT_CPUC)); | ||
3871 | } | 4058 | } |
3872 | 4059 | ||
3873 | #ifdef CONFIG_PROC_FS | 4060 | #ifdef CONFIG_PROC_FS |
@@ -4035,7 +4222,7 @@ static int s_show(struct seq_file *m, void *p) | |||
4035 | * + further values on SMP and with statistics enabled | 4222 | * + further values on SMP and with statistics enabled |
4036 | */ | 4223 | */ |
4037 | 4224 | ||
4038 | struct seq_operations slabinfo_op = { | 4225 | const struct seq_operations slabinfo_op = { |
4039 | .start = s_start, | 4226 | .start = s_start, |
4040 | .next = s_next, | 4227 | .next = s_next, |
4041 | .stop = s_stop, | 4228 | .stop = s_stop, |
@@ -4233,7 +4420,7 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4233 | return 0; | 4420 | return 0; |
4234 | } | 4421 | } |
4235 | 4422 | ||
4236 | struct seq_operations slabstats_op = { | 4423 | const struct seq_operations slabstats_op = { |
4237 | .start = leaks_start, | 4424 | .start = leaks_start, |
4238 | .next = s_next, | 4425 | .next = s_next, |
4239 | .stop = s_stop, | 4426 | .stop = s_stop, |
diff --git a/mm/sparse.c b/mm/sparse.c index 86c52ab808..ac26eb0d73 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -24,6 +24,25 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] | |||
24 | #endif | 24 | #endif |
25 | EXPORT_SYMBOL(mem_section); | 25 | EXPORT_SYMBOL(mem_section); |
26 | 26 | ||
27 | #ifdef NODE_NOT_IN_PAGE_FLAGS | ||
28 | /* | ||
29 | * If we did not store the node number in the page then we have to | ||
30 | * do a lookup in the section_to_node_table in order to find which | ||
31 | * node the page belongs to. | ||
32 | */ | ||
33 | #if MAX_NUMNODES <= 256 | ||
34 | static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; | ||
35 | #else | ||
36 | static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; | ||
37 | #endif | ||
38 | |||
39 | int page_to_nid(struct page *page) | ||
40 | { | ||
41 | return section_to_node_table[page_to_section(page)]; | ||
42 | } | ||
43 | EXPORT_SYMBOL(page_to_nid); | ||
44 | #endif | ||
45 | |||
27 | #ifdef CONFIG_SPARSEMEM_EXTREME | 46 | #ifdef CONFIG_SPARSEMEM_EXTREME |
28 | static struct mem_section *sparse_index_alloc(int nid) | 47 | static struct mem_section *sparse_index_alloc(int nid) |
29 | { | 48 | { |
@@ -49,6 +68,10 @@ static int sparse_index_init(unsigned long section_nr, int nid) | |||
49 | struct mem_section *section; | 68 | struct mem_section *section; |
50 | int ret = 0; | 69 | int ret = 0; |
51 | 70 | ||
71 | #ifdef NODE_NOT_IN_PAGE_FLAGS | ||
72 | section_to_node_table[section_nr] = nid; | ||
73 | #endif | ||
74 | |||
52 | if (mem_section[root]) | 75 | if (mem_section[root]) |
53 | return -EEXIST; | 76 | return -EEXIST; |
54 | 77 | ||
@@ -211,7 +234,7 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | |||
211 | struct page *page, *ret; | 234 | struct page *page, *ret; |
212 | unsigned long memmap_size = sizeof(struct page) * nr_pages; | 235 | unsigned long memmap_size = sizeof(struct page) * nr_pages; |
213 | 236 | ||
214 | page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); | 237 | page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); |
215 | if (page) | 238 | if (page) |
216 | goto got_map_page; | 239 | goto got_map_page; |
217 | 240 | ||
@@ -57,9 +57,9 @@ static void put_compound_page(struct page *page) | |||
57 | { | 57 | { |
58 | page = (struct page *)page_private(page); | 58 | page = (struct page *)page_private(page); |
59 | if (put_page_testzero(page)) { | 59 | if (put_page_testzero(page)) { |
60 | void (*dtor)(struct page *page); | 60 | compound_page_dtor *dtor; |
61 | 61 | ||
62 | dtor = (void (*)(struct page *))page[1].lru.next; | 62 | dtor = get_compound_page_dtor(page); |
63 | (*dtor)(page); | 63 | (*dtor)(page); |
64 | } | 64 | } |
65 | } | 65 | } |
@@ -216,7 +216,7 @@ void lru_add_drain(void) | |||
216 | } | 216 | } |
217 | 217 | ||
218 | #ifdef CONFIG_NUMA | 218 | #ifdef CONFIG_NUMA |
219 | static void lru_add_drain_per_cpu(void *dummy) | 219 | static void lru_add_drain_per_cpu(struct work_struct *dummy) |
220 | { | 220 | { |
221 | lru_add_drain(); | 221 | lru_add_drain(); |
222 | } | 222 | } |
@@ -226,7 +226,7 @@ static void lru_add_drain_per_cpu(void *dummy) | |||
226 | */ | 226 | */ |
227 | int lru_add_drain_all(void) | 227 | int lru_add_drain_all(void) |
228 | { | 228 | { |
229 | return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); | 229 | return schedule_on_each_cpu(lru_add_drain_per_cpu); |
230 | } | 230 | } |
231 | 231 | ||
232 | #else | 232 | #else |
@@ -514,5 +514,7 @@ void __init swap_setup(void) | |||
514 | * Right now other parts of the system means that we | 514 | * Right now other parts of the system means that we |
515 | * _really_ don't want to cluster much more | 515 | * _really_ don't want to cluster much more |
516 | */ | 516 | */ |
517 | #ifdef CONFIG_HOTPLUG_CPU | ||
517 | hotcpu_notifier(cpu_swap_callback, 0); | 518 | hotcpu_notifier(cpu_swap_callback, 0); |
519 | #endif | ||
518 | } | 520 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index a15def63f2..b9fc0e5de6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -427,34 +427,48 @@ void free_swap_and_cache(swp_entry_t entry) | |||
427 | 427 | ||
428 | #ifdef CONFIG_SOFTWARE_SUSPEND | 428 | #ifdef CONFIG_SOFTWARE_SUSPEND |
429 | /* | 429 | /* |
430 | * Find the swap type that corresponds to given device (if any) | 430 | * Find the swap type that corresponds to given device (if any). |
431 | * | 431 | * |
432 | * This is needed for software suspend and is done in such a way that inode | 432 | * @offset - number of the PAGE_SIZE-sized block of the device, starting |
433 | * aliasing is allowed. | 433 | * from 0, in which the swap header is expected to be located. |
434 | * | ||
435 | * This is needed for the suspend to disk (aka swsusp). | ||
434 | */ | 436 | */ |
435 | int swap_type_of(dev_t device) | 437 | int swap_type_of(dev_t device, sector_t offset) |
436 | { | 438 | { |
439 | struct block_device *bdev = NULL; | ||
437 | int i; | 440 | int i; |
438 | 441 | ||
442 | if (device) | ||
443 | bdev = bdget(device); | ||
444 | |||
439 | spin_lock(&swap_lock); | 445 | spin_lock(&swap_lock); |
440 | for (i = 0; i < nr_swapfiles; i++) { | 446 | for (i = 0; i < nr_swapfiles; i++) { |
441 | struct inode *inode; | 447 | struct swap_info_struct *sis = swap_info + i; |
442 | 448 | ||
443 | if (!(swap_info[i].flags & SWP_WRITEOK)) | 449 | if (!(sis->flags & SWP_WRITEOK)) |
444 | continue; | 450 | continue; |
445 | 451 | ||
446 | if (!device) { | 452 | if (!bdev) { |
447 | spin_unlock(&swap_lock); | 453 | spin_unlock(&swap_lock); |
448 | return i; | 454 | return i; |
449 | } | 455 | } |
450 | inode = swap_info[i].swap_file->f_dentry->d_inode; | 456 | if (bdev == sis->bdev) { |
451 | if (S_ISBLK(inode->i_mode) && | 457 | struct swap_extent *se; |
452 | device == MKDEV(imajor(inode), iminor(inode))) { | 458 | |
453 | spin_unlock(&swap_lock); | 459 | se = list_entry(sis->extent_list.next, |
454 | return i; | 460 | struct swap_extent, list); |
461 | if (se->start_block == offset) { | ||
462 | spin_unlock(&swap_lock); | ||
463 | bdput(bdev); | ||
464 | return i; | ||
465 | } | ||
455 | } | 466 | } |
456 | } | 467 | } |
457 | spin_unlock(&swap_lock); | 468 | spin_unlock(&swap_lock); |
469 | if (bdev) | ||
470 | bdput(bdev); | ||
471 | |||
458 | return -ENODEV; | 472 | return -ENODEV; |
459 | } | 473 | } |
460 | 474 | ||
@@ -931,6 +945,23 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | |||
931 | } | 945 | } |
932 | } | 946 | } |
933 | 947 | ||
948 | #ifdef CONFIG_SOFTWARE_SUSPEND | ||
949 | /* | ||
950 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev | ||
951 | * corresponding to given index in swap_info (swap type). | ||
952 | */ | ||
953 | sector_t swapdev_block(int swap_type, pgoff_t offset) | ||
954 | { | ||
955 | struct swap_info_struct *sis; | ||
956 | |||
957 | if (swap_type >= nr_swapfiles) | ||
958 | return 0; | ||
959 | |||
960 | sis = swap_info + swap_type; | ||
961 | return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; | ||
962 | } | ||
963 | #endif /* CONFIG_SOFTWARE_SUSPEND */ | ||
964 | |||
934 | /* | 965 | /* |
935 | * Free all of a swapdev's extent information | 966 | * Free all of a swapdev's extent information |
936 | */ | 967 | */ |
@@ -1274,10 +1305,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1274 | 1305 | ||
1275 | mutex_lock(&swapon_mutex); | 1306 | mutex_lock(&swapon_mutex); |
1276 | 1307 | ||
1308 | if (!l) | ||
1309 | return SEQ_START_TOKEN; | ||
1310 | |||
1277 | for (i = 0; i < nr_swapfiles; i++, ptr++) { | 1311 | for (i = 0; i < nr_swapfiles; i++, ptr++) { |
1278 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1312 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) |
1279 | continue; | 1313 | continue; |
1280 | if (!l--) | 1314 | if (!--l) |
1281 | return ptr; | 1315 | return ptr; |
1282 | } | 1316 | } |
1283 | 1317 | ||
@@ -1286,10 +1320,17 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1286 | 1320 | ||
1287 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) | 1321 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) |
1288 | { | 1322 | { |
1289 | struct swap_info_struct *ptr = v; | 1323 | struct swap_info_struct *ptr; |
1290 | struct swap_info_struct *endptr = swap_info + nr_swapfiles; | 1324 | struct swap_info_struct *endptr = swap_info + nr_swapfiles; |
1291 | 1325 | ||
1292 | for (++ptr; ptr < endptr; ptr++) { | 1326 | if (v == SEQ_START_TOKEN) |
1327 | ptr = swap_info; | ||
1328 | else { | ||
1329 | ptr = v; | ||
1330 | ptr++; | ||
1331 | } | ||
1332 | |||
1333 | for (; ptr < endptr; ptr++) { | ||
1293 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1334 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) |
1294 | continue; | 1335 | continue; |
1295 | ++*pos; | 1336 | ++*pos; |
@@ -1310,14 +1351,16 @@ static int swap_show(struct seq_file *swap, void *v) | |||
1310 | struct file *file; | 1351 | struct file *file; |
1311 | int len; | 1352 | int len; |
1312 | 1353 | ||
1313 | if (v == swap_info) | 1354 | if (ptr == SEQ_START_TOKEN) { |
1314 | seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); | 1355 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); |
1356 | return 0; | ||
1357 | } | ||
1315 | 1358 | ||
1316 | file = ptr->swap_file; | 1359 | file = ptr->swap_file; |
1317 | len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); | 1360 | len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\"); |
1318 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", | 1361 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
1319 | len < 40 ? 40 - len : 1, " ", | 1362 | len < 40 ? 40 - len : 1, " ", |
1320 | S_ISBLK(file->f_dentry->d_inode->i_mode) ? | 1363 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? |
1321 | "partition" : "file\t", | 1364 | "partition" : "file\t", |
1322 | ptr->pages << (PAGE_SHIFT - 10), | 1365 | ptr->pages << (PAGE_SHIFT - 10), |
1323 | ptr->inuse_pages << (PAGE_SHIFT - 10), | 1366 | ptr->inuse_pages << (PAGE_SHIFT - 10), |
@@ -1325,7 +1368,7 @@ static int swap_show(struct seq_file *swap, void *v) | |||
1325 | return 0; | 1368 | return 0; |
1326 | } | 1369 | } |
1327 | 1370 | ||
1328 | static struct seq_operations swaps_op = { | 1371 | static const struct seq_operations swaps_op = { |
1329 | .start = swap_start, | 1372 | .start = swap_start, |
1330 | .next = swap_next, | 1373 | .next = swap_next, |
1331 | .stop = swap_stop, | 1374 | .stop = swap_stop, |
@@ -1337,7 +1380,7 @@ static int swaps_open(struct inode *inode, struct file *file) | |||
1337 | return seq_open(file, &swaps_op); | 1380 | return seq_open(file, &swaps_op); |
1338 | } | 1381 | } |
1339 | 1382 | ||
1340 | static struct file_operations proc_swaps_operations = { | 1383 | static const struct file_operations proc_swaps_operations = { |
1341 | .open = swaps_open, | 1384 | .open = swaps_open, |
1342 | .read = seq_read, | 1385 | .read = seq_read, |
1343 | .llseek = seq_lseek, | 1386 | .llseek = seq_lseek, |
@@ -1540,6 +1583,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1540 | error = -EINVAL; | 1583 | error = -EINVAL; |
1541 | if (!maxpages) | 1584 | if (!maxpages) |
1542 | goto bad_swap; | 1585 | goto bad_swap; |
1586 | if (swapfilesize && maxpages > swapfilesize) { | ||
1587 | printk(KERN_WARNING | ||
1588 | "Swap area shorter than signature indicates\n"); | ||
1589 | goto bad_swap; | ||
1590 | } | ||
1543 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) | 1591 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) |
1544 | goto bad_swap; | 1592 | goto bad_swap; |
1545 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) | 1593 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) |
@@ -1567,12 +1615,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1567 | goto bad_swap; | 1615 | goto bad_swap; |
1568 | } | 1616 | } |
1569 | 1617 | ||
1570 | if (swapfilesize && maxpages > swapfilesize) { | ||
1571 | printk(KERN_WARNING | ||
1572 | "Swap area shorter than signature indicates\n"); | ||
1573 | error = -EINVAL; | ||
1574 | goto bad_swap; | ||
1575 | } | ||
1576 | if (nr_good_pages) { | 1618 | if (nr_good_pages) { |
1577 | p->swap_map[0] = SWAP_MAP_BAD; | 1619 | p->swap_map[0] = SWAP_MAP_BAD; |
1578 | p->max = maxpages; | 1620 | p->max = maxpages; |
diff --git a/mm/thrash.c b/mm/thrash.c index f4c560b4a2..9ef9071f99 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
@@ -7,100 +7,74 @@ | |||
7 | * | 7 | * |
8 | * Simple token based thrashing protection, using the algorithm | 8 | * Simple token based thrashing protection, using the algorithm |
9 | * described in: http://www.cs.wm.edu/~sjiang/token.pdf | 9 | * described in: http://www.cs.wm.edu/~sjiang/token.pdf |
10 | * | ||
11 | * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> | ||
12 | * Improved algorithm to pass token: | ||
13 | * Each task has a priority which is incremented if it contended | ||
14 | * for the token in an interval less than its previous attempt. | ||
15 | * If the token is acquired, that task's priority is boosted to prevent | ||
16 | * the token from bouncing around too often and to let the task make | ||
17 | * some progress in its execution. | ||
10 | */ | 18 | */ |
19 | |||
11 | #include <linux/jiffies.h> | 20 | #include <linux/jiffies.h> |
12 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
13 | #include <linux/sched.h> | 22 | #include <linux/sched.h> |
14 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
15 | 24 | ||
16 | static DEFINE_SPINLOCK(swap_token_lock); | 25 | static DEFINE_SPINLOCK(swap_token_lock); |
17 | static unsigned long swap_token_timeout; | 26 | struct mm_struct *swap_token_mm; |
18 | static unsigned long swap_token_check; | 27 | static unsigned int global_faults; |
19 | struct mm_struct * swap_token_mm = &init_mm; | ||
20 | |||
21 | #define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) | ||
22 | #define SWAP_TOKEN_TIMEOUT (300 * HZ) | ||
23 | /* | ||
24 | * Currently disabled; Needs further code to work at HZ * 300. | ||
25 | */ | ||
26 | unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT; | ||
27 | |||
28 | /* | ||
29 | * Take the token away if the process had no page faults | ||
30 | * in the last interval, or if it has held the token for | ||
31 | * too long. | ||
32 | */ | ||
33 | #define SWAP_TOKEN_ENOUGH_RSS 1 | ||
34 | #define SWAP_TOKEN_TIMED_OUT 2 | ||
35 | static int should_release_swap_token(struct mm_struct *mm) | ||
36 | { | ||
37 | int ret = 0; | ||
38 | if (!mm->recent_pagein) | ||
39 | ret = SWAP_TOKEN_ENOUGH_RSS; | ||
40 | else if (time_after(jiffies, swap_token_timeout)) | ||
41 | ret = SWAP_TOKEN_TIMED_OUT; | ||
42 | mm->recent_pagein = 0; | ||
43 | return ret; | ||
44 | } | ||
45 | 28 | ||
46 | /* | ||
47 | * Try to grab the swapout protection token. We only try to | ||
48 | * grab it once every TOKEN_CHECK_INTERVAL, both to prevent | ||
49 | * SMP lock contention and to check that the process that held | ||
50 | * the token before is no longer thrashing. | ||
51 | */ | ||
52 | void grab_swap_token(void) | 29 | void grab_swap_token(void) |
53 | { | 30 | { |
54 | struct mm_struct *mm; | 31 | int current_interval; |
55 | int reason; | ||
56 | 32 | ||
57 | /* We have the token. Let others know we still need it. */ | 33 | global_faults++; |
58 | if (has_swap_token(current->mm)) { | ||
59 | current->mm->recent_pagein = 1; | ||
60 | if (unlikely(!swap_token_default_timeout)) | ||
61 | disable_swap_token(); | ||
62 | return; | ||
63 | } | ||
64 | |||
65 | if (time_after(jiffies, swap_token_check)) { | ||
66 | 34 | ||
67 | if (!swap_token_default_timeout) { | 35 | current_interval = global_faults - current->mm->faultstamp; |
68 | swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; | ||
69 | return; | ||
70 | } | ||
71 | |||
72 | /* ... or if we recently held the token. */ | ||
73 | if (time_before(jiffies, current->mm->swap_token_time)) | ||
74 | return; | ||
75 | 36 | ||
76 | if (!spin_trylock(&swap_token_lock)) | 37 | if (!spin_trylock(&swap_token_lock)) |
77 | return; | 38 | return; |
78 | 39 | ||
79 | swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; | 40 | /* First come first served */ |
41 | if (swap_token_mm == NULL) { | ||
42 | current->mm->token_priority = current->mm->token_priority + 2; | ||
43 | swap_token_mm = current->mm; | ||
44 | goto out; | ||
45 | } | ||
80 | 46 | ||
81 | mm = swap_token_mm; | 47 | if (current->mm != swap_token_mm) { |
82 | if ((reason = should_release_swap_token(mm))) { | 48 | if (current_interval < current->mm->last_interval) |
83 | unsigned long eligible = jiffies; | 49 | current->mm->token_priority++; |
84 | if (reason == SWAP_TOKEN_TIMED_OUT) { | 50 | else { |
85 | eligible += swap_token_default_timeout; | 51 | current->mm->token_priority--; |
86 | } | 52 | if (unlikely(current->mm->token_priority < 0)) |
87 | mm->swap_token_time = eligible; | 53 | current->mm->token_priority = 0; |
88 | swap_token_timeout = jiffies + swap_token_default_timeout; | 54 | } |
55 | /* Check if we deserve the token */ | ||
56 | if (current->mm->token_priority > | ||
57 | swap_token_mm->token_priority) { | ||
58 | current->mm->token_priority += 2; | ||
89 | swap_token_mm = current->mm; | 59 | swap_token_mm = current->mm; |
90 | } | 60 | } |
91 | spin_unlock(&swap_token_lock); | 61 | } else { |
62 | /* Token holder came in again! */ | ||
63 | current->mm->token_priority += 2; | ||
92 | } | 64 | } |
93 | return; | 65 | |
66 | out: | ||
67 | current->mm->faultstamp = global_faults; | ||
68 | current->mm->last_interval = current_interval; | ||
69 | spin_unlock(&swap_token_lock); | ||
70 | return; | ||
94 | } | 71 | } |
95 | 72 | ||
96 | /* Called on process exit. */ | 73 | /* Called on process exit. */ |
97 | void __put_swap_token(struct mm_struct *mm) | 74 | void __put_swap_token(struct mm_struct *mm) |
98 | { | 75 | { |
99 | spin_lock(&swap_token_lock); | 76 | spin_lock(&swap_token_lock); |
100 | if (likely(mm == swap_token_mm)) { | 77 | if (likely(mm == swap_token_mm)) |
101 | mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL; | 78 | swap_token_mm = NULL; |
102 | swap_token_mm = &init_mm; | ||
103 | swap_token_check = jiffies; | ||
104 | } | ||
105 | spin_unlock(&swap_token_lock); | 79 | spin_unlock(&swap_token_lock); |
106 | } | 80 | } |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index 5f2cbf0f15..c7f6e1914b 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c | |||
@@ -79,8 +79,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
79 | d_instantiate(dentry, inode); | 79 | d_instantiate(dentry, inode); |
80 | inode->i_nlink = 0; /* It is unlinked */ | 80 | inode->i_nlink = 0; /* It is unlinked */ |
81 | 81 | ||
82 | file->f_vfsmnt = mntget(shm_mnt); | 82 | file->f_path.mnt = mntget(shm_mnt); |
83 | file->f_dentry = dentry; | 83 | file->f_path.dentry = dentry; |
84 | file->f_mapping = inode->i_mapping; | 84 | file->f_mapping = inode->i_mapping; |
85 | file->f_op = &ramfs_file_operations; | 85 | file->f_op = &ramfs_file_operations; |
86 | file->f_mode = FMODE_WRITE | FMODE_READ; | 86 | file->f_mode = FMODE_WRITE | FMODE_READ; |
diff --git a/mm/truncate.c b/mm/truncate.c index 11ca480701..9bfb8e8538 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
15 | #include <linux/pagevec.h> | 15 | #include <linux/pagevec.h> |
16 | #include <linux/task_io_accounting_ops.h> | ||
16 | #include <linux/buffer_head.h> /* grr. try_to_release_page, | 17 | #include <linux/buffer_head.h> /* grr. try_to_release_page, |
17 | do_invalidatepage */ | 18 | do_invalidatepage */ |
18 | 19 | ||
@@ -69,7 +70,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
69 | if (PagePrivate(page)) | 70 | if (PagePrivate(page)) |
70 | do_invalidatepage(page, 0); | 71 | do_invalidatepage(page, 0); |
71 | 72 | ||
72 | clear_page_dirty(page); | 73 | if (test_clear_page_dirty(page)) |
74 | task_io_account_cancelled_write(PAGE_CACHE_SIZE); | ||
73 | ClearPageUptodate(page); | 75 | ClearPageUptodate(page); |
74 | ClearPageMappedToDisk(page); | 76 | ClearPageMappedToDisk(page); |
75 | remove_from_page_cache(page); | 77 | remove_from_page_cache(page); |
@@ -96,7 +98,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
96 | return 0; | 98 | return 0; |
97 | 99 | ||
98 | ret = remove_mapping(mapping, page); | 100 | ret = remove_mapping(mapping, page); |
99 | ClearPageUptodate(page); | ||
100 | 101 | ||
101 | return ret; | 102 | return ret; |
102 | } | 103 | } |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 750ab6ed13..86897ee792 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -160,13 +160,15 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | |||
160 | return err; | 160 | return err; |
161 | } | 161 | } |
162 | 162 | ||
163 | struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, | 163 | static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, |
164 | unsigned long start, unsigned long end, int node) | 164 | unsigned long start, unsigned long end, |
165 | int node, gfp_t gfp_mask) | ||
165 | { | 166 | { |
166 | struct vm_struct **p, *tmp, *area; | 167 | struct vm_struct **p, *tmp, *area; |
167 | unsigned long align = 1; | 168 | unsigned long align = 1; |
168 | unsigned long addr; | 169 | unsigned long addr; |
169 | 170 | ||
171 | BUG_ON(in_interrupt()); | ||
170 | if (flags & VM_IOREMAP) { | 172 | if (flags & VM_IOREMAP) { |
171 | int bit = fls(size); | 173 | int bit = fls(size); |
172 | 174 | ||
@@ -179,16 +181,13 @@ struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, | |||
179 | } | 181 | } |
180 | addr = ALIGN(start, align); | 182 | addr = ALIGN(start, align); |
181 | size = PAGE_ALIGN(size); | 183 | size = PAGE_ALIGN(size); |
184 | if (unlikely(!size)) | ||
185 | return NULL; | ||
182 | 186 | ||
183 | area = kmalloc_node(sizeof(*area), GFP_KERNEL, node); | 187 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node); |
184 | if (unlikely(!area)) | 188 | if (unlikely(!area)) |
185 | return NULL; | 189 | return NULL; |
186 | 190 | ||
187 | if (unlikely(!size)) { | ||
188 | kfree (area); | ||
189 | return NULL; | ||
190 | } | ||
191 | |||
192 | /* | 191 | /* |
193 | * We always allocate a guard page. | 192 | * We always allocate a guard page. |
194 | */ | 193 | */ |
@@ -236,7 +235,7 @@ out: | |||
236 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 235 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, |
237 | unsigned long start, unsigned long end) | 236 | unsigned long start, unsigned long end) |
238 | { | 237 | { |
239 | return __get_vm_area_node(size, flags, start, end, -1); | 238 | return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL); |
240 | } | 239 | } |
241 | 240 | ||
242 | /** | 241 | /** |
@@ -253,9 +252,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) | |||
253 | return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); | 252 | return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); |
254 | } | 253 | } |
255 | 254 | ||
256 | struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node) | 255 | struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, |
256 | int node, gfp_t gfp_mask) | ||
257 | { | 257 | { |
258 | return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node); | 258 | return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, |
259 | gfp_mask); | ||
259 | } | 260 | } |
260 | 261 | ||
261 | /* Caller must hold vmlist_lock */ | 262 | /* Caller must hold vmlist_lock */ |
@@ -428,8 +429,11 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
428 | if (array_size > PAGE_SIZE) { | 429 | if (array_size > PAGE_SIZE) { |
429 | pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); | 430 | pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); |
430 | area->flags |= VM_VPAGES; | 431 | area->flags |= VM_VPAGES; |
431 | } else | 432 | } else { |
432 | pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node); | 433 | pages = kmalloc_node(array_size, |
434 | (gfp_mask & ~(__GFP_HIGHMEM | __GFP_ZERO)), | ||
435 | node); | ||
436 | } | ||
433 | area->pages = pages; | 437 | area->pages = pages; |
434 | if (!area->pages) { | 438 | if (!area->pages) { |
435 | remove_vm_area(area->addr); | 439 | remove_vm_area(area->addr); |
@@ -484,7 +488,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | |||
484 | if (!size || (size >> PAGE_SHIFT) > num_physpages) | 488 | if (!size || (size >> PAGE_SHIFT) > num_physpages) |
485 | return NULL; | 489 | return NULL; |
486 | 490 | ||
487 | area = get_vm_area_node(size, VM_ALLOC, node); | 491 | area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask); |
488 | if (!area) | 492 | if (!area) |
489 | return NULL; | 493 | return NULL; |
490 | 494 | ||
@@ -525,11 +529,12 @@ void *vmalloc_user(unsigned long size) | |||
525 | void *ret; | 529 | void *ret; |
526 | 530 | ||
527 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); | 531 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); |
528 | write_lock(&vmlist_lock); | 532 | if (ret) { |
529 | area = __find_vm_area(ret); | 533 | write_lock(&vmlist_lock); |
530 | area->flags |= VM_USERMAP; | 534 | area = __find_vm_area(ret); |
531 | write_unlock(&vmlist_lock); | 535 | area->flags |= VM_USERMAP; |
532 | 536 | write_unlock(&vmlist_lock); | |
537 | } | ||
533 | return ret; | 538 | return ret; |
534 | } | 539 | } |
535 | EXPORT_SYMBOL(vmalloc_user); | 540 | EXPORT_SYMBOL(vmalloc_user); |
@@ -598,11 +603,12 @@ void *vmalloc_32_user(unsigned long size) | |||
598 | void *ret; | 603 | void *ret; |
599 | 604 | ||
600 | ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); | 605 | ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); |
601 | write_lock(&vmlist_lock); | 606 | if (ret) { |
602 | area = __find_vm_area(ret); | 607 | write_lock(&vmlist_lock); |
603 | area->flags |= VM_USERMAP; | 608 | area = __find_vm_area(ret); |
604 | write_unlock(&vmlist_lock); | 609 | area->flags |= VM_USERMAP; |
605 | 610 | write_unlock(&vmlist_lock); | |
611 | } | ||
606 | return ret; | 612 | return ret; |
607 | } | 613 | } |
608 | EXPORT_SYMBOL(vmalloc_32_user); | 614 | EXPORT_SYMBOL(vmalloc_32_user); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index eca70310ad..093f5fe6dd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/rwsem.h> | 36 | #include <linux/rwsem.h> |
37 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
38 | #include <linux/kthread.h> | 38 | #include <linux/kthread.h> |
39 | #include <linux/freezer.h> | ||
39 | 40 | ||
40 | #include <asm/tlbflush.h> | 41 | #include <asm/tlbflush.h> |
41 | #include <asm/div64.h> | 42 | #include <asm/div64.h> |
@@ -378,6 +379,12 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
378 | return PAGE_CLEAN; | 379 | return PAGE_CLEAN; |
379 | } | 380 | } |
380 | 381 | ||
382 | /* | ||
383 | * Attempt to detach a locked page from its ->mapping. If it is dirty or if | ||
384 | * someone else has a ref on the page, abort and return 0. If it was | ||
385 | * successfully detached, return 1. Assumes the caller has a single ref on | ||
386 | * this page. | ||
387 | */ | ||
381 | int remove_mapping(struct address_space *mapping, struct page *page) | 388 | int remove_mapping(struct address_space *mapping, struct page *page) |
382 | { | 389 | { |
383 | BUG_ON(!PageLocked(page)); | 390 | BUG_ON(!PageLocked(page)); |
@@ -717,6 +724,20 @@ done: | |||
717 | return nr_reclaimed; | 724 | return nr_reclaimed; |
718 | } | 725 | } |
719 | 726 | ||
727 | /* | ||
728 | * We are about to scan this zone at a certain priority level. If that priority | ||
729 | * level is smaller (ie: more urgent) than the previous priority, then note | ||
730 | * that priority level within the zone. This is done so that when the next | ||
731 | * process comes in to scan this zone, it will immediately start out at this | ||
732 | * priority level rather than having to build up its own scanning priority. | ||
733 | * Here, this priority affects only the reclaim-mapped threshold. | ||
734 | */ | ||
735 | static inline void note_zone_scanning_priority(struct zone *zone, int priority) | ||
736 | { | ||
737 | if (priority < zone->prev_priority) | ||
738 | zone->prev_priority = priority; | ||
739 | } | ||
740 | |||
720 | static inline int zone_is_near_oom(struct zone *zone) | 741 | static inline int zone_is_near_oom(struct zone *zone) |
721 | { | 742 | { |
722 | return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; | 743 | return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; |
@@ -740,7 +761,7 @@ static inline int zone_is_near_oom(struct zone *zone) | |||
740 | * But we had to alter page->flags anyway. | 761 | * But we had to alter page->flags anyway. |
741 | */ | 762 | */ |
742 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | 763 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
743 | struct scan_control *sc) | 764 | struct scan_control *sc, int priority) |
744 | { | 765 | { |
745 | unsigned long pgmoved; | 766 | unsigned long pgmoved; |
746 | int pgdeactivate = 0; | 767 | int pgdeactivate = 0; |
@@ -764,7 +785,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
764 | * `distress' is a measure of how much trouble we're having | 785 | * `distress' is a measure of how much trouble we're having |
765 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | 786 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. |
766 | */ | 787 | */ |
767 | distress = 100 >> zone->prev_priority; | 788 | distress = 100 >> min(zone->prev_priority, priority); |
768 | 789 | ||
769 | /* | 790 | /* |
770 | * The point of this algorithm is to decide when to start | 791 | * The point of this algorithm is to decide when to start |
@@ -916,7 +937,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone, | |||
916 | nr_to_scan = min(nr_active, | 937 | nr_to_scan = min(nr_active, |
917 | (unsigned long)sc->swap_cluster_max); | 938 | (unsigned long)sc->swap_cluster_max); |
918 | nr_active -= nr_to_scan; | 939 | nr_active -= nr_to_scan; |
919 | shrink_active_list(nr_to_scan, zone, sc); | 940 | shrink_active_list(nr_to_scan, zone, sc, priority); |
920 | } | 941 | } |
921 | 942 | ||
922 | if (nr_inactive) { | 943 | if (nr_inactive) { |
@@ -966,9 +987,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
966 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 987 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
967 | continue; | 988 | continue; |
968 | 989 | ||
969 | zone->temp_priority = priority; | 990 | note_zone_scanning_priority(zone, priority); |
970 | if (zone->prev_priority > priority) | ||
971 | zone->prev_priority = priority; | ||
972 | 991 | ||
973 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 992 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
974 | continue; /* Let kswapd poll it */ | 993 | continue; /* Let kswapd poll it */ |
@@ -1018,7 +1037,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
1018 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 1037 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
1019 | continue; | 1038 | continue; |
1020 | 1039 | ||
1021 | zone->temp_priority = DEF_PRIORITY; | ||
1022 | lru_pages += zone->nr_active + zone->nr_inactive; | 1040 | lru_pages += zone->nr_active + zone->nr_inactive; |
1023 | } | 1041 | } |
1024 | 1042 | ||
@@ -1053,19 +1071,28 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
1053 | 1071 | ||
1054 | /* Take a nap, wait for some writeback to complete */ | 1072 | /* Take a nap, wait for some writeback to complete */ |
1055 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) | 1073 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) |
1056 | blk_congestion_wait(WRITE, HZ/10); | 1074 | congestion_wait(WRITE, HZ/10); |
1057 | } | 1075 | } |
1058 | /* top priority shrink_caches still had more to do? don't OOM, then */ | 1076 | /* top priority shrink_caches still had more to do? don't OOM, then */ |
1059 | if (!sc.all_unreclaimable) | 1077 | if (!sc.all_unreclaimable) |
1060 | ret = 1; | 1078 | ret = 1; |
1061 | out: | 1079 | out: |
1080 | /* | ||
1081 | * Now that we've scanned all the zones at this priority level, note | ||
1082 | * that level within the zone so that the next thread which performs | ||
1083 | * scanning of this zone will immediately start out at this priority | ||
1084 | * level. This affects only the decision whether or not to bring | ||
1085 | * mapped pages onto the inactive list. | ||
1086 | */ | ||
1087 | if (priority < 0) | ||
1088 | priority = 0; | ||
1062 | for (i = 0; zones[i] != 0; i++) { | 1089 | for (i = 0; zones[i] != 0; i++) { |
1063 | struct zone *zone = zones[i]; | 1090 | struct zone *zone = zones[i]; |
1064 | 1091 | ||
1065 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 1092 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
1066 | continue; | 1093 | continue; |
1067 | 1094 | ||
1068 | zone->prev_priority = zone->temp_priority; | 1095 | zone->prev_priority = priority; |
1069 | } | 1096 | } |
1070 | return ret; | 1097 | return ret; |
1071 | } | 1098 | } |
@@ -1105,6 +1132,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1105 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1132 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
1106 | .swappiness = vm_swappiness, | 1133 | .swappiness = vm_swappiness, |
1107 | }; | 1134 | }; |
1135 | /* | ||
1136 | * temp_priority is used to remember the scanning priority at which | ||
1137 | * this zone was successfully refilled to free_pages == pages_high. | ||
1138 | */ | ||
1139 | int temp_priority[MAX_NR_ZONES]; | ||
1108 | 1140 | ||
1109 | loop_again: | 1141 | loop_again: |
1110 | total_scanned = 0; | 1142 | total_scanned = 0; |
@@ -1112,11 +1144,8 @@ loop_again: | |||
1112 | sc.may_writepage = !laptop_mode; | 1144 | sc.may_writepage = !laptop_mode; |
1113 | count_vm_event(PAGEOUTRUN); | 1145 | count_vm_event(PAGEOUTRUN); |
1114 | 1146 | ||
1115 | for (i = 0; i < pgdat->nr_zones; i++) { | 1147 | for (i = 0; i < pgdat->nr_zones; i++) |
1116 | struct zone *zone = pgdat->node_zones + i; | 1148 | temp_priority[i] = DEF_PRIORITY; |
1117 | |||
1118 | zone->temp_priority = DEF_PRIORITY; | ||
1119 | } | ||
1120 | 1149 | ||
1121 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 1150 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
1122 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 1151 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
@@ -1144,11 +1173,12 @@ loop_again: | |||
1144 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1173 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1145 | 0, 0)) { | 1174 | 0, 0)) { |
1146 | end_zone = i; | 1175 | end_zone = i; |
1147 | goto scan; | 1176 | break; |
1148 | } | 1177 | } |
1149 | } | 1178 | } |
1150 | goto out; | 1179 | if (i < 0) |
1151 | scan: | 1180 | goto out; |
1181 | |||
1152 | for (i = 0; i <= end_zone; i++) { | 1182 | for (i = 0; i <= end_zone; i++) { |
1153 | struct zone *zone = pgdat->node_zones + i; | 1183 | struct zone *zone = pgdat->node_zones + i; |
1154 | 1184 | ||
@@ -1177,10 +1207,9 @@ scan: | |||
1177 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1207 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1178 | end_zone, 0)) | 1208 | end_zone, 0)) |
1179 | all_zones_ok = 0; | 1209 | all_zones_ok = 0; |
1180 | zone->temp_priority = priority; | 1210 | temp_priority[i] = priority; |
1181 | if (zone->prev_priority > priority) | ||
1182 | zone->prev_priority = priority; | ||
1183 | sc.nr_scanned = 0; | 1211 | sc.nr_scanned = 0; |
1212 | note_zone_scanning_priority(zone, priority); | ||
1184 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1213 | nr_reclaimed += shrink_zone(priority, zone, &sc); |
1185 | reclaim_state->reclaimed_slab = 0; | 1214 | reclaim_state->reclaimed_slab = 0; |
1186 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1215 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
@@ -1208,7 +1237,7 @@ scan: | |||
1208 | * another pass across the zones. | 1237 | * another pass across the zones. |
1209 | */ | 1238 | */ |
1210 | if (total_scanned && priority < DEF_PRIORITY - 2) | 1239 | if (total_scanned && priority < DEF_PRIORITY - 2) |
1211 | blk_congestion_wait(WRITE, HZ/10); | 1240 | congestion_wait(WRITE, HZ/10); |
1212 | 1241 | ||
1213 | /* | 1242 | /* |
1214 | * We do this so kswapd doesn't build up large priorities for | 1243 | * We do this so kswapd doesn't build up large priorities for |
@@ -1220,13 +1249,21 @@ scan: | |||
1220 | break; | 1249 | break; |
1221 | } | 1250 | } |
1222 | out: | 1251 | out: |
1252 | /* | ||
1253 | * Note within each zone the priority level at which this zone was | ||
1254 | * brought into a happy state. So that the next thread which scans this | ||
1255 | * zone will start out at that priority level. | ||
1256 | */ | ||
1223 | for (i = 0; i < pgdat->nr_zones; i++) { | 1257 | for (i = 0; i < pgdat->nr_zones; i++) { |
1224 | struct zone *zone = pgdat->node_zones + i; | 1258 | struct zone *zone = pgdat->node_zones + i; |
1225 | 1259 | ||
1226 | zone->prev_priority = zone->temp_priority; | 1260 | zone->prev_priority = temp_priority[i]; |
1227 | } | 1261 | } |
1228 | if (!all_zones_ok) { | 1262 | if (!all_zones_ok) { |
1229 | cond_resched(); | 1263 | cond_resched(); |
1264 | |||
1265 | try_to_freeze(); | ||
1266 | |||
1230 | goto loop_again; | 1267 | goto loop_again; |
1231 | } | 1268 | } |
1232 | 1269 | ||
@@ -1352,7 +1389,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, | |||
1352 | if (zone->nr_scan_active >= nr_pages || pass > 3) { | 1389 | if (zone->nr_scan_active >= nr_pages || pass > 3) { |
1353 | zone->nr_scan_active = 0; | 1390 | zone->nr_scan_active = 0; |
1354 | nr_to_scan = min(nr_pages, zone->nr_active); | 1391 | nr_to_scan = min(nr_pages, zone->nr_active); |
1355 | shrink_active_list(nr_to_scan, zone, sc); | 1392 | shrink_active_list(nr_to_scan, zone, sc, prio); |
1356 | } | 1393 | } |
1357 | } | 1394 | } |
1358 | 1395 | ||
@@ -1452,7 +1489,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1452 | goto out; | 1489 | goto out; |
1453 | 1490 | ||
1454 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | 1491 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) |
1455 | blk_congestion_wait(WRITE, HZ / 10); | 1492 | congestion_wait(WRITE, HZ / 10); |
1456 | } | 1493 | } |
1457 | 1494 | ||
1458 | lru_pages = 0; | 1495 | lru_pages = 0; |
@@ -1476,7 +1513,6 @@ out: | |||
1476 | } | 1513 | } |
1477 | #endif | 1514 | #endif |
1478 | 1515 | ||
1479 | #ifdef CONFIG_HOTPLUG_CPU | ||
1480 | /* It's optimal to keep kswapds on the same CPUs as their memory, but | 1516 | /* It's optimal to keep kswapds on the same CPUs as their memory, but |
1481 | not required for correctness. So if the last cpu in a node goes | 1517 | not required for correctness. So if the last cpu in a node goes |
1482 | away, we get changed to run anywhere: as the first one comes back, | 1518 | away, we get changed to run anywhere: as the first one comes back, |
@@ -1497,7 +1533,6 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
1497 | } | 1533 | } |
1498 | return NOTIFY_OK; | 1534 | return NOTIFY_OK; |
1499 | } | 1535 | } |
1500 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
1501 | 1536 | ||
1502 | /* | 1537 | /* |
1503 | * This kswapd start function will be called by init and node-hot-add. | 1538 | * This kswapd start function will be called by init and node-hot-add. |
@@ -1608,6 +1643,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1608 | */ | 1643 | */ |
1609 | priority = ZONE_RECLAIM_PRIORITY; | 1644 | priority = ZONE_RECLAIM_PRIORITY; |
1610 | do { | 1645 | do { |
1646 | note_zone_scanning_priority(zone, priority); | ||
1611 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1647 | nr_reclaimed += shrink_zone(priority, zone, &sc); |
1612 | priority--; | 1648 | priority--; |
1613 | } while (priority >= 0 && nr_reclaimed < nr_pages); | 1649 | } while (priority >= 0 && nr_reclaimed < nr_pages); |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 45b124e012..dc005a0c96 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -430,7 +430,7 @@ static int frag_show(struct seq_file *m, void *arg) | |||
430 | return 0; | 430 | return 0; |
431 | } | 431 | } |
432 | 432 | ||
433 | struct seq_operations fragmentation_op = { | 433 | const struct seq_operations fragmentation_op = { |
434 | .start = frag_start, | 434 | .start = frag_start, |
435 | .next = frag_next, | 435 | .next = frag_next, |
436 | .stop = frag_stop, | 436 | .stop = frag_stop, |
@@ -452,7 +452,7 @@ struct seq_operations fragmentation_op = { | |||
452 | #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \ | 452 | #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \ |
453 | TEXT_FOR_HIGHMEM(xx) | 453 | TEXT_FOR_HIGHMEM(xx) |
454 | 454 | ||
455 | static char *vmstat_text[] = { | 455 | static const char * const vmstat_text[] = { |
456 | /* Zoned VM counters */ | 456 | /* Zoned VM counters */ |
457 | "nr_anon_pages", | 457 | "nr_anon_pages", |
458 | "nr_mapped", | 458 | "nr_mapped", |
@@ -587,11 +587,9 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
587 | seq_printf(m, | 587 | seq_printf(m, |
588 | "\n all_unreclaimable: %u" | 588 | "\n all_unreclaimable: %u" |
589 | "\n prev_priority: %i" | 589 | "\n prev_priority: %i" |
590 | "\n temp_priority: %i" | ||
591 | "\n start_pfn: %lu", | 590 | "\n start_pfn: %lu", |
592 | zone->all_unreclaimable, | 591 | zone->all_unreclaimable, |
593 | zone->prev_priority, | 592 | zone->prev_priority, |
594 | zone->temp_priority, | ||
595 | zone->zone_start_pfn); | 593 | zone->zone_start_pfn); |
596 | spin_unlock_irqrestore(&zone->lock, flags); | 594 | spin_unlock_irqrestore(&zone->lock, flags); |
597 | seq_putc(m, '\n'); | 595 | seq_putc(m, '\n'); |
@@ -599,7 +597,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
599 | return 0; | 597 | return 0; |
600 | } | 598 | } |
601 | 599 | ||
602 | struct seq_operations zoneinfo_op = { | 600 | const struct seq_operations zoneinfo_op = { |
603 | .start = frag_start, /* iterate over all zones. The same as in | 601 | .start = frag_start, /* iterate over all zones. The same as in |
604 | * fragmentation. */ | 602 | * fragmentation. */ |
605 | .next = frag_next, | 603 | .next = frag_next, |
@@ -662,7 +660,7 @@ static void vmstat_stop(struct seq_file *m, void *arg) | |||
662 | m->private = NULL; | 660 | m->private = NULL; |
663 | } | 661 | } |
664 | 662 | ||
665 | struct seq_operations vmstat_op = { | 663 | const struct seq_operations vmstat_op = { |
666 | .start = vmstat_start, | 664 | .start = vmstat_start, |
667 | .next = vmstat_next, | 665 | .next = vmstat_next, |
668 | .stop = vmstat_stop, | 666 | .stop = vmstat_stop, |
@@ -681,13 +679,13 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
681 | void *hcpu) | 679 | void *hcpu) |
682 | { | 680 | { |
683 | switch (action) { | 681 | switch (action) { |
684 | case CPU_UP_PREPARE: | 682 | case CPU_UP_PREPARE: |
685 | case CPU_UP_CANCELED: | 683 | case CPU_UP_CANCELED: |
686 | case CPU_DEAD: | 684 | case CPU_DEAD: |
687 | refresh_zone_stat_thresholds(); | 685 | refresh_zone_stat_thresholds(); |
688 | break; | 686 | break; |
689 | default: | 687 | default: |
690 | break; | 688 | break; |
691 | } | 689 | } |
692 | return NOTIFY_OK; | 690 | return NOTIFY_OK; |
693 | } | 691 | } |