diff options
author | James Morris <jmorris@namei.org> | 2009-02-05 19:01:45 -0500 |
---|---|---|
committer | James Morris <jmorris@namei.org> | 2009-02-05 19:01:45 -0500 |
commit | cb5629b10d64a8006622ce3a52bc887d91057d69 (patch) | |
tree | 7c06d8f30783115e3384721046258ce615b129c5 /mm | |
parent | 8920d5ad6ba74ae8ab020e90cc4d976980e68701 (diff) | |
parent | f01d1d546abb2f4028b5299092f529eefb01253a (diff) |
Merge branch 'master' into next
Conflicts:
fs/namei.c
Manually merged per:
diff --cc fs/namei.c
index 734f2b5,bbc15c2..0000000
--- a/fs/namei.c
+++ b/fs/namei.c
@@@ -860,9 -848,8 +849,10 @@@ static int __link_path_walk(const char
nd->flags |= LOOKUP_CONTINUE;
err = exec_permission_lite(inode);
if (err == -EAGAIN)
- err = vfs_permission(nd, MAY_EXEC);
+ err = inode_permission(nd->path.dentry->d_inode,
+ MAY_EXEC);
+ if (!err)
+ err = ima_path_check(&nd->path, MAY_EXEC);
if (err)
break;
@@@ -1525,14 -1506,9 +1509,14 @@@ int may_open(struct path *path, int acc
flag &= ~O_TRUNC;
}
- error = vfs_permission(nd, acc_mode);
+ error = inode_permission(inode, acc_mode);
if (error)
return error;
+
- error = ima_path_check(&nd->path,
++ error = ima_path_check(path,
+ acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+ if (error)
+ return error;
/*
* An append-only file must be opened in append mode for writing.
*/
Signed-off-by: James Morris <jmorris@namei.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/backing-dev.c | 8 | ||||
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/fadvise.c | 18 | ||||
-rw-r--r-- | mm/filemap.c | 56 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/fremap.c | 6 | ||||
-rw-r--r-- | mm/hugetlb.c | 46 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 1898 | ||||
-rw-r--r-- | mm/memory.c | 234 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 20 | ||||
-rw-r--r-- | mm/mempolicy.c | 24 | ||||
-rw-r--r-- | mm/migrate.c | 139 | ||||
-rw-r--r-- | mm/mincore.c | 4 | ||||
-rw-r--r-- | mm/mlock.c | 64 | ||||
-rw-r--r-- | mm/mmap.c | 117 | ||||
-rw-r--r-- | mm/mprotect.c | 12 | ||||
-rw-r--r-- | mm/mremap.c | 8 | ||||
-rw-r--r-- | mm/msync.c | 4 | ||||
-rw-r--r-- | mm/nommu.c | 1054 | ||||
-rw-r--r-- | mm/oom_kill.c | 119 | ||||
-rw-r--r-- | mm/page-writeback.c | 254 | ||||
-rw-r--r-- | mm/page_alloc.c | 143 | ||||
-rw-r--r-- | mm/page_cgroup.c | 209 | ||||
-rw-r--r-- | mm/page_io.c | 6 | ||||
-rw-r--r-- | mm/pdflush.c | 16 | ||||
-rw-r--r-- | mm/rmap.c | 60 | ||||
-rw-r--r-- | mm/shmem.c | 104 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 24 | ||||
-rw-r--r-- | mm/swap.c | 77 | ||||
-rw-r--r-- | mm/swap_state.c | 35 | ||||
-rw-r--r-- | mm/swapfile.c | 607 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 134 | ||||
-rw-r--r-- | mm/vmalloc.c | 57 | ||||
-rw-r--r-- | mm/vmscan.c | 328 | ||||
-rw-r--r-- | mm/vmstat.c | 4 |
40 files changed, 4061 insertions, 1854 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 5b5790f8a816..a5b77811fdf2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -181,12 +181,6 @@ config MIGRATION | |||
181 | example on NUMA systems to put pages nearer to the processors accessing | 181 | example on NUMA systems to put pages nearer to the processors accessing |
182 | the page. | 182 | the page. |
183 | 183 | ||
184 | config RESOURCES_64BIT | ||
185 | bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL) | ||
186 | default 64BIT | ||
187 | help | ||
188 | This option allows memory and IO resources to be 64 bit. | ||
189 | |||
190 | config PHYS_ADDR_T_64BIT | 184 | config PHYS_ADDR_T_64BIT |
191 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT | 185 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT |
192 | 186 | ||
diff --git a/mm/Makefile b/mm/Makefile index 51c27709cc7c..72255be57f89 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | |||
9 | 9 | ||
10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ |
11 | maccess.o page_alloc.o page-writeback.o pdflush.o \ | 11 | maccess.o page_alloc.o page-writeback.o pdflush.o \ |
12 | readahead.o swap.o truncate.o vmscan.o \ | 12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
14 | page_isolation.o mm_init.o $(mmu-y) | 14 | page_isolation.o mm_init.o $(mmu-y) |
15 | 15 | ||
@@ -21,9 +21,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o | |||
21 | obj-$(CONFIG_NUMA) += mempolicy.o | 21 | obj-$(CONFIG_NUMA) += mempolicy.o |
22 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 22 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
23 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | 23 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o |
24 | obj-$(CONFIG_SHMEM) += shmem.o | ||
25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | 24 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o |
26 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | ||
27 | obj-$(CONFIG_SLOB) += slob.o | 25 | obj-$(CONFIG_SLOB) += slob.o |
28 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 26 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
29 | obj-$(CONFIG_SLAB) += slab.o | 27 | obj-$(CONFIG_SLAB) += slab.o |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 801c08b046e6..8e8587444132 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -24,9 +24,9 @@ static void bdi_debug_init(void) | |||
24 | static int bdi_debug_stats_show(struct seq_file *m, void *v) | 24 | static int bdi_debug_stats_show(struct seq_file *m, void *v) |
25 | { | 25 | { |
26 | struct backing_dev_info *bdi = m->private; | 26 | struct backing_dev_info *bdi = m->private; |
27 | long background_thresh; | 27 | unsigned long background_thresh; |
28 | long dirty_thresh; | 28 | unsigned long dirty_thresh; |
29 | long bdi_thresh; | 29 | unsigned long bdi_thresh; |
30 | 30 | ||
31 | get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); | 31 | get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); |
32 | 32 | ||
@@ -223,7 +223,7 @@ int bdi_init(struct backing_dev_info *bdi) | |||
223 | bdi->max_prop_frac = PROP_FRAC_BASE; | 223 | bdi->max_prop_frac = PROP_FRAC_BASE; |
224 | 224 | ||
225 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { | 225 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { |
226 | err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); | 226 | err = percpu_counter_init(&bdi->bdi_stat[i], 0); |
227 | if (err) | 227 | if (err) |
228 | goto err; | 228 | goto err; |
229 | } | 229 | } |
diff --git a/mm/bootmem.c b/mm/bootmem.c index ac5a891f142a..51a0ccf61e0e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -435,6 +435,10 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, | |||
435 | unsigned long fallback = 0; | 435 | unsigned long fallback = 0; |
436 | unsigned long min, max, start, sidx, midx, step; | 436 | unsigned long min, max, start, sidx, midx, step; |
437 | 437 | ||
438 | bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", | ||
439 | bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, | ||
440 | align, goal, limit); | ||
441 | |||
438 | BUG_ON(!size); | 442 | BUG_ON(!size); |
439 | BUG_ON(align & (align - 1)); | 443 | BUG_ON(align & (align - 1)); |
440 | BUG_ON(limit && goal + size > limit); | 444 | BUG_ON(limit && goal + size > limit); |
@@ -442,10 +446,6 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, | |||
442 | if (!bdata->node_bootmem_map) | 446 | if (!bdata->node_bootmem_map) |
443 | return NULL; | 447 | return NULL; |
444 | 448 | ||
445 | bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", | ||
446 | bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, | ||
447 | align, goal, limit); | ||
448 | |||
449 | min = bdata->node_min_pfn; | 449 | min = bdata->node_min_pfn; |
450 | max = bdata->node_low_pfn; | 450 | max = bdata->node_low_pfn; |
451 | 451 | ||
diff --git a/mm/fadvise.c b/mm/fadvise.c index a1da969bd980..54a0f8040afa 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -24,7 +24,7 @@ | |||
24 | * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could | 24 | * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could |
25 | * deactivate the pages and clear PG_Referenced. | 25 | * deactivate the pages and clear PG_Referenced. |
26 | */ | 26 | */ |
27 | asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | 27 | SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) |
28 | { | 28 | { |
29 | struct file *file = fget(fd); | 29 | struct file *file = fget(fd); |
30 | struct address_space *mapping; | 30 | struct address_space *mapping; |
@@ -126,12 +126,26 @@ out: | |||
126 | fput(file); | 126 | fput(file); |
127 | return ret; | 127 | return ret; |
128 | } | 128 | } |
129 | #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS | ||
130 | asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice) | ||
131 | { | ||
132 | return SYSC_fadvise64_64((int) fd, offset, len, (int) advice); | ||
133 | } | ||
134 | SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64); | ||
135 | #endif | ||
129 | 136 | ||
130 | #ifdef __ARCH_WANT_SYS_FADVISE64 | 137 | #ifdef __ARCH_WANT_SYS_FADVISE64 |
131 | 138 | ||
132 | asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice) | 139 | SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice) |
133 | { | 140 | { |
134 | return sys_fadvise64_64(fd, offset, len, advice); | 141 | return sys_fadvise64_64(fd, offset, len, advice); |
135 | } | 142 | } |
143 | #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS | ||
144 | asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice) | ||
145 | { | ||
146 | return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice); | ||
147 | } | ||
148 | SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64); | ||
149 | #endif | ||
136 | 150 | ||
137 | #endif | 151 | #endif |
diff --git a/mm/filemap.c b/mm/filemap.c index f3e5f8944d17..23acefe51808 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | |||
210 | int ret; | 210 | int ret; |
211 | struct writeback_control wbc = { | 211 | struct writeback_control wbc = { |
212 | .sync_mode = sync_mode, | 212 | .sync_mode = sync_mode, |
213 | .nr_to_write = mapping->nrpages * 2, | 213 | .nr_to_write = LONG_MAX, |
214 | .range_start = start, | 214 | .range_start = start, |
215 | .range_end = end, | 215 | .range_end = end, |
216 | }; | 216 | }; |
@@ -460,7 +460,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
460 | VM_BUG_ON(!PageLocked(page)); | 460 | VM_BUG_ON(!PageLocked(page)); |
461 | 461 | ||
462 | error = mem_cgroup_cache_charge(page, current->mm, | 462 | error = mem_cgroup_cache_charge(page, current->mm, |
463 | gfp_mask & ~__GFP_HIGHMEM); | 463 | gfp_mask & GFP_RECLAIM_MASK); |
464 | if (error) | 464 | if (error) |
465 | goto out; | 465 | goto out; |
466 | 466 | ||
@@ -741,7 +741,14 @@ repeat: | |||
741 | page = __page_cache_alloc(gfp_mask); | 741 | page = __page_cache_alloc(gfp_mask); |
742 | if (!page) | 742 | if (!page) |
743 | return NULL; | 743 | return NULL; |
744 | err = add_to_page_cache_lru(page, mapping, index, gfp_mask); | 744 | /* |
745 | * We want a regular kernel memory (not highmem or DMA etc) | ||
746 | * allocation for the radix tree nodes, but we need to honour | ||
747 | * the context-specific requirements the caller has asked for. | ||
748 | * GFP_RECLAIM_MASK collects those requirements. | ||
749 | */ | ||
750 | err = add_to_page_cache_lru(page, mapping, index, | ||
751 | (gfp_mask & GFP_RECLAIM_MASK)); | ||
745 | if (unlikely(err)) { | 752 | if (unlikely(err)) { |
746 | page_cache_release(page); | 753 | page_cache_release(page); |
747 | page = NULL; | 754 | page = NULL; |
@@ -950,7 +957,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) | |||
950 | return NULL; | 957 | return NULL; |
951 | } | 958 | } |
952 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); | 959 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); |
953 | if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { | 960 | if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { |
954 | page_cache_release(page); | 961 | page_cache_release(page); |
955 | page = NULL; | 962 | page = NULL; |
956 | } | 963 | } |
@@ -1317,7 +1324,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1317 | goto out; /* skip atime */ | 1324 | goto out; /* skip atime */ |
1318 | size = i_size_read(inode); | 1325 | size = i_size_read(inode); |
1319 | if (pos < size) { | 1326 | if (pos < size) { |
1320 | retval = filemap_write_and_wait(mapping); | 1327 | retval = filemap_write_and_wait_range(mapping, pos, |
1328 | pos + iov_length(iov, nr_segs) - 1); | ||
1321 | if (!retval) { | 1329 | if (!retval) { |
1322 | retval = mapping->a_ops->direct_IO(READ, iocb, | 1330 | retval = mapping->a_ops->direct_IO(READ, iocb, |
1323 | iov, pos, nr_segs); | 1331 | iov, pos, nr_segs); |
@@ -1366,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp, | |||
1366 | return 0; | 1374 | return 0; |
1367 | } | 1375 | } |
1368 | 1376 | ||
1369 | asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) | 1377 | SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) |
1370 | { | 1378 | { |
1371 | ssize_t ret; | 1379 | ssize_t ret; |
1372 | struct file *file; | 1380 | struct file *file; |
@@ -1385,6 +1393,13 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) | |||
1385 | } | 1393 | } |
1386 | return ret; | 1394 | return ret; |
1387 | } | 1395 | } |
1396 | #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS | ||
1397 | asmlinkage long SyS_readahead(long fd, loff_t offset, long count) | ||
1398 | { | ||
1399 | return SYSC_readahead((int) fd, offset, (size_t) count); | ||
1400 | } | ||
1401 | SYSCALL_ALIAS(sys_readahead, SyS_readahead); | ||
1402 | #endif | ||
1388 | 1403 | ||
1389 | #ifdef CONFIG_MMU | 1404 | #ifdef CONFIG_MMU |
1390 | /** | 1405 | /** |
@@ -1530,7 +1545,6 @@ retry_find: | |||
1530 | /* | 1545 | /* |
1531 | * Found the page and have a reference on it. | 1546 | * Found the page and have a reference on it. |
1532 | */ | 1547 | */ |
1533 | mark_page_accessed(page); | ||
1534 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; | 1548 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; |
1535 | vmf->page = page; | 1549 | vmf->page = page; |
1536 | return ret | VM_FAULT_LOCKED; | 1550 | return ret | VM_FAULT_LOCKED; |
@@ -1766,7 +1780,7 @@ int should_remove_suid(struct dentry *dentry) | |||
1766 | if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) | 1780 | if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) |
1767 | kill |= ATTR_KILL_SGID; | 1781 | kill |= ATTR_KILL_SGID; |
1768 | 1782 | ||
1769 | if (unlikely(kill && !capable(CAP_FSETID))) | 1783 | if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) |
1770 | return kill; | 1784 | return kill; |
1771 | 1785 | ||
1772 | return 0; | 1786 | return 0; |
@@ -2060,18 +2074,10 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
2060 | if (count != ocount) | 2074 | if (count != ocount) |
2061 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); | 2075 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); |
2062 | 2076 | ||
2063 | /* | ||
2064 | * Unmap all mmappings of the file up-front. | ||
2065 | * | ||
2066 | * This will cause any pte dirty bits to be propagated into the | ||
2067 | * pageframes for the subsequent filemap_write_and_wait(). | ||
2068 | */ | ||
2069 | write_len = iov_length(iov, *nr_segs); | 2077 | write_len = iov_length(iov, *nr_segs); |
2070 | end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; | 2078 | end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; |
2071 | if (mapping_mapped(mapping)) | ||
2072 | unmap_mapping_range(mapping, pos, write_len, 0); | ||
2073 | 2079 | ||
2074 | written = filemap_write_and_wait(mapping); | 2080 | written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); |
2075 | if (written) | 2081 | if (written) |
2076 | goto out; | 2082 | goto out; |
2077 | 2083 | ||
@@ -2140,19 +2146,24 @@ EXPORT_SYMBOL(generic_file_direct_write); | |||
2140 | * Find or create a page at the given pagecache position. Return the locked | 2146 | * Find or create a page at the given pagecache position. Return the locked |
2141 | * page. This function is specifically for buffered writes. | 2147 | * page. This function is specifically for buffered writes. |
2142 | */ | 2148 | */ |
2143 | struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) | 2149 | struct page *grab_cache_page_write_begin(struct address_space *mapping, |
2150 | pgoff_t index, unsigned flags) | ||
2144 | { | 2151 | { |
2145 | int status; | 2152 | int status; |
2146 | struct page *page; | 2153 | struct page *page; |
2154 | gfp_t gfp_notmask = 0; | ||
2155 | if (flags & AOP_FLAG_NOFS) | ||
2156 | gfp_notmask = __GFP_FS; | ||
2147 | repeat: | 2157 | repeat: |
2148 | page = find_lock_page(mapping, index); | 2158 | page = find_lock_page(mapping, index); |
2149 | if (likely(page)) | 2159 | if (likely(page)) |
2150 | return page; | 2160 | return page; |
2151 | 2161 | ||
2152 | page = page_cache_alloc(mapping); | 2162 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); |
2153 | if (!page) | 2163 | if (!page) |
2154 | return NULL; | 2164 | return NULL; |
2155 | status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); | 2165 | status = add_to_page_cache_lru(page, mapping, index, |
2166 | GFP_KERNEL & ~gfp_notmask); | ||
2156 | if (unlikely(status)) { | 2167 | if (unlikely(status)) { |
2157 | page_cache_release(page); | 2168 | page_cache_release(page); |
2158 | if (status == -EEXIST) | 2169 | if (status == -EEXIST) |
@@ -2161,7 +2172,7 @@ repeat: | |||
2161 | } | 2172 | } |
2162 | return page; | 2173 | return page; |
2163 | } | 2174 | } |
2164 | EXPORT_SYMBOL(__grab_cache_page); | 2175 | EXPORT_SYMBOL(grab_cache_page_write_begin); |
2165 | 2176 | ||
2166 | static ssize_t generic_perform_write(struct file *file, | 2177 | static ssize_t generic_perform_write(struct file *file, |
2167 | struct iov_iter *i, loff_t pos) | 2178 | struct iov_iter *i, loff_t pos) |
@@ -2286,7 +2297,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2286 | * the file data here, to try to honour O_DIRECT expectations. | 2297 | * the file data here, to try to honour O_DIRECT expectations. |
2287 | */ | 2298 | */ |
2288 | if (unlikely(file->f_flags & O_DIRECT) && written) | 2299 | if (unlikely(file->f_flags & O_DIRECT) && written) |
2289 | status = filemap_write_and_wait(mapping); | 2300 | status = filemap_write_and_wait_range(mapping, |
2301 | pos, pos + written - 1); | ||
2290 | 2302 | ||
2291 | return written ? written : status; | 2303 | return written ? written : status; |
2292 | } | 2304 | } |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b5167dfb2f2d..0c04615651b7 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -193,7 +193,7 @@ retry: | |||
193 | /* Nuke the page table entry. */ | 193 | /* Nuke the page table entry. */ |
194 | flush_cache_page(vma, address, pte_pfn(*pte)); | 194 | flush_cache_page(vma, address, pte_pfn(*pte)); |
195 | pteval = ptep_clear_flush_notify(vma, address, pte); | 195 | pteval = ptep_clear_flush_notify(vma, address, pte); |
196 | page_remove_rmap(page, vma); | 196 | page_remove_rmap(page); |
197 | dec_mm_counter(mm, file_rss); | 197 | dec_mm_counter(mm, file_rss); |
198 | BUG_ON(pte_dirty(pteval)); | 198 | BUG_ON(pte_dirty(pteval)); |
199 | pte_unmap_unlock(pte, ptl); | 199 | pte_unmap_unlock(pte, ptl); |
diff --git a/mm/fremap.c b/mm/fremap.c index 7d12ca70ef7b..736ba7f3306a 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
37 | if (page) { | 37 | if (page) { |
38 | if (pte_dirty(pte)) | 38 | if (pte_dirty(pte)) |
39 | set_page_dirty(page); | 39 | set_page_dirty(page); |
40 | page_remove_rmap(page, vma); | 40 | page_remove_rmap(page); |
41 | page_cache_release(page); | 41 | page_cache_release(page); |
42 | update_hiwater_rss(mm); | 42 | update_hiwater_rss(mm); |
43 | dec_mm_counter(mm, file_rss); | 43 | dec_mm_counter(mm, file_rss); |
@@ -120,8 +120,8 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, | |||
120 | * and the vma's default protection is used. Arbitrary protections | 120 | * and the vma's default protection is used. Arbitrary protections |
121 | * might be implemented in the future. | 121 | * might be implemented in the future. |
122 | */ | 122 | */ |
123 | asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, | 123 | SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, |
124 | unsigned long prot, unsigned long pgoff, unsigned long flags) | 124 | unsigned long, prot, unsigned long, pgoff, unsigned long, flags) |
125 | { | 125 | { |
126 | struct mm_struct *mm = current->mm; | 126 | struct mm_struct *mm = current->mm; |
127 | struct address_space *mapping; | 127 | struct address_space *mapping; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6058b53dcb89..618e98304080 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -220,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, | |||
220 | } | 220 | } |
221 | 221 | ||
222 | /* | 222 | /* |
223 | * Return the size of the pages allocated when backing a VMA. In the majority | ||
224 | * cases this will be same size as used by the page table entries. | ||
225 | */ | ||
226 | unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) | ||
227 | { | ||
228 | struct hstate *hstate; | ||
229 | |||
230 | if (!is_vm_hugetlb_page(vma)) | ||
231 | return PAGE_SIZE; | ||
232 | |||
233 | hstate = hstate_vma(vma); | ||
234 | |||
235 | return 1UL << (hstate->order + PAGE_SHIFT); | ||
236 | } | ||
237 | |||
238 | /* | ||
239 | * Return the page size being used by the MMU to back a VMA. In the majority | ||
240 | * of cases, the page size used by the kernel matches the MMU size. On | ||
241 | * architectures where it differs, an architecture-specific version of this | ||
242 | * function is required. | ||
243 | */ | ||
244 | #ifndef vma_mmu_pagesize | ||
245 | unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) | ||
246 | { | ||
247 | return vma_kernel_pagesize(vma); | ||
248 | } | ||
249 | #endif | ||
250 | |||
251 | /* | ||
223 | * Flags for MAP_PRIVATE reservations. These are stored in the bottom | 252 | * Flags for MAP_PRIVATE reservations. These are stored in the bottom |
224 | * bits of the reservation map pointer, which are always clear due to | 253 | * bits of the reservation map pointer, which are always clear due to |
225 | * alignment. | 254 | * alignment. |
@@ -371,8 +400,10 @@ static void clear_huge_page(struct page *page, | |||
371 | { | 400 | { |
372 | int i; | 401 | int i; |
373 | 402 | ||
374 | if (unlikely(sz > MAX_ORDER_NR_PAGES)) | 403 | if (unlikely(sz > MAX_ORDER_NR_PAGES)) { |
375 | return clear_gigantic_page(page, addr, sz); | 404 | clear_gigantic_page(page, addr, sz); |
405 | return; | ||
406 | } | ||
376 | 407 | ||
377 | might_sleep(); | 408 | might_sleep(); |
378 | for (i = 0; i < sz/PAGE_SIZE; i++) { | 409 | for (i = 0; i < sz/PAGE_SIZE; i++) { |
@@ -404,8 +435,10 @@ static void copy_huge_page(struct page *dst, struct page *src, | |||
404 | int i; | 435 | int i; |
405 | struct hstate *h = hstate_vma(vma); | 436 | struct hstate *h = hstate_vma(vma); |
406 | 437 | ||
407 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) | 438 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { |
408 | return copy_gigantic_page(dst, src, addr, vma); | 439 | copy_gigantic_page(dst, src, addr, vma); |
440 | return; | ||
441 | } | ||
409 | 442 | ||
410 | might_sleep(); | 443 | might_sleep(); |
411 | for (i = 0; i < pages_per_huge_page(h); i++) { | 444 | for (i = 0; i < pages_per_huge_page(h); i++) { |
@@ -972,7 +1005,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
972 | return page; | 1005 | return page; |
973 | } | 1006 | } |
974 | 1007 | ||
975 | __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) | 1008 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
976 | { | 1009 | { |
977 | struct huge_bootmem_page *m; | 1010 | struct huge_bootmem_page *m; |
978 | int nr_nodes = nodes_weight(node_online_map); | 1011 | int nr_nodes = nodes_weight(node_online_map); |
@@ -991,8 +1024,7 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) | |||
991 | * puts them into the mem_map). | 1024 | * puts them into the mem_map). |
992 | */ | 1025 | */ |
993 | m = addr; | 1026 | m = addr; |
994 | if (m) | 1027 | goto found; |
995 | goto found; | ||
996 | } | 1028 | } |
997 | hstate_next_node(h); | 1029 | hstate_next_node(h); |
998 | nr_nodes--; | 1030 | nr_nodes--; |
diff --git a/mm/internal.h b/mm/internal.h index 13333bc2eb68..478223b73a2a 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page); | |||
49 | /* | 49 | /* |
50 | * in mm/page_alloc.c | 50 | * in mm/page_alloc.c |
51 | */ | 51 | */ |
52 | extern unsigned long highest_memmap_pfn; | ||
52 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 53 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
53 | 54 | ||
54 | /* | 55 | /* |
@@ -275,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
275 | #define GUP_FLAGS_WRITE 0x1 | 276 | #define GUP_FLAGS_WRITE 0x1 |
276 | #define GUP_FLAGS_FORCE 0x2 | 277 | #define GUP_FLAGS_FORCE 0x2 |
277 | #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 | 278 | #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 |
279 | #define GUP_FLAGS_IGNORE_SIGKILL 0x8 | ||
278 | 280 | ||
279 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 281 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
280 | unsigned long start, int len, int flags, | 282 | unsigned long start, int len, int flags, |
diff --git a/mm/madvise.c b/mm/madvise.c index f9349c18a1b5..b9ce574827c8 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -281,7 +281,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
281 | * -EBADF - map exists, but area maps something that isn't a file. | 281 | * -EBADF - map exists, but area maps something that isn't a file. |
282 | * -EAGAIN - a kernel resource was temporarily unavailable. | 282 | * -EAGAIN - a kernel resource was temporarily unavailable. |
283 | */ | 283 | */ |
284 | asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) | 284 | SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) |
285 | { | 285 | { |
286 | unsigned long end, tmp; | 286 | unsigned long end, tmp; |
287 | struct vm_area_struct * vma, *prev; | 287 | struct vm_area_struct * vma, *prev; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 866dcc7eeb0c..8e4be9cb2a6a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -21,11 +21,13 @@ | |||
21 | #include <linux/memcontrol.h> | 21 | #include <linux/memcontrol.h> |
22 | #include <linux/cgroup.h> | 22 | #include <linux/cgroup.h> |
23 | #include <linux/mm.h> | 23 | #include <linux/mm.h> |
24 | #include <linux/pagemap.h> | ||
24 | #include <linux/smp.h> | 25 | #include <linux/smp.h> |
25 | #include <linux/page-flags.h> | 26 | #include <linux/page-flags.h> |
26 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
27 | #include <linux/bit_spinlock.h> | 28 | #include <linux/bit_spinlock.h> |
28 | #include <linux/rcupdate.h> | 29 | #include <linux/rcupdate.h> |
30 | #include <linux/mutex.h> | ||
29 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
30 | #include <linux/swap.h> | 32 | #include <linux/swap.h> |
31 | #include <linux/spinlock.h> | 33 | #include <linux/spinlock.h> |
@@ -34,12 +36,23 @@ | |||
34 | #include <linux/vmalloc.h> | 36 | #include <linux/vmalloc.h> |
35 | #include <linux/mm_inline.h> | 37 | #include <linux/mm_inline.h> |
36 | #include <linux/page_cgroup.h> | 38 | #include <linux/page_cgroup.h> |
39 | #include "internal.h" | ||
37 | 40 | ||
38 | #include <asm/uaccess.h> | 41 | #include <asm/uaccess.h> |
39 | 42 | ||
40 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 43 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
41 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 44 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
42 | 45 | ||
46 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
47 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */ | ||
48 | int do_swap_account __read_mostly; | ||
49 | static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | ||
50 | #else | ||
51 | #define do_swap_account (0) | ||
52 | #endif | ||
53 | |||
54 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | ||
55 | |||
43 | /* | 56 | /* |
44 | * Statistics for memory cgroup. | 57 | * Statistics for memory cgroup. |
45 | */ | 58 | */ |
@@ -60,7 +73,7 @@ struct mem_cgroup_stat_cpu { | |||
60 | } ____cacheline_aligned_in_smp; | 73 | } ____cacheline_aligned_in_smp; |
61 | 74 | ||
62 | struct mem_cgroup_stat { | 75 | struct mem_cgroup_stat { |
63 | struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; | 76 | struct mem_cgroup_stat_cpu cpustat[0]; |
64 | }; | 77 | }; |
65 | 78 | ||
66 | /* | 79 | /* |
@@ -89,9 +102,10 @@ struct mem_cgroup_per_zone { | |||
89 | /* | 102 | /* |
90 | * spin_lock to protect the per cgroup LRU | 103 | * spin_lock to protect the per cgroup LRU |
91 | */ | 104 | */ |
92 | spinlock_t lru_lock; | ||
93 | struct list_head lists[NR_LRU_LISTS]; | 105 | struct list_head lists[NR_LRU_LISTS]; |
94 | unsigned long count[NR_LRU_LISTS]; | 106 | unsigned long count[NR_LRU_LISTS]; |
107 | |||
108 | struct zone_reclaim_stat reclaim_stat; | ||
95 | }; | 109 | }; |
96 | /* Macro for accessing counter */ | 110 | /* Macro for accessing counter */ |
97 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | 111 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) |
@@ -122,44 +136,74 @@ struct mem_cgroup { | |||
122 | */ | 136 | */ |
123 | struct res_counter res; | 137 | struct res_counter res; |
124 | /* | 138 | /* |
139 | * the counter to account for mem+swap usage. | ||
140 | */ | ||
141 | struct res_counter memsw; | ||
142 | /* | ||
125 | * Per cgroup active and inactive list, similar to the | 143 | * Per cgroup active and inactive list, similar to the |
126 | * per zone LRU lists. | 144 | * per zone LRU lists. |
127 | */ | 145 | */ |
128 | struct mem_cgroup_lru_info info; | 146 | struct mem_cgroup_lru_info info; |
129 | 147 | ||
148 | /* | ||
149 | protect against reclaim related member. | ||
150 | */ | ||
151 | spinlock_t reclaim_param_lock; | ||
152 | |||
130 | int prev_priority; /* for recording reclaim priority */ | 153 | int prev_priority; /* for recording reclaim priority */ |
154 | |||
155 | /* | ||
156 | * While reclaiming in a hiearchy, we cache the last child we | ||
157 | * reclaimed from. Protected by hierarchy_mutex | ||
158 | */ | ||
159 | struct mem_cgroup *last_scanned_child; | ||
131 | /* | 160 | /* |
132 | * statistics. | 161 | * Should the accounting and control be hierarchical, per subtree? |
162 | */ | ||
163 | bool use_hierarchy; | ||
164 | unsigned long last_oom_jiffies; | ||
165 | atomic_t refcnt; | ||
166 | |||
167 | unsigned int swappiness; | ||
168 | |||
169 | /* | ||
170 | * statistics. This must be placed at the end of memcg. | ||
133 | */ | 171 | */ |
134 | struct mem_cgroup_stat stat; | 172 | struct mem_cgroup_stat stat; |
135 | }; | 173 | }; |
136 | static struct mem_cgroup init_mem_cgroup; | ||
137 | 174 | ||
138 | enum charge_type { | 175 | enum charge_type { |
139 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 176 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
140 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 177 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
141 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ | 178 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ |
142 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ | 179 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ |
180 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ | ||
143 | NR_CHARGE_TYPE, | 181 | NR_CHARGE_TYPE, |
144 | }; | 182 | }; |
145 | 183 | ||
146 | /* only for here (for easy reading.) */ | 184 | /* only for here (for easy reading.) */ |
147 | #define PCGF_CACHE (1UL << PCG_CACHE) | 185 | #define PCGF_CACHE (1UL << PCG_CACHE) |
148 | #define PCGF_USED (1UL << PCG_USED) | 186 | #define PCGF_USED (1UL << PCG_USED) |
149 | #define PCGF_ACTIVE (1UL << PCG_ACTIVE) | ||
150 | #define PCGF_LOCK (1UL << PCG_LOCK) | 187 | #define PCGF_LOCK (1UL << PCG_LOCK) |
151 | #define PCGF_FILE (1UL << PCG_FILE) | ||
152 | static const unsigned long | 188 | static const unsigned long |
153 | pcg_default_flags[NR_CHARGE_TYPE] = { | 189 | pcg_default_flags[NR_CHARGE_TYPE] = { |
154 | PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ | 190 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ |
155 | PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ | 191 | PCGF_USED | PCGF_LOCK, /* Anon */ |
156 | PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ | 192 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ |
157 | 0, /* FORCE */ | 193 | 0, /* FORCE */ |
158 | }; | 194 | }; |
159 | 195 | ||
160 | /* | 196 | /* for encoding cft->private value on file */ |
161 | * Always modified under lru lock. Then, not necessary to preempt_disable() | 197 | #define _MEM (0) |
162 | */ | 198 | #define _MEMSWAP (1) |
199 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | ||
200 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | ||
201 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | ||
202 | |||
203 | static void mem_cgroup_get(struct mem_cgroup *mem); | ||
204 | static void mem_cgroup_put(struct mem_cgroup *mem); | ||
205 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | ||
206 | |||
163 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 207 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
164 | struct page_cgroup *pc, | 208 | struct page_cgroup *pc, |
165 | bool charge) | 209 | bool charge) |
@@ -167,10 +211,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
167 | int val = (charge)? 1 : -1; | 211 | int val = (charge)? 1 : -1; |
168 | struct mem_cgroup_stat *stat = &mem->stat; | 212 | struct mem_cgroup_stat *stat = &mem->stat; |
169 | struct mem_cgroup_stat_cpu *cpustat; | 213 | struct mem_cgroup_stat_cpu *cpustat; |
214 | int cpu = get_cpu(); | ||
170 | 215 | ||
171 | VM_BUG_ON(!irqs_disabled()); | 216 | cpustat = &stat->cpustat[cpu]; |
172 | |||
173 | cpustat = &stat->cpustat[smp_processor_id()]; | ||
174 | if (PageCgroupCache(pc)) | 217 | if (PageCgroupCache(pc)) |
175 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); | 218 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); |
176 | else | 219 | else |
@@ -182,6 +225,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
182 | else | 225 | else |
183 | __mem_cgroup_stat_add_safe(cpustat, | 226 | __mem_cgroup_stat_add_safe(cpustat, |
184 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 227 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
228 | put_cpu(); | ||
185 | } | 229 | } |
186 | 230 | ||
187 | static struct mem_cgroup_per_zone * | 231 | static struct mem_cgroup_per_zone * |
@@ -197,6 +241,9 @@ page_cgroup_zoneinfo(struct page_cgroup *pc) | |||
197 | int nid = page_cgroup_nid(pc); | 241 | int nid = page_cgroup_nid(pc); |
198 | int zid = page_cgroup_zid(pc); | 242 | int zid = page_cgroup_zid(pc); |
199 | 243 | ||
244 | if (!mem) | ||
245 | return NULL; | ||
246 | |||
200 | return mem_cgroup_zoneinfo(mem, nid, zid); | 247 | return mem_cgroup_zoneinfo(mem, nid, zid); |
201 | } | 248 | } |
202 | 249 | ||
@@ -236,118 +283,169 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
236 | struct mem_cgroup, css); | 283 | struct mem_cgroup, css); |
237 | } | 284 | } |
238 | 285 | ||
239 | static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, | 286 | static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) |
240 | struct page_cgroup *pc) | ||
241 | { | 287 | { |
242 | int lru = LRU_BASE; | 288 | struct mem_cgroup *mem = NULL; |
243 | 289 | /* | |
244 | if (PageCgroupUnevictable(pc)) | 290 | * Because we have no locks, mm->owner's may be being moved to other |
245 | lru = LRU_UNEVICTABLE; | 291 | * cgroup. We use css_tryget() here even if this looks |
246 | else { | 292 | * pessimistic (rather than adding locks here). |
247 | if (PageCgroupActive(pc)) | 293 | */ |
248 | lru += LRU_ACTIVE; | 294 | rcu_read_lock(); |
249 | if (PageCgroupFile(pc)) | 295 | do { |
250 | lru += LRU_FILE; | 296 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
251 | } | 297 | if (unlikely(!mem)) |
252 | 298 | break; | |
253 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 299 | } while (!css_tryget(&mem->css)); |
254 | 300 | rcu_read_unlock(); | |
255 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); | 301 | return mem; |
256 | list_del(&pc->lru); | ||
257 | } | 302 | } |
258 | 303 | ||
259 | static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, | 304 | static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) |
260 | struct page_cgroup *pc) | ||
261 | { | 305 | { |
262 | int lru = LRU_BASE; | 306 | if (!mem) |
307 | return true; | ||
308 | return css_is_removed(&mem->css); | ||
309 | } | ||
263 | 310 | ||
264 | if (PageCgroupUnevictable(pc)) | 311 | /* |
265 | lru = LRU_UNEVICTABLE; | 312 | * Following LRU functions are allowed to be used without PCG_LOCK. |
266 | else { | 313 | * Operations are called by routine of global LRU independently from memcg. |
267 | if (PageCgroupActive(pc)) | 314 | * What we have to take care of here is validness of pc->mem_cgroup. |
268 | lru += LRU_ACTIVE; | 315 | * |
269 | if (PageCgroupFile(pc)) | 316 | * Changes to pc->mem_cgroup happens when |
270 | lru += LRU_FILE; | 317 | * 1. charge |
271 | } | 318 | * 2. moving account |
319 | * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. | ||
320 | * It is added to LRU before charge. | ||
321 | * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. | ||
322 | * When moving account, the page is not on LRU. It's isolated. | ||
323 | */ | ||
272 | 324 | ||
273 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 325 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) |
274 | list_add(&pc->lru, &mz->lists[lru]); | 326 | { |
327 | struct page_cgroup *pc; | ||
328 | struct mem_cgroup *mem; | ||
329 | struct mem_cgroup_per_zone *mz; | ||
275 | 330 | ||
276 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); | 331 | if (mem_cgroup_disabled()) |
332 | return; | ||
333 | pc = lookup_page_cgroup(page); | ||
334 | /* can happen while we handle swapcache. */ | ||
335 | if (list_empty(&pc->lru) || !pc->mem_cgroup) | ||
336 | return; | ||
337 | /* | ||
338 | * We don't check PCG_USED bit. It's cleared when the "page" is finally | ||
339 | * removed from global LRU. | ||
340 | */ | ||
341 | mz = page_cgroup_zoneinfo(pc); | ||
342 | mem = pc->mem_cgroup; | ||
343 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | ||
344 | list_del_init(&pc->lru); | ||
345 | return; | ||
277 | } | 346 | } |
278 | 347 | ||
279 | static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) | 348 | void mem_cgroup_del_lru(struct page *page) |
280 | { | 349 | { |
281 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | 350 | mem_cgroup_del_lru_list(page, page_lru(page)); |
282 | int active = PageCgroupActive(pc); | 351 | } |
283 | int file = PageCgroupFile(pc); | ||
284 | int unevictable = PageCgroupUnevictable(pc); | ||
285 | enum lru_list from = unevictable ? LRU_UNEVICTABLE : | ||
286 | (LRU_FILE * !!file + !!active); | ||
287 | 352 | ||
288 | if (lru == from) | 353 | void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) |
354 | { | ||
355 | struct mem_cgroup_per_zone *mz; | ||
356 | struct page_cgroup *pc; | ||
357 | |||
358 | if (mem_cgroup_disabled()) | ||
289 | return; | 359 | return; |
290 | 360 | ||
291 | MEM_CGROUP_ZSTAT(mz, from) -= 1; | 361 | pc = lookup_page_cgroup(page); |
292 | /* | 362 | /* |
293 | * However this is done under mz->lru_lock, another flags, which | 363 | * Used bit is set without atomic ops but after smp_wmb(). |
294 | * are not related to LRU, will be modified from out-of-lock. | 364 | * For making pc->mem_cgroup visible, insert smp_rmb() here. |
295 | * We have to use atomic set/clear flags. | ||
296 | */ | 365 | */ |
297 | if (is_unevictable_lru(lru)) { | 366 | smp_rmb(); |
298 | ClearPageCgroupActive(pc); | 367 | /* unused page is not rotated. */ |
299 | SetPageCgroupUnevictable(pc); | 368 | if (!PageCgroupUsed(pc)) |
300 | } else { | 369 | return; |
301 | if (is_active_lru(lru)) | 370 | mz = page_cgroup_zoneinfo(pc); |
302 | SetPageCgroupActive(pc); | ||
303 | else | ||
304 | ClearPageCgroupActive(pc); | ||
305 | ClearPageCgroupUnevictable(pc); | ||
306 | } | ||
307 | |||
308 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | ||
309 | list_move(&pc->lru, &mz->lists[lru]); | 371 | list_move(&pc->lru, &mz->lists[lru]); |
310 | } | 372 | } |
311 | 373 | ||
312 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | 374 | void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) |
313 | { | 375 | { |
314 | int ret; | 376 | struct page_cgroup *pc; |
377 | struct mem_cgroup_per_zone *mz; | ||
315 | 378 | ||
316 | task_lock(task); | 379 | if (mem_cgroup_disabled()) |
317 | ret = task->mm && mm_match_cgroup(task->mm, mem); | 380 | return; |
318 | task_unlock(task); | 381 | pc = lookup_page_cgroup(page); |
319 | return ret; | 382 | /* |
383 | * Used bit is set without atomic ops but after smp_wmb(). | ||
384 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | ||
385 | */ | ||
386 | smp_rmb(); | ||
387 | if (!PageCgroupUsed(pc)) | ||
388 | return; | ||
389 | |||
390 | mz = page_cgroup_zoneinfo(pc); | ||
391 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | ||
392 | list_add(&pc->lru, &mz->lists[lru]); | ||
320 | } | 393 | } |
321 | 394 | ||
322 | /* | 395 | /* |
323 | * This routine assumes that the appropriate zone's lru lock is already held | 396 | * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to |
397 | * lru because the page may.be reused after it's fully uncharged (because of | ||
398 | * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge | ||
399 | * it again. This function is only used to charge SwapCache. It's done under | ||
400 | * lock_page and expected that zone->lru_lock is never held. | ||
324 | */ | 401 | */ |
325 | void mem_cgroup_move_lists(struct page *page, enum lru_list lru) | 402 | static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) |
326 | { | 403 | { |
327 | struct page_cgroup *pc; | ||
328 | struct mem_cgroup_per_zone *mz; | ||
329 | unsigned long flags; | 404 | unsigned long flags; |
405 | struct zone *zone = page_zone(page); | ||
406 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
330 | 407 | ||
331 | if (mem_cgroup_subsys.disabled) | 408 | spin_lock_irqsave(&zone->lru_lock, flags); |
332 | return; | ||
333 | |||
334 | /* | 409 | /* |
335 | * We cannot lock_page_cgroup while holding zone's lru_lock, | 410 | * Forget old LRU when this page_cgroup is *not* used. This Used bit |
336 | * because other holders of lock_page_cgroup can be interrupted | 411 | * is guarded by lock_page() because the page is SwapCache. |
337 | * with an attempt to rotate_reclaimable_page. But we cannot | ||
338 | * safely get to page_cgroup without it, so just try_lock it: | ||
339 | * mem_cgroup_isolate_pages allows for page left on wrong list. | ||
340 | */ | 412 | */ |
341 | pc = lookup_page_cgroup(page); | 413 | if (!PageCgroupUsed(pc)) |
342 | if (!trylock_page_cgroup(pc)) | 414 | mem_cgroup_del_lru_list(page, page_lru(page)); |
415 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
416 | } | ||
417 | |||
418 | static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) | ||
419 | { | ||
420 | unsigned long flags; | ||
421 | struct zone *zone = page_zone(page); | ||
422 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
423 | |||
424 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
425 | /* link when the page is linked to LRU but page_cgroup isn't */ | ||
426 | if (PageLRU(page) && list_empty(&pc->lru)) | ||
427 | mem_cgroup_add_lru_list(page, page_lru(page)); | ||
428 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
429 | } | ||
430 | |||
431 | |||
432 | void mem_cgroup_move_lists(struct page *page, | ||
433 | enum lru_list from, enum lru_list to) | ||
434 | { | ||
435 | if (mem_cgroup_disabled()) | ||
343 | return; | 436 | return; |
344 | if (pc && PageCgroupUsed(pc)) { | 437 | mem_cgroup_del_lru_list(page, from); |
345 | mz = page_cgroup_zoneinfo(pc); | 438 | mem_cgroup_add_lru_list(page, to); |
346 | spin_lock_irqsave(&mz->lru_lock, flags); | 439 | } |
347 | __mem_cgroup_move_lists(pc, lru); | 440 | |
348 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 441 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) |
349 | } | 442 | { |
350 | unlock_page_cgroup(pc); | 443 | int ret; |
444 | |||
445 | task_lock(task); | ||
446 | ret = task->mm && mm_match_cgroup(task->mm, mem); | ||
447 | task_unlock(task); | ||
448 | return ret; | ||
351 | } | 449 | } |
352 | 450 | ||
353 | /* | 451 | /* |
@@ -372,39 +470,116 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) | |||
372 | */ | 470 | */ |
373 | int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) | 471 | int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) |
374 | { | 472 | { |
375 | return mem->prev_priority; | 473 | int prev_priority; |
474 | |||
475 | spin_lock(&mem->reclaim_param_lock); | ||
476 | prev_priority = mem->prev_priority; | ||
477 | spin_unlock(&mem->reclaim_param_lock); | ||
478 | |||
479 | return prev_priority; | ||
376 | } | 480 | } |
377 | 481 | ||
378 | void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) | 482 | void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) |
379 | { | 483 | { |
484 | spin_lock(&mem->reclaim_param_lock); | ||
380 | if (priority < mem->prev_priority) | 485 | if (priority < mem->prev_priority) |
381 | mem->prev_priority = priority; | 486 | mem->prev_priority = priority; |
487 | spin_unlock(&mem->reclaim_param_lock); | ||
382 | } | 488 | } |
383 | 489 | ||
384 | void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) | 490 | void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) |
385 | { | 491 | { |
492 | spin_lock(&mem->reclaim_param_lock); | ||
386 | mem->prev_priority = priority; | 493 | mem->prev_priority = priority; |
494 | spin_unlock(&mem->reclaim_param_lock); | ||
387 | } | 495 | } |
388 | 496 | ||
389 | /* | 497 | static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) |
390 | * Calculate # of pages to be scanned in this priority/zone. | 498 | { |
391 | * See also vmscan.c | 499 | unsigned long active; |
392 | * | 500 | unsigned long inactive; |
393 | * priority starts from "DEF_PRIORITY" and decremented in each loop. | 501 | unsigned long gb; |
394 | * (see include/linux/mmzone.h) | 502 | unsigned long inactive_ratio; |
395 | */ | 503 | |
504 | inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); | ||
505 | active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); | ||
506 | |||
507 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | ||
508 | if (gb) | ||
509 | inactive_ratio = int_sqrt(10 * gb); | ||
510 | else | ||
511 | inactive_ratio = 1; | ||
512 | |||
513 | if (present_pages) { | ||
514 | present_pages[0] = inactive; | ||
515 | present_pages[1] = active; | ||
516 | } | ||
517 | |||
518 | return inactive_ratio; | ||
519 | } | ||
396 | 520 | ||
397 | long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, | 521 | int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) |
398 | int priority, enum lru_list lru) | 522 | { |
523 | unsigned long active; | ||
524 | unsigned long inactive; | ||
525 | unsigned long present_pages[2]; | ||
526 | unsigned long inactive_ratio; | ||
527 | |||
528 | inactive_ratio = calc_inactive_ratio(memcg, present_pages); | ||
529 | |||
530 | inactive = present_pages[0]; | ||
531 | active = present_pages[1]; | ||
532 | |||
533 | if (inactive * inactive_ratio < active) | ||
534 | return 1; | ||
535 | |||
536 | return 0; | ||
537 | } | ||
538 | |||
539 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | ||
540 | struct zone *zone, | ||
541 | enum lru_list lru) | ||
399 | { | 542 | { |
400 | long nr_pages; | ||
401 | int nid = zone->zone_pgdat->node_id; | 543 | int nid = zone->zone_pgdat->node_id; |
402 | int zid = zone_idx(zone); | 544 | int zid = zone_idx(zone); |
403 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); | 545 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
404 | 546 | ||
405 | nr_pages = MEM_CGROUP_ZSTAT(mz, lru); | 547 | return MEM_CGROUP_ZSTAT(mz, lru); |
548 | } | ||
549 | |||
550 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, | ||
551 | struct zone *zone) | ||
552 | { | ||
553 | int nid = zone->zone_pgdat->node_id; | ||
554 | int zid = zone_idx(zone); | ||
555 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
406 | 556 | ||
407 | return (nr_pages >> priority); | 557 | return &mz->reclaim_stat; |
558 | } | ||
559 | |||
560 | struct zone_reclaim_stat * | ||
561 | mem_cgroup_get_reclaim_stat_from_page(struct page *page) | ||
562 | { | ||
563 | struct page_cgroup *pc; | ||
564 | struct mem_cgroup_per_zone *mz; | ||
565 | |||
566 | if (mem_cgroup_disabled()) | ||
567 | return NULL; | ||
568 | |||
569 | pc = lookup_page_cgroup(page); | ||
570 | /* | ||
571 | * Used bit is set without atomic ops but after smp_wmb(). | ||
572 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | ||
573 | */ | ||
574 | smp_rmb(); | ||
575 | if (!PageCgroupUsed(pc)) | ||
576 | return NULL; | ||
577 | |||
578 | mz = page_cgroup_zoneinfo(pc); | ||
579 | if (!mz) | ||
580 | return NULL; | ||
581 | |||
582 | return &mz->reclaim_stat; | ||
408 | } | 583 | } |
409 | 584 | ||
410 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | 585 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, |
@@ -429,94 +604,279 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
429 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | 604 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); |
430 | src = &mz->lists[lru]; | 605 | src = &mz->lists[lru]; |
431 | 606 | ||
432 | spin_lock(&mz->lru_lock); | ||
433 | scan = 0; | 607 | scan = 0; |
434 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { | 608 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { |
435 | if (scan >= nr_to_scan) | 609 | if (scan >= nr_to_scan) |
436 | break; | 610 | break; |
611 | |||
612 | page = pc->page; | ||
437 | if (unlikely(!PageCgroupUsed(pc))) | 613 | if (unlikely(!PageCgroupUsed(pc))) |
438 | continue; | 614 | continue; |
439 | page = pc->page; | ||
440 | |||
441 | if (unlikely(!PageLRU(page))) | 615 | if (unlikely(!PageLRU(page))) |
442 | continue; | 616 | continue; |
443 | 617 | ||
444 | /* | ||
445 | * TODO: play better with lumpy reclaim, grabbing anything. | ||
446 | */ | ||
447 | if (PageUnevictable(page) || | ||
448 | (PageActive(page) && !active) || | ||
449 | (!PageActive(page) && active)) { | ||
450 | __mem_cgroup_move_lists(pc, page_lru(page)); | ||
451 | continue; | ||
452 | } | ||
453 | |||
454 | scan++; | 618 | scan++; |
455 | list_move(&pc->lru, &pc_list); | ||
456 | |||
457 | if (__isolate_lru_page(page, mode, file) == 0) { | 619 | if (__isolate_lru_page(page, mode, file) == 0) { |
458 | list_move(&page->lru, dst); | 620 | list_move(&page->lru, dst); |
459 | nr_taken++; | 621 | nr_taken++; |
460 | } | 622 | } |
461 | } | 623 | } |
462 | 624 | ||
463 | list_splice(&pc_list, src); | ||
464 | spin_unlock(&mz->lru_lock); | ||
465 | |||
466 | *scanned = scan; | 625 | *scanned = scan; |
467 | return nr_taken; | 626 | return nr_taken; |
468 | } | 627 | } |
469 | 628 | ||
629 | #define mem_cgroup_from_res_counter(counter, member) \ | ||
630 | container_of(counter, struct mem_cgroup, member) | ||
631 | |||
470 | /* | 632 | /* |
471 | * Charge the memory controller for page usage. | 633 | * This routine finds the DFS walk successor. This routine should be |
472 | * Return | 634 | * called with hierarchy_mutex held |
473 | * 0 if the charge was successful | ||
474 | * < 0 if the cgroup is over its limit | ||
475 | */ | 635 | */ |
476 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | 636 | static struct mem_cgroup * |
477 | gfp_t gfp_mask, enum charge_type ctype, | 637 | __mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) |
478 | struct mem_cgroup *memcg) | ||
479 | { | 638 | { |
639 | struct cgroup *cgroup, *curr_cgroup, *root_cgroup; | ||
640 | |||
641 | curr_cgroup = curr->css.cgroup; | ||
642 | root_cgroup = root_mem->css.cgroup; | ||
643 | |||
644 | if (!list_empty(&curr_cgroup->children)) { | ||
645 | /* | ||
646 | * Walk down to children | ||
647 | */ | ||
648 | cgroup = list_entry(curr_cgroup->children.next, | ||
649 | struct cgroup, sibling); | ||
650 | curr = mem_cgroup_from_cont(cgroup); | ||
651 | goto done; | ||
652 | } | ||
653 | |||
654 | visit_parent: | ||
655 | if (curr_cgroup == root_cgroup) { | ||
656 | /* caller handles NULL case */ | ||
657 | curr = NULL; | ||
658 | goto done; | ||
659 | } | ||
660 | |||
661 | /* | ||
662 | * Goto next sibling | ||
663 | */ | ||
664 | if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { | ||
665 | cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, | ||
666 | sibling); | ||
667 | curr = mem_cgroup_from_cont(cgroup); | ||
668 | goto done; | ||
669 | } | ||
670 | |||
671 | /* | ||
672 | * Go up to next parent and next parent's sibling if need be | ||
673 | */ | ||
674 | curr_cgroup = curr_cgroup->parent; | ||
675 | goto visit_parent; | ||
676 | |||
677 | done: | ||
678 | return curr; | ||
679 | } | ||
680 | |||
681 | /* | ||
682 | * Visit the first child (need not be the first child as per the ordering | ||
683 | * of the cgroup list, since we track last_scanned_child) of @mem and use | ||
684 | * that to reclaim free pages from. | ||
685 | */ | ||
686 | static struct mem_cgroup * | ||
687 | mem_cgroup_get_next_node(struct mem_cgroup *root_mem) | ||
688 | { | ||
689 | struct cgroup *cgroup; | ||
690 | struct mem_cgroup *orig, *next; | ||
691 | bool obsolete; | ||
692 | |||
693 | /* | ||
694 | * Scan all children under the mem_cgroup mem | ||
695 | */ | ||
696 | mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); | ||
697 | |||
698 | orig = root_mem->last_scanned_child; | ||
699 | obsolete = mem_cgroup_is_obsolete(orig); | ||
700 | |||
701 | if (list_empty(&root_mem->css.cgroup->children)) { | ||
702 | /* | ||
703 | * root_mem might have children before and last_scanned_child | ||
704 | * may point to one of them. We put it later. | ||
705 | */ | ||
706 | if (orig) | ||
707 | VM_BUG_ON(!obsolete); | ||
708 | next = NULL; | ||
709 | goto done; | ||
710 | } | ||
711 | |||
712 | if (!orig || obsolete) { | ||
713 | cgroup = list_first_entry(&root_mem->css.cgroup->children, | ||
714 | struct cgroup, sibling); | ||
715 | next = mem_cgroup_from_cont(cgroup); | ||
716 | } else | ||
717 | next = __mem_cgroup_get_next_node(orig, root_mem); | ||
718 | |||
719 | done: | ||
720 | if (next) | ||
721 | mem_cgroup_get(next); | ||
722 | root_mem->last_scanned_child = next; | ||
723 | if (orig) | ||
724 | mem_cgroup_put(orig); | ||
725 | mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); | ||
726 | return (next) ? next : root_mem; | ||
727 | } | ||
728 | |||
729 | static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) | ||
730 | { | ||
731 | if (do_swap_account) { | ||
732 | if (res_counter_check_under_limit(&mem->res) && | ||
733 | res_counter_check_under_limit(&mem->memsw)) | ||
734 | return true; | ||
735 | } else | ||
736 | if (res_counter_check_under_limit(&mem->res)) | ||
737 | return true; | ||
738 | return false; | ||
739 | } | ||
740 | |||
741 | static unsigned int get_swappiness(struct mem_cgroup *memcg) | ||
742 | { | ||
743 | struct cgroup *cgrp = memcg->css.cgroup; | ||
744 | unsigned int swappiness; | ||
745 | |||
746 | /* root ? */ | ||
747 | if (cgrp->parent == NULL) | ||
748 | return vm_swappiness; | ||
749 | |||
750 | spin_lock(&memcg->reclaim_param_lock); | ||
751 | swappiness = memcg->swappiness; | ||
752 | spin_unlock(&memcg->reclaim_param_lock); | ||
753 | |||
754 | return swappiness; | ||
755 | } | ||
756 | |||
757 | /* | ||
758 | * Dance down the hierarchy if needed to reclaim memory. We remember the | ||
759 | * last child we reclaimed from, so that we don't end up penalizing | ||
760 | * one child extensively based on its position in the children list. | ||
761 | * | ||
762 | * root_mem is the original ancestor that we've been reclaim from. | ||
763 | */ | ||
764 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | ||
765 | gfp_t gfp_mask, bool noswap) | ||
766 | { | ||
767 | struct mem_cgroup *next_mem; | ||
768 | int ret = 0; | ||
769 | |||
770 | /* | ||
771 | * Reclaim unconditionally and don't check for return value. | ||
772 | * We need to reclaim in the current group and down the tree. | ||
773 | * One might think about checking for children before reclaiming, | ||
774 | * but there might be left over accounting, even after children | ||
775 | * have left. | ||
776 | */ | ||
777 | ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, | ||
778 | get_swappiness(root_mem)); | ||
779 | if (mem_cgroup_check_under_limit(root_mem)) | ||
780 | return 1; /* indicate reclaim has succeeded */ | ||
781 | if (!root_mem->use_hierarchy) | ||
782 | return ret; | ||
783 | |||
784 | next_mem = mem_cgroup_get_next_node(root_mem); | ||
785 | |||
786 | while (next_mem != root_mem) { | ||
787 | if (mem_cgroup_is_obsolete(next_mem)) { | ||
788 | next_mem = mem_cgroup_get_next_node(root_mem); | ||
789 | continue; | ||
790 | } | ||
791 | ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, | ||
792 | get_swappiness(next_mem)); | ||
793 | if (mem_cgroup_check_under_limit(root_mem)) | ||
794 | return 1; /* indicate reclaim has succeeded */ | ||
795 | next_mem = mem_cgroup_get_next_node(root_mem); | ||
796 | } | ||
797 | return ret; | ||
798 | } | ||
799 | |||
800 | bool mem_cgroup_oom_called(struct task_struct *task) | ||
801 | { | ||
802 | bool ret = false; | ||
480 | struct mem_cgroup *mem; | 803 | struct mem_cgroup *mem; |
481 | struct page_cgroup *pc; | 804 | struct mm_struct *mm; |
482 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
483 | struct mem_cgroup_per_zone *mz; | ||
484 | unsigned long flags; | ||
485 | 805 | ||
486 | pc = lookup_page_cgroup(page); | 806 | rcu_read_lock(); |
487 | /* can happen at boot */ | 807 | mm = task->mm; |
488 | if (unlikely(!pc)) | 808 | if (!mm) |
809 | mm = &init_mm; | ||
810 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | ||
811 | if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) | ||
812 | ret = true; | ||
813 | rcu_read_unlock(); | ||
814 | return ret; | ||
815 | } | ||
816 | /* | ||
817 | * Unlike exported interface, "oom" parameter is added. if oom==true, | ||
818 | * oom-killer can be invoked. | ||
819 | */ | ||
820 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | ||
821 | gfp_t gfp_mask, struct mem_cgroup **memcg, | ||
822 | bool oom) | ||
823 | { | ||
824 | struct mem_cgroup *mem, *mem_over_limit; | ||
825 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
826 | struct res_counter *fail_res; | ||
827 | |||
828 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | ||
829 | /* Don't account this! */ | ||
830 | *memcg = NULL; | ||
489 | return 0; | 831 | return 0; |
490 | prefetchw(pc); | 832 | } |
833 | |||
491 | /* | 834 | /* |
492 | * We always charge the cgroup the mm_struct belongs to. | 835 | * We always charge the cgroup the mm_struct belongs to. |
493 | * The mm_struct's mem_cgroup changes on task migration if the | 836 | * The mm_struct's mem_cgroup changes on task migration if the |
494 | * thread group leader migrates. It's possible that mm is not | 837 | * thread group leader migrates. It's possible that mm is not |
495 | * set, if so charge the init_mm (happens for pagecache usage). | 838 | * set, if so charge the init_mm (happens for pagecache usage). |
496 | */ | 839 | */ |
497 | 840 | mem = *memcg; | |
498 | if (likely(!memcg)) { | 841 | if (likely(!mem)) { |
499 | rcu_read_lock(); | 842 | mem = try_get_mem_cgroup_from_mm(mm); |
500 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 843 | *memcg = mem; |
501 | if (unlikely(!mem)) { | ||
502 | rcu_read_unlock(); | ||
503 | return 0; | ||
504 | } | ||
505 | /* | ||
506 | * For every charge from the cgroup, increment reference count | ||
507 | */ | ||
508 | css_get(&mem->css); | ||
509 | rcu_read_unlock(); | ||
510 | } else { | 844 | } else { |
511 | mem = memcg; | 845 | css_get(&mem->css); |
512 | css_get(&memcg->css); | ||
513 | } | 846 | } |
847 | if (unlikely(!mem)) | ||
848 | return 0; | ||
849 | |||
850 | VM_BUG_ON(mem_cgroup_is_obsolete(mem)); | ||
851 | |||
852 | while (1) { | ||
853 | int ret; | ||
854 | bool noswap = false; | ||
855 | |||
856 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | ||
857 | if (likely(!ret)) { | ||
858 | if (!do_swap_account) | ||
859 | break; | ||
860 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | ||
861 | &fail_res); | ||
862 | if (likely(!ret)) | ||
863 | break; | ||
864 | /* mem+swap counter fails */ | ||
865 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
866 | noswap = true; | ||
867 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | ||
868 | memsw); | ||
869 | } else | ||
870 | /* mem counter fails */ | ||
871 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | ||
872 | res); | ||
514 | 873 | ||
515 | while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { | ||
516 | if (!(gfp_mask & __GFP_WAIT)) | 874 | if (!(gfp_mask & __GFP_WAIT)) |
517 | goto out; | 875 | goto nomem; |
518 | 876 | ||
519 | if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) | 877 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, |
878 | noswap); | ||
879 | if (ret) | ||
520 | continue; | 880 | continue; |
521 | 881 | ||
522 | /* | 882 | /* |
@@ -525,49 +885,221 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
525 | * moved to swap cache or just unmapped from the cgroup. | 885 | * moved to swap cache or just unmapped from the cgroup. |
526 | * Check the limit again to see if the reclaim reduced the | 886 | * Check the limit again to see if the reclaim reduced the |
527 | * current usage of the cgroup before giving up | 887 | * current usage of the cgroup before giving up |
888 | * | ||
528 | */ | 889 | */ |
529 | if (res_counter_check_under_limit(&mem->res)) | 890 | if (mem_cgroup_check_under_limit(mem_over_limit)) |
530 | continue; | 891 | continue; |
531 | 892 | ||
532 | if (!nr_retries--) { | 893 | if (!nr_retries--) { |
533 | mem_cgroup_out_of_memory(mem, gfp_mask); | 894 | if (oom) { |
534 | goto out; | 895 | mutex_lock(&memcg_tasklist); |
896 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | ||
897 | mutex_unlock(&memcg_tasklist); | ||
898 | mem_over_limit->last_oom_jiffies = jiffies; | ||
899 | } | ||
900 | goto nomem; | ||
535 | } | 901 | } |
536 | } | 902 | } |
903 | return 0; | ||
904 | nomem: | ||
905 | css_put(&mem->css); | ||
906 | return -ENOMEM; | ||
907 | } | ||
908 | |||
909 | static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) | ||
910 | { | ||
911 | struct mem_cgroup *mem; | ||
912 | swp_entry_t ent; | ||
913 | |||
914 | if (!PageSwapCache(page)) | ||
915 | return NULL; | ||
537 | 916 | ||
917 | ent.val = page_private(page); | ||
918 | mem = lookup_swap_cgroup(ent); | ||
919 | if (!mem) | ||
920 | return NULL; | ||
921 | if (!css_tryget(&mem->css)) | ||
922 | return NULL; | ||
923 | return mem; | ||
924 | } | ||
925 | |||
926 | /* | ||
927 | * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be | ||
928 | * USED state. If already USED, uncharge and return. | ||
929 | */ | ||
930 | |||
931 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | ||
932 | struct page_cgroup *pc, | ||
933 | enum charge_type ctype) | ||
934 | { | ||
935 | /* try_charge() can return NULL to *memcg, taking care of it. */ | ||
936 | if (!mem) | ||
937 | return; | ||
538 | 938 | ||
539 | lock_page_cgroup(pc); | 939 | lock_page_cgroup(pc); |
540 | if (unlikely(PageCgroupUsed(pc))) { | 940 | if (unlikely(PageCgroupUsed(pc))) { |
541 | unlock_page_cgroup(pc); | 941 | unlock_page_cgroup(pc); |
542 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 942 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
943 | if (do_swap_account) | ||
944 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
543 | css_put(&mem->css); | 945 | css_put(&mem->css); |
544 | 946 | return; | |
545 | goto done; | ||
546 | } | 947 | } |
547 | pc->mem_cgroup = mem; | 948 | pc->mem_cgroup = mem; |
548 | /* | 949 | smp_wmb(); |
549 | * If a page is accounted as a page cache, insert to inactive list. | ||
550 | * If anon, insert to active list. | ||
551 | */ | ||
552 | pc->flags = pcg_default_flags[ctype]; | 950 | pc->flags = pcg_default_flags[ctype]; |
553 | 951 | ||
554 | mz = page_cgroup_zoneinfo(pc); | 952 | mem_cgroup_charge_statistics(mem, pc, true); |
555 | 953 | ||
556 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
557 | __mem_cgroup_add_list(mz, pc); | ||
558 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
559 | unlock_page_cgroup(pc); | 954 | unlock_page_cgroup(pc); |
955 | } | ||
560 | 956 | ||
561 | done: | 957 | /** |
562 | return 0; | 958 | * mem_cgroup_move_account - move account of the page |
959 | * @pc: page_cgroup of the page. | ||
960 | * @from: mem_cgroup which the page is moved from. | ||
961 | * @to: mem_cgroup which the page is moved to. @from != @to. | ||
962 | * | ||
963 | * The caller must confirm following. | ||
964 | * - page is not on LRU (isolate_page() is useful.) | ||
965 | * | ||
966 | * returns 0 at success, | ||
967 | * returns -EBUSY when lock is busy or "pc" is unstable. | ||
968 | * | ||
969 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | ||
970 | * new cgroup. It should be done by a caller. | ||
971 | */ | ||
972 | |||
973 | static int mem_cgroup_move_account(struct page_cgroup *pc, | ||
974 | struct mem_cgroup *from, struct mem_cgroup *to) | ||
975 | { | ||
976 | struct mem_cgroup_per_zone *from_mz, *to_mz; | ||
977 | int nid, zid; | ||
978 | int ret = -EBUSY; | ||
979 | |||
980 | VM_BUG_ON(from == to); | ||
981 | VM_BUG_ON(PageLRU(pc->page)); | ||
982 | |||
983 | nid = page_cgroup_nid(pc); | ||
984 | zid = page_cgroup_zid(pc); | ||
985 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); | ||
986 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); | ||
987 | |||
988 | if (!trylock_page_cgroup(pc)) | ||
989 | return ret; | ||
990 | |||
991 | if (!PageCgroupUsed(pc)) | ||
992 | goto out; | ||
993 | |||
994 | if (pc->mem_cgroup != from) | ||
995 | goto out; | ||
996 | |||
997 | res_counter_uncharge(&from->res, PAGE_SIZE); | ||
998 | mem_cgroup_charge_statistics(from, pc, false); | ||
999 | if (do_swap_account) | ||
1000 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
1001 | css_put(&from->css); | ||
1002 | |||
1003 | css_get(&to->css); | ||
1004 | pc->mem_cgroup = to; | ||
1005 | mem_cgroup_charge_statistics(to, pc, true); | ||
1006 | ret = 0; | ||
563 | out: | 1007 | out: |
564 | css_put(&mem->css); | 1008 | unlock_page_cgroup(pc); |
565 | return -ENOMEM; | 1009 | return ret; |
1010 | } | ||
1011 | |||
1012 | /* | ||
1013 | * move charges to its parent. | ||
1014 | */ | ||
1015 | |||
1016 | static int mem_cgroup_move_parent(struct page_cgroup *pc, | ||
1017 | struct mem_cgroup *child, | ||
1018 | gfp_t gfp_mask) | ||
1019 | { | ||
1020 | struct page *page = pc->page; | ||
1021 | struct cgroup *cg = child->css.cgroup; | ||
1022 | struct cgroup *pcg = cg->parent; | ||
1023 | struct mem_cgroup *parent; | ||
1024 | int ret; | ||
1025 | |||
1026 | /* Is ROOT ? */ | ||
1027 | if (!pcg) | ||
1028 | return -EINVAL; | ||
1029 | |||
1030 | |||
1031 | parent = mem_cgroup_from_cont(pcg); | ||
1032 | |||
1033 | |||
1034 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | ||
1035 | if (ret || !parent) | ||
1036 | return ret; | ||
1037 | |||
1038 | if (!get_page_unless_zero(page)) { | ||
1039 | ret = -EBUSY; | ||
1040 | goto uncharge; | ||
1041 | } | ||
1042 | |||
1043 | ret = isolate_lru_page(page); | ||
1044 | |||
1045 | if (ret) | ||
1046 | goto cancel; | ||
1047 | |||
1048 | ret = mem_cgroup_move_account(pc, child, parent); | ||
1049 | |||
1050 | putback_lru_page(page); | ||
1051 | if (!ret) { | ||
1052 | put_page(page); | ||
1053 | /* drop extra refcnt by try_charge() */ | ||
1054 | css_put(&parent->css); | ||
1055 | return 0; | ||
1056 | } | ||
1057 | |||
1058 | cancel: | ||
1059 | put_page(page); | ||
1060 | uncharge: | ||
1061 | /* drop extra refcnt by try_charge() */ | ||
1062 | css_put(&parent->css); | ||
1063 | /* uncharge if move fails */ | ||
1064 | res_counter_uncharge(&parent->res, PAGE_SIZE); | ||
1065 | if (do_swap_account) | ||
1066 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
1067 | return ret; | ||
1068 | } | ||
1069 | |||
1070 | /* | ||
1071 | * Charge the memory controller for page usage. | ||
1072 | * Return | ||
1073 | * 0 if the charge was successful | ||
1074 | * < 0 if the cgroup is over its limit | ||
1075 | */ | ||
1076 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | ||
1077 | gfp_t gfp_mask, enum charge_type ctype, | ||
1078 | struct mem_cgroup *memcg) | ||
1079 | { | ||
1080 | struct mem_cgroup *mem; | ||
1081 | struct page_cgroup *pc; | ||
1082 | int ret; | ||
1083 | |||
1084 | pc = lookup_page_cgroup(page); | ||
1085 | /* can happen at boot */ | ||
1086 | if (unlikely(!pc)) | ||
1087 | return 0; | ||
1088 | prefetchw(pc); | ||
1089 | |||
1090 | mem = memcg; | ||
1091 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | ||
1092 | if (ret || !mem) | ||
1093 | return ret; | ||
1094 | |||
1095 | __mem_cgroup_commit_charge(mem, pc, ctype); | ||
1096 | return 0; | ||
566 | } | 1097 | } |
567 | 1098 | ||
568 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) | 1099 | int mem_cgroup_newpage_charge(struct page *page, |
1100 | struct mm_struct *mm, gfp_t gfp_mask) | ||
569 | { | 1101 | { |
570 | if (mem_cgroup_subsys.disabled) | 1102 | if (mem_cgroup_disabled()) |
571 | return 0; | 1103 | return 0; |
572 | if (PageCompound(page)) | 1104 | if (PageCompound(page)) |
573 | return 0; | 1105 | return 0; |
@@ -589,7 +1121,10 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) | |||
589 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 1121 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
590 | gfp_t gfp_mask) | 1122 | gfp_t gfp_mask) |
591 | { | 1123 | { |
592 | if (mem_cgroup_subsys.disabled) | 1124 | struct mem_cgroup *mem = NULL; |
1125 | int ret; | ||
1126 | |||
1127 | if (mem_cgroup_disabled()) | ||
593 | return 0; | 1128 | return 0; |
594 | if (PageCompound(page)) | 1129 | if (PageCompound(page)) |
595 | return 0; | 1130 | return 0; |
@@ -601,6 +1136,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
601 | * For GFP_NOWAIT case, the page may be pre-charged before calling | 1136 | * For GFP_NOWAIT case, the page may be pre-charged before calling |
602 | * add_to_page_cache(). (See shmem.c) check it here and avoid to call | 1137 | * add_to_page_cache(). (See shmem.c) check it here and avoid to call |
603 | * charge twice. (It works but has to pay a bit larger cost.) | 1138 | * charge twice. (It works but has to pay a bit larger cost.) |
1139 | * And when the page is SwapCache, it should take swap information | ||
1140 | * into account. This is under lock_page() now. | ||
604 | */ | 1141 | */ |
605 | if (!(gfp_mask & __GFP_WAIT)) { | 1142 | if (!(gfp_mask & __GFP_WAIT)) { |
606 | struct page_cgroup *pc; | 1143 | struct page_cgroup *pc; |
@@ -617,58 +1154,198 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
617 | unlock_page_cgroup(pc); | 1154 | unlock_page_cgroup(pc); |
618 | } | 1155 | } |
619 | 1156 | ||
620 | if (unlikely(!mm)) | 1157 | if (do_swap_account && PageSwapCache(page)) { |
1158 | mem = try_get_mem_cgroup_from_swapcache(page); | ||
1159 | if (mem) | ||
1160 | mm = NULL; | ||
1161 | else | ||
1162 | mem = NULL; | ||
1163 | /* SwapCache may be still linked to LRU now. */ | ||
1164 | mem_cgroup_lru_del_before_commit_swapcache(page); | ||
1165 | } | ||
1166 | |||
1167 | if (unlikely(!mm && !mem)) | ||
621 | mm = &init_mm; | 1168 | mm = &init_mm; |
622 | 1169 | ||
623 | if (page_is_file_cache(page)) | 1170 | if (page_is_file_cache(page)) |
624 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 1171 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
625 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); | 1172 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); |
626 | else | 1173 | |
627 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 1174 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, |
628 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); | 1175 | MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); |
1176 | if (mem) | ||
1177 | css_put(&mem->css); | ||
1178 | if (PageSwapCache(page)) | ||
1179 | mem_cgroup_lru_add_after_commit_swapcache(page); | ||
1180 | |||
1181 | if (do_swap_account && !ret && PageSwapCache(page)) { | ||
1182 | swp_entry_t ent = {.val = page_private(page)}; | ||
1183 | /* avoid double counting */ | ||
1184 | mem = swap_cgroup_record(ent, NULL); | ||
1185 | if (mem) { | ||
1186 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1187 | mem_cgroup_put(mem); | ||
1188 | } | ||
1189 | } | ||
1190 | return ret; | ||
1191 | } | ||
1192 | |||
1193 | /* | ||
1194 | * While swap-in, try_charge -> commit or cancel, the page is locked. | ||
1195 | * And when try_charge() successfully returns, one refcnt to memcg without | ||
1196 | * struct page_cgroup is aquired. This refcnt will be cumsumed by | ||
1197 | * "commit()" or removed by "cancel()" | ||
1198 | */ | ||
1199 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | ||
1200 | struct page *page, | ||
1201 | gfp_t mask, struct mem_cgroup **ptr) | ||
1202 | { | ||
1203 | struct mem_cgroup *mem; | ||
1204 | int ret; | ||
1205 | |||
1206 | if (mem_cgroup_disabled()) | ||
1207 | return 0; | ||
1208 | |||
1209 | if (!do_swap_account) | ||
1210 | goto charge_cur_mm; | ||
1211 | /* | ||
1212 | * A racing thread's fault, or swapoff, may have already updated | ||
1213 | * the pte, and even removed page from swap cache: return success | ||
1214 | * to go on to do_swap_page()'s pte_same() test, which should fail. | ||
1215 | */ | ||
1216 | if (!PageSwapCache(page)) | ||
1217 | return 0; | ||
1218 | mem = try_get_mem_cgroup_from_swapcache(page); | ||
1219 | if (!mem) | ||
1220 | goto charge_cur_mm; | ||
1221 | *ptr = mem; | ||
1222 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | ||
1223 | /* drop extra refcnt from tryget */ | ||
1224 | css_put(&mem->css); | ||
1225 | return ret; | ||
1226 | charge_cur_mm: | ||
1227 | if (unlikely(!mm)) | ||
1228 | mm = &init_mm; | ||
1229 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | ||
1230 | } | ||
1231 | |||
1232 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | ||
1233 | { | ||
1234 | struct page_cgroup *pc; | ||
1235 | |||
1236 | if (mem_cgroup_disabled()) | ||
1237 | return; | ||
1238 | if (!ptr) | ||
1239 | return; | ||
1240 | pc = lookup_page_cgroup(page); | ||
1241 | mem_cgroup_lru_del_before_commit_swapcache(page); | ||
1242 | __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); | ||
1243 | mem_cgroup_lru_add_after_commit_swapcache(page); | ||
1244 | /* | ||
1245 | * Now swap is on-memory. This means this page may be | ||
1246 | * counted both as mem and swap....double count. | ||
1247 | * Fix it by uncharging from memsw. Basically, this SwapCache is stable | ||
1248 | * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() | ||
1249 | * may call delete_from_swap_cache() before reach here. | ||
1250 | */ | ||
1251 | if (do_swap_account && PageSwapCache(page)) { | ||
1252 | swp_entry_t ent = {.val = page_private(page)}; | ||
1253 | struct mem_cgroup *memcg; | ||
1254 | memcg = swap_cgroup_record(ent, NULL); | ||
1255 | if (memcg) { | ||
1256 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
1257 | mem_cgroup_put(memcg); | ||
1258 | } | ||
1259 | |||
1260 | } | ||
1261 | /* add this page(page_cgroup) to the LRU we want. */ | ||
1262 | |||
1263 | } | ||
1264 | |||
1265 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | ||
1266 | { | ||
1267 | if (mem_cgroup_disabled()) | ||
1268 | return; | ||
1269 | if (!mem) | ||
1270 | return; | ||
1271 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1272 | if (do_swap_account) | ||
1273 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1274 | css_put(&mem->css); | ||
629 | } | 1275 | } |
630 | 1276 | ||
1277 | |||
631 | /* | 1278 | /* |
632 | * uncharge if !page_mapped(page) | 1279 | * uncharge if !page_mapped(page) |
633 | */ | 1280 | */ |
634 | static void | 1281 | static struct mem_cgroup * |
635 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 1282 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
636 | { | 1283 | { |
637 | struct page_cgroup *pc; | 1284 | struct page_cgroup *pc; |
638 | struct mem_cgroup *mem; | 1285 | struct mem_cgroup *mem = NULL; |
639 | struct mem_cgroup_per_zone *mz; | 1286 | struct mem_cgroup_per_zone *mz; |
640 | unsigned long flags; | ||
641 | 1287 | ||
642 | if (mem_cgroup_subsys.disabled) | 1288 | if (mem_cgroup_disabled()) |
643 | return; | 1289 | return NULL; |
1290 | |||
1291 | if (PageSwapCache(page)) | ||
1292 | return NULL; | ||
644 | 1293 | ||
645 | /* | 1294 | /* |
646 | * Check if our page_cgroup is valid | 1295 | * Check if our page_cgroup is valid |
647 | */ | 1296 | */ |
648 | pc = lookup_page_cgroup(page); | 1297 | pc = lookup_page_cgroup(page); |
649 | if (unlikely(!pc || !PageCgroupUsed(pc))) | 1298 | if (unlikely(!pc || !PageCgroupUsed(pc))) |
650 | return; | 1299 | return NULL; |
651 | 1300 | ||
652 | lock_page_cgroup(pc); | 1301 | lock_page_cgroup(pc); |
653 | if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) | 1302 | |
654 | || !PageCgroupUsed(pc)) { | 1303 | mem = pc->mem_cgroup; |
655 | /* This happens at race in zap_pte_range() and do_swap_page()*/ | 1304 | |
656 | unlock_page_cgroup(pc); | 1305 | if (!PageCgroupUsed(pc)) |
657 | return; | 1306 | goto unlock_out; |
1307 | |||
1308 | switch (ctype) { | ||
1309 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | ||
1310 | if (page_mapped(page)) | ||
1311 | goto unlock_out; | ||
1312 | break; | ||
1313 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: | ||
1314 | if (!PageAnon(page)) { /* Shared memory */ | ||
1315 | if (page->mapping && !page_is_file_cache(page)) | ||
1316 | goto unlock_out; | ||
1317 | } else if (page_mapped(page)) /* Anon */ | ||
1318 | goto unlock_out; | ||
1319 | break; | ||
1320 | default: | ||
1321 | break; | ||
658 | } | 1322 | } |
1323 | |||
1324 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1325 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1326 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1327 | |||
1328 | mem_cgroup_charge_statistics(mem, pc, false); | ||
659 | ClearPageCgroupUsed(pc); | 1329 | ClearPageCgroupUsed(pc); |
660 | mem = pc->mem_cgroup; | 1330 | /* |
1331 | * pc->mem_cgroup is not cleared here. It will be accessed when it's | ||
1332 | * freed from LRU. This is safe because uncharged page is expected not | ||
1333 | * to be reused (freed soon). Exception is SwapCache, it's handled by | ||
1334 | * special functions. | ||
1335 | */ | ||
661 | 1336 | ||
662 | mz = page_cgroup_zoneinfo(pc); | 1337 | mz = page_cgroup_zoneinfo(pc); |
663 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
664 | __mem_cgroup_remove_list(mz, pc); | ||
665 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
666 | unlock_page_cgroup(pc); | 1338 | unlock_page_cgroup(pc); |
667 | 1339 | ||
668 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1340 | /* at swapout, this memcg will be accessed to record to swap */ |
669 | css_put(&mem->css); | 1341 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1342 | css_put(&mem->css); | ||
670 | 1343 | ||
671 | return; | 1344 | return mem; |
1345 | |||
1346 | unlock_out: | ||
1347 | unlock_page_cgroup(pc); | ||
1348 | return NULL; | ||
672 | } | 1349 | } |
673 | 1350 | ||
674 | void mem_cgroup_uncharge_page(struct page *page) | 1351 | void mem_cgroup_uncharge_page(struct page *page) |
@@ -689,16 +1366,55 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
689 | } | 1366 | } |
690 | 1367 | ||
691 | /* | 1368 | /* |
692 | * Before starting migration, account against new page. | 1369 | * called from __delete_from_swap_cache() and drop "page" account. |
1370 | * memcg information is recorded to swap_cgroup of "ent" | ||
693 | */ | 1371 | */ |
694 | int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) | 1372 | void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) |
1373 | { | ||
1374 | struct mem_cgroup *memcg; | ||
1375 | |||
1376 | memcg = __mem_cgroup_uncharge_common(page, | ||
1377 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT); | ||
1378 | /* record memcg information */ | ||
1379 | if (do_swap_account && memcg) { | ||
1380 | swap_cgroup_record(ent, memcg); | ||
1381 | mem_cgroup_get(memcg); | ||
1382 | } | ||
1383 | if (memcg) | ||
1384 | css_put(&memcg->css); | ||
1385 | } | ||
1386 | |||
1387 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
1388 | /* | ||
1389 | * called from swap_entry_free(). remove record in swap_cgroup and | ||
1390 | * uncharge "memsw" account. | ||
1391 | */ | ||
1392 | void mem_cgroup_uncharge_swap(swp_entry_t ent) | ||
1393 | { | ||
1394 | struct mem_cgroup *memcg; | ||
1395 | |||
1396 | if (!do_swap_account) | ||
1397 | return; | ||
1398 | |||
1399 | memcg = swap_cgroup_record(ent, NULL); | ||
1400 | if (memcg) { | ||
1401 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
1402 | mem_cgroup_put(memcg); | ||
1403 | } | ||
1404 | } | ||
1405 | #endif | ||
1406 | |||
1407 | /* | ||
1408 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old | ||
1409 | * page belongs to. | ||
1410 | */ | ||
1411 | int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | ||
695 | { | 1412 | { |
696 | struct page_cgroup *pc; | 1413 | struct page_cgroup *pc; |
697 | struct mem_cgroup *mem = NULL; | 1414 | struct mem_cgroup *mem = NULL; |
698 | enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
699 | int ret = 0; | 1415 | int ret = 0; |
700 | 1416 | ||
701 | if (mem_cgroup_subsys.disabled) | 1417 | if (mem_cgroup_disabled()) |
702 | return 0; | 1418 | return 0; |
703 | 1419 | ||
704 | pc = lookup_page_cgroup(page); | 1420 | pc = lookup_page_cgroup(page); |
@@ -706,41 +1422,67 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) | |||
706 | if (PageCgroupUsed(pc)) { | 1422 | if (PageCgroupUsed(pc)) { |
707 | mem = pc->mem_cgroup; | 1423 | mem = pc->mem_cgroup; |
708 | css_get(&mem->css); | 1424 | css_get(&mem->css); |
709 | if (PageCgroupCache(pc)) { | ||
710 | if (page_is_file_cache(page)) | ||
711 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
712 | else | ||
713 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
714 | } | ||
715 | } | 1425 | } |
716 | unlock_page_cgroup(pc); | 1426 | unlock_page_cgroup(pc); |
1427 | |||
717 | if (mem) { | 1428 | if (mem) { |
718 | ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, | 1429 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); |
719 | ctype, mem); | ||
720 | css_put(&mem->css); | 1430 | css_put(&mem->css); |
721 | } | 1431 | } |
1432 | *ptr = mem; | ||
722 | return ret; | 1433 | return ret; |
723 | } | 1434 | } |
724 | 1435 | ||
725 | /* remove redundant charge if migration failed*/ | 1436 | /* remove redundant charge if migration failed*/ |
726 | void mem_cgroup_end_migration(struct page *newpage) | 1437 | void mem_cgroup_end_migration(struct mem_cgroup *mem, |
1438 | struct page *oldpage, struct page *newpage) | ||
727 | { | 1439 | { |
1440 | struct page *target, *unused; | ||
1441 | struct page_cgroup *pc; | ||
1442 | enum charge_type ctype; | ||
1443 | |||
1444 | if (!mem) | ||
1445 | return; | ||
1446 | |||
1447 | /* at migration success, oldpage->mapping is NULL. */ | ||
1448 | if (oldpage->mapping) { | ||
1449 | target = oldpage; | ||
1450 | unused = NULL; | ||
1451 | } else { | ||
1452 | target = newpage; | ||
1453 | unused = oldpage; | ||
1454 | } | ||
1455 | |||
1456 | if (PageAnon(target)) | ||
1457 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
1458 | else if (page_is_file_cache(target)) | ||
1459 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
1460 | else | ||
1461 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
1462 | |||
1463 | /* unused page is not on radix-tree now. */ | ||
1464 | if (unused) | ||
1465 | __mem_cgroup_uncharge_common(unused, ctype); | ||
1466 | |||
1467 | pc = lookup_page_cgroup(target); | ||
728 | /* | 1468 | /* |
729 | * At success, page->mapping is not NULL. | 1469 | * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. |
730 | * special rollback care is necessary when | 1470 | * So, double-counting is effectively avoided. |
731 | * 1. at migration failure. (newpage->mapping is cleared in this case) | ||
732 | * 2. the newpage was moved but not remapped again because the task | ||
733 | * exits and the newpage is obsolete. In this case, the new page | ||
734 | * may be a swapcache. So, we just call mem_cgroup_uncharge_page() | ||
735 | * always for avoiding mess. The page_cgroup will be removed if | ||
736 | * unnecessary. File cache pages is still on radix-tree. Don't | ||
737 | * care it. | ||
738 | */ | 1471 | */ |
739 | if (!newpage->mapping) | 1472 | __mem_cgroup_commit_charge(mem, pc, ctype); |
740 | __mem_cgroup_uncharge_common(newpage, | 1473 | |
741 | MEM_CGROUP_CHARGE_TYPE_FORCE); | 1474 | /* |
742 | else if (PageAnon(newpage)) | 1475 | * Both of oldpage and newpage are still under lock_page(). |
743 | mem_cgroup_uncharge_page(newpage); | 1476 | * Then, we don't have to care about race in radix-tree. |
1477 | * But we have to be careful that this page is unmapped or not. | ||
1478 | * | ||
1479 | * There is a case for !page_mapped(). At the start of | ||
1480 | * migration, oldpage was mapped. But now, it's zapped. | ||
1481 | * But we know *target* page is not freed/reused under us. | ||
1482 | * mem_cgroup_uncharge_page() does all necessary checks. | ||
1483 | */ | ||
1484 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | ||
1485 | mem_cgroup_uncharge_page(target); | ||
744 | } | 1486 | } |
745 | 1487 | ||
746 | /* | 1488 | /* |
@@ -748,29 +1490,26 @@ void mem_cgroup_end_migration(struct page *newpage) | |||
748 | * This is typically used for page reclaiming for shmem for reducing side | 1490 | * This is typically used for page reclaiming for shmem for reducing side |
749 | * effect of page allocation from shmem, which is used by some mem_cgroup. | 1491 | * effect of page allocation from shmem, which is used by some mem_cgroup. |
750 | */ | 1492 | */ |
751 | int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) | 1493 | int mem_cgroup_shrink_usage(struct page *page, |
1494 | struct mm_struct *mm, | ||
1495 | gfp_t gfp_mask) | ||
752 | { | 1496 | { |
753 | struct mem_cgroup *mem; | 1497 | struct mem_cgroup *mem = NULL; |
754 | int progress = 0; | 1498 | int progress = 0; |
755 | int retry = MEM_CGROUP_RECLAIM_RETRIES; | 1499 | int retry = MEM_CGROUP_RECLAIM_RETRIES; |
756 | 1500 | ||
757 | if (mem_cgroup_subsys.disabled) | 1501 | if (mem_cgroup_disabled()) |
758 | return 0; | 1502 | return 0; |
759 | if (!mm) | 1503 | if (page) |
1504 | mem = try_get_mem_cgroup_from_swapcache(page); | ||
1505 | if (!mem && mm) | ||
1506 | mem = try_get_mem_cgroup_from_mm(mm); | ||
1507 | if (unlikely(!mem)) | ||
760 | return 0; | 1508 | return 0; |
761 | 1509 | ||
762 | rcu_read_lock(); | ||
763 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | ||
764 | if (unlikely(!mem)) { | ||
765 | rcu_read_unlock(); | ||
766 | return 0; | ||
767 | } | ||
768 | css_get(&mem->css); | ||
769 | rcu_read_unlock(); | ||
770 | |||
771 | do { | 1510 | do { |
772 | progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); | 1511 | progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); |
773 | progress += res_counter_check_under_limit(&mem->res); | 1512 | progress += mem_cgroup_check_under_limit(mem); |
774 | } while (!progress && --retry); | 1513 | } while (!progress && --retry); |
775 | 1514 | ||
776 | css_put(&mem->css); | 1515 | css_put(&mem->css); |
@@ -779,116 +1518,295 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) | |||
779 | return 0; | 1518 | return 0; |
780 | } | 1519 | } |
781 | 1520 | ||
782 | int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) | 1521 | static DEFINE_MUTEX(set_limit_mutex); |
1522 | |||
1523 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | ||
1524 | unsigned long long val) | ||
783 | { | 1525 | { |
784 | 1526 | ||
785 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; | 1527 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; |
786 | int progress; | 1528 | int progress; |
1529 | u64 memswlimit; | ||
787 | int ret = 0; | 1530 | int ret = 0; |
788 | 1531 | ||
789 | while (res_counter_set_limit(&memcg->res, val)) { | 1532 | while (retry_count) { |
790 | if (signal_pending(current)) { | 1533 | if (signal_pending(current)) { |
791 | ret = -EINTR; | 1534 | ret = -EINTR; |
792 | break; | 1535 | break; |
793 | } | 1536 | } |
794 | if (!retry_count) { | 1537 | /* |
795 | ret = -EBUSY; | 1538 | * Rather than hide all in some function, I do this in |
1539 | * open coded manner. You see what this really does. | ||
1540 | * We have to guarantee mem->res.limit < mem->memsw.limit. | ||
1541 | */ | ||
1542 | mutex_lock(&set_limit_mutex); | ||
1543 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
1544 | if (memswlimit < val) { | ||
1545 | ret = -EINVAL; | ||
1546 | mutex_unlock(&set_limit_mutex); | ||
796 | break; | 1547 | break; |
797 | } | 1548 | } |
798 | progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL); | 1549 | ret = res_counter_set_limit(&memcg->res, val); |
799 | if (!progress) | 1550 | mutex_unlock(&set_limit_mutex); |
800 | retry_count--; | 1551 | |
1552 | if (!ret) | ||
1553 | break; | ||
1554 | |||
1555 | progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, | ||
1556 | false); | ||
1557 | if (!progress) retry_count--; | ||
801 | } | 1558 | } |
1559 | |||
802 | return ret; | 1560 | return ret; |
803 | } | 1561 | } |
804 | 1562 | ||
1563 | int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | ||
1564 | unsigned long long val) | ||
1565 | { | ||
1566 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; | ||
1567 | u64 memlimit, oldusage, curusage; | ||
1568 | int ret; | ||
1569 | |||
1570 | if (!do_swap_account) | ||
1571 | return -EINVAL; | ||
1572 | |||
1573 | while (retry_count) { | ||
1574 | if (signal_pending(current)) { | ||
1575 | ret = -EINTR; | ||
1576 | break; | ||
1577 | } | ||
1578 | /* | ||
1579 | * Rather than hide all in some function, I do this in | ||
1580 | * open coded manner. You see what this really does. | ||
1581 | * We have to guarantee mem->res.limit < mem->memsw.limit. | ||
1582 | */ | ||
1583 | mutex_lock(&set_limit_mutex); | ||
1584 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
1585 | if (memlimit > val) { | ||
1586 | ret = -EINVAL; | ||
1587 | mutex_unlock(&set_limit_mutex); | ||
1588 | break; | ||
1589 | } | ||
1590 | ret = res_counter_set_limit(&memcg->memsw, val); | ||
1591 | mutex_unlock(&set_limit_mutex); | ||
1592 | |||
1593 | if (!ret) | ||
1594 | break; | ||
1595 | |||
1596 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
1597 | mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true); | ||
1598 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
1599 | if (curusage >= oldusage) | ||
1600 | retry_count--; | ||
1601 | } | ||
1602 | return ret; | ||
1603 | } | ||
805 | 1604 | ||
806 | /* | 1605 | /* |
807 | * This routine traverse page_cgroup in given list and drop them all. | 1606 | * This routine traverse page_cgroup in given list and drop them all. |
808 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 1607 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
809 | */ | 1608 | */ |
810 | #define FORCE_UNCHARGE_BATCH (128) | 1609 | static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, |
811 | static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | 1610 | int node, int zid, enum lru_list lru) |
812 | struct mem_cgroup_per_zone *mz, | ||
813 | enum lru_list lru) | ||
814 | { | 1611 | { |
815 | struct page_cgroup *pc; | 1612 | struct zone *zone; |
816 | struct page *page; | 1613 | struct mem_cgroup_per_zone *mz; |
817 | int count = FORCE_UNCHARGE_BATCH; | 1614 | struct page_cgroup *pc, *busy; |
818 | unsigned long flags; | 1615 | unsigned long flags, loop; |
819 | struct list_head *list; | 1616 | struct list_head *list; |
1617 | int ret = 0; | ||
820 | 1618 | ||
1619 | zone = &NODE_DATA(node)->node_zones[zid]; | ||
1620 | mz = mem_cgroup_zoneinfo(mem, node, zid); | ||
821 | list = &mz->lists[lru]; | 1621 | list = &mz->lists[lru]; |
822 | 1622 | ||
823 | spin_lock_irqsave(&mz->lru_lock, flags); | 1623 | loop = MEM_CGROUP_ZSTAT(mz, lru); |
824 | while (!list_empty(list)) { | 1624 | /* give some margin against EBUSY etc...*/ |
825 | pc = list_entry(list->prev, struct page_cgroup, lru); | 1625 | loop += 256; |
826 | page = pc->page; | 1626 | busy = NULL; |
827 | if (!PageCgroupUsed(pc)) | 1627 | while (loop--) { |
828 | break; | 1628 | ret = 0; |
829 | get_page(page); | 1629 | spin_lock_irqsave(&zone->lru_lock, flags); |
830 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 1630 | if (list_empty(list)) { |
831 | /* | 1631 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
832 | * Check if this page is on LRU. !LRU page can be found | ||
833 | * if it's under page migration. | ||
834 | */ | ||
835 | if (PageLRU(page)) { | ||
836 | __mem_cgroup_uncharge_common(page, | ||
837 | MEM_CGROUP_CHARGE_TYPE_FORCE); | ||
838 | put_page(page); | ||
839 | if (--count <= 0) { | ||
840 | count = FORCE_UNCHARGE_BATCH; | ||
841 | cond_resched(); | ||
842 | } | ||
843 | } else { | ||
844 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
845 | break; | 1632 | break; |
846 | } | 1633 | } |
847 | spin_lock_irqsave(&mz->lru_lock, flags); | 1634 | pc = list_entry(list->prev, struct page_cgroup, lru); |
1635 | if (busy == pc) { | ||
1636 | list_move(&pc->lru, list); | ||
1637 | busy = 0; | ||
1638 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
1639 | continue; | ||
1640 | } | ||
1641 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
1642 | |||
1643 | ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); | ||
1644 | if (ret == -ENOMEM) | ||
1645 | break; | ||
1646 | |||
1647 | if (ret == -EBUSY || ret == -EINVAL) { | ||
1648 | /* found lock contention or "pc" is obsolete. */ | ||
1649 | busy = pc; | ||
1650 | cond_resched(); | ||
1651 | } else | ||
1652 | busy = NULL; | ||
848 | } | 1653 | } |
849 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 1654 | |
1655 | if (!ret && !list_empty(list)) | ||
1656 | return -EBUSY; | ||
1657 | return ret; | ||
850 | } | 1658 | } |
851 | 1659 | ||
852 | /* | 1660 | /* |
853 | * make mem_cgroup's charge to be 0 if there is no task. | 1661 | * make mem_cgroup's charge to be 0 if there is no task. |
854 | * This enables deleting this mem_cgroup. | 1662 | * This enables deleting this mem_cgroup. |
855 | */ | 1663 | */ |
856 | static int mem_cgroup_force_empty(struct mem_cgroup *mem) | 1664 | static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) |
857 | { | 1665 | { |
858 | int ret = -EBUSY; | 1666 | int ret; |
859 | int node, zid; | 1667 | int node, zid, shrink; |
1668 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
1669 | struct cgroup *cgrp = mem->css.cgroup; | ||
860 | 1670 | ||
861 | css_get(&mem->css); | 1671 | css_get(&mem->css); |
862 | /* | 1672 | |
863 | * page reclaim code (kswapd etc..) will move pages between | 1673 | shrink = 0; |
864 | * active_list <-> inactive_list while we don't take a lock. | 1674 | /* should free all ? */ |
865 | * So, we have to do loop here until all lists are empty. | 1675 | if (free_all) |
866 | */ | 1676 | goto try_to_free; |
1677 | move_account: | ||
867 | while (mem->res.usage > 0) { | 1678 | while (mem->res.usage > 0) { |
868 | if (atomic_read(&mem->css.cgroup->count) > 0) | 1679 | ret = -EBUSY; |
1680 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | ||
1681 | goto out; | ||
1682 | ret = -EINTR; | ||
1683 | if (signal_pending(current)) | ||
869 | goto out; | 1684 | goto out; |
870 | /* This is for making all *used* pages to be on LRU. */ | 1685 | /* This is for making all *used* pages to be on LRU. */ |
871 | lru_add_drain_all(); | 1686 | lru_add_drain_all(); |
872 | for_each_node_state(node, N_POSSIBLE) | 1687 | ret = 0; |
873 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 1688 | for_each_node_state(node, N_HIGH_MEMORY) { |
874 | struct mem_cgroup_per_zone *mz; | 1689 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
875 | enum lru_list l; | 1690 | enum lru_list l; |
876 | mz = mem_cgroup_zoneinfo(mem, node, zid); | 1691 | for_each_lru(l) { |
877 | for_each_lru(l) | 1692 | ret = mem_cgroup_force_empty_list(mem, |
878 | mem_cgroup_force_empty_list(mem, mz, l); | 1693 | node, zid, l); |
1694 | if (ret) | ||
1695 | break; | ||
1696 | } | ||
879 | } | 1697 | } |
1698 | if (ret) | ||
1699 | break; | ||
1700 | } | ||
1701 | /* it seems parent cgroup doesn't have enough mem */ | ||
1702 | if (ret == -ENOMEM) | ||
1703 | goto try_to_free; | ||
880 | cond_resched(); | 1704 | cond_resched(); |
881 | } | 1705 | } |
882 | ret = 0; | 1706 | ret = 0; |
883 | out: | 1707 | out: |
884 | css_put(&mem->css); | 1708 | css_put(&mem->css); |
885 | return ret; | 1709 | return ret; |
1710 | |||
1711 | try_to_free: | ||
1712 | /* returns EBUSY if there is a task or if we come here twice. */ | ||
1713 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { | ||
1714 | ret = -EBUSY; | ||
1715 | goto out; | ||
1716 | } | ||
1717 | /* we call try-to-free pages for make this cgroup empty */ | ||
1718 | lru_add_drain_all(); | ||
1719 | /* try to free all pages in this cgroup */ | ||
1720 | shrink = 1; | ||
1721 | while (nr_retries && mem->res.usage > 0) { | ||
1722 | int progress; | ||
1723 | |||
1724 | if (signal_pending(current)) { | ||
1725 | ret = -EINTR; | ||
1726 | goto out; | ||
1727 | } | ||
1728 | progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, | ||
1729 | false, get_swappiness(mem)); | ||
1730 | if (!progress) { | ||
1731 | nr_retries--; | ||
1732 | /* maybe some writeback is necessary */ | ||
1733 | congestion_wait(WRITE, HZ/10); | ||
1734 | } | ||
1735 | |||
1736 | } | ||
1737 | lru_add_drain(); | ||
1738 | /* try move_account...there may be some *locked* pages. */ | ||
1739 | if (mem->res.usage) | ||
1740 | goto move_account; | ||
1741 | ret = 0; | ||
1742 | goto out; | ||
1743 | } | ||
1744 | |||
1745 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | ||
1746 | { | ||
1747 | return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); | ||
1748 | } | ||
1749 | |||
1750 | |||
1751 | static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) | ||
1752 | { | ||
1753 | return mem_cgroup_from_cont(cont)->use_hierarchy; | ||
1754 | } | ||
1755 | |||
1756 | static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | ||
1757 | u64 val) | ||
1758 | { | ||
1759 | int retval = 0; | ||
1760 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | ||
1761 | struct cgroup *parent = cont->parent; | ||
1762 | struct mem_cgroup *parent_mem = NULL; | ||
1763 | |||
1764 | if (parent) | ||
1765 | parent_mem = mem_cgroup_from_cont(parent); | ||
1766 | |||
1767 | cgroup_lock(); | ||
1768 | /* | ||
1769 | * If parent's use_hiearchy is set, we can't make any modifications | ||
1770 | * in the child subtrees. If it is unset, then the change can | ||
1771 | * occur, provided the current cgroup has no children. | ||
1772 | * | ||
1773 | * For the root cgroup, parent_mem is NULL, we allow value to be | ||
1774 | * set if there are no children. | ||
1775 | */ | ||
1776 | if ((!parent_mem || !parent_mem->use_hierarchy) && | ||
1777 | (val == 1 || val == 0)) { | ||
1778 | if (list_empty(&cont->children)) | ||
1779 | mem->use_hierarchy = val; | ||
1780 | else | ||
1781 | retval = -EBUSY; | ||
1782 | } else | ||
1783 | retval = -EINVAL; | ||
1784 | cgroup_unlock(); | ||
1785 | |||
1786 | return retval; | ||
886 | } | 1787 | } |
887 | 1788 | ||
888 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 1789 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
889 | { | 1790 | { |
890 | return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, | 1791 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
891 | cft->private); | 1792 | u64 val = 0; |
1793 | int type, name; | ||
1794 | |||
1795 | type = MEMFILE_TYPE(cft->private); | ||
1796 | name = MEMFILE_ATTR(cft->private); | ||
1797 | switch (type) { | ||
1798 | case _MEM: | ||
1799 | val = res_counter_read_u64(&mem->res, name); | ||
1800 | break; | ||
1801 | case _MEMSWAP: | ||
1802 | if (do_swap_account) | ||
1803 | val = res_counter_read_u64(&mem->memsw, name); | ||
1804 | break; | ||
1805 | default: | ||
1806 | BUG(); | ||
1807 | break; | ||
1808 | } | ||
1809 | return val; | ||
892 | } | 1810 | } |
893 | /* | 1811 | /* |
894 | * The user of this function is... | 1812 | * The user of this function is... |
@@ -898,15 +1816,22 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
898 | const char *buffer) | 1816 | const char *buffer) |
899 | { | 1817 | { |
900 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 1818 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
1819 | int type, name; | ||
901 | unsigned long long val; | 1820 | unsigned long long val; |
902 | int ret; | 1821 | int ret; |
903 | 1822 | ||
904 | switch (cft->private) { | 1823 | type = MEMFILE_TYPE(cft->private); |
1824 | name = MEMFILE_ATTR(cft->private); | ||
1825 | switch (name) { | ||
905 | case RES_LIMIT: | 1826 | case RES_LIMIT: |
906 | /* This function does all necessary parse...reuse it */ | 1827 | /* This function does all necessary parse...reuse it */ |
907 | ret = res_counter_memparse_write_strategy(buffer, &val); | 1828 | ret = res_counter_memparse_write_strategy(buffer, &val); |
908 | if (!ret) | 1829 | if (ret) |
1830 | break; | ||
1831 | if (type == _MEM) | ||
909 | ret = mem_cgroup_resize_limit(memcg, val); | 1832 | ret = mem_cgroup_resize_limit(memcg, val); |
1833 | else | ||
1834 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | ||
910 | break; | 1835 | break; |
911 | default: | 1836 | default: |
912 | ret = -EINVAL; /* should be BUG() ? */ | 1837 | ret = -EINVAL; /* should be BUG() ? */ |
@@ -915,27 +1840,59 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
915 | return ret; | 1840 | return ret; |
916 | } | 1841 | } |
917 | 1842 | ||
1843 | static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, | ||
1844 | unsigned long long *mem_limit, unsigned long long *memsw_limit) | ||
1845 | { | ||
1846 | struct cgroup *cgroup; | ||
1847 | unsigned long long min_limit, min_memsw_limit, tmp; | ||
1848 | |||
1849 | min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
1850 | min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
1851 | cgroup = memcg->css.cgroup; | ||
1852 | if (!memcg->use_hierarchy) | ||
1853 | goto out; | ||
1854 | |||
1855 | while (cgroup->parent) { | ||
1856 | cgroup = cgroup->parent; | ||
1857 | memcg = mem_cgroup_from_cont(cgroup); | ||
1858 | if (!memcg->use_hierarchy) | ||
1859 | break; | ||
1860 | tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
1861 | min_limit = min(min_limit, tmp); | ||
1862 | tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
1863 | min_memsw_limit = min(min_memsw_limit, tmp); | ||
1864 | } | ||
1865 | out: | ||
1866 | *mem_limit = min_limit; | ||
1867 | *memsw_limit = min_memsw_limit; | ||
1868 | return; | ||
1869 | } | ||
1870 | |||
918 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 1871 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
919 | { | 1872 | { |
920 | struct mem_cgroup *mem; | 1873 | struct mem_cgroup *mem; |
1874 | int type, name; | ||
921 | 1875 | ||
922 | mem = mem_cgroup_from_cont(cont); | 1876 | mem = mem_cgroup_from_cont(cont); |
923 | switch (event) { | 1877 | type = MEMFILE_TYPE(event); |
1878 | name = MEMFILE_ATTR(event); | ||
1879 | switch (name) { | ||
924 | case RES_MAX_USAGE: | 1880 | case RES_MAX_USAGE: |
925 | res_counter_reset_max(&mem->res); | 1881 | if (type == _MEM) |
1882 | res_counter_reset_max(&mem->res); | ||
1883 | else | ||
1884 | res_counter_reset_max(&mem->memsw); | ||
926 | break; | 1885 | break; |
927 | case RES_FAILCNT: | 1886 | case RES_FAILCNT: |
928 | res_counter_reset_failcnt(&mem->res); | 1887 | if (type == _MEM) |
1888 | res_counter_reset_failcnt(&mem->res); | ||
1889 | else | ||
1890 | res_counter_reset_failcnt(&mem->memsw); | ||
929 | break; | 1891 | break; |
930 | } | 1892 | } |
931 | return 0; | 1893 | return 0; |
932 | } | 1894 | } |
933 | 1895 | ||
934 | static int mem_force_empty_write(struct cgroup *cont, unsigned int event) | ||
935 | { | ||
936 | return mem_cgroup_force_empty(mem_cgroup_from_cont(cont)); | ||
937 | } | ||
938 | |||
939 | static const struct mem_cgroup_stat_desc { | 1896 | static const struct mem_cgroup_stat_desc { |
940 | const char *msg; | 1897 | const char *msg; |
941 | u64 unit; | 1898 | u64 unit; |
@@ -984,42 +1941,170 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
984 | cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); | 1941 | cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); |
985 | 1942 | ||
986 | } | 1943 | } |
1944 | { | ||
1945 | unsigned long long limit, memsw_limit; | ||
1946 | memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); | ||
1947 | cb->fill(cb, "hierarchical_memory_limit", limit); | ||
1948 | if (do_swap_account) | ||
1949 | cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); | ||
1950 | } | ||
1951 | |||
1952 | #ifdef CONFIG_DEBUG_VM | ||
1953 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); | ||
1954 | |||
1955 | { | ||
1956 | int nid, zid; | ||
1957 | struct mem_cgroup_per_zone *mz; | ||
1958 | unsigned long recent_rotated[2] = {0, 0}; | ||
1959 | unsigned long recent_scanned[2] = {0, 0}; | ||
1960 | |||
1961 | for_each_online_node(nid) | ||
1962 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
1963 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | ||
1964 | |||
1965 | recent_rotated[0] += | ||
1966 | mz->reclaim_stat.recent_rotated[0]; | ||
1967 | recent_rotated[1] += | ||
1968 | mz->reclaim_stat.recent_rotated[1]; | ||
1969 | recent_scanned[0] += | ||
1970 | mz->reclaim_stat.recent_scanned[0]; | ||
1971 | recent_scanned[1] += | ||
1972 | mz->reclaim_stat.recent_scanned[1]; | ||
1973 | } | ||
1974 | cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); | ||
1975 | cb->fill(cb, "recent_rotated_file", recent_rotated[1]); | ||
1976 | cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); | ||
1977 | cb->fill(cb, "recent_scanned_file", recent_scanned[1]); | ||
1978 | } | ||
1979 | #endif | ||
1980 | |||
987 | return 0; | 1981 | return 0; |
988 | } | 1982 | } |
989 | 1983 | ||
1984 | static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) | ||
1985 | { | ||
1986 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
1987 | |||
1988 | return get_swappiness(memcg); | ||
1989 | } | ||
1990 | |||
1991 | static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | ||
1992 | u64 val) | ||
1993 | { | ||
1994 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
1995 | struct mem_cgroup *parent; | ||
1996 | |||
1997 | if (val > 100) | ||
1998 | return -EINVAL; | ||
1999 | |||
2000 | if (cgrp->parent == NULL) | ||
2001 | return -EINVAL; | ||
2002 | |||
2003 | parent = mem_cgroup_from_cont(cgrp->parent); | ||
2004 | |||
2005 | cgroup_lock(); | ||
2006 | |||
2007 | /* If under hierarchy, only empty-root can set this value */ | ||
2008 | if ((parent->use_hierarchy) || | ||
2009 | (memcg->use_hierarchy && !list_empty(&cgrp->children))) { | ||
2010 | cgroup_unlock(); | ||
2011 | return -EINVAL; | ||
2012 | } | ||
2013 | |||
2014 | spin_lock(&memcg->reclaim_param_lock); | ||
2015 | memcg->swappiness = val; | ||
2016 | spin_unlock(&memcg->reclaim_param_lock); | ||
2017 | |||
2018 | cgroup_unlock(); | ||
2019 | |||
2020 | return 0; | ||
2021 | } | ||
2022 | |||
2023 | |||
990 | static struct cftype mem_cgroup_files[] = { | 2024 | static struct cftype mem_cgroup_files[] = { |
991 | { | 2025 | { |
992 | .name = "usage_in_bytes", | 2026 | .name = "usage_in_bytes", |
993 | .private = RES_USAGE, | 2027 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
994 | .read_u64 = mem_cgroup_read, | 2028 | .read_u64 = mem_cgroup_read, |
995 | }, | 2029 | }, |
996 | { | 2030 | { |
997 | .name = "max_usage_in_bytes", | 2031 | .name = "max_usage_in_bytes", |
998 | .private = RES_MAX_USAGE, | 2032 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), |
999 | .trigger = mem_cgroup_reset, | 2033 | .trigger = mem_cgroup_reset, |
1000 | .read_u64 = mem_cgroup_read, | 2034 | .read_u64 = mem_cgroup_read, |
1001 | }, | 2035 | }, |
1002 | { | 2036 | { |
1003 | .name = "limit_in_bytes", | 2037 | .name = "limit_in_bytes", |
1004 | .private = RES_LIMIT, | 2038 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), |
1005 | .write_string = mem_cgroup_write, | 2039 | .write_string = mem_cgroup_write, |
1006 | .read_u64 = mem_cgroup_read, | 2040 | .read_u64 = mem_cgroup_read, |
1007 | }, | 2041 | }, |
1008 | { | 2042 | { |
1009 | .name = "failcnt", | 2043 | .name = "failcnt", |
1010 | .private = RES_FAILCNT, | 2044 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
1011 | .trigger = mem_cgroup_reset, | 2045 | .trigger = mem_cgroup_reset, |
1012 | .read_u64 = mem_cgroup_read, | 2046 | .read_u64 = mem_cgroup_read, |
1013 | }, | 2047 | }, |
1014 | { | 2048 | { |
2049 | .name = "stat", | ||
2050 | .read_map = mem_control_stat_show, | ||
2051 | }, | ||
2052 | { | ||
1015 | .name = "force_empty", | 2053 | .name = "force_empty", |
1016 | .trigger = mem_force_empty_write, | 2054 | .trigger = mem_cgroup_force_empty_write, |
1017 | }, | 2055 | }, |
1018 | { | 2056 | { |
1019 | .name = "stat", | 2057 | .name = "use_hierarchy", |
1020 | .read_map = mem_control_stat_show, | 2058 | .write_u64 = mem_cgroup_hierarchy_write, |
2059 | .read_u64 = mem_cgroup_hierarchy_read, | ||
1021 | }, | 2060 | }, |
2061 | { | ||
2062 | .name = "swappiness", | ||
2063 | .read_u64 = mem_cgroup_swappiness_read, | ||
2064 | .write_u64 = mem_cgroup_swappiness_write, | ||
2065 | }, | ||
2066 | }; | ||
2067 | |||
2068 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
2069 | static struct cftype memsw_cgroup_files[] = { | ||
2070 | { | ||
2071 | .name = "memsw.usage_in_bytes", | ||
2072 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
2073 | .read_u64 = mem_cgroup_read, | ||
2074 | }, | ||
2075 | { | ||
2076 | .name = "memsw.max_usage_in_bytes", | ||
2077 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
2078 | .trigger = mem_cgroup_reset, | ||
2079 | .read_u64 = mem_cgroup_read, | ||
2080 | }, | ||
2081 | { | ||
2082 | .name = "memsw.limit_in_bytes", | ||
2083 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
2084 | .write_string = mem_cgroup_write, | ||
2085 | .read_u64 = mem_cgroup_read, | ||
2086 | }, | ||
2087 | { | ||
2088 | .name = "memsw.failcnt", | ||
2089 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
2090 | .trigger = mem_cgroup_reset, | ||
2091 | .read_u64 = mem_cgroup_read, | ||
2092 | }, | ||
2093 | }; | ||
2094 | |||
2095 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
2096 | { | ||
2097 | if (!do_swap_account) | ||
2098 | return 0; | ||
2099 | return cgroup_add_files(cont, ss, memsw_cgroup_files, | ||
2100 | ARRAY_SIZE(memsw_cgroup_files)); | ||
1022 | }; | 2101 | }; |
2102 | #else | ||
2103 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
2104 | { | ||
2105 | return 0; | ||
2106 | } | ||
2107 | #endif | ||
1023 | 2108 | ||
1024 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | 2109 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) |
1025 | { | 2110 | { |
@@ -1046,7 +2131,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
1046 | 2131 | ||
1047 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 2132 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
1048 | mz = &pn->zoneinfo[zone]; | 2133 | mz = &pn->zoneinfo[zone]; |
1049 | spin_lock_init(&mz->lru_lock); | ||
1050 | for_each_lru(l) | 2134 | for_each_lru(l) |
1051 | INIT_LIST_HEAD(&mz->lists[l]); | 2135 | INIT_LIST_HEAD(&mz->lists[l]); |
1052 | } | 2136 | } |
@@ -1058,55 +2142,133 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
1058 | kfree(mem->info.nodeinfo[node]); | 2142 | kfree(mem->info.nodeinfo[node]); |
1059 | } | 2143 | } |
1060 | 2144 | ||
2145 | static int mem_cgroup_size(void) | ||
2146 | { | ||
2147 | int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); | ||
2148 | return sizeof(struct mem_cgroup) + cpustat_size; | ||
2149 | } | ||
2150 | |||
1061 | static struct mem_cgroup *mem_cgroup_alloc(void) | 2151 | static struct mem_cgroup *mem_cgroup_alloc(void) |
1062 | { | 2152 | { |
1063 | struct mem_cgroup *mem; | 2153 | struct mem_cgroup *mem; |
2154 | int size = mem_cgroup_size(); | ||
1064 | 2155 | ||
1065 | if (sizeof(*mem) < PAGE_SIZE) | 2156 | if (size < PAGE_SIZE) |
1066 | mem = kmalloc(sizeof(*mem), GFP_KERNEL); | 2157 | mem = kmalloc(size, GFP_KERNEL); |
1067 | else | 2158 | else |
1068 | mem = vmalloc(sizeof(*mem)); | 2159 | mem = vmalloc(size); |
1069 | 2160 | ||
1070 | if (mem) | 2161 | if (mem) |
1071 | memset(mem, 0, sizeof(*mem)); | 2162 | memset(mem, 0, size); |
1072 | return mem; | 2163 | return mem; |
1073 | } | 2164 | } |
1074 | 2165 | ||
1075 | static void mem_cgroup_free(struct mem_cgroup *mem) | 2166 | /* |
2167 | * At destroying mem_cgroup, references from swap_cgroup can remain. | ||
2168 | * (scanning all at force_empty is too costly...) | ||
2169 | * | ||
2170 | * Instead of clearing all references at force_empty, we remember | ||
2171 | * the number of reference from swap_cgroup and free mem_cgroup when | ||
2172 | * it goes down to 0. | ||
2173 | * | ||
2174 | * Removal of cgroup itself succeeds regardless of refs from swap. | ||
2175 | */ | ||
2176 | |||
2177 | static void __mem_cgroup_free(struct mem_cgroup *mem) | ||
1076 | { | 2178 | { |
1077 | if (sizeof(*mem) < PAGE_SIZE) | 2179 | int node; |
2180 | |||
2181 | for_each_node_state(node, N_POSSIBLE) | ||
2182 | free_mem_cgroup_per_zone_info(mem, node); | ||
2183 | |||
2184 | if (mem_cgroup_size() < PAGE_SIZE) | ||
1078 | kfree(mem); | 2185 | kfree(mem); |
1079 | else | 2186 | else |
1080 | vfree(mem); | 2187 | vfree(mem); |
1081 | } | 2188 | } |
1082 | 2189 | ||
2190 | static void mem_cgroup_get(struct mem_cgroup *mem) | ||
2191 | { | ||
2192 | atomic_inc(&mem->refcnt); | ||
2193 | } | ||
1083 | 2194 | ||
1084 | static struct cgroup_subsys_state * | 2195 | static void mem_cgroup_put(struct mem_cgroup *mem) |
2196 | { | ||
2197 | if (atomic_dec_and_test(&mem->refcnt)) { | ||
2198 | struct mem_cgroup *parent = parent_mem_cgroup(mem); | ||
2199 | __mem_cgroup_free(mem); | ||
2200 | if (parent) | ||
2201 | mem_cgroup_put(parent); | ||
2202 | } | ||
2203 | } | ||
2204 | |||
2205 | /* | ||
2206 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. | ||
2207 | */ | ||
2208 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) | ||
2209 | { | ||
2210 | if (!mem->res.parent) | ||
2211 | return NULL; | ||
2212 | return mem_cgroup_from_res_counter(mem->res.parent, res); | ||
2213 | } | ||
2214 | |||
2215 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
2216 | static void __init enable_swap_cgroup(void) | ||
2217 | { | ||
2218 | if (!mem_cgroup_disabled() && really_do_swap_account) | ||
2219 | do_swap_account = 1; | ||
2220 | } | ||
2221 | #else | ||
2222 | static void __init enable_swap_cgroup(void) | ||
2223 | { | ||
2224 | } | ||
2225 | #endif | ||
2226 | |||
2227 | static struct cgroup_subsys_state * __ref | ||
1085 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 2228 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
1086 | { | 2229 | { |
1087 | struct mem_cgroup *mem; | 2230 | struct mem_cgroup *mem, *parent; |
1088 | int node; | 2231 | int node; |
1089 | 2232 | ||
1090 | if (unlikely((cont->parent) == NULL)) { | 2233 | mem = mem_cgroup_alloc(); |
1091 | mem = &init_mem_cgroup; | 2234 | if (!mem) |
1092 | } else { | 2235 | return ERR_PTR(-ENOMEM); |
1093 | mem = mem_cgroup_alloc(); | ||
1094 | if (!mem) | ||
1095 | return ERR_PTR(-ENOMEM); | ||
1096 | } | ||
1097 | |||
1098 | res_counter_init(&mem->res); | ||
1099 | 2236 | ||
1100 | for_each_node_state(node, N_POSSIBLE) | 2237 | for_each_node_state(node, N_POSSIBLE) |
1101 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 2238 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
1102 | goto free_out; | 2239 | goto free_out; |
2240 | /* root ? */ | ||
2241 | if (cont->parent == NULL) { | ||
2242 | enable_swap_cgroup(); | ||
2243 | parent = NULL; | ||
2244 | } else { | ||
2245 | parent = mem_cgroup_from_cont(cont->parent); | ||
2246 | mem->use_hierarchy = parent->use_hierarchy; | ||
2247 | } | ||
1103 | 2248 | ||
2249 | if (parent && parent->use_hierarchy) { | ||
2250 | res_counter_init(&mem->res, &parent->res); | ||
2251 | res_counter_init(&mem->memsw, &parent->memsw); | ||
2252 | /* | ||
2253 | * We increment refcnt of the parent to ensure that we can | ||
2254 | * safely access it on res_counter_charge/uncharge. | ||
2255 | * This refcnt will be decremented when freeing this | ||
2256 | * mem_cgroup(see mem_cgroup_put). | ||
2257 | */ | ||
2258 | mem_cgroup_get(parent); | ||
2259 | } else { | ||
2260 | res_counter_init(&mem->res, NULL); | ||
2261 | res_counter_init(&mem->memsw, NULL); | ||
2262 | } | ||
2263 | mem->last_scanned_child = NULL; | ||
2264 | spin_lock_init(&mem->reclaim_param_lock); | ||
2265 | |||
2266 | if (parent) | ||
2267 | mem->swappiness = get_swappiness(parent); | ||
2268 | atomic_set(&mem->refcnt, 1); | ||
1104 | return &mem->css; | 2269 | return &mem->css; |
1105 | free_out: | 2270 | free_out: |
1106 | for_each_node_state(node, N_POSSIBLE) | 2271 | __mem_cgroup_free(mem); |
1107 | free_mem_cgroup_per_zone_info(mem, node); | ||
1108 | if (cont->parent != NULL) | ||
1109 | mem_cgroup_free(mem); | ||
1110 | return ERR_PTR(-ENOMEM); | 2272 | return ERR_PTR(-ENOMEM); |
1111 | } | 2273 | } |
1112 | 2274 | ||
@@ -1114,26 +2276,33 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | |||
1114 | struct cgroup *cont) | 2276 | struct cgroup *cont) |
1115 | { | 2277 | { |
1116 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2278 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
1117 | mem_cgroup_force_empty(mem); | 2279 | mem_cgroup_force_empty(mem, false); |
1118 | } | 2280 | } |
1119 | 2281 | ||
1120 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, | 2282 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, |
1121 | struct cgroup *cont) | 2283 | struct cgroup *cont) |
1122 | { | 2284 | { |
1123 | int node; | ||
1124 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2285 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
2286 | struct mem_cgroup *last_scanned_child = mem->last_scanned_child; | ||
1125 | 2287 | ||
1126 | for_each_node_state(node, N_POSSIBLE) | 2288 | if (last_scanned_child) { |
1127 | free_mem_cgroup_per_zone_info(mem, node); | 2289 | VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child)); |
1128 | 2290 | mem_cgroup_put(last_scanned_child); | |
1129 | mem_cgroup_free(mem_cgroup_from_cont(cont)); | 2291 | } |
2292 | mem_cgroup_put(mem); | ||
1130 | } | 2293 | } |
1131 | 2294 | ||
1132 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | 2295 | static int mem_cgroup_populate(struct cgroup_subsys *ss, |
1133 | struct cgroup *cont) | 2296 | struct cgroup *cont) |
1134 | { | 2297 | { |
1135 | return cgroup_add_files(cont, ss, mem_cgroup_files, | 2298 | int ret; |
1136 | ARRAY_SIZE(mem_cgroup_files)); | 2299 | |
2300 | ret = cgroup_add_files(cont, ss, mem_cgroup_files, | ||
2301 | ARRAY_SIZE(mem_cgroup_files)); | ||
2302 | |||
2303 | if (!ret) | ||
2304 | ret = register_memsw_files(cont, ss); | ||
2305 | return ret; | ||
1137 | } | 2306 | } |
1138 | 2307 | ||
1139 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 2308 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
@@ -1141,25 +2310,12 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
1141 | struct cgroup *old_cont, | 2310 | struct cgroup *old_cont, |
1142 | struct task_struct *p) | 2311 | struct task_struct *p) |
1143 | { | 2312 | { |
1144 | struct mm_struct *mm; | 2313 | mutex_lock(&memcg_tasklist); |
1145 | struct mem_cgroup *mem, *old_mem; | ||
1146 | |||
1147 | mm = get_task_mm(p); | ||
1148 | if (mm == NULL) | ||
1149 | return; | ||
1150 | |||
1151 | mem = mem_cgroup_from_cont(cont); | ||
1152 | old_mem = mem_cgroup_from_cont(old_cont); | ||
1153 | |||
1154 | /* | 2314 | /* |
1155 | * Only thread group leaders are allowed to migrate, the mm_struct is | 2315 | * FIXME: It's better to move charges of this process from old |
1156 | * in effect owned by the leader | 2316 | * memcg to new memcg. But it's just on TODO-List now. |
1157 | */ | 2317 | */ |
1158 | if (!thread_group_leader(p)) | 2318 | mutex_unlock(&memcg_tasklist); |
1159 | goto out; | ||
1160 | |||
1161 | out: | ||
1162 | mmput(mm); | ||
1163 | } | 2319 | } |
1164 | 2320 | ||
1165 | struct cgroup_subsys mem_cgroup_subsys = { | 2321 | struct cgroup_subsys mem_cgroup_subsys = { |
@@ -1172,3 +2328,13 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
1172 | .attach = mem_cgroup_move_task, | 2328 | .attach = mem_cgroup_move_task, |
1173 | .early_init = 0, | 2329 | .early_init = 0, |
1174 | }; | 2330 | }; |
2331 | |||
2332 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
2333 | |||
2334 | static int __init disable_swap_account(char *s) | ||
2335 | { | ||
2336 | really_do_swap_account = 0; | ||
2337 | return 1; | ||
2338 | } | ||
2339 | __setup("noswapaccount", disable_swap_account); | ||
2340 | #endif | ||
diff --git a/mm/memory.c b/mm/memory.c index 0a2010a9518c..baa999e87cd2 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -52,6 +52,9 @@ | |||
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/memcontrol.h> | 53 | #include <linux/memcontrol.h> |
54 | #include <linux/mmu_notifier.h> | 54 | #include <linux/mmu_notifier.h> |
55 | #include <linux/kallsyms.h> | ||
56 | #include <linux/swapops.h> | ||
57 | #include <linux/elf.h> | ||
55 | 58 | ||
56 | #include <asm/pgalloc.h> | 59 | #include <asm/pgalloc.h> |
57 | #include <asm/uaccess.h> | 60 | #include <asm/uaccess.h> |
@@ -59,9 +62,6 @@ | |||
59 | #include <asm/tlbflush.h> | 62 | #include <asm/tlbflush.h> |
60 | #include <asm/pgtable.h> | 63 | #include <asm/pgtable.h> |
61 | 64 | ||
62 | #include <linux/swapops.h> | ||
63 | #include <linux/elf.h> | ||
64 | |||
65 | #include "internal.h" | 65 | #include "internal.h" |
66 | 66 | ||
67 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 67 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
@@ -375,15 +375,65 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | |||
375 | * | 375 | * |
376 | * The calling function must still handle the error. | 376 | * The calling function must still handle the error. |
377 | */ | 377 | */ |
378 | static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, | 378 | static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, |
379 | unsigned long vaddr) | 379 | pte_t pte, struct page *page) |
380 | { | 380 | { |
381 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " | 381 | pgd_t *pgd = pgd_offset(vma->vm_mm, addr); |
382 | "vm_flags = %lx, vaddr = %lx\n", | 382 | pud_t *pud = pud_offset(pgd, addr); |
383 | (long long)pte_val(pte), | 383 | pmd_t *pmd = pmd_offset(pud, addr); |
384 | (vma->vm_mm == current->mm ? current->comm : "???"), | 384 | struct address_space *mapping; |
385 | vma->vm_flags, vaddr); | 385 | pgoff_t index; |
386 | static unsigned long resume; | ||
387 | static unsigned long nr_shown; | ||
388 | static unsigned long nr_unshown; | ||
389 | |||
390 | /* | ||
391 | * Allow a burst of 60 reports, then keep quiet for that minute; | ||
392 | * or allow a steady drip of one report per second. | ||
393 | */ | ||
394 | if (nr_shown == 60) { | ||
395 | if (time_before(jiffies, resume)) { | ||
396 | nr_unshown++; | ||
397 | return; | ||
398 | } | ||
399 | if (nr_unshown) { | ||
400 | printk(KERN_ALERT | ||
401 | "BUG: Bad page map: %lu messages suppressed\n", | ||
402 | nr_unshown); | ||
403 | nr_unshown = 0; | ||
404 | } | ||
405 | nr_shown = 0; | ||
406 | } | ||
407 | if (nr_shown++ == 0) | ||
408 | resume = jiffies + 60 * HZ; | ||
409 | |||
410 | mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; | ||
411 | index = linear_page_index(vma, addr); | ||
412 | |||
413 | printk(KERN_ALERT | ||
414 | "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", | ||
415 | current->comm, | ||
416 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); | ||
417 | if (page) { | ||
418 | printk(KERN_ALERT | ||
419 | "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", | ||
420 | page, (void *)page->flags, page_count(page), | ||
421 | page_mapcount(page), page->mapping, page->index); | ||
422 | } | ||
423 | printk(KERN_ALERT | ||
424 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", | ||
425 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); | ||
426 | /* | ||
427 | * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y | ||
428 | */ | ||
429 | if (vma->vm_ops) | ||
430 | print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", | ||
431 | (unsigned long)vma->vm_ops->fault); | ||
432 | if (vma->vm_file && vma->vm_file->f_op) | ||
433 | print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", | ||
434 | (unsigned long)vma->vm_file->f_op->mmap); | ||
386 | dump_stack(); | 435 | dump_stack(); |
436 | add_taint(TAINT_BAD_PAGE); | ||
387 | } | 437 | } |
388 | 438 | ||
389 | static inline int is_cow_mapping(unsigned int flags) | 439 | static inline int is_cow_mapping(unsigned int flags) |
@@ -441,21 +491,18 @@ static inline int is_cow_mapping(unsigned int flags) | |||
441 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | 491 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, |
442 | pte_t pte) | 492 | pte_t pte) |
443 | { | 493 | { |
444 | unsigned long pfn; | 494 | unsigned long pfn = pte_pfn(pte); |
445 | 495 | ||
446 | if (HAVE_PTE_SPECIAL) { | 496 | if (HAVE_PTE_SPECIAL) { |
447 | if (likely(!pte_special(pte))) { | 497 | if (likely(!pte_special(pte))) |
448 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 498 | goto check_pfn; |
449 | return pte_page(pte); | 499 | if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) |
450 | } | 500 | print_bad_pte(vma, addr, pte, NULL); |
451 | VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); | ||
452 | return NULL; | 501 | return NULL; |
453 | } | 502 | } |
454 | 503 | ||
455 | /* !HAVE_PTE_SPECIAL case follows: */ | 504 | /* !HAVE_PTE_SPECIAL case follows: */ |
456 | 505 | ||
457 | pfn = pte_pfn(pte); | ||
458 | |||
459 | if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { | 506 | if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { |
460 | if (vma->vm_flags & VM_MIXEDMAP) { | 507 | if (vma->vm_flags & VM_MIXEDMAP) { |
461 | if (!pfn_valid(pfn)) | 508 | if (!pfn_valid(pfn)) |
@@ -471,11 +518,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
471 | } | 518 | } |
472 | } | 519 | } |
473 | 520 | ||
474 | VM_BUG_ON(!pfn_valid(pfn)); | 521 | check_pfn: |
522 | if (unlikely(pfn > highest_memmap_pfn)) { | ||
523 | print_bad_pte(vma, addr, pte, NULL); | ||
524 | return NULL; | ||
525 | } | ||
475 | 526 | ||
476 | /* | 527 | /* |
477 | * NOTE! We still have PageReserved() pages in the page tables. | 528 | * NOTE! We still have PageReserved() pages in the page tables. |
478 | * | ||
479 | * eg. VDSO mappings can cause them to exist. | 529 | * eg. VDSO mappings can cause them to exist. |
480 | */ | 530 | */ |
481 | out: | 531 | out: |
@@ -767,11 +817,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
767 | else { | 817 | else { |
768 | if (pte_dirty(ptent)) | 818 | if (pte_dirty(ptent)) |
769 | set_page_dirty(page); | 819 | set_page_dirty(page); |
770 | if (pte_young(ptent)) | 820 | if (pte_young(ptent) && |
771 | SetPageReferenced(page); | 821 | likely(!VM_SequentialReadHint(vma))) |
822 | mark_page_accessed(page); | ||
772 | file_rss--; | 823 | file_rss--; |
773 | } | 824 | } |
774 | page_remove_rmap(page, vma); | 825 | page_remove_rmap(page); |
826 | if (unlikely(page_mapcount(page) < 0)) | ||
827 | print_bad_pte(vma, addr, ptent, page); | ||
775 | tlb_remove_page(tlb, page); | 828 | tlb_remove_page(tlb, page); |
776 | continue; | 829 | continue; |
777 | } | 830 | } |
@@ -781,8 +834,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
781 | */ | 834 | */ |
782 | if (unlikely(details)) | 835 | if (unlikely(details)) |
783 | continue; | 836 | continue; |
784 | if (!pte_file(ptent)) | 837 | if (pte_file(ptent)) { |
785 | free_swap_and_cache(pte_to_swp_entry(ptent)); | 838 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) |
839 | print_bad_pte(vma, addr, ptent, NULL); | ||
840 | } else if | ||
841 | (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) | ||
842 | print_bad_pte(vma, addr, ptent, NULL); | ||
786 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 843 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
787 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); | 844 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); |
788 | 845 | ||
@@ -1153,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1153 | int write = !!(flags & GUP_FLAGS_WRITE); | 1210 | int write = !!(flags & GUP_FLAGS_WRITE); |
1154 | int force = !!(flags & GUP_FLAGS_FORCE); | 1211 | int force = !!(flags & GUP_FLAGS_FORCE); |
1155 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); | 1212 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); |
1213 | int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); | ||
1156 | 1214 | ||
1157 | if (len <= 0) | 1215 | if (len <= 0) |
1158 | return 0; | 1216 | return 0; |
@@ -1231,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1231 | struct page *page; | 1289 | struct page *page; |
1232 | 1290 | ||
1233 | /* | 1291 | /* |
1234 | * If tsk is ooming, cut off its access to large memory | 1292 | * If we have a pending SIGKILL, don't keep faulting |
1235 | * allocations. It has a pending SIGKILL, but it can't | 1293 | * pages and potentially allocating memory, unless |
1236 | * be processed until returning to user space. | 1294 | * current is handling munlock--e.g., on exit. In |
1295 | * that case, we are not allocating memory. Rather, | ||
1296 | * we're only unlocking already resident/mapped pages. | ||
1237 | */ | 1297 | */ |
1238 | if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) | 1298 | if (unlikely(!ignore_sigkill && |
1239 | return i ? i : -ENOMEM; | 1299 | fatal_signal_pending(current))) |
1300 | return i ? i : -ERESTARTSYS; | ||
1240 | 1301 | ||
1241 | if (write) | 1302 | if (write) |
1242 | foll_flags |= FOLL_WRITE; | 1303 | foll_flags |= FOLL_WRITE; |
@@ -1263,9 +1324,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1263 | * do_wp_page has broken COW when necessary, | 1324 | * do_wp_page has broken COW when necessary, |
1264 | * even if maybe_mkwrite decided not to set | 1325 | * even if maybe_mkwrite decided not to set |
1265 | * pte_write. We can thus safely do subsequent | 1326 | * pte_write. We can thus safely do subsequent |
1266 | * page lookups as if they were reads. | 1327 | * page lookups as if they were reads. But only |
1328 | * do so when looping for pte_write is futile: | ||
1329 | * in some cases userspace may also be wanting | ||
1330 | * to write to the gotten user page, which a | ||
1331 | * read fault here might prevent (a readonly | ||
1332 | * page might get reCOWed by userspace write). | ||
1267 | */ | 1333 | */ |
1268 | if (ret & VM_FAULT_WRITE) | 1334 | if ((ret & VM_FAULT_WRITE) && |
1335 | !(vma->vm_flags & VM_WRITE)) | ||
1269 | foll_flags &= ~FOLL_WRITE; | 1336 | foll_flags &= ~FOLL_WRITE; |
1270 | 1337 | ||
1271 | cond_resched(); | 1338 | cond_resched(); |
@@ -1444,6 +1511,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
1444 | unsigned long pfn) | 1511 | unsigned long pfn) |
1445 | { | 1512 | { |
1446 | int ret; | 1513 | int ret; |
1514 | pgprot_t pgprot = vma->vm_page_prot; | ||
1447 | /* | 1515 | /* |
1448 | * Technically, architectures with pte_special can avoid all these | 1516 | * Technically, architectures with pte_special can avoid all these |
1449 | * restrictions (same for remap_pfn_range). However we would like | 1517 | * restrictions (same for remap_pfn_range). However we would like |
@@ -1458,10 +1526,10 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
1458 | 1526 | ||
1459 | if (addr < vma->vm_start || addr >= vma->vm_end) | 1527 | if (addr < vma->vm_start || addr >= vma->vm_end) |
1460 | return -EFAULT; | 1528 | return -EFAULT; |
1461 | if (track_pfn_vma_new(vma, vma->vm_page_prot, pfn, PAGE_SIZE)) | 1529 | if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) |
1462 | return -EINVAL; | 1530 | return -EINVAL; |
1463 | 1531 | ||
1464 | ret = insert_pfn(vma, addr, pfn, vma->vm_page_prot); | 1532 | ret = insert_pfn(vma, addr, pfn, pgprot); |
1465 | 1533 | ||
1466 | if (ret) | 1534 | if (ret) |
1467 | untrack_pfn_vma(vma, pfn, PAGE_SIZE); | 1535 | untrack_pfn_vma(vma, pfn, PAGE_SIZE); |
@@ -1604,9 +1672,15 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
1604 | 1672 | ||
1605 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | 1673 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; |
1606 | 1674 | ||
1607 | err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size)); | 1675 | err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); |
1608 | if (err) | 1676 | if (err) { |
1677 | /* | ||
1678 | * To indicate that track_pfn related cleanup is not | ||
1679 | * needed from higher level routine calling unmap_vmas | ||
1680 | */ | ||
1681 | vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); | ||
1609 | return -EINVAL; | 1682 | return -EINVAL; |
1683 | } | ||
1610 | 1684 | ||
1611 | BUG_ON(addr >= end); | 1685 | BUG_ON(addr >= end); |
1612 | pfn -= addr >> PAGE_SHIFT; | 1686 | pfn -= addr >> PAGE_SHIFT; |
@@ -1644,6 +1718,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1644 | 1718 | ||
1645 | BUG_ON(pmd_huge(*pmd)); | 1719 | BUG_ON(pmd_huge(*pmd)); |
1646 | 1720 | ||
1721 | arch_enter_lazy_mmu_mode(); | ||
1722 | |||
1647 | token = pmd_pgtable(*pmd); | 1723 | token = pmd_pgtable(*pmd); |
1648 | 1724 | ||
1649 | do { | 1725 | do { |
@@ -1652,6 +1728,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1652 | break; | 1728 | break; |
1653 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1729 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1654 | 1730 | ||
1731 | arch_leave_lazy_mmu_mode(); | ||
1732 | |||
1655 | if (mm != &init_mm) | 1733 | if (mm != &init_mm) |
1656 | pte_unmap_unlock(pte-1, ptl); | 1734 | pte_unmap_unlock(pte-1, ptl); |
1657 | return err; | 1735 | return err; |
@@ -1837,10 +1915,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1837 | * not dirty accountable. | 1915 | * not dirty accountable. |
1838 | */ | 1916 | */ |
1839 | if (PageAnon(old_page)) { | 1917 | if (PageAnon(old_page)) { |
1840 | if (trylock_page(old_page)) { | 1918 | if (!trylock_page(old_page)) { |
1841 | reuse = can_share_swap_page(old_page); | 1919 | page_cache_get(old_page); |
1842 | unlock_page(old_page); | 1920 | pte_unmap_unlock(page_table, ptl); |
1921 | lock_page(old_page); | ||
1922 | page_table = pte_offset_map_lock(mm, pmd, address, | ||
1923 | &ptl); | ||
1924 | if (!pte_same(*page_table, orig_pte)) { | ||
1925 | unlock_page(old_page); | ||
1926 | page_cache_release(old_page); | ||
1927 | goto unlock; | ||
1928 | } | ||
1929 | page_cache_release(old_page); | ||
1843 | } | 1930 | } |
1931 | reuse = reuse_swap_page(old_page); | ||
1932 | unlock_page(old_page); | ||
1844 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 1933 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
1845 | (VM_WRITE|VM_SHARED))) { | 1934 | (VM_WRITE|VM_SHARED))) { |
1846 | /* | 1935 | /* |
@@ -1910,7 +1999,7 @@ gotten: | |||
1910 | * Don't let another task, with possibly unlocked vma, | 1999 | * Don't let another task, with possibly unlocked vma, |
1911 | * keep the mlocked page. | 2000 | * keep the mlocked page. |
1912 | */ | 2001 | */ |
1913 | if (vma->vm_flags & VM_LOCKED) { | 2002 | if ((vma->vm_flags & VM_LOCKED) && old_page) { |
1914 | lock_page(old_page); /* for LRU manipulation */ | 2003 | lock_page(old_page); /* for LRU manipulation */ |
1915 | clear_page_mlock(old_page); | 2004 | clear_page_mlock(old_page); |
1916 | unlock_page(old_page); | 2005 | unlock_page(old_page); |
@@ -1918,7 +2007,7 @@ gotten: | |||
1918 | cow_user_page(new_page, old_page, address, vma); | 2007 | cow_user_page(new_page, old_page, address, vma); |
1919 | __SetPageUptodate(new_page); | 2008 | __SetPageUptodate(new_page); |
1920 | 2009 | ||
1921 | if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) | 2010 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) |
1922 | goto oom_free_new; | 2011 | goto oom_free_new; |
1923 | 2012 | ||
1924 | /* | 2013 | /* |
@@ -1943,11 +2032,7 @@ gotten: | |||
1943 | * thread doing COW. | 2032 | * thread doing COW. |
1944 | */ | 2033 | */ |
1945 | ptep_clear_flush_notify(vma, address, page_table); | 2034 | ptep_clear_flush_notify(vma, address, page_table); |
1946 | SetPageSwapBacked(new_page); | ||
1947 | lru_cache_add_active_or_unevictable(new_page, vma); | ||
1948 | page_add_new_anon_rmap(new_page, vma, address); | 2035 | page_add_new_anon_rmap(new_page, vma, address); |
1949 | |||
1950 | //TODO: is this safe? do_anonymous_page() does it this way. | ||
1951 | set_pte_at(mm, address, page_table, entry); | 2036 | set_pte_at(mm, address, page_table, entry); |
1952 | update_mmu_cache(vma, address, entry); | 2037 | update_mmu_cache(vma, address, entry); |
1953 | if (old_page) { | 2038 | if (old_page) { |
@@ -1973,7 +2058,7 @@ gotten: | |||
1973 | * mapcount is visible. So transitively, TLBs to | 2058 | * mapcount is visible. So transitively, TLBs to |
1974 | * old page will be flushed before it can be reused. | 2059 | * old page will be flushed before it can be reused. |
1975 | */ | 2060 | */ |
1976 | page_remove_rmap(old_page, vma); | 2061 | page_remove_rmap(old_page); |
1977 | } | 2062 | } |
1978 | 2063 | ||
1979 | /* Free the old page.. */ | 2064 | /* Free the old page.. */ |
@@ -2266,7 +2351,7 @@ int vmtruncate(struct inode * inode, loff_t offset) | |||
2266 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | 2351 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); |
2267 | } | 2352 | } |
2268 | 2353 | ||
2269 | if (inode->i_op && inode->i_op->truncate) | 2354 | if (inode->i_op->truncate) |
2270 | inode->i_op->truncate(inode); | 2355 | inode->i_op->truncate(inode); |
2271 | return 0; | 2356 | return 0; |
2272 | 2357 | ||
@@ -2286,7 +2371,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | |||
2286 | * a way to truncate a range of blocks (punch a hole) - | 2371 | * a way to truncate a range of blocks (punch a hole) - |
2287 | * we should return failure right now. | 2372 | * we should return failure right now. |
2288 | */ | 2373 | */ |
2289 | if (!inode->i_op || !inode->i_op->truncate_range) | 2374 | if (!inode->i_op->truncate_range) |
2290 | return -ENOSYS; | 2375 | return -ENOSYS; |
2291 | 2376 | ||
2292 | mutex_lock(&inode->i_mutex); | 2377 | mutex_lock(&inode->i_mutex); |
@@ -2314,6 +2399,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2314 | struct page *page; | 2399 | struct page *page; |
2315 | swp_entry_t entry; | 2400 | swp_entry_t entry; |
2316 | pte_t pte; | 2401 | pte_t pte; |
2402 | struct mem_cgroup *ptr = NULL; | ||
2317 | int ret = 0; | 2403 | int ret = 0; |
2318 | 2404 | ||
2319 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2405 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
@@ -2352,7 +2438,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2352 | lock_page(page); | 2438 | lock_page(page); |
2353 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2439 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2354 | 2440 | ||
2355 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | 2441 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { |
2356 | ret = VM_FAULT_OOM; | 2442 | ret = VM_FAULT_OOM; |
2357 | unlock_page(page); | 2443 | unlock_page(page); |
2358 | goto out; | 2444 | goto out; |
@@ -2370,22 +2456,35 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2370 | goto out_nomap; | 2456 | goto out_nomap; |
2371 | } | 2457 | } |
2372 | 2458 | ||
2373 | /* The page isn't present yet, go ahead with the fault. */ | 2459 | /* |
2460 | * The page isn't present yet, go ahead with the fault. | ||
2461 | * | ||
2462 | * Be careful about the sequence of operations here. | ||
2463 | * To get its accounting right, reuse_swap_page() must be called | ||
2464 | * while the page is counted on swap but not yet in mapcount i.e. | ||
2465 | * before page_add_anon_rmap() and swap_free(); try_to_free_swap() | ||
2466 | * must be called after the swap_free(), or it will never succeed. | ||
2467 | * Because delete_from_swap_page() may be called by reuse_swap_page(), | ||
2468 | * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry | ||
2469 | * in page->private. In this case, a record in swap_cgroup is silently | ||
2470 | * discarded at swap_free(). | ||
2471 | */ | ||
2374 | 2472 | ||
2375 | inc_mm_counter(mm, anon_rss); | 2473 | inc_mm_counter(mm, anon_rss); |
2376 | pte = mk_pte(page, vma->vm_page_prot); | 2474 | pte = mk_pte(page, vma->vm_page_prot); |
2377 | if (write_access && can_share_swap_page(page)) { | 2475 | if (write_access && reuse_swap_page(page)) { |
2378 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2476 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
2379 | write_access = 0; | 2477 | write_access = 0; |
2380 | } | 2478 | } |
2381 | |||
2382 | flush_icache_page(vma, page); | 2479 | flush_icache_page(vma, page); |
2383 | set_pte_at(mm, address, page_table, pte); | 2480 | set_pte_at(mm, address, page_table, pte); |
2384 | page_add_anon_rmap(page, vma, address); | 2481 | page_add_anon_rmap(page, vma, address); |
2482 | /* It's better to call commit-charge after rmap is established */ | ||
2483 | mem_cgroup_commit_charge_swapin(page, ptr); | ||
2385 | 2484 | ||
2386 | swap_free(entry); | 2485 | swap_free(entry); |
2387 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) | 2486 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) |
2388 | remove_exclusive_swap_page(page); | 2487 | try_to_free_swap(page); |
2389 | unlock_page(page); | 2488 | unlock_page(page); |
2390 | 2489 | ||
2391 | if (write_access) { | 2490 | if (write_access) { |
@@ -2402,7 +2501,7 @@ unlock: | |||
2402 | out: | 2501 | out: |
2403 | return ret; | 2502 | return ret; |
2404 | out_nomap: | 2503 | out_nomap: |
2405 | mem_cgroup_uncharge_page(page); | 2504 | mem_cgroup_cancel_charge_swapin(ptr); |
2406 | pte_unmap_unlock(page_table, ptl); | 2505 | pte_unmap_unlock(page_table, ptl); |
2407 | unlock_page(page); | 2506 | unlock_page(page); |
2408 | page_cache_release(page); | 2507 | page_cache_release(page); |
@@ -2432,7 +2531,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2432 | goto oom; | 2531 | goto oom; |
2433 | __SetPageUptodate(page); | 2532 | __SetPageUptodate(page); |
2434 | 2533 | ||
2435 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) | 2534 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) |
2436 | goto oom_free_page; | 2535 | goto oom_free_page; |
2437 | 2536 | ||
2438 | entry = mk_pte(page, vma->vm_page_prot); | 2537 | entry = mk_pte(page, vma->vm_page_prot); |
@@ -2442,8 +2541,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2442 | if (!pte_none(*page_table)) | 2541 | if (!pte_none(*page_table)) |
2443 | goto release; | 2542 | goto release; |
2444 | inc_mm_counter(mm, anon_rss); | 2543 | inc_mm_counter(mm, anon_rss); |
2445 | SetPageSwapBacked(page); | ||
2446 | lru_cache_add_active_or_unevictable(page, vma); | ||
2447 | page_add_new_anon_rmap(page, vma, address); | 2544 | page_add_new_anon_rmap(page, vma, address); |
2448 | set_pte_at(mm, address, page_table, entry); | 2545 | set_pte_at(mm, address, page_table, entry); |
2449 | 2546 | ||
@@ -2525,7 +2622,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2525 | ret = VM_FAULT_OOM; | 2622 | ret = VM_FAULT_OOM; |
2526 | goto out; | 2623 | goto out; |
2527 | } | 2624 | } |
2528 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | 2625 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { |
2529 | ret = VM_FAULT_OOM; | 2626 | ret = VM_FAULT_OOM; |
2530 | page_cache_release(page); | 2627 | page_cache_release(page); |
2531 | goto out; | 2628 | goto out; |
@@ -2591,8 +2688,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2591 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2688 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2592 | if (anon) { | 2689 | if (anon) { |
2593 | inc_mm_counter(mm, anon_rss); | 2690 | inc_mm_counter(mm, anon_rss); |
2594 | SetPageSwapBacked(page); | ||
2595 | lru_cache_add_active_or_unevictable(page, vma); | ||
2596 | page_add_new_anon_rmap(page, vma, address); | 2691 | page_add_new_anon_rmap(page, vma, address); |
2597 | } else { | 2692 | } else { |
2598 | inc_mm_counter(mm, file_rss); | 2693 | inc_mm_counter(mm, file_rss); |
@@ -2602,7 +2697,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2602 | get_page(dirty_page); | 2697 | get_page(dirty_page); |
2603 | } | 2698 | } |
2604 | } | 2699 | } |
2605 | //TODO: is this safe? do_anonymous_page() does it this way. | ||
2606 | set_pte_at(mm, address, page_table, entry); | 2700 | set_pte_at(mm, address, page_table, entry); |
2607 | 2701 | ||
2608 | /* no need to invalidate: a not-present page won't be cached */ | 2702 | /* no need to invalidate: a not-present page won't be cached */ |
@@ -2666,12 +2760,11 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2666 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2760 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
2667 | return 0; | 2761 | return 0; |
2668 | 2762 | ||
2669 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || | 2763 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { |
2670 | !(vma->vm_flags & VM_CAN_NONLINEAR))) { | ||
2671 | /* | 2764 | /* |
2672 | * Page table corrupted: show pte and kill process. | 2765 | * Page table corrupted: show pte and kill process. |
2673 | */ | 2766 | */ |
2674 | print_bad_pte(vma, orig_pte, address); | 2767 | print_bad_pte(vma, address, orig_pte, NULL); |
2675 | return VM_FAULT_OOM; | 2768 | return VM_FAULT_OOM; |
2676 | } | 2769 | } |
2677 | 2770 | ||
@@ -2953,7 +3046,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | |||
2953 | { | 3046 | { |
2954 | resource_size_t phys_addr; | 3047 | resource_size_t phys_addr; |
2955 | unsigned long prot = 0; | 3048 | unsigned long prot = 0; |
2956 | void *maddr; | 3049 | void __iomem *maddr; |
2957 | int offset = addr & (PAGE_SIZE-1); | 3050 | int offset = addr & (PAGE_SIZE-1); |
2958 | 3051 | ||
2959 | if (follow_phys(vma, addr, write, &prot, &phys_addr)) | 3052 | if (follow_phys(vma, addr, write, &prot, &phys_addr)) |
@@ -3079,6 +3172,15 @@ void print_vma_addr(char *prefix, unsigned long ip) | |||
3079 | #ifdef CONFIG_PROVE_LOCKING | 3172 | #ifdef CONFIG_PROVE_LOCKING |
3080 | void might_fault(void) | 3173 | void might_fault(void) |
3081 | { | 3174 | { |
3175 | /* | ||
3176 | * Some code (nfs/sunrpc) uses socket ops on kernel memory while | ||
3177 | * holding the mmap_sem, this is safe because kernel memory doesn't | ||
3178 | * get paged out, therefore we'll never actually fault, and the | ||
3179 | * below annotations will generate false positives. | ||
3180 | */ | ||
3181 | if (segment_eq(get_fs(), KERNEL_DS)) | ||
3182 | return; | ||
3183 | |||
3082 | might_sleep(); | 3184 | might_sleep(); |
3083 | /* | 3185 | /* |
3084 | * it would be nicer only to annotate paths which are not under | 3186 | * it would be nicer only to annotate paths which are not under |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b17371185468..c083cf5fd6df 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -216,7 +216,8 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
216 | return 0; | 216 | return 0; |
217 | } | 217 | } |
218 | 218 | ||
219 | static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn) | 219 | static int __meminit __add_section(int nid, struct zone *zone, |
220 | unsigned long phys_start_pfn) | ||
220 | { | 221 | { |
221 | int nr_pages = PAGES_PER_SECTION; | 222 | int nr_pages = PAGES_PER_SECTION; |
222 | int ret; | 223 | int ret; |
@@ -234,7 +235,7 @@ static int __meminit __add_section(struct zone *zone, unsigned long phys_start_p | |||
234 | if (ret < 0) | 235 | if (ret < 0) |
235 | return ret; | 236 | return ret; |
236 | 237 | ||
237 | return register_new_memory(__pfn_to_section(phys_start_pfn)); | 238 | return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); |
238 | } | 239 | } |
239 | 240 | ||
240 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 241 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
@@ -273,8 +274,8 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) | |||
273 | * call this function after deciding the zone to which to | 274 | * call this function after deciding the zone to which to |
274 | * add the new pages. | 275 | * add the new pages. |
275 | */ | 276 | */ |
276 | int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, | 277 | int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, |
277 | unsigned long nr_pages) | 278 | unsigned long nr_pages) |
278 | { | 279 | { |
279 | unsigned long i; | 280 | unsigned long i; |
280 | int err = 0; | 281 | int err = 0; |
@@ -284,7 +285,7 @@ int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
284 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); | 285 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); |
285 | 286 | ||
286 | for (i = start_sec; i <= end_sec; i++) { | 287 | for (i = start_sec; i <= end_sec; i++) { |
287 | err = __add_section(zone, i << PFN_SECTION_SHIFT); | 288 | err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); |
288 | 289 | ||
289 | /* | 290 | /* |
290 | * EEXIST is finally dealt with by ioresource collision | 291 | * EEXIST is finally dealt with by ioresource collision |
@@ -626,15 +627,12 @@ int scan_lru_pages(unsigned long start, unsigned long end) | |||
626 | } | 627 | } |
627 | 628 | ||
628 | static struct page * | 629 | static struct page * |
629 | hotremove_migrate_alloc(struct page *page, | 630 | hotremove_migrate_alloc(struct page *page, unsigned long private, int **x) |
630 | unsigned long private, | ||
631 | int **x) | ||
632 | { | 631 | { |
633 | /* This should be improoooooved!! */ | 632 | /* This should be improooooved!! */ |
634 | return alloc_page(GFP_HIGHUSER_PAGECACHE); | 633 | return alloc_page(GFP_HIGHUSER_MOVABLE); |
635 | } | 634 | } |
636 | 635 | ||
637 | |||
638 | #define NR_OFFLINE_AT_ONCE_PAGES (256) | 636 | #define NR_OFFLINE_AT_ONCE_PAGES (256) |
639 | static int | 637 | static int |
640 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | 638 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e412ffa8e52e..3eb4a6fdc043 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1068,10 +1068,9 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | |||
1068 | return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; | 1068 | return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; |
1069 | } | 1069 | } |
1070 | 1070 | ||
1071 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, | 1071 | SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, |
1072 | unsigned long mode, | 1072 | unsigned long, mode, unsigned long __user *, nmask, |
1073 | unsigned long __user *nmask, unsigned long maxnode, | 1073 | unsigned long, maxnode, unsigned, flags) |
1074 | unsigned flags) | ||
1075 | { | 1074 | { |
1076 | nodemask_t nodes; | 1075 | nodemask_t nodes; |
1077 | int err; | 1076 | int err; |
@@ -1091,8 +1090,8 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |||
1091 | } | 1090 | } |
1092 | 1091 | ||
1093 | /* Set the process memory policy */ | 1092 | /* Set the process memory policy */ |
1094 | asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | 1093 | SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, |
1095 | unsigned long maxnode) | 1094 | unsigned long, maxnode) |
1096 | { | 1095 | { |
1097 | int err; | 1096 | int err; |
1098 | nodemask_t nodes; | 1097 | nodemask_t nodes; |
@@ -1110,9 +1109,9 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | |||
1110 | return do_set_mempolicy(mode, flags, &nodes); | 1109 | return do_set_mempolicy(mode, flags, &nodes); |
1111 | } | 1110 | } |
1112 | 1111 | ||
1113 | asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | 1112 | SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, |
1114 | const unsigned long __user *old_nodes, | 1113 | const unsigned long __user *, old_nodes, |
1115 | const unsigned long __user *new_nodes) | 1114 | const unsigned long __user *, new_nodes) |
1116 | { | 1115 | { |
1117 | const struct cred *cred = current_cred(), *tcred; | 1116 | const struct cred *cred = current_cred(), *tcred; |
1118 | struct mm_struct *mm; | 1117 | struct mm_struct *mm; |
@@ -1185,10 +1184,9 @@ out: | |||
1185 | 1184 | ||
1186 | 1185 | ||
1187 | /* Retrieve NUMA policy */ | 1186 | /* Retrieve NUMA policy */ |
1188 | asmlinkage long sys_get_mempolicy(int __user *policy, | 1187 | SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, |
1189 | unsigned long __user *nmask, | 1188 | unsigned long __user *, nmask, unsigned long, maxnode, |
1190 | unsigned long maxnode, | 1189 | unsigned long, addr, unsigned long, flags) |
1191 | unsigned long addr, unsigned long flags) | ||
1192 | { | 1190 | { |
1193 | int err; | 1191 | int err; |
1194 | int uninitialized_var(pval); | 1192 | int uninitialized_var(pval); |
diff --git a/mm/migrate.c b/mm/migrate.c index 21631ab8c08b..2bb4e1d63520 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -121,20 +121,6 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
121 | if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) | 121 | if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) |
122 | goto out; | 122 | goto out; |
123 | 123 | ||
124 | /* | ||
125 | * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge. | ||
126 | * Failure is not an option here: we're now expected to remove every | ||
127 | * migration pte, and will cause crashes otherwise. Normally this | ||
128 | * is not an issue: mem_cgroup_prepare_migration bumped up the old | ||
129 | * page_cgroup count for safety, that's now attached to the new page, | ||
130 | * so this charge should just be another incrementation of the count, | ||
131 | * to keep in balance with rmap.c's mem_cgroup_uncharging. But if | ||
132 | * there's been a force_empty, those reference counts may no longer | ||
133 | * be reliable, and this charge can actually fail: oh well, we don't | ||
134 | * make the situation any worse by proceeding as if it had succeeded. | ||
135 | */ | ||
136 | mem_cgroup_charge(new, mm, GFP_ATOMIC); | ||
137 | |||
138 | get_page(new); | 124 | get_page(new); |
139 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 125 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
140 | if (is_write_migration_entry(entry)) | 126 | if (is_write_migration_entry(entry)) |
@@ -300,12 +286,10 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
300 | * Now we know that no one else is looking at the page. | 286 | * Now we know that no one else is looking at the page. |
301 | */ | 287 | */ |
302 | get_page(newpage); /* add cache reference */ | 288 | get_page(newpage); /* add cache reference */ |
303 | #ifdef CONFIG_SWAP | ||
304 | if (PageSwapCache(page)) { | 289 | if (PageSwapCache(page)) { |
305 | SetPageSwapCache(newpage); | 290 | SetPageSwapCache(newpage); |
306 | set_page_private(newpage, page_private(page)); | 291 | set_page_private(newpage, page_private(page)); |
307 | } | 292 | } |
308 | #endif | ||
309 | 293 | ||
310 | radix_tree_replace_slot(pslot, newpage); | 294 | radix_tree_replace_slot(pslot, newpage); |
311 | 295 | ||
@@ -373,18 +357,13 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
373 | 357 | ||
374 | mlock_migrate_page(newpage, page); | 358 | mlock_migrate_page(newpage, page); |
375 | 359 | ||
376 | #ifdef CONFIG_SWAP | ||
377 | ClearPageSwapCache(page); | 360 | ClearPageSwapCache(page); |
378 | #endif | ||
379 | ClearPagePrivate(page); | 361 | ClearPagePrivate(page); |
380 | set_page_private(page, 0); | 362 | set_page_private(page, 0); |
381 | /* page->mapping contains a flag for PageAnon() */ | 363 | /* page->mapping contains a flag for PageAnon() */ |
382 | anon = PageAnon(page); | 364 | anon = PageAnon(page); |
383 | page->mapping = NULL; | 365 | page->mapping = NULL; |
384 | 366 | ||
385 | if (!anon) /* This page was removed from radix-tree. */ | ||
386 | mem_cgroup_uncharge_cache_page(page); | ||
387 | |||
388 | /* | 367 | /* |
389 | * If any waiters have accumulated on the new page then | 368 | * If any waiters have accumulated on the new page then |
390 | * wake them up. | 369 | * wake them up. |
@@ -618,6 +597,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
618 | struct page *newpage = get_new_page(page, private, &result); | 597 | struct page *newpage = get_new_page(page, private, &result); |
619 | int rcu_locked = 0; | 598 | int rcu_locked = 0; |
620 | int charge = 0; | 599 | int charge = 0; |
600 | struct mem_cgroup *mem; | ||
621 | 601 | ||
622 | if (!newpage) | 602 | if (!newpage) |
623 | return -ENOMEM; | 603 | return -ENOMEM; |
@@ -627,24 +607,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
627 | goto move_newpage; | 607 | goto move_newpage; |
628 | } | 608 | } |
629 | 609 | ||
630 | charge = mem_cgroup_prepare_migration(page, newpage); | ||
631 | if (charge == -ENOMEM) { | ||
632 | rc = -ENOMEM; | ||
633 | goto move_newpage; | ||
634 | } | ||
635 | /* prepare cgroup just returns 0 or -ENOMEM */ | 610 | /* prepare cgroup just returns 0 or -ENOMEM */ |
636 | BUG_ON(charge); | ||
637 | |||
638 | rc = -EAGAIN; | 611 | rc = -EAGAIN; |
612 | |||
639 | if (!trylock_page(page)) { | 613 | if (!trylock_page(page)) { |
640 | if (!force) | 614 | if (!force) |
641 | goto move_newpage; | 615 | goto move_newpage; |
642 | lock_page(page); | 616 | lock_page(page); |
643 | } | 617 | } |
644 | 618 | ||
619 | /* charge against new page */ | ||
620 | charge = mem_cgroup_prepare_migration(page, &mem); | ||
621 | if (charge == -ENOMEM) { | ||
622 | rc = -ENOMEM; | ||
623 | goto unlock; | ||
624 | } | ||
625 | BUG_ON(charge); | ||
626 | |||
645 | if (PageWriteback(page)) { | 627 | if (PageWriteback(page)) { |
646 | if (!force) | 628 | if (!force) |
647 | goto unlock; | 629 | goto uncharge; |
648 | wait_on_page_writeback(page); | 630 | wait_on_page_writeback(page); |
649 | } | 631 | } |
650 | /* | 632 | /* |
@@ -697,7 +679,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
697 | rcu_unlock: | 679 | rcu_unlock: |
698 | if (rcu_locked) | 680 | if (rcu_locked) |
699 | rcu_read_unlock(); | 681 | rcu_read_unlock(); |
700 | 682 | uncharge: | |
683 | if (!charge) | ||
684 | mem_cgroup_end_migration(mem, page, newpage); | ||
701 | unlock: | 685 | unlock: |
702 | unlock_page(page); | 686 | unlock_page(page); |
703 | 687 | ||
@@ -713,8 +697,6 @@ unlock: | |||
713 | } | 697 | } |
714 | 698 | ||
715 | move_newpage: | 699 | move_newpage: |
716 | if (!charge) | ||
717 | mem_cgroup_end_migration(newpage); | ||
718 | 700 | ||
719 | /* | 701 | /* |
720 | * Move the new page to the LRU. If migration was not successful | 702 | * Move the new page to the LRU. If migration was not successful |
@@ -848,12 +830,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
848 | struct vm_area_struct *vma; | 830 | struct vm_area_struct *vma; |
849 | struct page *page; | 831 | struct page *page; |
850 | 832 | ||
851 | /* | ||
852 | * A valid page pointer that will not match any of the | ||
853 | * pages that will be moved. | ||
854 | */ | ||
855 | pp->page = ZERO_PAGE(0); | ||
856 | |||
857 | err = -EFAULT; | 833 | err = -EFAULT; |
858 | vma = find_vma(mm, pp->addr); | 834 | vma = find_vma(mm, pp->addr); |
859 | if (!vma || !vma_migratable(vma)) | 835 | if (!vma || !vma_migratable(vma)) |
@@ -919,41 +895,43 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | |||
919 | const int __user *nodes, | 895 | const int __user *nodes, |
920 | int __user *status, int flags) | 896 | int __user *status, int flags) |
921 | { | 897 | { |
922 | struct page_to_node *pm = NULL; | 898 | struct page_to_node *pm; |
923 | nodemask_t task_nodes; | 899 | nodemask_t task_nodes; |
924 | int err = 0; | 900 | unsigned long chunk_nr_pages; |
925 | int i; | 901 | unsigned long chunk_start; |
902 | int err; | ||
926 | 903 | ||
927 | task_nodes = cpuset_mems_allowed(task); | 904 | task_nodes = cpuset_mems_allowed(task); |
928 | 905 | ||
929 | /* Limit nr_pages so that the multiplication may not overflow */ | 906 | err = -ENOMEM; |
930 | if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { | 907 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); |
931 | err = -E2BIG; | 908 | if (!pm) |
932 | goto out; | ||
933 | } | ||
934 | |||
935 | pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); | ||
936 | if (!pm) { | ||
937 | err = -ENOMEM; | ||
938 | goto out; | 909 | goto out; |
939 | } | ||
940 | |||
941 | /* | 910 | /* |
942 | * Get parameters from user space and initialize the pm | 911 | * Store a chunk of page_to_node array in a page, |
943 | * array. Return various errors if the user did something wrong. | 912 | * but keep the last one as a marker |
944 | */ | 913 | */ |
945 | for (i = 0; i < nr_pages; i++) { | 914 | chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; |
946 | const void __user *p; | ||
947 | 915 | ||
948 | err = -EFAULT; | 916 | for (chunk_start = 0; |
949 | if (get_user(p, pages + i)) | 917 | chunk_start < nr_pages; |
950 | goto out_pm; | 918 | chunk_start += chunk_nr_pages) { |
919 | int j; | ||
920 | |||
921 | if (chunk_start + chunk_nr_pages > nr_pages) | ||
922 | chunk_nr_pages = nr_pages - chunk_start; | ||
951 | 923 | ||
952 | pm[i].addr = (unsigned long)p; | 924 | /* fill the chunk pm with addrs and nodes from user-space */ |
953 | if (nodes) { | 925 | for (j = 0; j < chunk_nr_pages; j++) { |
926 | const void __user *p; | ||
954 | int node; | 927 | int node; |
955 | 928 | ||
956 | if (get_user(node, nodes + i)) | 929 | err = -EFAULT; |
930 | if (get_user(p, pages + j + chunk_start)) | ||
931 | goto out_pm; | ||
932 | pm[j].addr = (unsigned long) p; | ||
933 | |||
934 | if (get_user(node, nodes + j + chunk_start)) | ||
957 | goto out_pm; | 935 | goto out_pm; |
958 | 936 | ||
959 | err = -ENODEV; | 937 | err = -ENODEV; |
@@ -964,22 +942,29 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | |||
964 | if (!node_isset(node, task_nodes)) | 942 | if (!node_isset(node, task_nodes)) |
965 | goto out_pm; | 943 | goto out_pm; |
966 | 944 | ||
967 | pm[i].node = node; | 945 | pm[j].node = node; |
968 | } else | 946 | } |
969 | pm[i].node = 0; /* anything to not match MAX_NUMNODES */ | 947 | |
970 | } | 948 | /* End marker for this chunk */ |
971 | /* End marker */ | 949 | pm[chunk_nr_pages].node = MAX_NUMNODES; |
972 | pm[nr_pages].node = MAX_NUMNODES; | 950 | |
951 | /* Migrate this chunk */ | ||
952 | err = do_move_page_to_node_array(mm, pm, | ||
953 | flags & MPOL_MF_MOVE_ALL); | ||
954 | if (err < 0) | ||
955 | goto out_pm; | ||
973 | 956 | ||
974 | err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL); | ||
975 | if (err >= 0) | ||
976 | /* Return status information */ | 957 | /* Return status information */ |
977 | for (i = 0; i < nr_pages; i++) | 958 | for (j = 0; j < chunk_nr_pages; j++) |
978 | if (put_user(pm[i].status, status + i)) | 959 | if (put_user(pm[j].status, status + j + chunk_start)) { |
979 | err = -EFAULT; | 960 | err = -EFAULT; |
961 | goto out_pm; | ||
962 | } | ||
963 | } | ||
964 | err = 0; | ||
980 | 965 | ||
981 | out_pm: | 966 | out_pm: |
982 | vfree(pm); | 967 | free_page((unsigned long)pm); |
983 | out: | 968 | out: |
984 | return err; | 969 | return err; |
985 | } | 970 | } |
@@ -1070,10 +1055,10 @@ out: | |||
1070 | * Move a list of pages in the address space of the currently executing | 1055 | * Move a list of pages in the address space of the currently executing |
1071 | * process. | 1056 | * process. |
1072 | */ | 1057 | */ |
1073 | asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | 1058 | SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, |
1074 | const void __user * __user *pages, | 1059 | const void __user * __user *, pages, |
1075 | const int __user *nodes, | 1060 | const int __user *, nodes, |
1076 | int __user *status, int flags) | 1061 | int __user *, status, int, flags) |
1077 | { | 1062 | { |
1078 | const struct cred *cred = current_cred(), *tcred; | 1063 | const struct cred *cred = current_cred(), *tcred; |
1079 | struct task_struct *task; | 1064 | struct task_struct *task; |
diff --git a/mm/mincore.c b/mm/mincore.c index 5178800bc129..8cb508f84ea4 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -177,8 +177,8 @@ none_mapped: | |||
177 | * mapped | 177 | * mapped |
178 | * -EAGAIN - A kernel resource was temporarily unavailable. | 178 | * -EAGAIN - A kernel resource was temporarily unavailable. |
179 | */ | 179 | */ |
180 | asmlinkage long sys_mincore(unsigned long start, size_t len, | 180 | SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, |
181 | unsigned char __user * vec) | 181 | unsigned char __user *, vec) |
182 | { | 182 | { |
183 | long retval; | 183 | long retval; |
184 | unsigned long pages; | 184 | unsigned long pages; |
diff --git a/mm/mlock.c b/mm/mlock.c index 3035a56e7616..028ec482fdd4 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -173,12 +173,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
173 | (atomic_read(&mm->mm_users) != 0)); | 173 | (atomic_read(&mm->mm_users) != 0)); |
174 | 174 | ||
175 | /* | 175 | /* |
176 | * mlock: don't page populate if page has PROT_NONE permission. | 176 | * mlock: don't page populate if vma has PROT_NONE permission. |
177 | * munlock: the pages always do munlock althrough | 177 | * munlock: always do munlock although the vma has PROT_NONE |
178 | * its has PROT_NONE permission. | 178 | * permission, or SIGKILL is pending. |
179 | */ | 179 | */ |
180 | if (!mlock) | 180 | if (!mlock) |
181 | gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; | 181 | gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS | |
182 | GUP_FLAGS_IGNORE_SIGKILL; | ||
182 | 183 | ||
183 | if (vma->vm_flags & VM_WRITE) | 184 | if (vma->vm_flags & VM_WRITE) |
184 | gup_flags |= GUP_FLAGS_WRITE; | 185 | gup_flags |= GUP_FLAGS_WRITE; |
@@ -293,14 +294,10 @@ static inline int __mlock_posix_error_return(long retval) | |||
293 | * | 294 | * |
294 | * return number of pages [> 0] to be removed from locked_vm on success | 295 | * return number of pages [> 0] to be removed from locked_vm on success |
295 | * of "special" vmas. | 296 | * of "special" vmas. |
296 | * | ||
297 | * return negative error if vma spanning @start-@range disappears while | ||
298 | * mmap semaphore is dropped. Unlikely? | ||
299 | */ | 297 | */ |
300 | long mlock_vma_pages_range(struct vm_area_struct *vma, | 298 | long mlock_vma_pages_range(struct vm_area_struct *vma, |
301 | unsigned long start, unsigned long end) | 299 | unsigned long start, unsigned long end) |
302 | { | 300 | { |
303 | struct mm_struct *mm = vma->vm_mm; | ||
304 | int nr_pages = (end - start) / PAGE_SIZE; | 301 | int nr_pages = (end - start) / PAGE_SIZE; |
305 | BUG_ON(!(vma->vm_flags & VM_LOCKED)); | 302 | BUG_ON(!(vma->vm_flags & VM_LOCKED)); |
306 | 303 | ||
@@ -313,20 +310,8 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, | |||
313 | if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | 310 | if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || |
314 | is_vm_hugetlb_page(vma) || | 311 | is_vm_hugetlb_page(vma) || |
315 | vma == get_gate_vma(current))) { | 312 | vma == get_gate_vma(current))) { |
316 | long error; | ||
317 | downgrade_write(&mm->mmap_sem); | ||
318 | |||
319 | error = __mlock_vma_pages_range(vma, start, end, 1); | ||
320 | 313 | ||
321 | up_read(&mm->mmap_sem); | 314 | return __mlock_vma_pages_range(vma, start, end, 1); |
322 | /* vma can change or disappear */ | ||
323 | down_write(&mm->mmap_sem); | ||
324 | vma = find_vma(mm, start); | ||
325 | /* non-NULL vma must contain @start, but need to check @end */ | ||
326 | if (!vma || end > vma->vm_end) | ||
327 | return -ENOMEM; | ||
328 | |||
329 | return 0; /* hide other errors from mmap(), et al */ | ||
330 | } | 315 | } |
331 | 316 | ||
332 | /* | 317 | /* |
@@ -437,41 +422,14 @@ success: | |||
437 | vma->vm_flags = newflags; | 422 | vma->vm_flags = newflags; |
438 | 423 | ||
439 | if (lock) { | 424 | if (lock) { |
440 | /* | ||
441 | * mmap_sem is currently held for write. Downgrade the write | ||
442 | * lock to a read lock so that other faults, mmap scans, ... | ||
443 | * while we fault in all pages. | ||
444 | */ | ||
445 | downgrade_write(&mm->mmap_sem); | ||
446 | |||
447 | ret = __mlock_vma_pages_range(vma, start, end, 1); | 425 | ret = __mlock_vma_pages_range(vma, start, end, 1); |
448 | 426 | ||
449 | /* | 427 | if (ret > 0) { |
450 | * Need to reacquire mmap sem in write mode, as our callers | ||
451 | * expect this. We have no support for atomically upgrading | ||
452 | * a sem to write, so we need to check for ranges while sem | ||
453 | * is unlocked. | ||
454 | */ | ||
455 | up_read(&mm->mmap_sem); | ||
456 | /* vma can change or disappear */ | ||
457 | down_write(&mm->mmap_sem); | ||
458 | *prev = find_vma(mm, start); | ||
459 | /* non-NULL *prev must contain @start, but need to check @end */ | ||
460 | if (!(*prev) || end > (*prev)->vm_end) | ||
461 | ret = -ENOMEM; | ||
462 | else if (ret > 0) { | ||
463 | mm->locked_vm -= ret; | 428 | mm->locked_vm -= ret; |
464 | ret = 0; | 429 | ret = 0; |
465 | } else | 430 | } else |
466 | ret = __mlock_posix_error_return(ret); /* translate if needed */ | 431 | ret = __mlock_posix_error_return(ret); /* translate if needed */ |
467 | } else { | 432 | } else { |
468 | /* | ||
469 | * TODO: for unlocking, pages will already be resident, so | ||
470 | * we don't need to wait for allocations/reclaim/pagein, ... | ||
471 | * However, unlocking a very large region can still take a | ||
472 | * while. Should we downgrade the semaphore for both lock | ||
473 | * AND unlock ? | ||
474 | */ | ||
475 | __mlock_vma_pages_range(vma, start, end, 0); | 433 | __mlock_vma_pages_range(vma, start, end, 0); |
476 | } | 434 | } |
477 | 435 | ||
@@ -529,7 +487,7 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
529 | return error; | 487 | return error; |
530 | } | 488 | } |
531 | 489 | ||
532 | asmlinkage long sys_mlock(unsigned long start, size_t len) | 490 | SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) |
533 | { | 491 | { |
534 | unsigned long locked; | 492 | unsigned long locked; |
535 | unsigned long lock_limit; | 493 | unsigned long lock_limit; |
@@ -557,7 +515,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len) | |||
557 | return error; | 515 | return error; |
558 | } | 516 | } |
559 | 517 | ||
560 | asmlinkage long sys_munlock(unsigned long start, size_t len) | 518 | SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) |
561 | { | 519 | { |
562 | int ret; | 520 | int ret; |
563 | 521 | ||
@@ -594,7 +552,7 @@ out: | |||
594 | return 0; | 552 | return 0; |
595 | } | 553 | } |
596 | 554 | ||
597 | asmlinkage long sys_mlockall(int flags) | 555 | SYSCALL_DEFINE1(mlockall, int, flags) |
598 | { | 556 | { |
599 | unsigned long lock_limit; | 557 | unsigned long lock_limit; |
600 | int ret = -EINVAL; | 558 | int ret = -EINVAL; |
@@ -622,7 +580,7 @@ out: | |||
622 | return ret; | 580 | return ret; |
623 | } | 581 | } |
624 | 582 | ||
625 | asmlinkage long sys_munlockall(void) | 583 | SYSCALL_DEFINE0(munlockall) |
626 | { | 584 | { |
627 | int ret; | 585 | int ret; |
628 | 586 | ||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * Written by obz. | 4 | * Written by obz. |
5 | * | 5 | * |
6 | * Address space accounting code <alan@redhat.com> | 6 | * Address space accounting code <alan@lxorguk.ukuu.org.uk> |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/slab.h> | 9 | #include <linux/slab.h> |
@@ -246,7 +246,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) | |||
246 | return next; | 246 | return next; |
247 | } | 247 | } |
248 | 248 | ||
249 | asmlinkage unsigned long sys_brk(unsigned long brk) | 249 | SYSCALL_DEFINE1(brk, unsigned long, brk) |
250 | { | 250 | { |
251 | unsigned long rlim, retval; | 251 | unsigned long rlim, retval; |
252 | unsigned long newbrk, oldbrk; | 252 | unsigned long newbrk, oldbrk; |
@@ -414,7 +414,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | |||
414 | 414 | ||
415 | static void __vma_link_file(struct vm_area_struct *vma) | 415 | static void __vma_link_file(struct vm_area_struct *vma) |
416 | { | 416 | { |
417 | struct file * file; | 417 | struct file *file; |
418 | 418 | ||
419 | file = vma->vm_file; | 419 | file = vma->vm_file; |
420 | if (file) { | 420 | if (file) { |
@@ -475,11 +475,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
475 | * insert vm structure into list and rbtree and anon_vma, | 475 | * insert vm structure into list and rbtree and anon_vma, |
476 | * but it has already been inserted into prio_tree earlier. | 476 | * but it has already been inserted into prio_tree earlier. |
477 | */ | 477 | */ |
478 | static void | 478 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
479 | __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | ||
480 | { | 479 | { |
481 | struct vm_area_struct * __vma, * prev; | 480 | struct vm_area_struct *__vma, *prev; |
482 | struct rb_node ** rb_link, * rb_parent; | 481 | struct rb_node **rb_link, *rb_parent; |
483 | 482 | ||
484 | __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); | 483 | __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); |
485 | BUG_ON(__vma && __vma->vm_start < vma->vm_end); | 484 | BUG_ON(__vma && __vma->vm_start < vma->vm_end); |
@@ -660,6 +659,9 @@ again: remove_next = 1 + (end > next->vm_end); | |||
660 | validate_mm(mm); | 659 | validate_mm(mm); |
661 | } | 660 | } |
662 | 661 | ||
662 | /* Flags that can be inherited from an existing mapping when merging */ | ||
663 | #define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR) | ||
664 | |||
663 | /* | 665 | /* |
664 | * If the vma has a ->close operation then the driver probably needs to release | 666 | * If the vma has a ->close operation then the driver probably needs to release |
665 | * per-vma resources, so we don't attempt to merge those. | 667 | * per-vma resources, so we don't attempt to merge those. |
@@ -667,7 +669,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
667 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 669 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
668 | struct file *file, unsigned long vm_flags) | 670 | struct file *file, unsigned long vm_flags) |
669 | { | 671 | { |
670 | if (vma->vm_flags != vm_flags) | 672 | if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) |
671 | return 0; | 673 | return 0; |
672 | if (vma->vm_file != file) | 674 | if (vma->vm_file != file) |
673 | return 0; | 675 | return 0; |
@@ -909,7 +911,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
909 | * The caller must hold down_write(current->mm->mmap_sem). | 911 | * The caller must hold down_write(current->mm->mmap_sem). |
910 | */ | 912 | */ |
911 | 913 | ||
912 | unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | 914 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
913 | unsigned long len, unsigned long prot, | 915 | unsigned long len, unsigned long prot, |
914 | unsigned long flags, unsigned long pgoff) | 916 | unsigned long flags, unsigned long pgoff) |
915 | { | 917 | { |
@@ -1092,6 +1094,15 @@ int vma_wants_writenotify(struct vm_area_struct *vma) | |||
1092 | mapping_cap_account_dirty(vma->vm_file->f_mapping); | 1094 | mapping_cap_account_dirty(vma->vm_file->f_mapping); |
1093 | } | 1095 | } |
1094 | 1096 | ||
1097 | /* | ||
1098 | * We account for memory if it's a private writeable mapping, | ||
1099 | * and VM_NORESERVE wasn't set. | ||
1100 | */ | ||
1101 | static inline int accountable_mapping(unsigned int vm_flags) | ||
1102 | { | ||
1103 | return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; | ||
1104 | } | ||
1105 | |||
1095 | unsigned long mmap_region(struct file *file, unsigned long addr, | 1106 | unsigned long mmap_region(struct file *file, unsigned long addr, |
1096 | unsigned long len, unsigned long flags, | 1107 | unsigned long len, unsigned long flags, |
1097 | unsigned int vm_flags, unsigned long pgoff, | 1108 | unsigned int vm_flags, unsigned long pgoff, |
@@ -1119,36 +1130,32 @@ munmap_back: | |||
1119 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) | 1130 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) |
1120 | return -ENOMEM; | 1131 | return -ENOMEM; |
1121 | 1132 | ||
1122 | if (flags & MAP_NORESERVE) | 1133 | /* |
1134 | * Set 'VM_NORESERVE' if we should not account for the | ||
1135 | * memory use of this mapping. We only honor MAP_NORESERVE | ||
1136 | * if we're allowed to overcommit memory. | ||
1137 | */ | ||
1138 | if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) | ||
1139 | vm_flags |= VM_NORESERVE; | ||
1140 | if (!accountable) | ||
1123 | vm_flags |= VM_NORESERVE; | 1141 | vm_flags |= VM_NORESERVE; |
1124 | 1142 | ||
1125 | if (accountable && (!(flags & MAP_NORESERVE) || | 1143 | /* |
1126 | sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { | 1144 | * Private writable mapping: check memory availability |
1127 | if (vm_flags & VM_SHARED) { | 1145 | */ |
1128 | /* Check memory availability in shmem_file_setup? */ | 1146 | if (accountable_mapping(vm_flags)) { |
1129 | vm_flags |= VM_ACCOUNT; | 1147 | charged = len >> PAGE_SHIFT; |
1130 | } else if (vm_flags & VM_WRITE) { | 1148 | if (security_vm_enough_memory(charged)) |
1131 | /* | 1149 | return -ENOMEM; |
1132 | * Private writable mapping: check memory availability | 1150 | vm_flags |= VM_ACCOUNT; |
1133 | */ | ||
1134 | charged = len >> PAGE_SHIFT; | ||
1135 | if (security_vm_enough_memory(charged)) | ||
1136 | return -ENOMEM; | ||
1137 | vm_flags |= VM_ACCOUNT; | ||
1138 | } | ||
1139 | } | 1151 | } |
1140 | 1152 | ||
1141 | /* | 1153 | /* |
1142 | * Can we just expand an old private anonymous mapping? | 1154 | * Can we just expand an old mapping? |
1143 | * The VM_SHARED test is necessary because shmem_zero_setup | ||
1144 | * will create the file object for a shared anonymous map below. | ||
1145 | */ | 1155 | */ |
1146 | if (!file && !(vm_flags & VM_SHARED)) { | 1156 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); |
1147 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, | 1157 | if (vma) |
1148 | NULL, NULL, pgoff, NULL); | 1158 | goto out; |
1149 | if (vma) | ||
1150 | goto out; | ||
1151 | } | ||
1152 | 1159 | ||
1153 | /* | 1160 | /* |
1154 | * Determine the object being mapped and call the appropriate | 1161 | * Determine the object being mapped and call the appropriate |
@@ -1191,14 +1198,6 @@ munmap_back: | |||
1191 | goto free_vma; | 1198 | goto free_vma; |
1192 | } | 1199 | } |
1193 | 1200 | ||
1194 | /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform | ||
1195 | * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) | ||
1196 | * that memory reservation must be checked; but that reservation | ||
1197 | * belongs to shared memory object, not to vma: so now clear it. | ||
1198 | */ | ||
1199 | if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) | ||
1200 | vma->vm_flags &= ~VM_ACCOUNT; | ||
1201 | |||
1202 | /* Can addr have changed?? | 1201 | /* Can addr have changed?? |
1203 | * | 1202 | * |
1204 | * Answer: Yes, several device drivers can do it in their | 1203 | * Answer: Yes, several device drivers can do it in their |
@@ -1211,17 +1210,8 @@ munmap_back: | |||
1211 | if (vma_wants_writenotify(vma)) | 1210 | if (vma_wants_writenotify(vma)) |
1212 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); | 1211 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); |
1213 | 1212 | ||
1214 | if (file && vma_merge(mm, prev, addr, vma->vm_end, | 1213 | vma_link(mm, vma, prev, rb_link, rb_parent); |
1215 | vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { | 1214 | file = vma->vm_file; |
1216 | mpol_put(vma_policy(vma)); | ||
1217 | kmem_cache_free(vm_area_cachep, vma); | ||
1218 | fput(file); | ||
1219 | if (vm_flags & VM_EXECUTABLE) | ||
1220 | removed_exe_file_vma(mm); | ||
1221 | } else { | ||
1222 | vma_link(mm, vma, prev, rb_link, rb_parent); | ||
1223 | file = vma->vm_file; | ||
1224 | } | ||
1225 | 1215 | ||
1226 | /* Once vma denies write, undo our temporary denial count */ | 1216 | /* Once vma denies write, undo our temporary denial count */ |
1227 | if (correct_wcount) | 1217 | if (correct_wcount) |
@@ -1468,7 +1458,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | |||
1468 | EXPORT_SYMBOL(get_unmapped_area); | 1458 | EXPORT_SYMBOL(get_unmapped_area); |
1469 | 1459 | ||
1470 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ | 1460 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ |
1471 | struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) | 1461 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) |
1472 | { | 1462 | { |
1473 | struct vm_area_struct *vma = NULL; | 1463 | struct vm_area_struct *vma = NULL; |
1474 | 1464 | ||
@@ -1511,7 +1501,7 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, | |||
1511 | struct vm_area_struct **pprev) | 1501 | struct vm_area_struct **pprev) |
1512 | { | 1502 | { |
1513 | struct vm_area_struct *vma = NULL, *prev = NULL; | 1503 | struct vm_area_struct *vma = NULL, *prev = NULL; |
1514 | struct rb_node * rb_node; | 1504 | struct rb_node *rb_node; |
1515 | if (!mm) | 1505 | if (!mm) |
1516 | goto out; | 1506 | goto out; |
1517 | 1507 | ||
@@ -1545,7 +1535,7 @@ out: | |||
1545 | * update accounting. This is shared with both the | 1535 | * update accounting. This is shared with both the |
1546 | * grow-up and grow-down cases. | 1536 | * grow-up and grow-down cases. |
1547 | */ | 1537 | */ |
1548 | static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow) | 1538 | static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) |
1549 | { | 1539 | { |
1550 | struct mm_struct *mm = vma->vm_mm; | 1540 | struct mm_struct *mm = vma->vm_mm; |
1551 | struct rlimit *rlim = current->signal->rlim; | 1541 | struct rlimit *rlim = current->signal->rlim; |
@@ -1953,7 +1943,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1953 | 1943 | ||
1954 | EXPORT_SYMBOL(do_munmap); | 1944 | EXPORT_SYMBOL(do_munmap); |
1955 | 1945 | ||
1956 | asmlinkage long sys_munmap(unsigned long addr, size_t len) | 1946 | SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) |
1957 | { | 1947 | { |
1958 | int ret; | 1948 | int ret; |
1959 | struct mm_struct *mm = current->mm; | 1949 | struct mm_struct *mm = current->mm; |
@@ -2095,6 +2085,9 @@ void exit_mmap(struct mm_struct *mm) | |||
2095 | arch_exit_mmap(mm); | 2085 | arch_exit_mmap(mm); |
2096 | mmu_notifier_release(mm); | 2086 | mmu_notifier_release(mm); |
2097 | 2087 | ||
2088 | if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */ | ||
2089 | return; | ||
2090 | |||
2098 | if (mm->locked_vm) { | 2091 | if (mm->locked_vm) { |
2099 | vma = mm->mmap; | 2092 | vma = mm->mmap; |
2100 | while (vma) { | 2093 | while (vma) { |
@@ -2107,7 +2100,7 @@ void exit_mmap(struct mm_struct *mm) | |||
2107 | lru_add_drain(); | 2100 | lru_add_drain(); |
2108 | flush_cache_mm(mm); | 2101 | flush_cache_mm(mm); |
2109 | tlb = tlb_gather_mmu(mm, 1); | 2102 | tlb = tlb_gather_mmu(mm, 1); |
2110 | /* Don't update_hiwater_rss(mm) here, do_exit already did */ | 2103 | /* update_hiwater_rss(mm) here? but nobody should be looking */ |
2111 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 2104 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
2112 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); | 2105 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); |
2113 | vm_unacct_memory(nr_accounted); | 2106 | vm_unacct_memory(nr_accounted); |
@@ -2474,3 +2467,13 @@ void mm_drop_all_locks(struct mm_struct *mm) | |||
2474 | 2467 | ||
2475 | mutex_unlock(&mm_all_locks_mutex); | 2468 | mutex_unlock(&mm_all_locks_mutex); |
2476 | } | 2469 | } |
2470 | |||
2471 | /* | ||
2472 | * initialise the VMA slab | ||
2473 | */ | ||
2474 | void __init mmap_init(void) | ||
2475 | { | ||
2476 | vm_area_cachep = kmem_cache_create("vm_area_struct", | ||
2477 | sizeof(struct vm_area_struct), 0, | ||
2478 | SLAB_PANIC, NULL); | ||
2479 | } | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index fded06f923f4..abe2694e13f4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * (C) Copyright 1994 Linus Torvalds | 4 | * (C) Copyright 1994 Linus Torvalds |
5 | * (C) Copyright 2002 Christoph Hellwig | 5 | * (C) Copyright 2002 Christoph Hellwig |
6 | * | 6 | * |
7 | * Address space accounting code <alan@redhat.com> | 7 | * Address space accounting code <alan@lxorguk.ukuu.org.uk> |
8 | * (C) Copyright 2002 Red Hat Inc, All Rights Reserved | 8 | * (C) Copyright 2002 Red Hat Inc, All Rights Reserved |
9 | */ | 9 | */ |
10 | 10 | ||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/swap.h> | 22 | #include <linux/swap.h> |
23 | #include <linux/swapops.h> | 23 | #include <linux/swapops.h> |
24 | #include <linux/mmu_notifier.h> | 24 | #include <linux/mmu_notifier.h> |
25 | #include <linux/migrate.h> | ||
25 | #include <asm/uaccess.h> | 26 | #include <asm/uaccess.h> |
26 | #include <asm/pgtable.h> | 27 | #include <asm/pgtable.h> |
27 | #include <asm/cacheflush.h> | 28 | #include <asm/cacheflush.h> |
@@ -59,8 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
59 | ptent = pte_mkwrite(ptent); | 60 | ptent = pte_mkwrite(ptent); |
60 | 61 | ||
61 | ptep_modify_prot_commit(mm, addr, pte, ptent); | 62 | ptep_modify_prot_commit(mm, addr, pte, ptent); |
62 | #ifdef CONFIG_MIGRATION | 63 | } else if (PAGE_MIGRATION && !pte_file(oldpte)) { |
63 | } else if (!pte_file(oldpte)) { | ||
64 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 64 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
65 | 65 | ||
66 | if (is_write_migration_entry(entry)) { | 66 | if (is_write_migration_entry(entry)) { |
@@ -72,9 +72,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
72 | set_pte_at(mm, addr, pte, | 72 | set_pte_at(mm, addr, pte, |
73 | swp_entry_to_pte(entry)); | 73 | swp_entry_to_pte(entry)); |
74 | } | 74 | } |
75 | #endif | ||
76 | } | 75 | } |
77 | |||
78 | } while (pte++, addr += PAGE_SIZE, addr != end); | 76 | } while (pte++, addr += PAGE_SIZE, addr != end); |
79 | arch_leave_lazy_mmu_mode(); | 77 | arch_leave_lazy_mmu_mode(); |
80 | pte_unmap_unlock(pte - 1, ptl); | 78 | pte_unmap_unlock(pte - 1, ptl); |
@@ -219,8 +217,8 @@ fail: | |||
219 | return error; | 217 | return error; |
220 | } | 218 | } |
221 | 219 | ||
222 | asmlinkage long | 220 | SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, |
223 | sys_mprotect(unsigned long start, size_t len, unsigned long prot) | 221 | unsigned long, prot) |
224 | { | 222 | { |
225 | unsigned long vm_flags, nstart, end, tmp, reqprot; | 223 | unsigned long vm_flags, nstart, end, tmp, reqprot; |
226 | struct vm_area_struct *vma, *prev; | 224 | struct vm_area_struct *vma, *prev; |
diff --git a/mm/mremap.c b/mm/mremap.c index 58a2908f42f5..a39b7b91be46 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * (C) Copyright 1996 Linus Torvalds | 4 | * (C) Copyright 1996 Linus Torvalds |
5 | * | 5 | * |
6 | * Address space accounting code <alan@redhat.com> | 6 | * Address space accounting code <alan@lxorguk.ukuu.org.uk> |
7 | * (C) Copyright 2002 Red Hat Inc, All Rights Reserved | 7 | * (C) Copyright 2002 Red Hat Inc, All Rights Reserved |
8 | */ | 8 | */ |
9 | 9 | ||
@@ -420,9 +420,9 @@ out_nc: | |||
420 | return ret; | 420 | return ret; |
421 | } | 421 | } |
422 | 422 | ||
423 | asmlinkage unsigned long sys_mremap(unsigned long addr, | 423 | SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, |
424 | unsigned long old_len, unsigned long new_len, | 424 | unsigned long, new_len, unsigned long, flags, |
425 | unsigned long flags, unsigned long new_addr) | 425 | unsigned long, new_addr) |
426 | { | 426 | { |
427 | unsigned long ret; | 427 | unsigned long ret; |
428 | 428 | ||
diff --git a/mm/msync.c b/mm/msync.c index 144a7570535d..4083209b7f02 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -28,7 +28,7 @@ | |||
28 | * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to | 28 | * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to |
29 | * applications. | 29 | * applications. |
30 | */ | 30 | */ |
31 | asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | 31 | SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) |
32 | { | 32 | { |
33 | unsigned long end; | 33 | unsigned long end; |
34 | struct mm_struct *mm = current->mm; | 34 | struct mm_struct *mm = current->mm; |
@@ -82,7 +82,7 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | |||
82 | (vma->vm_flags & VM_SHARED)) { | 82 | (vma->vm_flags & VM_SHARED)) { |
83 | get_file(file); | 83 | get_file(file); |
84 | up_read(&mm->mmap_sem); | 84 | up_read(&mm->mmap_sem); |
85 | error = do_fsync(file, 0); | 85 | error = vfs_fsync(file, file->f_path.dentry, 0); |
86 | fput(file); | 86 | fput(file); |
87 | if (error || start >= end) | 87 | if (error || start >= end) |
88 | goto out; | 88 | goto out; |
diff --git a/mm/nommu.c b/mm/nommu.c index 7695dc850785..2fcf47d449b4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -6,11 +6,11 @@ | |||
6 | * | 6 | * |
7 | * See Documentation/nommu-mmap.txt | 7 | * See Documentation/nommu-mmap.txt |
8 | * | 8 | * |
9 | * Copyright (c) 2004-2005 David Howells <dhowells@redhat.com> | 9 | * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com> |
10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> | 10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> |
11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> | 11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> |
12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> | 12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> |
13 | * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org> | 13 | * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org> |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
@@ -33,6 +33,28 @@ | |||
33 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
34 | #include <asm/tlb.h> | 34 | #include <asm/tlb.h> |
35 | #include <asm/tlbflush.h> | 35 | #include <asm/tlbflush.h> |
36 | #include "internal.h" | ||
37 | |||
38 | static inline __attribute__((format(printf, 1, 2))) | ||
39 | void no_printk(const char *fmt, ...) | ||
40 | { | ||
41 | } | ||
42 | |||
43 | #if 0 | ||
44 | #define kenter(FMT, ...) \ | ||
45 | printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) | ||
46 | #define kleave(FMT, ...) \ | ||
47 | printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) | ||
48 | #define kdebug(FMT, ...) \ | ||
49 | printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__) | ||
50 | #else | ||
51 | #define kenter(FMT, ...) \ | ||
52 | no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) | ||
53 | #define kleave(FMT, ...) \ | ||
54 | no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) | ||
55 | #define kdebug(FMT, ...) \ | ||
56 | no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) | ||
57 | #endif | ||
36 | 58 | ||
37 | #include "internal.h" | 59 | #include "internal.h" |
38 | 60 | ||
@@ -40,19 +62,22 @@ void *high_memory; | |||
40 | struct page *mem_map; | 62 | struct page *mem_map; |
41 | unsigned long max_mapnr; | 63 | unsigned long max_mapnr; |
42 | unsigned long num_physpages; | 64 | unsigned long num_physpages; |
43 | unsigned long askedalloc, realalloc; | ||
44 | atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); | 65 | atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); |
45 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 66 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
46 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 67 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
47 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | 68 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; |
69 | int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ | ||
48 | int heap_stack_gap = 0; | 70 | int heap_stack_gap = 0; |
49 | 71 | ||
72 | atomic_t mmap_pages_allocated; | ||
73 | |||
50 | EXPORT_SYMBOL(mem_map); | 74 | EXPORT_SYMBOL(mem_map); |
51 | EXPORT_SYMBOL(num_physpages); | 75 | EXPORT_SYMBOL(num_physpages); |
52 | 76 | ||
53 | /* list of shareable VMAs */ | 77 | /* list of mapped, potentially shareable regions */ |
54 | struct rb_root nommu_vma_tree = RB_ROOT; | 78 | static struct kmem_cache *vm_region_jar; |
55 | DECLARE_RWSEM(nommu_vma_sem); | 79 | struct rb_root nommu_region_tree = RB_ROOT; |
80 | DECLARE_RWSEM(nommu_region_sem); | ||
56 | 81 | ||
57 | struct vm_operations_struct generic_file_vm_ops = { | 82 | struct vm_operations_struct generic_file_vm_ops = { |
58 | }; | 83 | }; |
@@ -86,7 +111,7 @@ do_expand: | |||
86 | i_size_write(inode, offset); | 111 | i_size_write(inode, offset); |
87 | 112 | ||
88 | out_truncate: | 113 | out_truncate: |
89 | if (inode->i_op && inode->i_op->truncate) | 114 | if (inode->i_op->truncate) |
90 | inode->i_op->truncate(inode); | 115 | inode->i_op->truncate(inode); |
91 | return 0; | 116 | return 0; |
92 | out_sig: | 117 | out_sig: |
@@ -124,6 +149,20 @@ unsigned int kobjsize(const void *objp) | |||
124 | return ksize(objp); | 149 | return ksize(objp); |
125 | 150 | ||
126 | /* | 151 | /* |
152 | * If it's not a compound page, see if we have a matching VMA | ||
153 | * region. This test is intentionally done in reverse order, | ||
154 | * so if there's no VMA, we still fall through and hand back | ||
155 | * PAGE_SIZE for 0-order pages. | ||
156 | */ | ||
157 | if (!PageCompound(page)) { | ||
158 | struct vm_area_struct *vma; | ||
159 | |||
160 | vma = find_vma(current->mm, (unsigned long)objp); | ||
161 | if (vma) | ||
162 | return vma->vm_end - vma->vm_start; | ||
163 | } | ||
164 | |||
165 | /* | ||
127 | * The ksize() function is only guaranteed to work for pointers | 166 | * The ksize() function is only guaranteed to work for pointers |
128 | * returned by kmalloc(). So handle arbitrary pointers here. | 167 | * returned by kmalloc(). So handle arbitrary pointers here. |
129 | */ | 168 | */ |
@@ -355,6 +394,24 @@ void vunmap(const void *addr) | |||
355 | } | 394 | } |
356 | EXPORT_SYMBOL(vunmap); | 395 | EXPORT_SYMBOL(vunmap); |
357 | 396 | ||
397 | void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) | ||
398 | { | ||
399 | BUG(); | ||
400 | return NULL; | ||
401 | } | ||
402 | EXPORT_SYMBOL(vm_map_ram); | ||
403 | |||
404 | void vm_unmap_ram(const void *mem, unsigned int count) | ||
405 | { | ||
406 | BUG(); | ||
407 | } | ||
408 | EXPORT_SYMBOL(vm_unmap_ram); | ||
409 | |||
410 | void vm_unmap_aliases(void) | ||
411 | { | ||
412 | } | ||
413 | EXPORT_SYMBOL_GPL(vm_unmap_aliases); | ||
414 | |||
358 | /* | 415 | /* |
359 | * Implement a stub for vmalloc_sync_all() if the architecture chose not to | 416 | * Implement a stub for vmalloc_sync_all() if the architecture chose not to |
360 | * have one. | 417 | * have one. |
@@ -377,7 +434,7 @@ EXPORT_SYMBOL(vm_insert_page); | |||
377 | * to a regular file. in this case, the unmapping will need | 434 | * to a regular file. in this case, the unmapping will need |
378 | * to invoke file system routines that need the global lock. | 435 | * to invoke file system routines that need the global lock. |
379 | */ | 436 | */ |
380 | asmlinkage unsigned long sys_brk(unsigned long brk) | 437 | SYSCALL_DEFINE1(brk, unsigned long, brk) |
381 | { | 438 | { |
382 | struct mm_struct *mm = current->mm; | 439 | struct mm_struct *mm = current->mm; |
383 | 440 | ||
@@ -401,129 +458,178 @@ asmlinkage unsigned long sys_brk(unsigned long brk) | |||
401 | return mm->brk = brk; | 458 | return mm->brk = brk; |
402 | } | 459 | } |
403 | 460 | ||
404 | #ifdef DEBUG | 461 | /* |
405 | static void show_process_blocks(void) | 462 | * initialise the VMA and region record slabs |
463 | */ | ||
464 | void __init mmap_init(void) | ||
406 | { | 465 | { |
407 | struct vm_list_struct *vml; | 466 | vm_region_jar = kmem_cache_create("vm_region_jar", |
408 | 467 | sizeof(struct vm_region), 0, | |
409 | printk("Process blocks %d:", current->pid); | 468 | SLAB_PANIC, NULL); |
410 | 469 | vm_area_cachep = kmem_cache_create("vm_area_struct", | |
411 | for (vml = ¤t->mm->context.vmlist; vml; vml = vml->next) { | 470 | sizeof(struct vm_area_struct), 0, |
412 | printk(" %p: %p", vml, vml->vma); | 471 | SLAB_PANIC, NULL); |
413 | if (vml->vma) | ||
414 | printk(" (%d @%lx #%d)", | ||
415 | kobjsize((void *) vml->vma->vm_start), | ||
416 | vml->vma->vm_start, | ||
417 | atomic_read(&vml->vma->vm_usage)); | ||
418 | printk(vml->next ? " ->" : ".\n"); | ||
419 | } | ||
420 | } | 472 | } |
421 | #endif /* DEBUG */ | ||
422 | 473 | ||
423 | /* | 474 | /* |
424 | * add a VMA into a process's mm_struct in the appropriate place in the list | 475 | * validate the region tree |
425 | * - should be called with mm->mmap_sem held writelocked | 476 | * - the caller must hold the region lock |
426 | */ | 477 | */ |
427 | static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) | 478 | #ifdef CONFIG_DEBUG_NOMMU_REGIONS |
479 | static noinline void validate_nommu_regions(void) | ||
428 | { | 480 | { |
429 | struct vm_list_struct **ppv; | 481 | struct vm_region *region, *last; |
430 | 482 | struct rb_node *p, *lastp; | |
431 | for (ppv = ¤t->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) | 483 | |
432 | if ((*ppv)->vma->vm_start > vml->vma->vm_start) | 484 | lastp = rb_first(&nommu_region_tree); |
433 | break; | 485 | if (!lastp) |
434 | 486 | return; | |
435 | vml->next = *ppv; | 487 | |
436 | *ppv = vml; | 488 | last = rb_entry(lastp, struct vm_region, vm_rb); |
489 | if (unlikely(last->vm_end <= last->vm_start)) | ||
490 | BUG(); | ||
491 | if (unlikely(last->vm_top < last->vm_end)) | ||
492 | BUG(); | ||
493 | |||
494 | while ((p = rb_next(lastp))) { | ||
495 | region = rb_entry(p, struct vm_region, vm_rb); | ||
496 | last = rb_entry(lastp, struct vm_region, vm_rb); | ||
497 | |||
498 | if (unlikely(region->vm_end <= region->vm_start)) | ||
499 | BUG(); | ||
500 | if (unlikely(region->vm_top < region->vm_end)) | ||
501 | BUG(); | ||
502 | if (unlikely(region->vm_start < last->vm_top)) | ||
503 | BUG(); | ||
504 | |||
505 | lastp = p; | ||
506 | } | ||
437 | } | 507 | } |
508 | #else | ||
509 | #define validate_nommu_regions() do {} while(0) | ||
510 | #endif | ||
438 | 511 | ||
439 | /* | 512 | /* |
440 | * look up the first VMA in which addr resides, NULL if none | 513 | * add a region into the global tree |
441 | * - should be called with mm->mmap_sem at least held readlocked | ||
442 | */ | 514 | */ |
443 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | 515 | static void add_nommu_region(struct vm_region *region) |
444 | { | 516 | { |
445 | struct vm_list_struct *loop, *vml; | 517 | struct vm_region *pregion; |
518 | struct rb_node **p, *parent; | ||
446 | 519 | ||
447 | /* search the vm_start ordered list */ | 520 | validate_nommu_regions(); |
448 | vml = NULL; | 521 | |
449 | for (loop = mm->context.vmlist; loop; loop = loop->next) { | 522 | BUG_ON(region->vm_start & ~PAGE_MASK); |
450 | if (loop->vma->vm_start > addr) | 523 | |
451 | break; | 524 | parent = NULL; |
452 | vml = loop; | 525 | p = &nommu_region_tree.rb_node; |
526 | while (*p) { | ||
527 | parent = *p; | ||
528 | pregion = rb_entry(parent, struct vm_region, vm_rb); | ||
529 | if (region->vm_start < pregion->vm_start) | ||
530 | p = &(*p)->rb_left; | ||
531 | else if (region->vm_start > pregion->vm_start) | ||
532 | p = &(*p)->rb_right; | ||
533 | else if (pregion == region) | ||
534 | return; | ||
535 | else | ||
536 | BUG(); | ||
453 | } | 537 | } |
454 | 538 | ||
455 | if (vml && vml->vma->vm_end > addr) | 539 | rb_link_node(®ion->vm_rb, parent, p); |
456 | return vml->vma; | 540 | rb_insert_color(®ion->vm_rb, &nommu_region_tree); |
457 | 541 | ||
458 | return NULL; | 542 | validate_nommu_regions(); |
459 | } | 543 | } |
460 | EXPORT_SYMBOL(find_vma); | ||
461 | 544 | ||
462 | /* | 545 | /* |
463 | * find a VMA | 546 | * delete a region from the global tree |
464 | * - we don't extend stack VMAs under NOMMU conditions | ||
465 | */ | 547 | */ |
466 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | 548 | static void delete_nommu_region(struct vm_region *region) |
467 | { | 549 | { |
468 | return find_vma(mm, addr); | 550 | BUG_ON(!nommu_region_tree.rb_node); |
469 | } | ||
470 | 551 | ||
471 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | 552 | validate_nommu_regions(); |
472 | { | 553 | rb_erase(®ion->vm_rb, &nommu_region_tree); |
473 | return -ENOMEM; | 554 | validate_nommu_regions(); |
474 | } | 555 | } |
475 | 556 | ||
476 | /* | 557 | /* |
477 | * look up the first VMA exactly that exactly matches addr | 558 | * free a contiguous series of pages |
478 | * - should be called with mm->mmap_sem at least held readlocked | ||
479 | */ | 559 | */ |
480 | static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | 560 | static void free_page_series(unsigned long from, unsigned long to) |
481 | unsigned long addr) | ||
482 | { | 561 | { |
483 | struct vm_list_struct *vml; | 562 | for (; from < to; from += PAGE_SIZE) { |
484 | 563 | struct page *page = virt_to_page(from); | |
485 | /* search the vm_start ordered list */ | 564 | |
486 | for (vml = mm->context.vmlist; vml; vml = vml->next) { | 565 | kdebug("- free %lx", from); |
487 | if (vml->vma->vm_start == addr) | 566 | atomic_dec(&mmap_pages_allocated); |
488 | return vml->vma; | 567 | if (page_count(page) != 1) |
489 | if (vml->vma->vm_start > addr) | 568 | kdebug("free page %p [%d]", page, page_count(page)); |
490 | break; | 569 | put_page(page); |
491 | } | 570 | } |
492 | |||
493 | return NULL; | ||
494 | } | 571 | } |
495 | 572 | ||
496 | /* | 573 | /* |
497 | * find a VMA in the global tree | 574 | * release a reference to a region |
575 | * - the caller must hold the region semaphore, which this releases | ||
576 | * - the region may not have been added to the tree yet, in which case vm_top | ||
577 | * will equal vm_start | ||
498 | */ | 578 | */ |
499 | static inline struct vm_area_struct *find_nommu_vma(unsigned long start) | 579 | static void __put_nommu_region(struct vm_region *region) |
580 | __releases(nommu_region_sem) | ||
500 | { | 581 | { |
501 | struct vm_area_struct *vma; | 582 | kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); |
502 | struct rb_node *n = nommu_vma_tree.rb_node; | ||
503 | 583 | ||
504 | while (n) { | 584 | BUG_ON(!nommu_region_tree.rb_node); |
505 | vma = rb_entry(n, struct vm_area_struct, vm_rb); | ||
506 | 585 | ||
507 | if (start < vma->vm_start) | 586 | if (atomic_dec_and_test(®ion->vm_usage)) { |
508 | n = n->rb_left; | 587 | if (region->vm_top > region->vm_start) |
509 | else if (start > vma->vm_start) | 588 | delete_nommu_region(region); |
510 | n = n->rb_right; | 589 | up_write(&nommu_region_sem); |
511 | else | 590 | |
512 | return vma; | 591 | if (region->vm_file) |
592 | fput(region->vm_file); | ||
593 | |||
594 | /* IO memory and memory shared directly out of the pagecache | ||
595 | * from ramfs/tmpfs mustn't be released here */ | ||
596 | if (region->vm_flags & VM_MAPPED_COPY) { | ||
597 | kdebug("free series"); | ||
598 | free_page_series(region->vm_start, region->vm_top); | ||
599 | } | ||
600 | kmem_cache_free(vm_region_jar, region); | ||
601 | } else { | ||
602 | up_write(&nommu_region_sem); | ||
513 | } | 603 | } |
604 | } | ||
514 | 605 | ||
515 | return NULL; | 606 | /* |
607 | * release a reference to a region | ||
608 | */ | ||
609 | static void put_nommu_region(struct vm_region *region) | ||
610 | { | ||
611 | down_write(&nommu_region_sem); | ||
612 | __put_nommu_region(region); | ||
516 | } | 613 | } |
517 | 614 | ||
518 | /* | 615 | /* |
519 | * add a VMA in the global tree | 616 | * add a VMA into a process's mm_struct in the appropriate place in the list |
617 | * and tree and add to the address space's page tree also if not an anonymous | ||
618 | * page | ||
619 | * - should be called with mm->mmap_sem held writelocked | ||
520 | */ | 620 | */ |
521 | static void add_nommu_vma(struct vm_area_struct *vma) | 621 | static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) |
522 | { | 622 | { |
523 | struct vm_area_struct *pvma; | 623 | struct vm_area_struct *pvma, **pp; |
524 | struct address_space *mapping; | 624 | struct address_space *mapping; |
525 | struct rb_node **p = &nommu_vma_tree.rb_node; | 625 | struct rb_node **p, *parent; |
526 | struct rb_node *parent = NULL; | 626 | |
627 | kenter(",%p", vma); | ||
628 | |||
629 | BUG_ON(!vma->vm_region); | ||
630 | |||
631 | mm->map_count++; | ||
632 | vma->vm_mm = mm; | ||
527 | 633 | ||
528 | /* add the VMA to the mapping */ | 634 | /* add the VMA to the mapping */ |
529 | if (vma->vm_file) { | 635 | if (vma->vm_file) { |
@@ -534,42 +640,62 @@ static void add_nommu_vma(struct vm_area_struct *vma) | |||
534 | flush_dcache_mmap_unlock(mapping); | 640 | flush_dcache_mmap_unlock(mapping); |
535 | } | 641 | } |
536 | 642 | ||
537 | /* add the VMA to the master list */ | 643 | /* add the VMA to the tree */ |
644 | parent = NULL; | ||
645 | p = &mm->mm_rb.rb_node; | ||
538 | while (*p) { | 646 | while (*p) { |
539 | parent = *p; | 647 | parent = *p; |
540 | pvma = rb_entry(parent, struct vm_area_struct, vm_rb); | 648 | pvma = rb_entry(parent, struct vm_area_struct, vm_rb); |
541 | 649 | ||
542 | if (vma->vm_start < pvma->vm_start) { | 650 | /* sort by: start addr, end addr, VMA struct addr in that order |
651 | * (the latter is necessary as we may get identical VMAs) */ | ||
652 | if (vma->vm_start < pvma->vm_start) | ||
543 | p = &(*p)->rb_left; | 653 | p = &(*p)->rb_left; |
544 | } | 654 | else if (vma->vm_start > pvma->vm_start) |
545 | else if (vma->vm_start > pvma->vm_start) { | ||
546 | p = &(*p)->rb_right; | 655 | p = &(*p)->rb_right; |
547 | } | 656 | else if (vma->vm_end < pvma->vm_end) |
548 | else { | 657 | p = &(*p)->rb_left; |
549 | /* mappings are at the same address - this can only | 658 | else if (vma->vm_end > pvma->vm_end) |
550 | * happen for shared-mem chardevs and shared file | 659 | p = &(*p)->rb_right; |
551 | * mappings backed by ramfs/tmpfs */ | 660 | else if (vma < pvma) |
552 | BUG_ON(!(pvma->vm_flags & VM_SHARED)); | 661 | p = &(*p)->rb_left; |
553 | 662 | else if (vma > pvma) | |
554 | if (vma < pvma) | 663 | p = &(*p)->rb_right; |
555 | p = &(*p)->rb_left; | 664 | else |
556 | else if (vma > pvma) | 665 | BUG(); |
557 | p = &(*p)->rb_right; | ||
558 | else | ||
559 | BUG(); | ||
560 | } | ||
561 | } | 666 | } |
562 | 667 | ||
563 | rb_link_node(&vma->vm_rb, parent, p); | 668 | rb_link_node(&vma->vm_rb, parent, p); |
564 | rb_insert_color(&vma->vm_rb, &nommu_vma_tree); | 669 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); |
670 | |||
671 | /* add VMA to the VMA list also */ | ||
672 | for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) { | ||
673 | if (pvma->vm_start > vma->vm_start) | ||
674 | break; | ||
675 | if (pvma->vm_start < vma->vm_start) | ||
676 | continue; | ||
677 | if (pvma->vm_end < vma->vm_end) | ||
678 | break; | ||
679 | } | ||
680 | |||
681 | vma->vm_next = *pp; | ||
682 | *pp = vma; | ||
565 | } | 683 | } |
566 | 684 | ||
567 | /* | 685 | /* |
568 | * delete a VMA from the global list | 686 | * delete a VMA from its owning mm_struct and address space |
569 | */ | 687 | */ |
570 | static void delete_nommu_vma(struct vm_area_struct *vma) | 688 | static void delete_vma_from_mm(struct vm_area_struct *vma) |
571 | { | 689 | { |
690 | struct vm_area_struct **pp; | ||
572 | struct address_space *mapping; | 691 | struct address_space *mapping; |
692 | struct mm_struct *mm = vma->vm_mm; | ||
693 | |||
694 | kenter("%p", vma); | ||
695 | |||
696 | mm->map_count--; | ||
697 | if (mm->mmap_cache == vma) | ||
698 | mm->mmap_cache = NULL; | ||
573 | 699 | ||
574 | /* remove the VMA from the mapping */ | 700 | /* remove the VMA from the mapping */ |
575 | if (vma->vm_file) { | 701 | if (vma->vm_file) { |
@@ -580,8 +706,115 @@ static void delete_nommu_vma(struct vm_area_struct *vma) | |||
580 | flush_dcache_mmap_unlock(mapping); | 706 | flush_dcache_mmap_unlock(mapping); |
581 | } | 707 | } |
582 | 708 | ||
583 | /* remove from the master list */ | 709 | /* remove from the MM's tree and list */ |
584 | rb_erase(&vma->vm_rb, &nommu_vma_tree); | 710 | rb_erase(&vma->vm_rb, &mm->mm_rb); |
711 | for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) { | ||
712 | if (*pp == vma) { | ||
713 | *pp = vma->vm_next; | ||
714 | break; | ||
715 | } | ||
716 | } | ||
717 | |||
718 | vma->vm_mm = NULL; | ||
719 | } | ||
720 | |||
721 | /* | ||
722 | * destroy a VMA record | ||
723 | */ | ||
724 | static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) | ||
725 | { | ||
726 | kenter("%p", vma); | ||
727 | if (vma->vm_ops && vma->vm_ops->close) | ||
728 | vma->vm_ops->close(vma); | ||
729 | if (vma->vm_file) { | ||
730 | fput(vma->vm_file); | ||
731 | if (vma->vm_flags & VM_EXECUTABLE) | ||
732 | removed_exe_file_vma(mm); | ||
733 | } | ||
734 | put_nommu_region(vma->vm_region); | ||
735 | kmem_cache_free(vm_area_cachep, vma); | ||
736 | } | ||
737 | |||
738 | /* | ||
739 | * look up the first VMA in which addr resides, NULL if none | ||
740 | * - should be called with mm->mmap_sem at least held readlocked | ||
741 | */ | ||
742 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | ||
743 | { | ||
744 | struct vm_area_struct *vma; | ||
745 | struct rb_node *n = mm->mm_rb.rb_node; | ||
746 | |||
747 | /* check the cache first */ | ||
748 | vma = mm->mmap_cache; | ||
749 | if (vma && vma->vm_start <= addr && vma->vm_end > addr) | ||
750 | return vma; | ||
751 | |||
752 | /* trawl the tree (there may be multiple mappings in which addr | ||
753 | * resides) */ | ||
754 | for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { | ||
755 | vma = rb_entry(n, struct vm_area_struct, vm_rb); | ||
756 | if (vma->vm_start > addr) | ||
757 | return NULL; | ||
758 | if (vma->vm_end > addr) { | ||
759 | mm->mmap_cache = vma; | ||
760 | return vma; | ||
761 | } | ||
762 | } | ||
763 | |||
764 | return NULL; | ||
765 | } | ||
766 | EXPORT_SYMBOL(find_vma); | ||
767 | |||
768 | /* | ||
769 | * find a VMA | ||
770 | * - we don't extend stack VMAs under NOMMU conditions | ||
771 | */ | ||
772 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | ||
773 | { | ||
774 | return find_vma(mm, addr); | ||
775 | } | ||
776 | |||
777 | /* | ||
778 | * expand a stack to a given address | ||
779 | * - not supported under NOMMU conditions | ||
780 | */ | ||
781 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | ||
782 | { | ||
783 | return -ENOMEM; | ||
784 | } | ||
785 | |||
786 | /* | ||
787 | * look up the first VMA exactly that exactly matches addr | ||
788 | * - should be called with mm->mmap_sem at least held readlocked | ||
789 | */ | ||
790 | static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | ||
791 | unsigned long addr, | ||
792 | unsigned long len) | ||
793 | { | ||
794 | struct vm_area_struct *vma; | ||
795 | struct rb_node *n = mm->mm_rb.rb_node; | ||
796 | unsigned long end = addr + len; | ||
797 | |||
798 | /* check the cache first */ | ||
799 | vma = mm->mmap_cache; | ||
800 | if (vma && vma->vm_start == addr && vma->vm_end == end) | ||
801 | return vma; | ||
802 | |||
803 | /* trawl the tree (there may be multiple mappings in which addr | ||
804 | * resides) */ | ||
805 | for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { | ||
806 | vma = rb_entry(n, struct vm_area_struct, vm_rb); | ||
807 | if (vma->vm_start < addr) | ||
808 | continue; | ||
809 | if (vma->vm_start > addr) | ||
810 | return NULL; | ||
811 | if (vma->vm_end == end) { | ||
812 | mm->mmap_cache = vma; | ||
813 | return vma; | ||
814 | } | ||
815 | } | ||
816 | |||
817 | return NULL; | ||
585 | } | 818 | } |
586 | 819 | ||
587 | /* | 820 | /* |
@@ -596,7 +829,7 @@ static int validate_mmap_request(struct file *file, | |||
596 | unsigned long pgoff, | 829 | unsigned long pgoff, |
597 | unsigned long *_capabilities) | 830 | unsigned long *_capabilities) |
598 | { | 831 | { |
599 | unsigned long capabilities; | 832 | unsigned long capabilities, rlen; |
600 | unsigned long reqprot = prot; | 833 | unsigned long reqprot = prot; |
601 | int ret; | 834 | int ret; |
602 | 835 | ||
@@ -616,12 +849,12 @@ static int validate_mmap_request(struct file *file, | |||
616 | return -EINVAL; | 849 | return -EINVAL; |
617 | 850 | ||
618 | /* Careful about overflows.. */ | 851 | /* Careful about overflows.. */ |
619 | len = PAGE_ALIGN(len); | 852 | rlen = PAGE_ALIGN(len); |
620 | if (!len || len > TASK_SIZE) | 853 | if (!rlen || rlen > TASK_SIZE) |
621 | return -ENOMEM; | 854 | return -ENOMEM; |
622 | 855 | ||
623 | /* offset overflow? */ | 856 | /* offset overflow? */ |
624 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) | 857 | if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff) |
625 | return -EOVERFLOW; | 858 | return -EOVERFLOW; |
626 | 859 | ||
627 | if (file) { | 860 | if (file) { |
@@ -795,13 +1028,18 @@ static unsigned long determine_vm_flags(struct file *file, | |||
795 | } | 1028 | } |
796 | 1029 | ||
797 | /* | 1030 | /* |
798 | * set up a shared mapping on a file | 1031 | * set up a shared mapping on a file (the driver or filesystem provides and |
1032 | * pins the storage) | ||
799 | */ | 1033 | */ |
800 | static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) | 1034 | static int do_mmap_shared_file(struct vm_area_struct *vma) |
801 | { | 1035 | { |
802 | int ret; | 1036 | int ret; |
803 | 1037 | ||
804 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | 1038 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); |
1039 | if (ret == 0) { | ||
1040 | vma->vm_region->vm_top = vma->vm_region->vm_end; | ||
1041 | return ret; | ||
1042 | } | ||
805 | if (ret != -ENOSYS) | 1043 | if (ret != -ENOSYS) |
806 | return ret; | 1044 | return ret; |
807 | 1045 | ||
@@ -815,10 +1053,14 @@ static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) | |||
815 | /* | 1053 | /* |
816 | * set up a private mapping or an anonymous shared mapping | 1054 | * set up a private mapping or an anonymous shared mapping |
817 | */ | 1055 | */ |
818 | static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) | 1056 | static int do_mmap_private(struct vm_area_struct *vma, |
1057 | struct vm_region *region, | ||
1058 | unsigned long len) | ||
819 | { | 1059 | { |
1060 | struct page *pages; | ||
1061 | unsigned long total, point, n, rlen; | ||
820 | void *base; | 1062 | void *base; |
821 | int ret; | 1063 | int ret, order; |
822 | 1064 | ||
823 | /* invoke the file's mapping function so that it can keep track of | 1065 | /* invoke the file's mapping function so that it can keep track of |
824 | * shared mappings on devices or memory | 1066 | * shared mappings on devices or memory |
@@ -826,34 +1068,63 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) | |||
826 | */ | 1068 | */ |
827 | if (vma->vm_file) { | 1069 | if (vma->vm_file) { |
828 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | 1070 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); |
829 | if (ret != -ENOSYS) { | 1071 | if (ret == 0) { |
830 | /* shouldn't return success if we're not sharing */ | 1072 | /* shouldn't return success if we're not sharing */ |
831 | BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE)); | 1073 | BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); |
832 | return ret; /* success or a real error */ | 1074 | vma->vm_region->vm_top = vma->vm_region->vm_end; |
1075 | return ret; | ||
833 | } | 1076 | } |
1077 | if (ret != -ENOSYS) | ||
1078 | return ret; | ||
834 | 1079 | ||
835 | /* getting an ENOSYS error indicates that direct mmap isn't | 1080 | /* getting an ENOSYS error indicates that direct mmap isn't |
836 | * possible (as opposed to tried but failed) so we'll try to | 1081 | * possible (as opposed to tried but failed) so we'll try to |
837 | * make a private copy of the data and map that instead */ | 1082 | * make a private copy of the data and map that instead */ |
838 | } | 1083 | } |
839 | 1084 | ||
1085 | rlen = PAGE_ALIGN(len); | ||
1086 | |||
840 | /* allocate some memory to hold the mapping | 1087 | /* allocate some memory to hold the mapping |
841 | * - note that this may not return a page-aligned address if the object | 1088 | * - note that this may not return a page-aligned address if the object |
842 | * we're allocating is smaller than a page | 1089 | * we're allocating is smaller than a page |
843 | */ | 1090 | */ |
844 | base = kmalloc(len, GFP_KERNEL|__GFP_COMP); | 1091 | order = get_order(rlen); |
845 | if (!base) | 1092 | kdebug("alloc order %d for %lx", order, len); |
1093 | |||
1094 | pages = alloc_pages(GFP_KERNEL, order); | ||
1095 | if (!pages) | ||
846 | goto enomem; | 1096 | goto enomem; |
847 | 1097 | ||
848 | vma->vm_start = (unsigned long) base; | 1098 | total = 1 << order; |
849 | vma->vm_end = vma->vm_start + len; | 1099 | atomic_add(total, &mmap_pages_allocated); |
850 | vma->vm_flags |= VM_MAPPED_COPY; | 1100 | |
1101 | point = rlen >> PAGE_SHIFT; | ||
1102 | |||
1103 | /* we allocated a power-of-2 sized page set, so we may want to trim off | ||
1104 | * the excess */ | ||
1105 | if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { | ||
1106 | while (total > point) { | ||
1107 | order = ilog2(total - point); | ||
1108 | n = 1 << order; | ||
1109 | kdebug("shave %lu/%lu @%lu", n, total - point, total); | ||
1110 | atomic_sub(n, &mmap_pages_allocated); | ||
1111 | total -= n; | ||
1112 | set_page_refcounted(pages + total); | ||
1113 | __free_pages(pages + total, order); | ||
1114 | } | ||
1115 | } | ||
851 | 1116 | ||
852 | #ifdef WARN_ON_SLACK | 1117 | for (point = 1; point < total; point++) |
853 | if (len + WARN_ON_SLACK <= kobjsize(result)) | 1118 | set_page_refcounted(&pages[point]); |
854 | printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n", | 1119 | |
855 | len, current->pid, kobjsize(result) - len); | 1120 | base = page_address(pages); |
856 | #endif | 1121 | region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; |
1122 | region->vm_start = (unsigned long) base; | ||
1123 | region->vm_end = region->vm_start + rlen; | ||
1124 | region->vm_top = region->vm_start + (total << PAGE_SHIFT); | ||
1125 | |||
1126 | vma->vm_start = region->vm_start; | ||
1127 | vma->vm_end = region->vm_start + len; | ||
857 | 1128 | ||
858 | if (vma->vm_file) { | 1129 | if (vma->vm_file) { |
859 | /* read the contents of a file into the copy */ | 1130 | /* read the contents of a file into the copy */ |
@@ -865,31 +1136,33 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) | |||
865 | 1136 | ||
866 | old_fs = get_fs(); | 1137 | old_fs = get_fs(); |
867 | set_fs(KERNEL_DS); | 1138 | set_fs(KERNEL_DS); |
868 | ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); | 1139 | ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos); |
869 | set_fs(old_fs); | 1140 | set_fs(old_fs); |
870 | 1141 | ||
871 | if (ret < 0) | 1142 | if (ret < 0) |
872 | goto error_free; | 1143 | goto error_free; |
873 | 1144 | ||
874 | /* clear the last little bit */ | 1145 | /* clear the last little bit */ |
875 | if (ret < len) | 1146 | if (ret < rlen) |
876 | memset(base + ret, 0, len - ret); | 1147 | memset(base + ret, 0, rlen - ret); |
877 | 1148 | ||
878 | } else { | 1149 | } else { |
879 | /* if it's an anonymous mapping, then just clear it */ | 1150 | /* if it's an anonymous mapping, then just clear it */ |
880 | memset(base, 0, len); | 1151 | memset(base, 0, rlen); |
881 | } | 1152 | } |
882 | 1153 | ||
883 | return 0; | 1154 | return 0; |
884 | 1155 | ||
885 | error_free: | 1156 | error_free: |
886 | kfree(base); | 1157 | free_page_series(region->vm_start, region->vm_end); |
887 | vma->vm_start = 0; | 1158 | region->vm_start = vma->vm_start = 0; |
1159 | region->vm_end = vma->vm_end = 0; | ||
1160 | region->vm_top = 0; | ||
888 | return ret; | 1161 | return ret; |
889 | 1162 | ||
890 | enomem: | 1163 | enomem: |
891 | printk("Allocation of length %lu from process %d failed\n", | 1164 | printk("Allocation of length %lu from process %d (%s) failed\n", |
892 | len, current->pid); | 1165 | len, current->pid, current->comm); |
893 | show_free_areas(); | 1166 | show_free_areas(); |
894 | return -ENOMEM; | 1167 | return -ENOMEM; |
895 | } | 1168 | } |
@@ -904,13 +1177,14 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
904 | unsigned long flags, | 1177 | unsigned long flags, |
905 | unsigned long pgoff) | 1178 | unsigned long pgoff) |
906 | { | 1179 | { |
907 | struct vm_list_struct *vml = NULL; | 1180 | struct vm_area_struct *vma; |
908 | struct vm_area_struct *vma = NULL; | 1181 | struct vm_region *region; |
909 | struct rb_node *rb; | 1182 | struct rb_node *rb; |
910 | unsigned long capabilities, vm_flags; | 1183 | unsigned long capabilities, vm_flags, result; |
911 | void *result; | ||
912 | int ret; | 1184 | int ret; |
913 | 1185 | ||
1186 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); | ||
1187 | |||
914 | if (!(flags & MAP_FIXED)) | 1188 | if (!(flags & MAP_FIXED)) |
915 | addr = round_hint_to_min(addr); | 1189 | addr = round_hint_to_min(addr); |
916 | 1190 | ||
@@ -918,73 +1192,120 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
918 | * mapping */ | 1192 | * mapping */ |
919 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, | 1193 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, |
920 | &capabilities); | 1194 | &capabilities); |
921 | if (ret < 0) | 1195 | if (ret < 0) { |
1196 | kleave(" = %d [val]", ret); | ||
922 | return ret; | 1197 | return ret; |
1198 | } | ||
923 | 1199 | ||
924 | /* we've determined that we can make the mapping, now translate what we | 1200 | /* we've determined that we can make the mapping, now translate what we |
925 | * now know into VMA flags */ | 1201 | * now know into VMA flags */ |
926 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); | 1202 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); |
927 | 1203 | ||
928 | /* we're going to need to record the mapping if it works */ | 1204 | /* we're going to need to record the mapping */ |
929 | vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL); | 1205 | region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); |
930 | if (!vml) | 1206 | if (!region) |
931 | goto error_getting_vml; | 1207 | goto error_getting_region; |
1208 | |||
1209 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | ||
1210 | if (!vma) | ||
1211 | goto error_getting_vma; | ||
1212 | |||
1213 | atomic_set(®ion->vm_usage, 1); | ||
1214 | region->vm_flags = vm_flags; | ||
1215 | region->vm_pgoff = pgoff; | ||
1216 | |||
1217 | INIT_LIST_HEAD(&vma->anon_vma_node); | ||
1218 | vma->vm_flags = vm_flags; | ||
1219 | vma->vm_pgoff = pgoff; | ||
1220 | |||
1221 | if (file) { | ||
1222 | region->vm_file = file; | ||
1223 | get_file(file); | ||
1224 | vma->vm_file = file; | ||
1225 | get_file(file); | ||
1226 | if (vm_flags & VM_EXECUTABLE) { | ||
1227 | added_exe_file_vma(current->mm); | ||
1228 | vma->vm_mm = current->mm; | ||
1229 | } | ||
1230 | } | ||
932 | 1231 | ||
933 | down_write(&nommu_vma_sem); | 1232 | down_write(&nommu_region_sem); |
934 | 1233 | ||
935 | /* if we want to share, we need to check for VMAs created by other | 1234 | /* if we want to share, we need to check for regions created by other |
936 | * mmap() calls that overlap with our proposed mapping | 1235 | * mmap() calls that overlap with our proposed mapping |
937 | * - we can only share with an exact match on most regular files | 1236 | * - we can only share with a superset match on most regular files |
938 | * - shared mappings on character devices and memory backed files are | 1237 | * - shared mappings on character devices and memory backed files are |
939 | * permitted to overlap inexactly as far as we are concerned for in | 1238 | * permitted to overlap inexactly as far as we are concerned for in |
940 | * these cases, sharing is handled in the driver or filesystem rather | 1239 | * these cases, sharing is handled in the driver or filesystem rather |
941 | * than here | 1240 | * than here |
942 | */ | 1241 | */ |
943 | if (vm_flags & VM_MAYSHARE) { | 1242 | if (vm_flags & VM_MAYSHARE) { |
944 | unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; | 1243 | struct vm_region *pregion; |
945 | unsigned long vmpglen; | 1244 | unsigned long pglen, rpglen, pgend, rpgend, start; |
946 | 1245 | ||
947 | /* suppress VMA sharing for shared regions */ | 1246 | pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; |
948 | if (vm_flags & VM_SHARED && | 1247 | pgend = pgoff + pglen; |
949 | capabilities & BDI_CAP_MAP_DIRECT) | ||
950 | goto dont_share_VMAs; | ||
951 | 1248 | ||
952 | for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) { | 1249 | for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { |
953 | vma = rb_entry(rb, struct vm_area_struct, vm_rb); | 1250 | pregion = rb_entry(rb, struct vm_region, vm_rb); |
954 | 1251 | ||
955 | if (!(vma->vm_flags & VM_MAYSHARE)) | 1252 | if (!(pregion->vm_flags & VM_MAYSHARE)) |
956 | continue; | 1253 | continue; |
957 | 1254 | ||
958 | /* search for overlapping mappings on the same file */ | 1255 | /* search for overlapping mappings on the same file */ |
959 | if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) | 1256 | if (pregion->vm_file->f_path.dentry->d_inode != |
1257 | file->f_path.dentry->d_inode) | ||
960 | continue; | 1258 | continue; |
961 | 1259 | ||
962 | if (vma->vm_pgoff >= pgoff + pglen) | 1260 | if (pregion->vm_pgoff >= pgend) |
963 | continue; | 1261 | continue; |
964 | 1262 | ||
965 | vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1; | 1263 | rpglen = pregion->vm_end - pregion->vm_start; |
966 | vmpglen >>= PAGE_SHIFT; | 1264 | rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT; |
967 | if (pgoff >= vma->vm_pgoff + vmpglen) | 1265 | rpgend = pregion->vm_pgoff + rpglen; |
1266 | if (pgoff >= rpgend) | ||
968 | continue; | 1267 | continue; |
969 | 1268 | ||
970 | /* handle inexactly overlapping matches between mappings */ | 1269 | /* handle inexactly overlapping matches between |
971 | if (vma->vm_pgoff != pgoff || vmpglen != pglen) { | 1270 | * mappings */ |
1271 | if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && | ||
1272 | !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { | ||
1273 | /* new mapping is not a subset of the region */ | ||
972 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) | 1274 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) |
973 | goto sharing_violation; | 1275 | goto sharing_violation; |
974 | continue; | 1276 | continue; |
975 | } | 1277 | } |
976 | 1278 | ||
977 | /* we've found a VMA we can share */ | 1279 | /* we've found a region we can share */ |
978 | atomic_inc(&vma->vm_usage); | 1280 | atomic_inc(&pregion->vm_usage); |
979 | 1281 | vma->vm_region = pregion; | |
980 | vml->vma = vma; | 1282 | start = pregion->vm_start; |
981 | result = (void *) vma->vm_start; | 1283 | start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; |
982 | goto shared; | 1284 | vma->vm_start = start; |
1285 | vma->vm_end = start + len; | ||
1286 | |||
1287 | if (pregion->vm_flags & VM_MAPPED_COPY) { | ||
1288 | kdebug("share copy"); | ||
1289 | vma->vm_flags |= VM_MAPPED_COPY; | ||
1290 | } else { | ||
1291 | kdebug("share mmap"); | ||
1292 | ret = do_mmap_shared_file(vma); | ||
1293 | if (ret < 0) { | ||
1294 | vma->vm_region = NULL; | ||
1295 | vma->vm_start = 0; | ||
1296 | vma->vm_end = 0; | ||
1297 | atomic_dec(&pregion->vm_usage); | ||
1298 | pregion = NULL; | ||
1299 | goto error_just_free; | ||
1300 | } | ||
1301 | } | ||
1302 | fput(region->vm_file); | ||
1303 | kmem_cache_free(vm_region_jar, region); | ||
1304 | region = pregion; | ||
1305 | result = start; | ||
1306 | goto share; | ||
983 | } | 1307 | } |
984 | 1308 | ||
985 | dont_share_VMAs: | ||
986 | vma = NULL; | ||
987 | |||
988 | /* obtain the address at which to make a shared mapping | 1309 | /* obtain the address at which to make a shared mapping |
989 | * - this is the hook for quasi-memory character devices to | 1310 | * - this is the hook for quasi-memory character devices to |
990 | * tell us the location of a shared mapping | 1311 | * tell us the location of a shared mapping |
@@ -995,113 +1316,93 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
995 | if (IS_ERR((void *) addr)) { | 1316 | if (IS_ERR((void *) addr)) { |
996 | ret = addr; | 1317 | ret = addr; |
997 | if (ret != (unsigned long) -ENOSYS) | 1318 | if (ret != (unsigned long) -ENOSYS) |
998 | goto error; | 1319 | goto error_just_free; |
999 | 1320 | ||
1000 | /* the driver refused to tell us where to site | 1321 | /* the driver refused to tell us where to site |
1001 | * the mapping so we'll have to attempt to copy | 1322 | * the mapping so we'll have to attempt to copy |
1002 | * it */ | 1323 | * it */ |
1003 | ret = (unsigned long) -ENODEV; | 1324 | ret = (unsigned long) -ENODEV; |
1004 | if (!(capabilities & BDI_CAP_MAP_COPY)) | 1325 | if (!(capabilities & BDI_CAP_MAP_COPY)) |
1005 | goto error; | 1326 | goto error_just_free; |
1006 | 1327 | ||
1007 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 1328 | capabilities &= ~BDI_CAP_MAP_DIRECT; |
1329 | } else { | ||
1330 | vma->vm_start = region->vm_start = addr; | ||
1331 | vma->vm_end = region->vm_end = addr + len; | ||
1008 | } | 1332 | } |
1009 | } | 1333 | } |
1010 | } | 1334 | } |
1011 | 1335 | ||
1012 | /* we're going to need a VMA struct as well */ | 1336 | vma->vm_region = region; |
1013 | vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); | ||
1014 | if (!vma) | ||
1015 | goto error_getting_vma; | ||
1016 | |||
1017 | INIT_LIST_HEAD(&vma->anon_vma_node); | ||
1018 | atomic_set(&vma->vm_usage, 1); | ||
1019 | if (file) { | ||
1020 | get_file(file); | ||
1021 | if (vm_flags & VM_EXECUTABLE) { | ||
1022 | added_exe_file_vma(current->mm); | ||
1023 | vma->vm_mm = current->mm; | ||
1024 | } | ||
1025 | } | ||
1026 | vma->vm_file = file; | ||
1027 | vma->vm_flags = vm_flags; | ||
1028 | vma->vm_start = addr; | ||
1029 | vma->vm_end = addr + len; | ||
1030 | vma->vm_pgoff = pgoff; | ||
1031 | |||
1032 | vml->vma = vma; | ||
1033 | 1337 | ||
1034 | /* set up the mapping */ | 1338 | /* set up the mapping */ |
1035 | if (file && vma->vm_flags & VM_SHARED) | 1339 | if (file && vma->vm_flags & VM_SHARED) |
1036 | ret = do_mmap_shared_file(vma, len); | 1340 | ret = do_mmap_shared_file(vma); |
1037 | else | 1341 | else |
1038 | ret = do_mmap_private(vma, len); | 1342 | ret = do_mmap_private(vma, region, len); |
1039 | if (ret < 0) | 1343 | if (ret < 0) |
1040 | goto error; | 1344 | goto error_put_region; |
1041 | |||
1042 | /* okay... we have a mapping; now we have to register it */ | ||
1043 | result = (void *) vma->vm_start; | ||
1044 | 1345 | ||
1045 | if (vma->vm_flags & VM_MAPPED_COPY) { | 1346 | add_nommu_region(region); |
1046 | realalloc += kobjsize(result); | ||
1047 | askedalloc += len; | ||
1048 | } | ||
1049 | 1347 | ||
1050 | realalloc += kobjsize(vma); | 1348 | /* okay... we have a mapping; now we have to register it */ |
1051 | askedalloc += sizeof(*vma); | 1349 | result = vma->vm_start; |
1052 | 1350 | ||
1053 | current->mm->total_vm += len >> PAGE_SHIFT; | 1351 | current->mm->total_vm += len >> PAGE_SHIFT; |
1054 | 1352 | ||
1055 | add_nommu_vma(vma); | 1353 | share: |
1056 | 1354 | add_vma_to_mm(current->mm, vma); | |
1057 | shared: | ||
1058 | realalloc += kobjsize(vml); | ||
1059 | askedalloc += sizeof(*vml); | ||
1060 | 1355 | ||
1061 | add_vma_to_mm(current->mm, vml); | 1356 | up_write(&nommu_region_sem); |
1062 | |||
1063 | up_write(&nommu_vma_sem); | ||
1064 | 1357 | ||
1065 | if (prot & PROT_EXEC) | 1358 | if (prot & PROT_EXEC) |
1066 | flush_icache_range((unsigned long) result, | 1359 | flush_icache_range(result, result + len); |
1067 | (unsigned long) result + len); | ||
1068 | 1360 | ||
1069 | #ifdef DEBUG | 1361 | kleave(" = %lx", result); |
1070 | printk("do_mmap:\n"); | 1362 | return result; |
1071 | show_process_blocks(); | ||
1072 | #endif | ||
1073 | 1363 | ||
1074 | return (unsigned long) result; | 1364 | error_put_region: |
1075 | 1365 | __put_nommu_region(region); | |
1076 | error: | ||
1077 | up_write(&nommu_vma_sem); | ||
1078 | kfree(vml); | ||
1079 | if (vma) { | 1366 | if (vma) { |
1080 | if (vma->vm_file) { | 1367 | if (vma->vm_file) { |
1081 | fput(vma->vm_file); | 1368 | fput(vma->vm_file); |
1082 | if (vma->vm_flags & VM_EXECUTABLE) | 1369 | if (vma->vm_flags & VM_EXECUTABLE) |
1083 | removed_exe_file_vma(vma->vm_mm); | 1370 | removed_exe_file_vma(vma->vm_mm); |
1084 | } | 1371 | } |
1085 | kfree(vma); | 1372 | kmem_cache_free(vm_area_cachep, vma); |
1086 | } | 1373 | } |
1374 | kleave(" = %d [pr]", ret); | ||
1087 | return ret; | 1375 | return ret; |
1088 | 1376 | ||
1089 | sharing_violation: | 1377 | error_just_free: |
1090 | up_write(&nommu_vma_sem); | 1378 | up_write(&nommu_region_sem); |
1091 | printk("Attempt to share mismatched mappings\n"); | 1379 | error: |
1092 | kfree(vml); | 1380 | fput(region->vm_file); |
1093 | return -EINVAL; | 1381 | kmem_cache_free(vm_region_jar, region); |
1382 | fput(vma->vm_file); | ||
1383 | if (vma->vm_flags & VM_EXECUTABLE) | ||
1384 | removed_exe_file_vma(vma->vm_mm); | ||
1385 | kmem_cache_free(vm_area_cachep, vma); | ||
1386 | kleave(" = %d", ret); | ||
1387 | return ret; | ||
1094 | 1388 | ||
1095 | error_getting_vma: | 1389 | sharing_violation: |
1096 | up_write(&nommu_vma_sem); | 1390 | up_write(&nommu_region_sem); |
1097 | kfree(vml); | 1391 | printk(KERN_WARNING "Attempt to share mismatched mappings\n"); |
1098 | printk("Allocation of vma for %lu byte allocation from process %d failed\n", | 1392 | ret = -EINVAL; |
1393 | goto error; | ||
1394 | |||
1395 | error_getting_vma: | ||
1396 | kmem_cache_free(vm_region_jar, region); | ||
1397 | printk(KERN_WARNING "Allocation of vma for %lu byte allocation" | ||
1398 | " from process %d failed\n", | ||
1099 | len, current->pid); | 1399 | len, current->pid); |
1100 | show_free_areas(); | 1400 | show_free_areas(); |
1101 | return -ENOMEM; | 1401 | return -ENOMEM; |
1102 | 1402 | ||
1103 | error_getting_vml: | 1403 | error_getting_region: |
1104 | printk("Allocation of vml for %lu byte allocation from process %d failed\n", | 1404 | printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" |
1405 | " from process %d failed\n", | ||
1105 | len, current->pid); | 1406 | len, current->pid); |
1106 | show_free_areas(); | 1407 | show_free_areas(); |
1107 | return -ENOMEM; | 1408 | return -ENOMEM; |
@@ -1109,90 +1410,188 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1109 | EXPORT_SYMBOL(do_mmap_pgoff); | 1410 | EXPORT_SYMBOL(do_mmap_pgoff); |
1110 | 1411 | ||
1111 | /* | 1412 | /* |
1112 | * handle mapping disposal for uClinux | 1413 | * split a vma into two pieces at address 'addr', a new vma is allocated either |
1414 | * for the first part or the tail. | ||
1113 | */ | 1415 | */ |
1114 | static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma) | 1416 | int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, |
1417 | unsigned long addr, int new_below) | ||
1115 | { | 1418 | { |
1116 | if (vma) { | 1419 | struct vm_area_struct *new; |
1117 | down_write(&nommu_vma_sem); | 1420 | struct vm_region *region; |
1421 | unsigned long npages; | ||
1118 | 1422 | ||
1119 | if (atomic_dec_and_test(&vma->vm_usage)) { | 1423 | kenter(""); |
1120 | delete_nommu_vma(vma); | ||
1121 | 1424 | ||
1122 | if (vma->vm_ops && vma->vm_ops->close) | 1425 | /* we're only permitted to split anonymous regions that have a single |
1123 | vma->vm_ops->close(vma); | 1426 | * owner */ |
1427 | if (vma->vm_file || | ||
1428 | atomic_read(&vma->vm_region->vm_usage) != 1) | ||
1429 | return -ENOMEM; | ||
1124 | 1430 | ||
1125 | /* IO memory and memory shared directly out of the pagecache from | 1431 | if (mm->map_count >= sysctl_max_map_count) |
1126 | * ramfs/tmpfs mustn't be released here */ | 1432 | return -ENOMEM; |
1127 | if (vma->vm_flags & VM_MAPPED_COPY) { | ||
1128 | realalloc -= kobjsize((void *) vma->vm_start); | ||
1129 | askedalloc -= vma->vm_end - vma->vm_start; | ||
1130 | kfree((void *) vma->vm_start); | ||
1131 | } | ||
1132 | 1433 | ||
1133 | realalloc -= kobjsize(vma); | 1434 | region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); |
1134 | askedalloc -= sizeof(*vma); | 1435 | if (!region) |
1436 | return -ENOMEM; | ||
1135 | 1437 | ||
1136 | if (vma->vm_file) { | 1438 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
1137 | fput(vma->vm_file); | 1439 | if (!new) { |
1138 | if (vma->vm_flags & VM_EXECUTABLE) | 1440 | kmem_cache_free(vm_region_jar, region); |
1139 | removed_exe_file_vma(mm); | 1441 | return -ENOMEM; |
1140 | } | 1442 | } |
1141 | kfree(vma); | 1443 | |
1142 | } | 1444 | /* most fields are the same, copy all, and then fixup */ |
1445 | *new = *vma; | ||
1446 | *region = *vma->vm_region; | ||
1447 | new->vm_region = region; | ||
1448 | |||
1449 | npages = (addr - vma->vm_start) >> PAGE_SHIFT; | ||
1143 | 1450 | ||
1144 | up_write(&nommu_vma_sem); | 1451 | if (new_below) { |
1452 | region->vm_top = region->vm_end = new->vm_end = addr; | ||
1453 | } else { | ||
1454 | region->vm_start = new->vm_start = addr; | ||
1455 | region->vm_pgoff = new->vm_pgoff += npages; | ||
1145 | } | 1456 | } |
1457 | |||
1458 | if (new->vm_ops && new->vm_ops->open) | ||
1459 | new->vm_ops->open(new); | ||
1460 | |||
1461 | delete_vma_from_mm(vma); | ||
1462 | down_write(&nommu_region_sem); | ||
1463 | delete_nommu_region(vma->vm_region); | ||
1464 | if (new_below) { | ||
1465 | vma->vm_region->vm_start = vma->vm_start = addr; | ||
1466 | vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; | ||
1467 | } else { | ||
1468 | vma->vm_region->vm_end = vma->vm_end = addr; | ||
1469 | vma->vm_region->vm_top = addr; | ||
1470 | } | ||
1471 | add_nommu_region(vma->vm_region); | ||
1472 | add_nommu_region(new->vm_region); | ||
1473 | up_write(&nommu_region_sem); | ||
1474 | add_vma_to_mm(mm, vma); | ||
1475 | add_vma_to_mm(mm, new); | ||
1476 | return 0; | ||
1146 | } | 1477 | } |
1147 | 1478 | ||
1148 | /* | 1479 | /* |
1149 | * release a mapping | 1480 | * shrink a VMA by removing the specified chunk from either the beginning or |
1150 | * - under NOMMU conditions the parameters must match exactly to the mapping to | 1481 | * the end |
1151 | * be removed | ||
1152 | */ | 1482 | */ |
1153 | int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | 1483 | static int shrink_vma(struct mm_struct *mm, |
1484 | struct vm_area_struct *vma, | ||
1485 | unsigned long from, unsigned long to) | ||
1154 | { | 1486 | { |
1155 | struct vm_list_struct *vml, **parent; | 1487 | struct vm_region *region; |
1156 | unsigned long end = addr + len; | ||
1157 | 1488 | ||
1158 | #ifdef DEBUG | 1489 | kenter(""); |
1159 | printk("do_munmap:\n"); | ||
1160 | #endif | ||
1161 | 1490 | ||
1162 | for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { | 1491 | /* adjust the VMA's pointers, which may reposition it in the MM's tree |
1163 | if ((*parent)->vma->vm_start > addr) | 1492 | * and list */ |
1164 | break; | 1493 | delete_vma_from_mm(vma); |
1165 | if ((*parent)->vma->vm_start == addr && | 1494 | if (from > vma->vm_start) |
1166 | ((len == 0) || ((*parent)->vma->vm_end == end))) | 1495 | vma->vm_end = from; |
1167 | goto found; | 1496 | else |
1497 | vma->vm_start = to; | ||
1498 | add_vma_to_mm(mm, vma); | ||
1499 | |||
1500 | /* cut the backing region down to size */ | ||
1501 | region = vma->vm_region; | ||
1502 | BUG_ON(atomic_read(®ion->vm_usage) != 1); | ||
1503 | |||
1504 | down_write(&nommu_region_sem); | ||
1505 | delete_nommu_region(region); | ||
1506 | if (from > region->vm_start) { | ||
1507 | to = region->vm_top; | ||
1508 | region->vm_top = region->vm_end = from; | ||
1509 | } else { | ||
1510 | region->vm_start = to; | ||
1168 | } | 1511 | } |
1512 | add_nommu_region(region); | ||
1513 | up_write(&nommu_region_sem); | ||
1169 | 1514 | ||
1170 | printk("munmap of non-mmaped memory by process %d (%s): %p\n", | 1515 | free_page_series(from, to); |
1171 | current->pid, current->comm, (void *) addr); | 1516 | return 0; |
1172 | return -EINVAL; | 1517 | } |
1173 | 1518 | ||
1174 | found: | 1519 | /* |
1175 | vml = *parent; | 1520 | * release a mapping |
1521 | * - under NOMMU conditions the chunk to be unmapped must be backed by a single | ||
1522 | * VMA, though it need not cover the whole VMA | ||
1523 | */ | ||
1524 | int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | ||
1525 | { | ||
1526 | struct vm_area_struct *vma; | ||
1527 | struct rb_node *rb; | ||
1528 | unsigned long end = start + len; | ||
1529 | int ret; | ||
1176 | 1530 | ||
1177 | put_vma(mm, vml->vma); | 1531 | kenter(",%lx,%zx", start, len); |
1178 | 1532 | ||
1179 | *parent = vml->next; | 1533 | if (len == 0) |
1180 | realalloc -= kobjsize(vml); | 1534 | return -EINVAL; |
1181 | askedalloc -= sizeof(*vml); | ||
1182 | kfree(vml); | ||
1183 | 1535 | ||
1184 | update_hiwater_vm(mm); | 1536 | /* find the first potentially overlapping VMA */ |
1185 | mm->total_vm -= len >> PAGE_SHIFT; | 1537 | vma = find_vma(mm, start); |
1538 | if (!vma) { | ||
1539 | printk(KERN_WARNING | ||
1540 | "munmap of memory not mmapped by process %d (%s):" | ||
1541 | " 0x%lx-0x%lx\n", | ||
1542 | current->pid, current->comm, start, start + len - 1); | ||
1543 | return -EINVAL; | ||
1544 | } | ||
1186 | 1545 | ||
1187 | #ifdef DEBUG | 1546 | /* we're allowed to split an anonymous VMA but not a file-backed one */ |
1188 | show_process_blocks(); | 1547 | if (vma->vm_file) { |
1189 | #endif | 1548 | do { |
1549 | if (start > vma->vm_start) { | ||
1550 | kleave(" = -EINVAL [miss]"); | ||
1551 | return -EINVAL; | ||
1552 | } | ||
1553 | if (end == vma->vm_end) | ||
1554 | goto erase_whole_vma; | ||
1555 | rb = rb_next(&vma->vm_rb); | ||
1556 | vma = rb_entry(rb, struct vm_area_struct, vm_rb); | ||
1557 | } while (rb); | ||
1558 | kleave(" = -EINVAL [split file]"); | ||
1559 | return -EINVAL; | ||
1560 | } else { | ||
1561 | /* the chunk must be a subset of the VMA found */ | ||
1562 | if (start == vma->vm_start && end == vma->vm_end) | ||
1563 | goto erase_whole_vma; | ||
1564 | if (start < vma->vm_start || end > vma->vm_end) { | ||
1565 | kleave(" = -EINVAL [superset]"); | ||
1566 | return -EINVAL; | ||
1567 | } | ||
1568 | if (start & ~PAGE_MASK) { | ||
1569 | kleave(" = -EINVAL [unaligned start]"); | ||
1570 | return -EINVAL; | ||
1571 | } | ||
1572 | if (end != vma->vm_end && end & ~PAGE_MASK) { | ||
1573 | kleave(" = -EINVAL [unaligned split]"); | ||
1574 | return -EINVAL; | ||
1575 | } | ||
1576 | if (start != vma->vm_start && end != vma->vm_end) { | ||
1577 | ret = split_vma(mm, vma, start, 1); | ||
1578 | if (ret < 0) { | ||
1579 | kleave(" = %d [split]", ret); | ||
1580 | return ret; | ||
1581 | } | ||
1582 | } | ||
1583 | return shrink_vma(mm, vma, start, end); | ||
1584 | } | ||
1190 | 1585 | ||
1586 | erase_whole_vma: | ||
1587 | delete_vma_from_mm(vma); | ||
1588 | delete_vma(mm, vma); | ||
1589 | kleave(" = 0"); | ||
1191 | return 0; | 1590 | return 0; |
1192 | } | 1591 | } |
1193 | EXPORT_SYMBOL(do_munmap); | 1592 | EXPORT_SYMBOL(do_munmap); |
1194 | 1593 | ||
1195 | asmlinkage long sys_munmap(unsigned long addr, size_t len) | 1594 | SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) |
1196 | { | 1595 | { |
1197 | int ret; | 1596 | int ret; |
1198 | struct mm_struct *mm = current->mm; | 1597 | struct mm_struct *mm = current->mm; |
@@ -1204,32 +1603,26 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len) | |||
1204 | } | 1603 | } |
1205 | 1604 | ||
1206 | /* | 1605 | /* |
1207 | * Release all mappings | 1606 | * release all the mappings made in a process's VM space |
1208 | */ | 1607 | */ |
1209 | void exit_mmap(struct mm_struct * mm) | 1608 | void exit_mmap(struct mm_struct *mm) |
1210 | { | 1609 | { |
1211 | struct vm_list_struct *tmp; | 1610 | struct vm_area_struct *vma; |
1212 | |||
1213 | if (mm) { | ||
1214 | #ifdef DEBUG | ||
1215 | printk("Exit_mmap:\n"); | ||
1216 | #endif | ||
1217 | 1611 | ||
1218 | mm->total_vm = 0; | 1612 | if (!mm) |
1613 | return; | ||
1219 | 1614 | ||
1220 | while ((tmp = mm->context.vmlist)) { | 1615 | kenter(""); |
1221 | mm->context.vmlist = tmp->next; | ||
1222 | put_vma(mm, tmp->vma); | ||
1223 | 1616 | ||
1224 | realalloc -= kobjsize(tmp); | 1617 | mm->total_vm = 0; |
1225 | askedalloc -= sizeof(*tmp); | ||
1226 | kfree(tmp); | ||
1227 | } | ||
1228 | 1618 | ||
1229 | #ifdef DEBUG | 1619 | while ((vma = mm->mmap)) { |
1230 | show_process_blocks(); | 1620 | mm->mmap = vma->vm_next; |
1231 | #endif | 1621 | delete_vma_from_mm(vma); |
1622 | delete_vma(mm, vma); | ||
1232 | } | 1623 | } |
1624 | |||
1625 | kleave(""); | ||
1233 | } | 1626 | } |
1234 | 1627 | ||
1235 | unsigned long do_brk(unsigned long addr, unsigned long len) | 1628 | unsigned long do_brk(unsigned long addr, unsigned long len) |
@@ -1242,8 +1635,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
1242 | * time (controlled by the MREMAP_MAYMOVE flag and available VM space) | 1635 | * time (controlled by the MREMAP_MAYMOVE flag and available VM space) |
1243 | * | 1636 | * |
1244 | * under NOMMU conditions, we only permit changing a mapping's size, and only | 1637 | * under NOMMU conditions, we only permit changing a mapping's size, and only |
1245 | * as long as it stays within the hole allocated by the kmalloc() call in | 1638 | * as long as it stays within the region allocated by do_mmap_private() and the |
1246 | * do_mmap_pgoff() and the block is not shareable | 1639 | * block is not shareable |
1247 | * | 1640 | * |
1248 | * MREMAP_FIXED is not supported under NOMMU conditions | 1641 | * MREMAP_FIXED is not supported under NOMMU conditions |
1249 | */ | 1642 | */ |
@@ -1254,13 +1647,16 @@ unsigned long do_mremap(unsigned long addr, | |||
1254 | struct vm_area_struct *vma; | 1647 | struct vm_area_struct *vma; |
1255 | 1648 | ||
1256 | /* insanity checks first */ | 1649 | /* insanity checks first */ |
1257 | if (new_len == 0) | 1650 | if (old_len == 0 || new_len == 0) |
1258 | return (unsigned long) -EINVAL; | 1651 | return (unsigned long) -EINVAL; |
1259 | 1652 | ||
1653 | if (addr & ~PAGE_MASK) | ||
1654 | return -EINVAL; | ||
1655 | |||
1260 | if (flags & MREMAP_FIXED && new_addr != addr) | 1656 | if (flags & MREMAP_FIXED && new_addr != addr) |
1261 | return (unsigned long) -EINVAL; | 1657 | return (unsigned long) -EINVAL; |
1262 | 1658 | ||
1263 | vma = find_vma_exact(current->mm, addr); | 1659 | vma = find_vma_exact(current->mm, addr, old_len); |
1264 | if (!vma) | 1660 | if (!vma) |
1265 | return (unsigned long) -EINVAL; | 1661 | return (unsigned long) -EINVAL; |
1266 | 1662 | ||
@@ -1270,22 +1666,18 @@ unsigned long do_mremap(unsigned long addr, | |||
1270 | if (vma->vm_flags & VM_MAYSHARE) | 1666 | if (vma->vm_flags & VM_MAYSHARE) |
1271 | return (unsigned long) -EPERM; | 1667 | return (unsigned long) -EPERM; |
1272 | 1668 | ||
1273 | if (new_len > kobjsize((void *) addr)) | 1669 | if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) |
1274 | return (unsigned long) -ENOMEM; | 1670 | return (unsigned long) -ENOMEM; |
1275 | 1671 | ||
1276 | /* all checks complete - do it */ | 1672 | /* all checks complete - do it */ |
1277 | vma->vm_end = vma->vm_start + new_len; | 1673 | vma->vm_end = vma->vm_start + new_len; |
1278 | |||
1279 | askedalloc -= old_len; | ||
1280 | askedalloc += new_len; | ||
1281 | |||
1282 | return vma->vm_start; | 1674 | return vma->vm_start; |
1283 | } | 1675 | } |
1284 | EXPORT_SYMBOL(do_mremap); | 1676 | EXPORT_SYMBOL(do_mremap); |
1285 | 1677 | ||
1286 | asmlinkage unsigned long sys_mremap(unsigned long addr, | 1678 | SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, |
1287 | unsigned long old_len, unsigned long new_len, | 1679 | unsigned long, new_len, unsigned long, flags, |
1288 | unsigned long flags, unsigned long new_addr) | 1680 | unsigned long, new_addr) |
1289 | { | 1681 | { |
1290 | unsigned long ret; | 1682 | unsigned long ret; |
1291 | 1683 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 558f9afe6e4e..40ba05061a4f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -31,7 +31,7 @@ | |||
31 | int sysctl_panic_on_oom; | 31 | int sysctl_panic_on_oom; |
32 | int sysctl_oom_kill_allocating_task; | 32 | int sysctl_oom_kill_allocating_task; |
33 | int sysctl_oom_dump_tasks; | 33 | int sysctl_oom_dump_tasks; |
34 | static DEFINE_SPINLOCK(zone_scan_mutex); | 34 | static DEFINE_SPINLOCK(zone_scan_lock); |
35 | /* #define DEBUG */ | 35 | /* #define DEBUG */ |
36 | 36 | ||
37 | /** | 37 | /** |
@@ -392,6 +392,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
392 | printk(KERN_WARNING "%s invoked oom-killer: " | 392 | printk(KERN_WARNING "%s invoked oom-killer: " |
393 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | 393 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", |
394 | current->comm, gfp_mask, order, current->oomkilladj); | 394 | current->comm, gfp_mask, order, current->oomkilladj); |
395 | task_lock(current); | ||
396 | cpuset_print_task_mems_allowed(current); | ||
397 | task_unlock(current); | ||
395 | dump_stack(); | 398 | dump_stack(); |
396 | show_mem(); | 399 | show_mem(); |
397 | if (sysctl_oom_dump_tasks) | 400 | if (sysctl_oom_dump_tasks) |
@@ -426,7 +429,6 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) | |||
426 | unsigned long points = 0; | 429 | unsigned long points = 0; |
427 | struct task_struct *p; | 430 | struct task_struct *p; |
428 | 431 | ||
429 | cgroup_lock(); | ||
430 | read_lock(&tasklist_lock); | 432 | read_lock(&tasklist_lock); |
431 | retry: | 433 | retry: |
432 | p = select_bad_process(&points, mem); | 434 | p = select_bad_process(&points, mem); |
@@ -441,7 +443,6 @@ retry: | |||
441 | goto retry; | 443 | goto retry; |
442 | out: | 444 | out: |
443 | read_unlock(&tasklist_lock); | 445 | read_unlock(&tasklist_lock); |
444 | cgroup_unlock(); | ||
445 | } | 446 | } |
446 | #endif | 447 | #endif |
447 | 448 | ||
@@ -470,7 +471,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
470 | struct zone *zone; | 471 | struct zone *zone; |
471 | int ret = 1; | 472 | int ret = 1; |
472 | 473 | ||
473 | spin_lock(&zone_scan_mutex); | 474 | spin_lock(&zone_scan_lock); |
474 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { | 475 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { |
475 | if (zone_is_oom_locked(zone)) { | 476 | if (zone_is_oom_locked(zone)) { |
476 | ret = 0; | 477 | ret = 0; |
@@ -480,7 +481,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
480 | 481 | ||
481 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { | 482 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { |
482 | /* | 483 | /* |
483 | * Lock each zone in the zonelist under zone_scan_mutex so a | 484 | * Lock each zone in the zonelist under zone_scan_lock so a |
484 | * parallel invocation of try_set_zone_oom() doesn't succeed | 485 | * parallel invocation of try_set_zone_oom() doesn't succeed |
485 | * when it shouldn't. | 486 | * when it shouldn't. |
486 | */ | 487 | */ |
@@ -488,7 +489,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
488 | } | 489 | } |
489 | 490 | ||
490 | out: | 491 | out: |
491 | spin_unlock(&zone_scan_mutex); | 492 | spin_unlock(&zone_scan_lock); |
492 | return ret; | 493 | return ret; |
493 | } | 494 | } |
494 | 495 | ||
@@ -502,11 +503,82 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
502 | struct zoneref *z; | 503 | struct zoneref *z; |
503 | struct zone *zone; | 504 | struct zone *zone; |
504 | 505 | ||
505 | spin_lock(&zone_scan_mutex); | 506 | spin_lock(&zone_scan_lock); |
506 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { | 507 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { |
507 | zone_clear_flag(zone, ZONE_OOM_LOCKED); | 508 | zone_clear_flag(zone, ZONE_OOM_LOCKED); |
508 | } | 509 | } |
509 | spin_unlock(&zone_scan_mutex); | 510 | spin_unlock(&zone_scan_lock); |
511 | } | ||
512 | |||
513 | /* | ||
514 | * Must be called with tasklist_lock held for read. | ||
515 | */ | ||
516 | static void __out_of_memory(gfp_t gfp_mask, int order) | ||
517 | { | ||
518 | if (sysctl_oom_kill_allocating_task) { | ||
519 | oom_kill_process(current, gfp_mask, order, 0, NULL, | ||
520 | "Out of memory (oom_kill_allocating_task)"); | ||
521 | |||
522 | } else { | ||
523 | unsigned long points; | ||
524 | struct task_struct *p; | ||
525 | |||
526 | retry: | ||
527 | /* | ||
528 | * Rambo mode: Shoot down a process and hope it solves whatever | ||
529 | * issues we may have. | ||
530 | */ | ||
531 | p = select_bad_process(&points, NULL); | ||
532 | |||
533 | if (PTR_ERR(p) == -1UL) | ||
534 | return; | ||
535 | |||
536 | /* Found nothing?!?! Either we hang forever, or we panic. */ | ||
537 | if (!p) { | ||
538 | read_unlock(&tasklist_lock); | ||
539 | panic("Out of memory and no killable processes...\n"); | ||
540 | } | ||
541 | |||
542 | if (oom_kill_process(p, gfp_mask, order, points, NULL, | ||
543 | "Out of memory")) | ||
544 | goto retry; | ||
545 | } | ||
546 | } | ||
547 | |||
548 | /* | ||
549 | * pagefault handler calls into here because it is out of memory but | ||
550 | * doesn't know exactly how or why. | ||
551 | */ | ||
552 | void pagefault_out_of_memory(void) | ||
553 | { | ||
554 | unsigned long freed = 0; | ||
555 | |||
556 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); | ||
557 | if (freed > 0) | ||
558 | /* Got some memory back in the last second. */ | ||
559 | return; | ||
560 | |||
561 | /* | ||
562 | * If this is from memcg, oom-killer is already invoked. | ||
563 | * and not worth to go system-wide-oom. | ||
564 | */ | ||
565 | if (mem_cgroup_oom_called(current)) | ||
566 | goto rest_and_return; | ||
567 | |||
568 | if (sysctl_panic_on_oom) | ||
569 | panic("out of memory from page fault. panic_on_oom is selected.\n"); | ||
570 | |||
571 | read_lock(&tasklist_lock); | ||
572 | __out_of_memory(0, 0); /* unknown gfp_mask and order */ | ||
573 | read_unlock(&tasklist_lock); | ||
574 | |||
575 | /* | ||
576 | * Give "p" a good chance of killing itself before we | ||
577 | * retry to allocate memory. | ||
578 | */ | ||
579 | rest_and_return: | ||
580 | if (!test_thread_flag(TIF_MEMDIE)) | ||
581 | schedule_timeout_uninterruptible(1); | ||
510 | } | 582 | } |
511 | 583 | ||
512 | /** | 584 | /** |
@@ -522,8 +594,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
522 | */ | 594 | */ |
523 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | 595 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) |
524 | { | 596 | { |
525 | struct task_struct *p; | ||
526 | unsigned long points = 0; | ||
527 | unsigned long freed = 0; | 597 | unsigned long freed = 0; |
528 | enum oom_constraint constraint; | 598 | enum oom_constraint constraint; |
529 | 599 | ||
@@ -544,7 +614,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
544 | 614 | ||
545 | switch (constraint) { | 615 | switch (constraint) { |
546 | case CONSTRAINT_MEMORY_POLICY: | 616 | case CONSTRAINT_MEMORY_POLICY: |
547 | oom_kill_process(current, gfp_mask, order, points, NULL, | 617 | oom_kill_process(current, gfp_mask, order, 0, NULL, |
548 | "No available memory (MPOL_BIND)"); | 618 | "No available memory (MPOL_BIND)"); |
549 | break; | 619 | break; |
550 | 620 | ||
@@ -553,35 +623,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
553 | panic("out of memory. panic_on_oom is selected\n"); | 623 | panic("out of memory. panic_on_oom is selected\n"); |
554 | /* Fall-through */ | 624 | /* Fall-through */ |
555 | case CONSTRAINT_CPUSET: | 625 | case CONSTRAINT_CPUSET: |
556 | if (sysctl_oom_kill_allocating_task) { | 626 | __out_of_memory(gfp_mask, order); |
557 | oom_kill_process(current, gfp_mask, order, points, NULL, | ||
558 | "Out of memory (oom_kill_allocating_task)"); | ||
559 | break; | ||
560 | } | ||
561 | retry: | ||
562 | /* | ||
563 | * Rambo mode: Shoot down a process and hope it solves whatever | ||
564 | * issues we may have. | ||
565 | */ | ||
566 | p = select_bad_process(&points, NULL); | ||
567 | |||
568 | if (PTR_ERR(p) == -1UL) | ||
569 | goto out; | ||
570 | |||
571 | /* Found nothing?!?! Either we hang forever, or we panic. */ | ||
572 | if (!p) { | ||
573 | read_unlock(&tasklist_lock); | ||
574 | panic("Out of memory and no killable processes...\n"); | ||
575 | } | ||
576 | |||
577 | if (oom_kill_process(p, gfp_mask, order, points, NULL, | ||
578 | "Out of memory")) | ||
579 | goto retry; | ||
580 | |||
581 | break; | 627 | break; |
582 | } | 628 | } |
583 | 629 | ||
584 | out: | ||
585 | read_unlock(&tasklist_lock); | 630 | read_unlock(&tasklist_lock); |
586 | 631 | ||
587 | /* | 632 | /* |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2970e35fd03f..dc32dae01e5f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void) | |||
69 | int dirty_background_ratio = 5; | 69 | int dirty_background_ratio = 5; |
70 | 70 | ||
71 | /* | 71 | /* |
72 | * dirty_background_bytes starts at 0 (disabled) so that it is a function of | ||
73 | * dirty_background_ratio * the amount of dirtyable memory | ||
74 | */ | ||
75 | unsigned long dirty_background_bytes; | ||
76 | |||
77 | /* | ||
72 | * free highmem will not be subtracted from the total free memory | 78 | * free highmem will not be subtracted from the total free memory |
73 | * for calculating free ratios if vm_highmem_is_dirtyable is true | 79 | * for calculating free ratios if vm_highmem_is_dirtyable is true |
74 | */ | 80 | */ |
@@ -80,6 +86,12 @@ int vm_highmem_is_dirtyable; | |||
80 | int vm_dirty_ratio = 10; | 86 | int vm_dirty_ratio = 10; |
81 | 87 | ||
82 | /* | 88 | /* |
89 | * vm_dirty_bytes starts at 0 (disabled) so that it is a function of | ||
90 | * vm_dirty_ratio * the amount of dirtyable memory | ||
91 | */ | ||
92 | unsigned long vm_dirty_bytes; | ||
93 | |||
94 | /* | ||
83 | * The interval between `kupdate'-style writebacks, in jiffies | 95 | * The interval between `kupdate'-style writebacks, in jiffies |
84 | */ | 96 | */ |
85 | int dirty_writeback_interval = 5 * HZ; | 97 | int dirty_writeback_interval = 5 * HZ; |
@@ -135,23 +147,75 @@ static int calc_period_shift(void) | |||
135 | { | 147 | { |
136 | unsigned long dirty_total; | 148 | unsigned long dirty_total; |
137 | 149 | ||
138 | dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; | 150 | if (vm_dirty_bytes) |
151 | dirty_total = vm_dirty_bytes / PAGE_SIZE; | ||
152 | else | ||
153 | dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / | ||
154 | 100; | ||
139 | return 2 + ilog2(dirty_total - 1); | 155 | return 2 + ilog2(dirty_total - 1); |
140 | } | 156 | } |
141 | 157 | ||
142 | /* | 158 | /* |
143 | * update the period when the dirty ratio changes. | 159 | * update the period when the dirty threshold changes. |
144 | */ | 160 | */ |
161 | static void update_completion_period(void) | ||
162 | { | ||
163 | int shift = calc_period_shift(); | ||
164 | prop_change_shift(&vm_completions, shift); | ||
165 | prop_change_shift(&vm_dirties, shift); | ||
166 | } | ||
167 | |||
168 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | ||
169 | struct file *filp, void __user *buffer, size_t *lenp, | ||
170 | loff_t *ppos) | ||
171 | { | ||
172 | int ret; | ||
173 | |||
174 | ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
175 | if (ret == 0 && write) | ||
176 | dirty_background_bytes = 0; | ||
177 | return ret; | ||
178 | } | ||
179 | |||
180 | int dirty_background_bytes_handler(struct ctl_table *table, int write, | ||
181 | struct file *filp, void __user *buffer, size_t *lenp, | ||
182 | loff_t *ppos) | ||
183 | { | ||
184 | int ret; | ||
185 | |||
186 | ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
187 | if (ret == 0 && write) | ||
188 | dirty_background_ratio = 0; | ||
189 | return ret; | ||
190 | } | ||
191 | |||
145 | int dirty_ratio_handler(struct ctl_table *table, int write, | 192 | int dirty_ratio_handler(struct ctl_table *table, int write, |
146 | struct file *filp, void __user *buffer, size_t *lenp, | 193 | struct file *filp, void __user *buffer, size_t *lenp, |
147 | loff_t *ppos) | 194 | loff_t *ppos) |
148 | { | 195 | { |
149 | int old_ratio = vm_dirty_ratio; | 196 | int old_ratio = vm_dirty_ratio; |
150 | int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | 197 | int ret; |
198 | |||
199 | ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
151 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | 200 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
152 | int shift = calc_period_shift(); | 201 | update_completion_period(); |
153 | prop_change_shift(&vm_completions, shift); | 202 | vm_dirty_bytes = 0; |
154 | prop_change_shift(&vm_dirties, shift); | 203 | } |
204 | return ret; | ||
205 | } | ||
206 | |||
207 | |||
208 | int dirty_bytes_handler(struct ctl_table *table, int write, | ||
209 | struct file *filp, void __user *buffer, size_t *lenp, | ||
210 | loff_t *ppos) | ||
211 | { | ||
212 | int old_bytes = vm_dirty_bytes; | ||
213 | int ret; | ||
214 | |||
215 | ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
216 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { | ||
217 | update_completion_period(); | ||
218 | vm_dirty_ratio = 0; | ||
155 | } | 219 | } |
156 | return ret; | 220 | return ret; |
157 | } | 221 | } |
@@ -362,26 +426,32 @@ unsigned long determine_dirtyable_memory(void) | |||
362 | } | 426 | } |
363 | 427 | ||
364 | void | 428 | void |
365 | get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, | 429 | get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, |
366 | struct backing_dev_info *bdi) | 430 | unsigned long *pbdi_dirty, struct backing_dev_info *bdi) |
367 | { | 431 | { |
368 | int background_ratio; /* Percentages */ | 432 | unsigned long background; |
369 | int dirty_ratio; | 433 | unsigned long dirty; |
370 | long background; | ||
371 | long dirty; | ||
372 | unsigned long available_memory = determine_dirtyable_memory(); | 434 | unsigned long available_memory = determine_dirtyable_memory(); |
373 | struct task_struct *tsk; | 435 | struct task_struct *tsk; |
374 | 436 | ||
375 | dirty_ratio = vm_dirty_ratio; | 437 | if (vm_dirty_bytes) |
376 | if (dirty_ratio < 5) | 438 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); |
377 | dirty_ratio = 5; | 439 | else { |
440 | int dirty_ratio; | ||
441 | |||
442 | dirty_ratio = vm_dirty_ratio; | ||
443 | if (dirty_ratio < 5) | ||
444 | dirty_ratio = 5; | ||
445 | dirty = (dirty_ratio * available_memory) / 100; | ||
446 | } | ||
378 | 447 | ||
379 | background_ratio = dirty_background_ratio; | 448 | if (dirty_background_bytes) |
380 | if (background_ratio >= dirty_ratio) | 449 | background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); |
381 | background_ratio = dirty_ratio / 2; | 450 | else |
451 | background = (dirty_background_ratio * available_memory) / 100; | ||
382 | 452 | ||
383 | background = (background_ratio * available_memory) / 100; | 453 | if (background >= dirty) |
384 | dirty = (dirty_ratio * available_memory) / 100; | 454 | background = dirty / 2; |
385 | tsk = current; | 455 | tsk = current; |
386 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { | 456 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { |
387 | background += background / 4; | 457 | background += background / 4; |
@@ -423,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
423 | { | 493 | { |
424 | long nr_reclaimable, bdi_nr_reclaimable; | 494 | long nr_reclaimable, bdi_nr_reclaimable; |
425 | long nr_writeback, bdi_nr_writeback; | 495 | long nr_writeback, bdi_nr_writeback; |
426 | long background_thresh; | 496 | unsigned long background_thresh; |
427 | long dirty_thresh; | 497 | unsigned long dirty_thresh; |
428 | long bdi_thresh; | 498 | unsigned long bdi_thresh; |
429 | unsigned long pages_written = 0; | 499 | unsigned long pages_written = 0; |
430 | unsigned long write_chunk = sync_writeback_pages(); | 500 | unsigned long write_chunk = sync_writeback_pages(); |
431 | 501 | ||
@@ -580,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); | |||
580 | 650 | ||
581 | void throttle_vm_writeout(gfp_t gfp_mask) | 651 | void throttle_vm_writeout(gfp_t gfp_mask) |
582 | { | 652 | { |
583 | long background_thresh; | 653 | unsigned long background_thresh; |
584 | long dirty_thresh; | 654 | unsigned long dirty_thresh; |
585 | 655 | ||
586 | for ( ; ; ) { | 656 | for ( ; ; ) { |
587 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); | 657 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
@@ -624,8 +694,8 @@ static void background_writeout(unsigned long _min_pages) | |||
624 | }; | 694 | }; |
625 | 695 | ||
626 | for ( ; ; ) { | 696 | for ( ; ; ) { |
627 | long background_thresh; | 697 | unsigned long background_thresh; |
628 | long dirty_thresh; | 698 | unsigned long dirty_thresh; |
629 | 699 | ||
630 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); | 700 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
631 | if (global_page_state(NR_FILE_DIRTY) + | 701 | if (global_page_state(NR_FILE_DIRTY) + |
@@ -868,9 +938,11 @@ int write_cache_pages(struct address_space *mapping, | |||
868 | int done = 0; | 938 | int done = 0; |
869 | struct pagevec pvec; | 939 | struct pagevec pvec; |
870 | int nr_pages; | 940 | int nr_pages; |
941 | pgoff_t uninitialized_var(writeback_index); | ||
871 | pgoff_t index; | 942 | pgoff_t index; |
872 | pgoff_t end; /* Inclusive */ | 943 | pgoff_t end; /* Inclusive */ |
873 | int scanned = 0; | 944 | pgoff_t done_index; |
945 | int cycled; | ||
874 | int range_whole = 0; | 946 | int range_whole = 0; |
875 | long nr_to_write = wbc->nr_to_write; | 947 | long nr_to_write = wbc->nr_to_write; |
876 | 948 | ||
@@ -881,83 +953,143 @@ int write_cache_pages(struct address_space *mapping, | |||
881 | 953 | ||
882 | pagevec_init(&pvec, 0); | 954 | pagevec_init(&pvec, 0); |
883 | if (wbc->range_cyclic) { | 955 | if (wbc->range_cyclic) { |
884 | index = mapping->writeback_index; /* Start from prev offset */ | 956 | writeback_index = mapping->writeback_index; /* prev offset */ |
957 | index = writeback_index; | ||
958 | if (index == 0) | ||
959 | cycled = 1; | ||
960 | else | ||
961 | cycled = 0; | ||
885 | end = -1; | 962 | end = -1; |
886 | } else { | 963 | } else { |
887 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | 964 | index = wbc->range_start >> PAGE_CACHE_SHIFT; |
888 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | 965 | end = wbc->range_end >> PAGE_CACHE_SHIFT; |
889 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | 966 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
890 | range_whole = 1; | 967 | range_whole = 1; |
891 | scanned = 1; | 968 | cycled = 1; /* ignore range_cyclic tests */ |
892 | } | 969 | } |
893 | retry: | 970 | retry: |
894 | while (!done && (index <= end) && | 971 | done_index = index; |
895 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | 972 | while (!done && (index <= end)) { |
896 | PAGECACHE_TAG_DIRTY, | 973 | int i; |
897 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { | 974 | |
898 | unsigned i; | 975 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, |
976 | PAGECACHE_TAG_DIRTY, | ||
977 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); | ||
978 | if (nr_pages == 0) | ||
979 | break; | ||
899 | 980 | ||
900 | scanned = 1; | ||
901 | for (i = 0; i < nr_pages; i++) { | 981 | for (i = 0; i < nr_pages; i++) { |
902 | struct page *page = pvec.pages[i]; | 982 | struct page *page = pvec.pages[i]; |
903 | 983 | ||
904 | /* | 984 | /* |
905 | * At this point we hold neither mapping->tree_lock nor | 985 | * At this point, the page may be truncated or |
906 | * lock on the page itself: the page may be truncated or | 986 | * invalidated (changing page->mapping to NULL), or |
907 | * invalidated (changing page->mapping to NULL), or even | 987 | * even swizzled back from swapper_space to tmpfs file |
908 | * swizzled back from swapper_space to tmpfs file | 988 | * mapping. However, page->index will not change |
909 | * mapping | 989 | * because we have a reference on the page. |
910 | */ | 990 | */ |
991 | if (page->index > end) { | ||
992 | /* | ||
993 | * can't be range_cyclic (1st pass) because | ||
994 | * end == -1 in that case. | ||
995 | */ | ||
996 | done = 1; | ||
997 | break; | ||
998 | } | ||
999 | |||
1000 | done_index = page->index + 1; | ||
1001 | |||
911 | lock_page(page); | 1002 | lock_page(page); |
912 | 1003 | ||
1004 | /* | ||
1005 | * Page truncated or invalidated. We can freely skip it | ||
1006 | * then, even for data integrity operations: the page | ||
1007 | * has disappeared concurrently, so there could be no | ||
1008 | * real expectation of this data interity operation | ||
1009 | * even if there is now a new, dirty page at the same | ||
1010 | * pagecache address. | ||
1011 | */ | ||
913 | if (unlikely(page->mapping != mapping)) { | 1012 | if (unlikely(page->mapping != mapping)) { |
1013 | continue_unlock: | ||
914 | unlock_page(page); | 1014 | unlock_page(page); |
915 | continue; | 1015 | continue; |
916 | } | 1016 | } |
917 | 1017 | ||
918 | if (!wbc->range_cyclic && page->index > end) { | 1018 | if (!PageDirty(page)) { |
919 | done = 1; | 1019 | /* someone wrote it for us */ |
920 | unlock_page(page); | 1020 | goto continue_unlock; |
921 | continue; | ||
922 | } | 1021 | } |
923 | 1022 | ||
924 | if (wbc->sync_mode != WB_SYNC_NONE) | 1023 | if (PageWriteback(page)) { |
925 | wait_on_page_writeback(page); | 1024 | if (wbc->sync_mode != WB_SYNC_NONE) |
926 | 1025 | wait_on_page_writeback(page); | |
927 | if (PageWriteback(page) || | 1026 | else |
928 | !clear_page_dirty_for_io(page)) { | 1027 | goto continue_unlock; |
929 | unlock_page(page); | ||
930 | continue; | ||
931 | } | 1028 | } |
932 | 1029 | ||
933 | ret = (*writepage)(page, wbc, data); | 1030 | BUG_ON(PageWriteback(page)); |
1031 | if (!clear_page_dirty_for_io(page)) | ||
1032 | goto continue_unlock; | ||
934 | 1033 | ||
935 | if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { | 1034 | ret = (*writepage)(page, wbc, data); |
936 | unlock_page(page); | 1035 | if (unlikely(ret)) { |
937 | ret = 0; | 1036 | if (ret == AOP_WRITEPAGE_ACTIVATE) { |
938 | } | 1037 | unlock_page(page); |
939 | if (ret || (--nr_to_write <= 0)) | 1038 | ret = 0; |
1039 | } else { | ||
1040 | /* | ||
1041 | * done_index is set past this page, | ||
1042 | * so media errors will not choke | ||
1043 | * background writeout for the entire | ||
1044 | * file. This has consequences for | ||
1045 | * range_cyclic semantics (ie. it may | ||
1046 | * not be suitable for data integrity | ||
1047 | * writeout). | ||
1048 | */ | ||
1049 | done = 1; | ||
1050 | break; | ||
1051 | } | ||
1052 | } | ||
1053 | |||
1054 | if (nr_to_write > 0) | ||
1055 | nr_to_write--; | ||
1056 | else if (wbc->sync_mode == WB_SYNC_NONE) { | ||
1057 | /* | ||
1058 | * We stop writing back only if we are not | ||
1059 | * doing integrity sync. In case of integrity | ||
1060 | * sync we have to keep going because someone | ||
1061 | * may be concurrently dirtying pages, and we | ||
1062 | * might have synced a lot of newly appeared | ||
1063 | * dirty pages, but have not synced all of the | ||
1064 | * old dirty pages. | ||
1065 | */ | ||
940 | done = 1; | 1066 | done = 1; |
1067 | break; | ||
1068 | } | ||
1069 | |||
941 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | 1070 | if (wbc->nonblocking && bdi_write_congested(bdi)) { |
942 | wbc->encountered_congestion = 1; | 1071 | wbc->encountered_congestion = 1; |
943 | done = 1; | 1072 | done = 1; |
1073 | break; | ||
944 | } | 1074 | } |
945 | } | 1075 | } |
946 | pagevec_release(&pvec); | 1076 | pagevec_release(&pvec); |
947 | cond_resched(); | 1077 | cond_resched(); |
948 | } | 1078 | } |
949 | if (!scanned && !done) { | 1079 | if (!cycled) { |
950 | /* | 1080 | /* |
1081 | * range_cyclic: | ||
951 | * We hit the last page and there is more work to be done: wrap | 1082 | * We hit the last page and there is more work to be done: wrap |
952 | * back to the start of the file | 1083 | * back to the start of the file |
953 | */ | 1084 | */ |
954 | scanned = 1; | 1085 | cycled = 1; |
955 | index = 0; | 1086 | index = 0; |
1087 | end = writeback_index - 1; | ||
956 | goto retry; | 1088 | goto retry; |
957 | } | 1089 | } |
958 | if (!wbc->no_nrwrite_index_update) { | 1090 | if (!wbc->no_nrwrite_index_update) { |
959 | if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) | 1091 | if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) |
960 | mapping->writeback_index = index; | 1092 | mapping->writeback_index = done_index; |
961 | wbc->nr_to_write = nr_to_write; | 1093 | wbc->nr_to_write = nr_to_write; |
962 | } | 1094 | } |
963 | 1095 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d8ac01474563..5675b3073854 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -69,7 +69,7 @@ EXPORT_SYMBOL(node_states); | |||
69 | 69 | ||
70 | unsigned long totalram_pages __read_mostly; | 70 | unsigned long totalram_pages __read_mostly; |
71 | unsigned long totalreserve_pages __read_mostly; | 71 | unsigned long totalreserve_pages __read_mostly; |
72 | long nr_swap_pages; | 72 | unsigned long highest_memmap_pfn __read_mostly; |
73 | int percpu_pagelist_fraction; | 73 | int percpu_pagelist_fraction; |
74 | 74 | ||
75 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 75 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
@@ -223,19 +223,41 @@ static inline int bad_range(struct zone *zone, struct page *page) | |||
223 | 223 | ||
224 | static void bad_page(struct page *page) | 224 | static void bad_page(struct page *page) |
225 | { | 225 | { |
226 | printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG | 226 | static unsigned long resume; |
227 | "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", | 227 | static unsigned long nr_shown; |
228 | current->comm, page, (int)(2*sizeof(unsigned long)), | 228 | static unsigned long nr_unshown; |
229 | (unsigned long)page->flags, page->mapping, | 229 | |
230 | page_mapcount(page), page_count(page)); | 230 | /* |
231 | * Allow a burst of 60 reports, then keep quiet for that minute; | ||
232 | * or allow a steady drip of one report per second. | ||
233 | */ | ||
234 | if (nr_shown == 60) { | ||
235 | if (time_before(jiffies, resume)) { | ||
236 | nr_unshown++; | ||
237 | goto out; | ||
238 | } | ||
239 | if (nr_unshown) { | ||
240 | printk(KERN_ALERT | ||
241 | "BUG: Bad page state: %lu messages suppressed\n", | ||
242 | nr_unshown); | ||
243 | nr_unshown = 0; | ||
244 | } | ||
245 | nr_shown = 0; | ||
246 | } | ||
247 | if (nr_shown++ == 0) | ||
248 | resume = jiffies + 60 * HZ; | ||
249 | |||
250 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", | ||
251 | current->comm, page_to_pfn(page)); | ||
252 | printk(KERN_ALERT | ||
253 | "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", | ||
254 | page, (void *)page->flags, page_count(page), | ||
255 | page_mapcount(page), page->mapping, page->index); | ||
231 | 256 | ||
232 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" | ||
233 | KERN_EMERG "Backtrace:\n"); | ||
234 | dump_stack(); | 257 | dump_stack(); |
235 | page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; | 258 | out: |
236 | set_page_count(page, 0); | 259 | /* Leave bad fields for debug, except PageBuddy could make trouble */ |
237 | reset_page_mapcount(page); | 260 | __ClearPageBuddy(page); |
238 | page->mapping = NULL; | ||
239 | add_taint(TAINT_BAD_PAGE); | 261 | add_taint(TAINT_BAD_PAGE); |
240 | } | 262 | } |
241 | 263 | ||
@@ -292,25 +314,31 @@ void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
292 | } | 314 | } |
293 | #endif | 315 | #endif |
294 | 316 | ||
295 | static void destroy_compound_page(struct page *page, unsigned long order) | 317 | static int destroy_compound_page(struct page *page, unsigned long order) |
296 | { | 318 | { |
297 | int i; | 319 | int i; |
298 | int nr_pages = 1 << order; | 320 | int nr_pages = 1 << order; |
321 | int bad = 0; | ||
299 | 322 | ||
300 | if (unlikely(compound_order(page) != order)) | 323 | if (unlikely(compound_order(page) != order) || |
324 | unlikely(!PageHead(page))) { | ||
301 | bad_page(page); | 325 | bad_page(page); |
326 | bad++; | ||
327 | } | ||
302 | 328 | ||
303 | if (unlikely(!PageHead(page))) | ||
304 | bad_page(page); | ||
305 | __ClearPageHead(page); | 329 | __ClearPageHead(page); |
330 | |||
306 | for (i = 1; i < nr_pages; i++) { | 331 | for (i = 1; i < nr_pages; i++) { |
307 | struct page *p = page + i; | 332 | struct page *p = page + i; |
308 | 333 | ||
309 | if (unlikely(!PageTail(p) | | 334 | if (unlikely(!PageTail(p) | (p->first_page != page))) { |
310 | (p->first_page != page))) | ||
311 | bad_page(page); | 335 | bad_page(page); |
336 | bad++; | ||
337 | } | ||
312 | __ClearPageTail(p); | 338 | __ClearPageTail(p); |
313 | } | 339 | } |
340 | |||
341 | return bad; | ||
314 | } | 342 | } |
315 | 343 | ||
316 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | 344 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) |
@@ -430,7 +458,8 @@ static inline void __free_one_page(struct page *page, | |||
430 | int migratetype = get_pageblock_migratetype(page); | 458 | int migratetype = get_pageblock_migratetype(page); |
431 | 459 | ||
432 | if (unlikely(PageCompound(page))) | 460 | if (unlikely(PageCompound(page))) |
433 | destroy_compound_page(page, order); | 461 | if (unlikely(destroy_compound_page(page, order))) |
462 | return; | ||
434 | 463 | ||
435 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 464 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
436 | 465 | ||
@@ -467,18 +496,13 @@ static inline int free_pages_check(struct page *page) | |||
467 | if (unlikely(page_mapcount(page) | | 496 | if (unlikely(page_mapcount(page) | |
468 | (page->mapping != NULL) | | 497 | (page->mapping != NULL) | |
469 | (page_count(page) != 0) | | 498 | (page_count(page) != 0) | |
470 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) | 499 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { |
471 | bad_page(page); | 500 | bad_page(page); |
472 | if (PageDirty(page)) | 501 | return 1; |
473 | __ClearPageDirty(page); | 502 | } |
474 | if (PageSwapBacked(page)) | 503 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
475 | __ClearPageSwapBacked(page); | 504 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
476 | /* | 505 | return 0; |
477 | * For now, we report if PG_reserved was found set, but do not | ||
478 | * clear it, and do not free the page. But we shall soon need | ||
479 | * to do more, for when the ZERO_PAGE count wraps negative. | ||
480 | */ | ||
481 | return PageReserved(page); | ||
482 | } | 506 | } |
483 | 507 | ||
484 | /* | 508 | /* |
@@ -523,11 +547,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
523 | { | 547 | { |
524 | unsigned long flags; | 548 | unsigned long flags; |
525 | int i; | 549 | int i; |
526 | int reserved = 0; | 550 | int bad = 0; |
527 | 551 | ||
528 | for (i = 0 ; i < (1 << order) ; ++i) | 552 | for (i = 0 ; i < (1 << order) ; ++i) |
529 | reserved += free_pages_check(page + i); | 553 | bad += free_pages_check(page + i); |
530 | if (reserved) | 554 | if (bad) |
531 | return; | 555 | return; |
532 | 556 | ||
533 | if (!PageHighMem(page)) { | 557 | if (!PageHighMem(page)) { |
@@ -612,23 +636,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
612 | if (unlikely(page_mapcount(page) | | 636 | if (unlikely(page_mapcount(page) | |
613 | (page->mapping != NULL) | | 637 | (page->mapping != NULL) | |
614 | (page_count(page) != 0) | | 638 | (page_count(page) != 0) | |
615 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) | 639 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { |
616 | bad_page(page); | 640 | bad_page(page); |
617 | |||
618 | /* | ||
619 | * For now, we report if PG_reserved was found set, but do not | ||
620 | * clear it, and do not allocate the page: as a safety net. | ||
621 | */ | ||
622 | if (PageReserved(page)) | ||
623 | return 1; | 641 | return 1; |
642 | } | ||
624 | 643 | ||
625 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | | ||
626 | 1 << PG_referenced | 1 << PG_arch_1 | | ||
627 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk | ||
628 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
629 | | 1 << PG_mlocked | ||
630 | #endif | ||
631 | ); | ||
632 | set_page_private(page, 0); | 644 | set_page_private(page, 0); |
633 | set_page_refcounted(page); | 645 | set_page_refcounted(page); |
634 | 646 | ||
@@ -2609,6 +2621,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
2609 | unsigned long pfn; | 2621 | unsigned long pfn; |
2610 | struct zone *z; | 2622 | struct zone *z; |
2611 | 2623 | ||
2624 | if (highest_memmap_pfn < end_pfn - 1) | ||
2625 | highest_memmap_pfn = end_pfn - 1; | ||
2626 | |||
2612 | z = &NODE_DATA(nid)->node_zones[zone]; | 2627 | z = &NODE_DATA(nid)->node_zones[zone]; |
2613 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 2628 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
2614 | /* | 2629 | /* |
@@ -3381,10 +3396,8 @@ static void __init setup_usemap(struct pglist_data *pgdat, | |||
3381 | { | 3396 | { |
3382 | unsigned long usemapsize = usemap_size(zonesize); | 3397 | unsigned long usemapsize = usemap_size(zonesize); |
3383 | zone->pageblock_flags = NULL; | 3398 | zone->pageblock_flags = NULL; |
3384 | if (usemapsize) { | 3399 | if (usemapsize) |
3385 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); | 3400 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); |
3386 | memset(zone->pageblock_flags, 0, usemapsize); | ||
3387 | } | ||
3388 | } | 3401 | } |
3389 | #else | 3402 | #else |
3390 | static void inline setup_usemap(struct pglist_data *pgdat, | 3403 | static void inline setup_usemap(struct pglist_data *pgdat, |
@@ -3469,9 +3482,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
3469 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; | 3482 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; |
3470 | if (realsize >= memmap_pages) { | 3483 | if (realsize >= memmap_pages) { |
3471 | realsize -= memmap_pages; | 3484 | realsize -= memmap_pages; |
3472 | printk(KERN_DEBUG | 3485 | if (memmap_pages) |
3473 | " %s zone: %lu pages used for memmap\n", | 3486 | printk(KERN_DEBUG |
3474 | zone_names[j], memmap_pages); | 3487 | " %s zone: %lu pages used for memmap\n", |
3488 | zone_names[j], memmap_pages); | ||
3475 | } else | 3489 | } else |
3476 | printk(KERN_WARNING | 3490 | printk(KERN_WARNING |
3477 | " %s zone: %lu pages exceeds realsize %lu\n", | 3491 | " %s zone: %lu pages exceeds realsize %lu\n", |
@@ -3509,10 +3523,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
3509 | INIT_LIST_HEAD(&zone->lru[l].list); | 3523 | INIT_LIST_HEAD(&zone->lru[l].list); |
3510 | zone->lru[l].nr_scan = 0; | 3524 | zone->lru[l].nr_scan = 0; |
3511 | } | 3525 | } |
3512 | zone->recent_rotated[0] = 0; | 3526 | zone->reclaim_stat.recent_rotated[0] = 0; |
3513 | zone->recent_rotated[1] = 0; | 3527 | zone->reclaim_stat.recent_rotated[1] = 0; |
3514 | zone->recent_scanned[0] = 0; | 3528 | zone->reclaim_stat.recent_scanned[0] = 0; |
3515 | zone->recent_scanned[1] = 0; | 3529 | zone->reclaim_stat.recent_scanned[1] = 0; |
3516 | zap_zone_vm_stats(zone); | 3530 | zap_zone_vm_stats(zone); |
3517 | zone->flags = 0; | 3531 | zone->flags = 0; |
3518 | if (!size) | 3532 | if (!size) |
@@ -4316,7 +4330,7 @@ void setup_per_zone_pages_min(void) | |||
4316 | * 1TB 101 10GB | 4330 | * 1TB 101 10GB |
4317 | * 10TB 320 32GB | 4331 | * 10TB 320 32GB |
4318 | */ | 4332 | */ |
4319 | void setup_per_zone_inactive_ratio(void) | 4333 | static void setup_per_zone_inactive_ratio(void) |
4320 | { | 4334 | { |
4321 | struct zone *zone; | 4335 | struct zone *zone; |
4322 | 4336 | ||
@@ -4573,19 +4587,6 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4573 | return table; | 4587 | return table; |
4574 | } | 4588 | } |
4575 | 4589 | ||
4576 | #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE | ||
4577 | struct page *pfn_to_page(unsigned long pfn) | ||
4578 | { | ||
4579 | return __pfn_to_page(pfn); | ||
4580 | } | ||
4581 | unsigned long page_to_pfn(struct page *page) | ||
4582 | { | ||
4583 | return __page_to_pfn(page); | ||
4584 | } | ||
4585 | EXPORT_SYMBOL(pfn_to_page); | ||
4586 | EXPORT_SYMBOL(page_to_pfn); | ||
4587 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ | ||
4588 | |||
4589 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ | 4590 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ |
4590 | static inline unsigned long *get_pageblock_bitmap(struct zone *zone, | 4591 | static inline unsigned long *get_pageblock_bitmap(struct zone *zone, |
4591 | unsigned long pfn) | 4592 | unsigned long pfn) |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index ab27ff750519..7006a11350c8 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/memory.h> | 8 | #include <linux/memory.h> |
9 | #include <linux/vmalloc.h> | 9 | #include <linux/vmalloc.h> |
10 | #include <linux/cgroup.h> | 10 | #include <linux/cgroup.h> |
11 | #include <linux/swapops.h> | ||
11 | 12 | ||
12 | static void __meminit | 13 | static void __meminit |
13 | __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) | 14 | __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) |
@@ -15,6 +16,7 @@ __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) | |||
15 | pc->flags = 0; | 16 | pc->flags = 0; |
16 | pc->mem_cgroup = NULL; | 17 | pc->mem_cgroup = NULL; |
17 | pc->page = pfn_to_page(pfn); | 18 | pc->page = pfn_to_page(pfn); |
19 | INIT_LIST_HEAD(&pc->lru); | ||
18 | } | 20 | } |
19 | static unsigned long total_usage; | 21 | static unsigned long total_usage; |
20 | 22 | ||
@@ -72,7 +74,7 @@ void __init page_cgroup_init(void) | |||
72 | 74 | ||
73 | int nid, fail; | 75 | int nid, fail; |
74 | 76 | ||
75 | if (mem_cgroup_subsys.disabled) | 77 | if (mem_cgroup_disabled()) |
76 | return; | 78 | return; |
77 | 79 | ||
78 | for_each_online_node(nid) { | 80 | for_each_online_node(nid) { |
@@ -101,15 +103,13 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) | |||
101 | } | 103 | } |
102 | 104 | ||
103 | /* __alloc_bootmem...() is protected by !slab_available() */ | 105 | /* __alloc_bootmem...() is protected by !slab_available() */ |
104 | int __init_refok init_section_page_cgroup(unsigned long pfn) | 106 | static int __init_refok init_section_page_cgroup(unsigned long pfn) |
105 | { | 107 | { |
106 | struct mem_section *section; | 108 | struct mem_section *section = __pfn_to_section(pfn); |
107 | struct page_cgroup *base, *pc; | 109 | struct page_cgroup *base, *pc; |
108 | unsigned long table_size; | 110 | unsigned long table_size; |
109 | int nid, index; | 111 | int nid, index; |
110 | 112 | ||
111 | section = __pfn_to_section(pfn); | ||
112 | |||
113 | if (!section->page_cgroup) { | 113 | if (!section->page_cgroup) { |
114 | nid = page_to_nid(pfn_to_page(pfn)); | 114 | nid = page_to_nid(pfn_to_page(pfn)); |
115 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | 115 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; |
@@ -145,7 +145,6 @@ int __init_refok init_section_page_cgroup(unsigned long pfn) | |||
145 | __init_page_cgroup(pc, pfn + index); | 145 | __init_page_cgroup(pc, pfn + index); |
146 | } | 146 | } |
147 | 147 | ||
148 | section = __pfn_to_section(pfn); | ||
149 | section->page_cgroup = base - pfn; | 148 | section->page_cgroup = base - pfn; |
150 | total_usage += table_size; | 149 | total_usage += table_size; |
151 | return 0; | 150 | return 0; |
@@ -248,7 +247,7 @@ void __init page_cgroup_init(void) | |||
248 | unsigned long pfn; | 247 | unsigned long pfn; |
249 | int fail = 0; | 248 | int fail = 0; |
250 | 249 | ||
251 | if (mem_cgroup_subsys.disabled) | 250 | if (mem_cgroup_disabled()) |
252 | return; | 251 | return; |
253 | 252 | ||
254 | for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { | 253 | for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { |
@@ -273,3 +272,199 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | |||
273 | } | 272 | } |
274 | 273 | ||
275 | #endif | 274 | #endif |
275 | |||
276 | |||
277 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
278 | |||
279 | static DEFINE_MUTEX(swap_cgroup_mutex); | ||
280 | struct swap_cgroup_ctrl { | ||
281 | struct page **map; | ||
282 | unsigned long length; | ||
283 | }; | ||
284 | |||
285 | struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; | ||
286 | |||
287 | /* | ||
288 | * This 8bytes seems big..maybe we can reduce this when we can use "id" for | ||
289 | * cgroup rather than pointer. | ||
290 | */ | ||
291 | struct swap_cgroup { | ||
292 | struct mem_cgroup *val; | ||
293 | }; | ||
294 | #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) | ||
295 | #define SC_POS_MASK (SC_PER_PAGE - 1) | ||
296 | |||
297 | /* | ||
298 | * SwapCgroup implements "lookup" and "exchange" operations. | ||
299 | * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge | ||
300 | * against SwapCache. At swap_free(), this is accessed directly from swap. | ||
301 | * | ||
302 | * This means, | ||
303 | * - we have no race in "exchange" when we're accessed via SwapCache because | ||
304 | * SwapCache(and its swp_entry) is under lock. | ||
305 | * - When called via swap_free(), there is no user of this entry and no race. | ||
306 | * Then, we don't need lock around "exchange". | ||
307 | * | ||
308 | * TODO: we can push these buffers out to HIGHMEM. | ||
309 | */ | ||
310 | |||
311 | /* | ||
312 | * allocate buffer for swap_cgroup. | ||
313 | */ | ||
314 | static int swap_cgroup_prepare(int type) | ||
315 | { | ||
316 | struct page *page; | ||
317 | struct swap_cgroup_ctrl *ctrl; | ||
318 | unsigned long idx, max; | ||
319 | |||
320 | if (!do_swap_account) | ||
321 | return 0; | ||
322 | ctrl = &swap_cgroup_ctrl[type]; | ||
323 | |||
324 | for (idx = 0; idx < ctrl->length; idx++) { | ||
325 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
326 | if (!page) | ||
327 | goto not_enough_page; | ||
328 | ctrl->map[idx] = page; | ||
329 | } | ||
330 | return 0; | ||
331 | not_enough_page: | ||
332 | max = idx; | ||
333 | for (idx = 0; idx < max; idx++) | ||
334 | __free_page(ctrl->map[idx]); | ||
335 | |||
336 | return -ENOMEM; | ||
337 | } | ||
338 | |||
339 | /** | ||
340 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | ||
341 | * @ent: swap entry to be recorded into | ||
342 | * @mem: mem_cgroup to be recorded | ||
343 | * | ||
344 | * Returns old value at success, NULL at failure. | ||
345 | * (Of course, old value can be NULL.) | ||
346 | */ | ||
347 | struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) | ||
348 | { | ||
349 | int type = swp_type(ent); | ||
350 | unsigned long offset = swp_offset(ent); | ||
351 | unsigned long idx = offset / SC_PER_PAGE; | ||
352 | unsigned long pos = offset & SC_POS_MASK; | ||
353 | struct swap_cgroup_ctrl *ctrl; | ||
354 | struct page *mappage; | ||
355 | struct swap_cgroup *sc; | ||
356 | struct mem_cgroup *old; | ||
357 | |||
358 | if (!do_swap_account) | ||
359 | return NULL; | ||
360 | |||
361 | ctrl = &swap_cgroup_ctrl[type]; | ||
362 | |||
363 | mappage = ctrl->map[idx]; | ||
364 | sc = page_address(mappage); | ||
365 | sc += pos; | ||
366 | old = sc->val; | ||
367 | sc->val = mem; | ||
368 | |||
369 | return old; | ||
370 | } | ||
371 | |||
372 | /** | ||
373 | * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry | ||
374 | * @ent: swap entry to be looked up. | ||
375 | * | ||
376 | * Returns pointer to mem_cgroup at success. NULL at failure. | ||
377 | */ | ||
378 | struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent) | ||
379 | { | ||
380 | int type = swp_type(ent); | ||
381 | unsigned long offset = swp_offset(ent); | ||
382 | unsigned long idx = offset / SC_PER_PAGE; | ||
383 | unsigned long pos = offset & SC_POS_MASK; | ||
384 | struct swap_cgroup_ctrl *ctrl; | ||
385 | struct page *mappage; | ||
386 | struct swap_cgroup *sc; | ||
387 | struct mem_cgroup *ret; | ||
388 | |||
389 | if (!do_swap_account) | ||
390 | return NULL; | ||
391 | |||
392 | ctrl = &swap_cgroup_ctrl[type]; | ||
393 | mappage = ctrl->map[idx]; | ||
394 | sc = page_address(mappage); | ||
395 | sc += pos; | ||
396 | ret = sc->val; | ||
397 | return ret; | ||
398 | } | ||
399 | |||
400 | int swap_cgroup_swapon(int type, unsigned long max_pages) | ||
401 | { | ||
402 | void *array; | ||
403 | unsigned long array_size; | ||
404 | unsigned long length; | ||
405 | struct swap_cgroup_ctrl *ctrl; | ||
406 | |||
407 | if (!do_swap_account) | ||
408 | return 0; | ||
409 | |||
410 | length = ((max_pages/SC_PER_PAGE) + 1); | ||
411 | array_size = length * sizeof(void *); | ||
412 | |||
413 | array = vmalloc(array_size); | ||
414 | if (!array) | ||
415 | goto nomem; | ||
416 | |||
417 | memset(array, 0, array_size); | ||
418 | ctrl = &swap_cgroup_ctrl[type]; | ||
419 | mutex_lock(&swap_cgroup_mutex); | ||
420 | ctrl->length = length; | ||
421 | ctrl->map = array; | ||
422 | if (swap_cgroup_prepare(type)) { | ||
423 | /* memory shortage */ | ||
424 | ctrl->map = NULL; | ||
425 | ctrl->length = 0; | ||
426 | vfree(array); | ||
427 | mutex_unlock(&swap_cgroup_mutex); | ||
428 | goto nomem; | ||
429 | } | ||
430 | mutex_unlock(&swap_cgroup_mutex); | ||
431 | |||
432 | printk(KERN_INFO | ||
433 | "swap_cgroup: uses %ld bytes of vmalloc for pointer array space" | ||
434 | " and %ld bytes to hold mem_cgroup pointers on swap\n", | ||
435 | array_size, length * PAGE_SIZE); | ||
436 | printk(KERN_INFO | ||
437 | "swap_cgroup can be disabled by noswapaccount boot option.\n"); | ||
438 | |||
439 | return 0; | ||
440 | nomem: | ||
441 | printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); | ||
442 | printk(KERN_INFO | ||
443 | "swap_cgroup can be disabled by noswapaccount boot option\n"); | ||
444 | return -ENOMEM; | ||
445 | } | ||
446 | |||
447 | void swap_cgroup_swapoff(int type) | ||
448 | { | ||
449 | int i; | ||
450 | struct swap_cgroup_ctrl *ctrl; | ||
451 | |||
452 | if (!do_swap_account) | ||
453 | return; | ||
454 | |||
455 | mutex_lock(&swap_cgroup_mutex); | ||
456 | ctrl = &swap_cgroup_ctrl[type]; | ||
457 | if (ctrl->map) { | ||
458 | for (i = 0; i < ctrl->length; i++) { | ||
459 | struct page *page = ctrl->map[i]; | ||
460 | if (page) | ||
461 | __free_page(page); | ||
462 | } | ||
463 | vfree(ctrl->map); | ||
464 | ctrl->map = NULL; | ||
465 | ctrl->length = 0; | ||
466 | } | ||
467 | mutex_unlock(&swap_cgroup_mutex); | ||
468 | } | ||
469 | |||
470 | #endif | ||
diff --git a/mm/page_io.c b/mm/page_io.c index 065c4480eaf0..dc6ce0afbded 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -98,7 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
98 | struct bio *bio; | 98 | struct bio *bio; |
99 | int ret = 0, rw = WRITE; | 99 | int ret = 0, rw = WRITE; |
100 | 100 | ||
101 | if (remove_exclusive_swap_page(page)) { | 101 | if (try_to_free_swap(page)) { |
102 | unlock_page(page); | 102 | unlock_page(page); |
103 | goto out; | 103 | goto out; |
104 | } | 104 | } |
@@ -125,8 +125,8 @@ int swap_readpage(struct file *file, struct page *page) | |||
125 | struct bio *bio; | 125 | struct bio *bio; |
126 | int ret = 0; | 126 | int ret = 0; |
127 | 127 | ||
128 | BUG_ON(!PageLocked(page)); | 128 | VM_BUG_ON(!PageLocked(page)); |
129 | BUG_ON(PageUptodate(page)); | 129 | VM_BUG_ON(PageUptodate(page)); |
130 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, | 130 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, |
131 | end_swap_bio_read); | 131 | end_swap_bio_read); |
132 | if (bio == NULL) { | 132 | if (bio == NULL) { |
diff --git a/mm/pdflush.c b/mm/pdflush.c index a0a14c4d5072..15de509b68fd 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c | |||
@@ -172,7 +172,16 @@ static int __pdflush(struct pdflush_work *my_work) | |||
172 | static int pdflush(void *dummy) | 172 | static int pdflush(void *dummy) |
173 | { | 173 | { |
174 | struct pdflush_work my_work; | 174 | struct pdflush_work my_work; |
175 | cpumask_t cpus_allowed; | 175 | cpumask_var_t cpus_allowed; |
176 | |||
177 | /* | ||
178 | * Since the caller doesn't even check kthread_run() worked, let's not | ||
179 | * freak out too much if this fails. | ||
180 | */ | ||
181 | if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { | ||
182 | printk(KERN_WARNING "pdflush failed to allocate cpumask\n"); | ||
183 | return 0; | ||
184 | } | ||
176 | 185 | ||
177 | /* | 186 | /* |
178 | * pdflush can spend a lot of time doing encryption via dm-crypt. We | 187 | * pdflush can spend a lot of time doing encryption via dm-crypt. We |
@@ -187,8 +196,9 @@ static int pdflush(void *dummy) | |||
187 | * This is needed as pdflush's are dynamically created and destroyed. | 196 | * This is needed as pdflush's are dynamically created and destroyed. |
188 | * The boottime pdflush's are easily placed w/o these 2 lines. | 197 | * The boottime pdflush's are easily placed w/o these 2 lines. |
189 | */ | 198 | */ |
190 | cpuset_cpus_allowed(current, &cpus_allowed); | 199 | cpuset_cpus_allowed(current, cpus_allowed); |
191 | set_cpus_allowed_ptr(current, &cpus_allowed); | 200 | set_cpus_allowed_ptr(current, cpus_allowed); |
201 | free_cpumask_var(cpus_allowed); | ||
192 | 202 | ||
193 | return __pdflush(&my_work); | 203 | return __pdflush(&my_work); |
194 | } | 204 | } |
@@ -47,9 +47,9 @@ | |||
47 | #include <linux/rmap.h> | 47 | #include <linux/rmap.h> |
48 | #include <linux/rcupdate.h> | 48 | #include <linux/rcupdate.h> |
49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
50 | #include <linux/kallsyms.h> | ||
51 | #include <linux/memcontrol.h> | 50 | #include <linux/memcontrol.h> |
52 | #include <linux/mmu_notifier.h> | 51 | #include <linux/mmu_notifier.h> |
52 | #include <linux/migrate.h> | ||
53 | 53 | ||
54 | #include <asm/tlbflush.h> | 54 | #include <asm/tlbflush.h> |
55 | 55 | ||
@@ -191,7 +191,7 @@ void __init anon_vma_init(void) | |||
191 | * Getting a lock on a stable anon_vma from a page off the LRU is | 191 | * Getting a lock on a stable anon_vma from a page off the LRU is |
192 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. | 192 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. |
193 | */ | 193 | */ |
194 | struct anon_vma *page_lock_anon_vma(struct page *page) | 194 | static struct anon_vma *page_lock_anon_vma(struct page *page) |
195 | { | 195 | { |
196 | struct anon_vma *anon_vma; | 196 | struct anon_vma *anon_vma; |
197 | unsigned long anon_mapping; | 197 | unsigned long anon_mapping; |
@@ -211,7 +211,7 @@ out: | |||
211 | return NULL; | 211 | return NULL; |
212 | } | 212 | } |
213 | 213 | ||
214 | void page_unlock_anon_vma(struct anon_vma *anon_vma) | 214 | static void page_unlock_anon_vma(struct anon_vma *anon_vma) |
215 | { | 215 | { |
216 | spin_unlock(&anon_vma->lock); | 216 | spin_unlock(&anon_vma->lock); |
217 | rcu_read_unlock(); | 217 | rcu_read_unlock(); |
@@ -359,8 +359,17 @@ static int page_referenced_one(struct page *page, | |||
359 | goto out_unmap; | 359 | goto out_unmap; |
360 | } | 360 | } |
361 | 361 | ||
362 | if (ptep_clear_flush_young_notify(vma, address, pte)) | 362 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
363 | referenced++; | 363 | /* |
364 | * Don't treat a reference through a sequentially read | ||
365 | * mapping as such. If the page has been used in | ||
366 | * another mapping, we will catch it; if this other | ||
367 | * mapping is already gone, the unmap path will have | ||
368 | * set PG_referenced or activated the page. | ||
369 | */ | ||
370 | if (likely(!VM_SequentialReadHint(vma))) | ||
371 | referenced++; | ||
372 | } | ||
364 | 373 | ||
365 | /* Pretend the page is referenced if the task has the | 374 | /* Pretend the page is referenced if the task has the |
366 | swap token and is in the middle of a page fault. */ | 375 | swap token and is in the middle of a page fault. */ |
@@ -661,9 +670,14 @@ void page_add_anon_rmap(struct page *page, | |||
661 | void page_add_new_anon_rmap(struct page *page, | 670 | void page_add_new_anon_rmap(struct page *page, |
662 | struct vm_area_struct *vma, unsigned long address) | 671 | struct vm_area_struct *vma, unsigned long address) |
663 | { | 672 | { |
664 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 673 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
665 | atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ | 674 | SetPageSwapBacked(page); |
675 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | ||
666 | __page_set_anon_rmap(page, vma, address); | 676 | __page_set_anon_rmap(page, vma, address); |
677 | if (page_evictable(page, vma)) | ||
678 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | ||
679 | else | ||
680 | add_page_to_unevictable_list(page); | ||
667 | } | 681 | } |
668 | 682 | ||
669 | /** | 683 | /** |
@@ -693,7 +707,6 @@ void page_add_file_rmap(struct page *page) | |||
693 | */ | 707 | */ |
694 | void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) | 708 | void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) |
695 | { | 709 | { |
696 | BUG_ON(page_mapcount(page) == 0); | ||
697 | if (PageAnon(page)) | 710 | if (PageAnon(page)) |
698 | __page_check_anon_rmap(page, vma, address); | 711 | __page_check_anon_rmap(page, vma, address); |
699 | atomic_inc(&page->_mapcount); | 712 | atomic_inc(&page->_mapcount); |
@@ -703,28 +716,12 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long | |||
703 | /** | 716 | /** |
704 | * page_remove_rmap - take down pte mapping from a page | 717 | * page_remove_rmap - take down pte mapping from a page |
705 | * @page: page to remove mapping from | 718 | * @page: page to remove mapping from |
706 | * @vma: the vm area in which the mapping is removed | ||
707 | * | 719 | * |
708 | * The caller needs to hold the pte lock. | 720 | * The caller needs to hold the pte lock. |
709 | */ | 721 | */ |
710 | void page_remove_rmap(struct page *page, struct vm_area_struct *vma) | 722 | void page_remove_rmap(struct page *page) |
711 | { | 723 | { |
712 | if (atomic_add_negative(-1, &page->_mapcount)) { | 724 | if (atomic_add_negative(-1, &page->_mapcount)) { |
713 | if (unlikely(page_mapcount(page) < 0)) { | ||
714 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); | ||
715 | printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); | ||
716 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); | ||
717 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); | ||
718 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); | ||
719 | print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); | ||
720 | if (vma->vm_ops) { | ||
721 | print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); | ||
722 | } | ||
723 | if (vma->vm_file && vma->vm_file->f_op) | ||
724 | print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); | ||
725 | BUG(); | ||
726 | } | ||
727 | |||
728 | /* | 725 | /* |
729 | * Now that the last pte has gone, s390 must transfer dirty | 726 | * Now that the last pte has gone, s390 must transfer dirty |
730 | * flag from storage key to struct page. We can usually skip | 727 | * flag from storage key to struct page. We can usually skip |
@@ -818,8 +815,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
818 | spin_unlock(&mmlist_lock); | 815 | spin_unlock(&mmlist_lock); |
819 | } | 816 | } |
820 | dec_mm_counter(mm, anon_rss); | 817 | dec_mm_counter(mm, anon_rss); |
821 | #ifdef CONFIG_MIGRATION | 818 | } else if (PAGE_MIGRATION) { |
822 | } else { | ||
823 | /* | 819 | /* |
824 | * Store the pfn of the page in a special migration | 820 | * Store the pfn of the page in a special migration |
825 | * pte. do_swap_page() will wait until the migration | 821 | * pte. do_swap_page() will wait until the migration |
@@ -827,23 +823,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
827 | */ | 823 | */ |
828 | BUG_ON(!migration); | 824 | BUG_ON(!migration); |
829 | entry = make_migration_entry(page, pte_write(pteval)); | 825 | entry = make_migration_entry(page, pte_write(pteval)); |
830 | #endif | ||
831 | } | 826 | } |
832 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 827 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
833 | BUG_ON(pte_file(*pte)); | 828 | BUG_ON(pte_file(*pte)); |
834 | } else | 829 | } else if (PAGE_MIGRATION && migration) { |
835 | #ifdef CONFIG_MIGRATION | ||
836 | if (migration) { | ||
837 | /* Establish migration entry for a file page */ | 830 | /* Establish migration entry for a file page */ |
838 | swp_entry_t entry; | 831 | swp_entry_t entry; |
839 | entry = make_migration_entry(page, pte_write(pteval)); | 832 | entry = make_migration_entry(page, pte_write(pteval)); |
840 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 833 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
841 | } else | 834 | } else |
842 | #endif | ||
843 | dec_mm_counter(mm, file_rss); | 835 | dec_mm_counter(mm, file_rss); |
844 | 836 | ||
845 | 837 | ||
846 | page_remove_rmap(page, vma); | 838 | page_remove_rmap(page); |
847 | page_cache_release(page); | 839 | page_cache_release(page); |
848 | 840 | ||
849 | out_unmap: | 841 | out_unmap: |
@@ -958,7 +950,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
958 | if (pte_dirty(pteval)) | 950 | if (pte_dirty(pteval)) |
959 | set_page_dirty(page); | 951 | set_page_dirty(page); |
960 | 952 | ||
961 | page_remove_rmap(page, vma); | 953 | page_remove_rmap(page); |
962 | page_cache_release(page); | 954 | page_cache_release(page); |
963 | dec_mm_counter(mm, file_rss); | 955 | dec_mm_counter(mm, file_rss); |
964 | (*mapcount)--; | 956 | (*mapcount)--; |
diff --git a/mm/shmem.c b/mm/shmem.c index dd5588f5d939..75199888a6bd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -14,31 +14,39 @@ | |||
14 | * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> | 14 | * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> |
15 | * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> | 15 | * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> |
16 | * | 16 | * |
17 | * tiny-shmem: | ||
18 | * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> | ||
19 | * | ||
17 | * This file is released under the GPL. | 20 | * This file is released under the GPL. |
18 | */ | 21 | */ |
19 | 22 | ||
23 | #include <linux/fs.h> | ||
24 | #include <linux/init.h> | ||
25 | #include <linux/vfs.h> | ||
26 | #include <linux/mount.h> | ||
27 | #include <linux/file.h> | ||
28 | #include <linux/mm.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/swap.h> | ||
31 | |||
32 | static struct vfsmount *shm_mnt; | ||
33 | |||
34 | #ifdef CONFIG_SHMEM | ||
20 | /* | 35 | /* |
21 | * This virtual memory filesystem is heavily based on the ramfs. It | 36 | * This virtual memory filesystem is heavily based on the ramfs. It |
22 | * extends ramfs by the ability to use swap and honor resource limits | 37 | * extends ramfs by the ability to use swap and honor resource limits |
23 | * which makes it a completely usable filesystem. | 38 | * which makes it a completely usable filesystem. |
24 | */ | 39 | */ |
25 | 40 | ||
26 | #include <linux/module.h> | ||
27 | #include <linux/init.h> | ||
28 | #include <linux/fs.h> | ||
29 | #include <linux/xattr.h> | 41 | #include <linux/xattr.h> |
30 | #include <linux/exportfs.h> | 42 | #include <linux/exportfs.h> |
31 | #include <linux/generic_acl.h> | 43 | #include <linux/generic_acl.h> |
32 | #include <linux/mm.h> | ||
33 | #include <linux/mman.h> | 44 | #include <linux/mman.h> |
34 | #include <linux/file.h> | ||
35 | #include <linux/swap.h> | ||
36 | #include <linux/pagemap.h> | 45 | #include <linux/pagemap.h> |
37 | #include <linux/string.h> | 46 | #include <linux/string.h> |
38 | #include <linux/slab.h> | 47 | #include <linux/slab.h> |
39 | #include <linux/backing-dev.h> | 48 | #include <linux/backing-dev.h> |
40 | #include <linux/shmem_fs.h> | 49 | #include <linux/shmem_fs.h> |
41 | #include <linux/mount.h> | ||
42 | #include <linux/writeback.h> | 50 | #include <linux/writeback.h> |
43 | #include <linux/vfs.h> | 51 | #include <linux/vfs.h> |
44 | #include <linux/blkdev.h> | 52 | #include <linux/blkdev.h> |
@@ -921,7 +929,11 @@ found: | |||
921 | error = 1; | 929 | error = 1; |
922 | if (!inode) | 930 | if (!inode) |
923 | goto out; | 931 | goto out; |
924 | /* Precharge page using GFP_KERNEL while we can wait */ | 932 | /* |
933 | * Charge page using GFP_KERNEL while we can wait. | ||
934 | * Charged back to the user(not to caller) when swap account is used. | ||
935 | * add_to_page_cache() will be called with GFP_NOWAIT. | ||
936 | */ | ||
925 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | 937 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); |
926 | if (error) | 938 | if (error) |
927 | goto out; | 939 | goto out; |
@@ -1313,15 +1325,19 @@ repeat: | |||
1313 | } else { | 1325 | } else { |
1314 | shmem_swp_unmap(entry); | 1326 | shmem_swp_unmap(entry); |
1315 | spin_unlock(&info->lock); | 1327 | spin_unlock(&info->lock); |
1316 | unlock_page(swappage); | ||
1317 | page_cache_release(swappage); | ||
1318 | if (error == -ENOMEM) { | 1328 | if (error == -ENOMEM) { |
1319 | /* allow reclaim from this memory cgroup */ | 1329 | /* allow reclaim from this memory cgroup */ |
1320 | error = mem_cgroup_shrink_usage(current->mm, | 1330 | error = mem_cgroup_shrink_usage(swappage, |
1331 | current->mm, | ||
1321 | gfp); | 1332 | gfp); |
1322 | if (error) | 1333 | if (error) { |
1334 | unlock_page(swappage); | ||
1335 | page_cache_release(swappage); | ||
1323 | goto failed; | 1336 | goto failed; |
1337 | } | ||
1324 | } | 1338 | } |
1339 | unlock_page(swappage); | ||
1340 | page_cache_release(swappage); | ||
1325 | goto repeat; | 1341 | goto repeat; |
1326 | } | 1342 | } |
1327 | } else if (sgp == SGP_READ && !filepage) { | 1343 | } else if (sgp == SGP_READ && !filepage) { |
@@ -1372,7 +1388,7 @@ repeat: | |||
1372 | 1388 | ||
1373 | /* Precharge page while we can wait, compensate after */ | 1389 | /* Precharge page while we can wait, compensate after */ |
1374 | error = mem_cgroup_cache_charge(filepage, current->mm, | 1390 | error = mem_cgroup_cache_charge(filepage, current->mm, |
1375 | gfp & ~__GFP_HIGHMEM); | 1391 | GFP_KERNEL); |
1376 | if (error) { | 1392 | if (error) { |
1377 | page_cache_release(filepage); | 1393 | page_cache_release(filepage); |
1378 | shmem_unacct_blocks(info->flags, 1); | 1394 | shmem_unacct_blocks(info->flags, 1); |
@@ -1445,7 +1461,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1445 | if (error) | 1461 | if (error) |
1446 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1462 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
1447 | 1463 | ||
1448 | mark_page_accessed(vmf->page); | ||
1449 | return ret | VM_FAULT_LOCKED; | 1464 | return ret | VM_FAULT_LOCKED; |
1450 | } | 1465 | } |
1451 | 1466 | ||
@@ -2487,7 +2502,6 @@ static struct file_system_type tmpfs_fs_type = { | |||
2487 | .get_sb = shmem_get_sb, | 2502 | .get_sb = shmem_get_sb, |
2488 | .kill_sb = kill_litter_super, | 2503 | .kill_sb = kill_litter_super, |
2489 | }; | 2504 | }; |
2490 | static struct vfsmount *shm_mnt; | ||
2491 | 2505 | ||
2492 | static int __init init_tmpfs(void) | 2506 | static int __init init_tmpfs(void) |
2493 | { | 2507 | { |
@@ -2526,7 +2540,51 @@ out4: | |||
2526 | shm_mnt = ERR_PTR(error); | 2540 | shm_mnt = ERR_PTR(error); |
2527 | return error; | 2541 | return error; |
2528 | } | 2542 | } |
2529 | module_init(init_tmpfs) | 2543 | |
2544 | #else /* !CONFIG_SHMEM */ | ||
2545 | |||
2546 | /* | ||
2547 | * tiny-shmem: simple shmemfs and tmpfs using ramfs code | ||
2548 | * | ||
2549 | * This is intended for small system where the benefits of the full | ||
2550 | * shmem code (swap-backed and resource-limited) are outweighed by | ||
2551 | * their complexity. On systems without swap this code should be | ||
2552 | * effectively equivalent, but much lighter weight. | ||
2553 | */ | ||
2554 | |||
2555 | #include <linux/ramfs.h> | ||
2556 | |||
2557 | static struct file_system_type tmpfs_fs_type = { | ||
2558 | .name = "tmpfs", | ||
2559 | .get_sb = ramfs_get_sb, | ||
2560 | .kill_sb = kill_litter_super, | ||
2561 | }; | ||
2562 | |||
2563 | static int __init init_tmpfs(void) | ||
2564 | { | ||
2565 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); | ||
2566 | |||
2567 | shm_mnt = kern_mount(&tmpfs_fs_type); | ||
2568 | BUG_ON(IS_ERR(shm_mnt)); | ||
2569 | |||
2570 | return 0; | ||
2571 | } | ||
2572 | |||
2573 | int shmem_unuse(swp_entry_t entry, struct page *page) | ||
2574 | { | ||
2575 | return 0; | ||
2576 | } | ||
2577 | |||
2578 | #define shmem_file_operations ramfs_file_operations | ||
2579 | #define shmem_vm_ops generic_file_vm_ops | ||
2580 | #define shmem_get_inode ramfs_get_inode | ||
2581 | #define shmem_acct_size(a, b) 0 | ||
2582 | #define shmem_unacct_size(a, b) do {} while (0) | ||
2583 | #define SHMEM_MAX_BYTES LLONG_MAX | ||
2584 | |||
2585 | #endif /* CONFIG_SHMEM */ | ||
2586 | |||
2587 | /* common code */ | ||
2530 | 2588 | ||
2531 | /** | 2589 | /** |
2532 | * shmem_file_setup - get an unlinked file living in tmpfs | 2590 | * shmem_file_setup - get an unlinked file living in tmpfs |
@@ -2570,12 +2628,20 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
2570 | if (!inode) | 2628 | if (!inode) |
2571 | goto close_file; | 2629 | goto close_file; |
2572 | 2630 | ||
2573 | SHMEM_I(inode)->flags = flags & VM_ACCOUNT; | 2631 | #ifdef CONFIG_SHMEM |
2632 | SHMEM_I(inode)->flags = (flags & VM_NORESERVE) ? 0 : VM_ACCOUNT; | ||
2633 | #endif | ||
2574 | d_instantiate(dentry, inode); | 2634 | d_instantiate(dentry, inode); |
2575 | inode->i_size = size; | 2635 | inode->i_size = size; |
2576 | inode->i_nlink = 0; /* It is unlinked */ | 2636 | inode->i_nlink = 0; /* It is unlinked */ |
2577 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | 2637 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, |
2578 | &shmem_file_operations); | 2638 | &shmem_file_operations); |
2639 | |||
2640 | #ifndef CONFIG_MMU | ||
2641 | error = ramfs_nommu_expand_for_mapping(inode, size); | ||
2642 | if (error) | ||
2643 | goto close_file; | ||
2644 | #endif | ||
2579 | return file; | 2645 | return file; |
2580 | 2646 | ||
2581 | close_file: | 2647 | close_file: |
@@ -2608,3 +2674,5 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
2608 | vma->vm_ops = &shmem_vm_ops; | 2674 | vma->vm_ops = &shmem_vm_ops; |
2609 | return 0; | 2675 | return 0; |
2610 | } | 2676 | } |
2677 | |||
2678 | module_init(init_tmpfs) | ||
@@ -2157,7 +2157,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2157 | 2157 | ||
2158 | /* | 2158 | /* |
2159 | * We use cache_chain_mutex to ensure a consistent view of | 2159 | * We use cache_chain_mutex to ensure a consistent view of |
2160 | * cpu_online_map as well. Please see cpuup_callback | 2160 | * cpu_online_mask as well. Please see cpuup_callback |
2161 | */ | 2161 | */ |
2162 | get_online_cpus(); | 2162 | get_online_cpus(); |
2163 | mutex_lock(&cache_chain_mutex); | 2163 | mutex_lock(&cache_chain_mutex); |
@@ -1970,7 +1970,7 @@ static DEFINE_PER_CPU(struct kmem_cache_cpu, | |||
1970 | kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; | 1970 | kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; |
1971 | 1971 | ||
1972 | static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); | 1972 | static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); |
1973 | static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE; | 1973 | static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); |
1974 | 1974 | ||
1975 | static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, | 1975 | static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, |
1976 | int cpu, gfp_t flags) | 1976 | int cpu, gfp_t flags) |
@@ -1996,7 +1996,7 @@ static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, | |||
1996 | static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) | 1996 | static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) |
1997 | { | 1997 | { |
1998 | if (c < per_cpu(kmem_cache_cpu, cpu) || | 1998 | if (c < per_cpu(kmem_cache_cpu, cpu) || |
1999 | c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { | 1999 | c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { |
2000 | kfree(c); | 2000 | kfree(c); |
2001 | return; | 2001 | return; |
2002 | } | 2002 | } |
@@ -2045,13 +2045,13 @@ static void init_alloc_cpu_cpu(int cpu) | |||
2045 | { | 2045 | { |
2046 | int i; | 2046 | int i; |
2047 | 2047 | ||
2048 | if (cpu_isset(cpu, kmem_cach_cpu_free_init_once)) | 2048 | if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) |
2049 | return; | 2049 | return; |
2050 | 2050 | ||
2051 | for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) | 2051 | for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) |
2052 | free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); | 2052 | free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); |
2053 | 2053 | ||
2054 | cpu_set(cpu, kmem_cach_cpu_free_init_once); | 2054 | cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)); |
2055 | } | 2055 | } |
2056 | 2056 | ||
2057 | static void __init init_alloc_cpu(void) | 2057 | static void __init init_alloc_cpu(void) |
@@ -2254,7 +2254,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
2254 | * Add some empty padding so that we can catch | 2254 | * Add some empty padding so that we can catch |
2255 | * overwrites from earlier objects rather than let | 2255 | * overwrites from earlier objects rather than let |
2256 | * tracking information or the free pointer be | 2256 | * tracking information or the free pointer be |
2257 | * corrupted if an user writes before the start | 2257 | * corrupted if a user writes before the start |
2258 | * of the object. | 2258 | * of the object. |
2259 | */ | 2259 | */ |
2260 | size += sizeof(void *); | 2260 | size += sizeof(void *); |
@@ -3451,7 +3451,7 @@ struct location { | |||
3451 | long max_time; | 3451 | long max_time; |
3452 | long min_pid; | 3452 | long min_pid; |
3453 | long max_pid; | 3453 | long max_pid; |
3454 | cpumask_t cpus; | 3454 | DECLARE_BITMAP(cpus, NR_CPUS); |
3455 | nodemask_t nodes; | 3455 | nodemask_t nodes; |
3456 | }; | 3456 | }; |
3457 | 3457 | ||
@@ -3526,7 +3526,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, | |||
3526 | if (track->pid > l->max_pid) | 3526 | if (track->pid > l->max_pid) |
3527 | l->max_pid = track->pid; | 3527 | l->max_pid = track->pid; |
3528 | 3528 | ||
3529 | cpu_set(track->cpu, l->cpus); | 3529 | cpumask_set_cpu(track->cpu, |
3530 | to_cpumask(l->cpus)); | ||
3530 | } | 3531 | } |
3531 | node_set(page_to_nid(virt_to_page(track)), l->nodes); | 3532 | node_set(page_to_nid(virt_to_page(track)), l->nodes); |
3532 | return 1; | 3533 | return 1; |
@@ -3556,8 +3557,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, | |||
3556 | l->max_time = age; | 3557 | l->max_time = age; |
3557 | l->min_pid = track->pid; | 3558 | l->min_pid = track->pid; |
3558 | l->max_pid = track->pid; | 3559 | l->max_pid = track->pid; |
3559 | cpus_clear(l->cpus); | 3560 | cpumask_clear(to_cpumask(l->cpus)); |
3560 | cpu_set(track->cpu, l->cpus); | 3561 | cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); |
3561 | nodes_clear(l->nodes); | 3562 | nodes_clear(l->nodes); |
3562 | node_set(page_to_nid(virt_to_page(track)), l->nodes); | 3563 | node_set(page_to_nid(virt_to_page(track)), l->nodes); |
3563 | return 1; | 3564 | return 1; |
@@ -3638,11 +3639,12 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
3638 | len += sprintf(buf + len, " pid=%ld", | 3639 | len += sprintf(buf + len, " pid=%ld", |
3639 | l->min_pid); | 3640 | l->min_pid); |
3640 | 3641 | ||
3641 | if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && | 3642 | if (num_online_cpus() > 1 && |
3643 | !cpumask_empty(to_cpumask(l->cpus)) && | ||
3642 | len < PAGE_SIZE - 60) { | 3644 | len < PAGE_SIZE - 60) { |
3643 | len += sprintf(buf + len, " cpus="); | 3645 | len += sprintf(buf + len, " cpus="); |
3644 | len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, | 3646 | len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, |
3645 | l->cpus); | 3647 | to_cpumask(l->cpus)); |
3646 | } | 3648 | } |
3647 | 3649 | ||
3648 | if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && | 3650 | if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && |
@@ -151,6 +151,26 @@ void rotate_reclaimable_page(struct page *page) | |||
151 | } | 151 | } |
152 | } | 152 | } |
153 | 153 | ||
154 | static void update_page_reclaim_stat(struct zone *zone, struct page *page, | ||
155 | int file, int rotated) | ||
156 | { | ||
157 | struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat; | ||
158 | struct zone_reclaim_stat *memcg_reclaim_stat; | ||
159 | |||
160 | memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page); | ||
161 | |||
162 | reclaim_stat->recent_scanned[file]++; | ||
163 | if (rotated) | ||
164 | reclaim_stat->recent_rotated[file]++; | ||
165 | |||
166 | if (!memcg_reclaim_stat) | ||
167 | return; | ||
168 | |||
169 | memcg_reclaim_stat->recent_scanned[file]++; | ||
170 | if (rotated) | ||
171 | memcg_reclaim_stat->recent_rotated[file]++; | ||
172 | } | ||
173 | |||
154 | /* | 174 | /* |
155 | * FIXME: speed this up? | 175 | * FIXME: speed this up? |
156 | */ | 176 | */ |
@@ -168,10 +188,8 @@ void activate_page(struct page *page) | |||
168 | lru += LRU_ACTIVE; | 188 | lru += LRU_ACTIVE; |
169 | add_page_to_lru_list(zone, page, lru); | 189 | add_page_to_lru_list(zone, page, lru); |
170 | __count_vm_event(PGACTIVATE); | 190 | __count_vm_event(PGACTIVATE); |
171 | mem_cgroup_move_lists(page, lru); | ||
172 | 191 | ||
173 | zone->recent_rotated[!!file]++; | 192 | update_page_reclaim_stat(zone, page, !!file, 1); |
174 | zone->recent_scanned[!!file]++; | ||
175 | } | 193 | } |
176 | spin_unlock_irq(&zone->lru_lock); | 194 | spin_unlock_irq(&zone->lru_lock); |
177 | } | 195 | } |
@@ -246,25 +264,6 @@ void add_page_to_unevictable_list(struct page *page) | |||
246 | spin_unlock_irq(&zone->lru_lock); | 264 | spin_unlock_irq(&zone->lru_lock); |
247 | } | 265 | } |
248 | 266 | ||
249 | /** | ||
250 | * lru_cache_add_active_or_unevictable | ||
251 | * @page: the page to be added to LRU | ||
252 | * @vma: vma in which page is mapped for determining reclaimability | ||
253 | * | ||
254 | * place @page on active or unevictable LRU list, depending on | ||
255 | * page_evictable(). Note that if the page is not evictable, | ||
256 | * it goes directly back onto it's zone's unevictable list. It does | ||
257 | * NOT use a per cpu pagevec. | ||
258 | */ | ||
259 | void lru_cache_add_active_or_unevictable(struct page *page, | ||
260 | struct vm_area_struct *vma) | ||
261 | { | ||
262 | if (page_evictable(page, vma)) | ||
263 | lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page)); | ||
264 | else | ||
265 | add_page_to_unevictable_list(page); | ||
266 | } | ||
267 | |||
268 | /* | 267 | /* |
269 | * Drain pages out of the cpu's pagevecs. | 268 | * Drain pages out of the cpu's pagevecs. |
270 | * Either "cpu" is the current CPU, and preemption has already been | 269 | * Either "cpu" is the current CPU, and preemption has already been |
@@ -398,28 +397,6 @@ void __pagevec_release(struct pagevec *pvec) | |||
398 | EXPORT_SYMBOL(__pagevec_release); | 397 | EXPORT_SYMBOL(__pagevec_release); |
399 | 398 | ||
400 | /* | 399 | /* |
401 | * pagevec_release() for pages which are known to not be on the LRU | ||
402 | * | ||
403 | * This function reinitialises the caller's pagevec. | ||
404 | */ | ||
405 | void __pagevec_release_nonlru(struct pagevec *pvec) | ||
406 | { | ||
407 | int i; | ||
408 | struct pagevec pages_to_free; | ||
409 | |||
410 | pagevec_init(&pages_to_free, pvec->cold); | ||
411 | for (i = 0; i < pagevec_count(pvec); i++) { | ||
412 | struct page *page = pvec->pages[i]; | ||
413 | |||
414 | VM_BUG_ON(PageLRU(page)); | ||
415 | if (put_page_testzero(page)) | ||
416 | pagevec_add(&pages_to_free, page); | ||
417 | } | ||
418 | pagevec_free(&pages_to_free); | ||
419 | pagevec_reinit(pvec); | ||
420 | } | ||
421 | |||
422 | /* | ||
423 | * Add the passed pages to the LRU, then drop the caller's refcount | 400 | * Add the passed pages to the LRU, then drop the caller's refcount |
424 | * on them. Reinitialises the caller's pagevec. | 401 | * on them. Reinitialises the caller's pagevec. |
425 | */ | 402 | */ |
@@ -427,12 +404,14 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) | |||
427 | { | 404 | { |
428 | int i; | 405 | int i; |
429 | struct zone *zone = NULL; | 406 | struct zone *zone = NULL; |
407 | |||
430 | VM_BUG_ON(is_unevictable_lru(lru)); | 408 | VM_BUG_ON(is_unevictable_lru(lru)); |
431 | 409 | ||
432 | for (i = 0; i < pagevec_count(pvec); i++) { | 410 | for (i = 0; i < pagevec_count(pvec); i++) { |
433 | struct page *page = pvec->pages[i]; | 411 | struct page *page = pvec->pages[i]; |
434 | struct zone *pagezone = page_zone(page); | 412 | struct zone *pagezone = page_zone(page); |
435 | int file; | 413 | int file; |
414 | int active; | ||
436 | 415 | ||
437 | if (pagezone != zone) { | 416 | if (pagezone != zone) { |
438 | if (zone) | 417 | if (zone) |
@@ -444,12 +423,11 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) | |||
444 | VM_BUG_ON(PageUnevictable(page)); | 423 | VM_BUG_ON(PageUnevictable(page)); |
445 | VM_BUG_ON(PageLRU(page)); | 424 | VM_BUG_ON(PageLRU(page)); |
446 | SetPageLRU(page); | 425 | SetPageLRU(page); |
426 | active = is_active_lru(lru); | ||
447 | file = is_file_lru(lru); | 427 | file = is_file_lru(lru); |
448 | zone->recent_scanned[file]++; | 428 | if (active) |
449 | if (is_active_lru(lru)) { | ||
450 | SetPageActive(page); | 429 | SetPageActive(page); |
451 | zone->recent_rotated[file]++; | 430 | update_page_reclaim_stat(zone, page, file, active); |
452 | } | ||
453 | add_page_to_lru_list(zone, page, lru); | 431 | add_page_to_lru_list(zone, page, lru); |
454 | } | 432 | } |
455 | if (zone) | 433 | if (zone) |
@@ -495,8 +473,7 @@ void pagevec_swap_free(struct pagevec *pvec) | |||
495 | struct page *page = pvec->pages[i]; | 473 | struct page *page = pvec->pages[i]; |
496 | 474 | ||
497 | if (PageSwapCache(page) && trylock_page(page)) { | 475 | if (PageSwapCache(page) && trylock_page(page)) { |
498 | if (PageSwapCache(page)) | 476 | try_to_free_swap(page); |
499 | remove_exclusive_swap_page_ref(page); | ||
500 | unlock_page(page); | 477 | unlock_page(page); |
501 | } | 478 | } |
502 | } | 479 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 3353c9029cef..3ecea98ecb45 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/backing-dev.h> | 17 | #include <linux/backing-dev.h> |
18 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
19 | #include <linux/migrate.h> | 19 | #include <linux/migrate.h> |
20 | #include <linux/page_cgroup.h> | ||
20 | 21 | ||
21 | #include <asm/pgtable.h> | 22 | #include <asm/pgtable.h> |
22 | 23 | ||
@@ -72,10 +73,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
72 | { | 73 | { |
73 | int error; | 74 | int error; |
74 | 75 | ||
75 | BUG_ON(!PageLocked(page)); | 76 | VM_BUG_ON(!PageLocked(page)); |
76 | BUG_ON(PageSwapCache(page)); | 77 | VM_BUG_ON(PageSwapCache(page)); |
77 | BUG_ON(PagePrivate(page)); | 78 | VM_BUG_ON(!PageSwapBacked(page)); |
78 | BUG_ON(!PageSwapBacked(page)); | 79 | |
79 | error = radix_tree_preload(gfp_mask); | 80 | error = radix_tree_preload(gfp_mask); |
80 | if (!error) { | 81 | if (!error) { |
81 | page_cache_get(page); | 82 | page_cache_get(page); |
@@ -108,10 +109,11 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
108 | */ | 109 | */ |
109 | void __delete_from_swap_cache(struct page *page) | 110 | void __delete_from_swap_cache(struct page *page) |
110 | { | 111 | { |
111 | BUG_ON(!PageLocked(page)); | 112 | swp_entry_t ent = {.val = page_private(page)}; |
112 | BUG_ON(!PageSwapCache(page)); | 113 | |
113 | BUG_ON(PageWriteback(page)); | 114 | VM_BUG_ON(!PageLocked(page)); |
114 | BUG_ON(PagePrivate(page)); | 115 | VM_BUG_ON(!PageSwapCache(page)); |
116 | VM_BUG_ON(PageWriteback(page)); | ||
115 | 117 | ||
116 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); | 118 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); |
117 | set_page_private(page, 0); | 119 | set_page_private(page, 0); |
@@ -119,6 +121,7 @@ void __delete_from_swap_cache(struct page *page) | |||
119 | total_swapcache_pages--; | 121 | total_swapcache_pages--; |
120 | __dec_zone_page_state(page, NR_FILE_PAGES); | 122 | __dec_zone_page_state(page, NR_FILE_PAGES); |
121 | INC_CACHE_INFO(del_total); | 123 | INC_CACHE_INFO(del_total); |
124 | mem_cgroup_uncharge_swapcache(page, ent); | ||
122 | } | 125 | } |
123 | 126 | ||
124 | /** | 127 | /** |
@@ -129,13 +132,13 @@ void __delete_from_swap_cache(struct page *page) | |||
129 | * Allocate swap space for the page and add the page to the | 132 | * Allocate swap space for the page and add the page to the |
130 | * swap cache. Caller needs to hold the page lock. | 133 | * swap cache. Caller needs to hold the page lock. |
131 | */ | 134 | */ |
132 | int add_to_swap(struct page * page, gfp_t gfp_mask) | 135 | int add_to_swap(struct page *page) |
133 | { | 136 | { |
134 | swp_entry_t entry; | 137 | swp_entry_t entry; |
135 | int err; | 138 | int err; |
136 | 139 | ||
137 | BUG_ON(!PageLocked(page)); | 140 | VM_BUG_ON(!PageLocked(page)); |
138 | BUG_ON(!PageUptodate(page)); | 141 | VM_BUG_ON(!PageUptodate(page)); |
139 | 142 | ||
140 | for (;;) { | 143 | for (;;) { |
141 | entry = get_swap_page(); | 144 | entry = get_swap_page(); |
@@ -154,7 +157,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask) | |||
154 | * Add it to the swap cache and mark it dirty | 157 | * Add it to the swap cache and mark it dirty |
155 | */ | 158 | */ |
156 | err = add_to_swap_cache(page, entry, | 159 | err = add_to_swap_cache(page, entry, |
157 | gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); | 160 | __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); |
158 | 161 | ||
159 | switch (err) { | 162 | switch (err) { |
160 | case 0: /* Success */ | 163 | case 0: /* Success */ |
@@ -196,14 +199,14 @@ void delete_from_swap_cache(struct page *page) | |||
196 | * If we are the only user, then try to free up the swap cache. | 199 | * If we are the only user, then try to free up the swap cache. |
197 | * | 200 | * |
198 | * Its ok to check for PageSwapCache without the page lock | 201 | * Its ok to check for PageSwapCache without the page lock |
199 | * here because we are going to recheck again inside | 202 | * here because we are going to recheck again inside |
200 | * exclusive_swap_page() _with_ the lock. | 203 | * try_to_free_swap() _with_ the lock. |
201 | * - Marcelo | 204 | * - Marcelo |
202 | */ | 205 | */ |
203 | static inline void free_swap_cache(struct page *page) | 206 | static inline void free_swap_cache(struct page *page) |
204 | { | 207 | { |
205 | if (PageSwapCache(page) && trylock_page(page)) { | 208 | if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { |
206 | remove_exclusive_swap_page(page); | 209 | try_to_free_swap(page); |
207 | unlock_page(page); | 210 | unlock_page(page); |
208 | } | 211 | } |
209 | } | 212 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 54a9f87e5162..7e6304dfafab 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/namei.h> | 16 | #include <linux/namei.h> |
17 | #include <linux/shm.h> | 17 | #include <linux/shm.h> |
18 | #include <linux/blkdev.h> | 18 | #include <linux/blkdev.h> |
19 | #include <linux/random.h> | ||
19 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
20 | #include <linux/proc_fs.h> | 21 | #include <linux/proc_fs.h> |
21 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
@@ -32,9 +33,11 @@ | |||
32 | #include <asm/pgtable.h> | 33 | #include <asm/pgtable.h> |
33 | #include <asm/tlbflush.h> | 34 | #include <asm/tlbflush.h> |
34 | #include <linux/swapops.h> | 35 | #include <linux/swapops.h> |
36 | #include <linux/page_cgroup.h> | ||
35 | 37 | ||
36 | static DEFINE_SPINLOCK(swap_lock); | 38 | static DEFINE_SPINLOCK(swap_lock); |
37 | static unsigned int nr_swapfiles; | 39 | static unsigned int nr_swapfiles; |
40 | long nr_swap_pages; | ||
38 | long total_swap_pages; | 41 | long total_swap_pages; |
39 | static int swap_overflow; | 42 | static int swap_overflow; |
40 | static int least_priority; | 43 | static int least_priority; |
@@ -83,15 +86,96 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
83 | up_read(&swap_unplug_sem); | 86 | up_read(&swap_unplug_sem); |
84 | } | 87 | } |
85 | 88 | ||
89 | /* | ||
90 | * swapon tell device that all the old swap contents can be discarded, | ||
91 | * to allow the swap device to optimize its wear-levelling. | ||
92 | */ | ||
93 | static int discard_swap(struct swap_info_struct *si) | ||
94 | { | ||
95 | struct swap_extent *se; | ||
96 | int err = 0; | ||
97 | |||
98 | list_for_each_entry(se, &si->extent_list, list) { | ||
99 | sector_t start_block = se->start_block << (PAGE_SHIFT - 9); | ||
100 | sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); | ||
101 | |||
102 | if (se->start_page == 0) { | ||
103 | /* Do not discard the swap header page! */ | ||
104 | start_block += 1 << (PAGE_SHIFT - 9); | ||
105 | nr_blocks -= 1 << (PAGE_SHIFT - 9); | ||
106 | if (!nr_blocks) | ||
107 | continue; | ||
108 | } | ||
109 | |||
110 | err = blkdev_issue_discard(si->bdev, start_block, | ||
111 | nr_blocks, GFP_KERNEL); | ||
112 | if (err) | ||
113 | break; | ||
114 | |||
115 | cond_resched(); | ||
116 | } | ||
117 | return err; /* That will often be -EOPNOTSUPP */ | ||
118 | } | ||
119 | |||
120 | /* | ||
121 | * swap allocation tell device that a cluster of swap can now be discarded, | ||
122 | * to allow the swap device to optimize its wear-levelling. | ||
123 | */ | ||
124 | static void discard_swap_cluster(struct swap_info_struct *si, | ||
125 | pgoff_t start_page, pgoff_t nr_pages) | ||
126 | { | ||
127 | struct swap_extent *se = si->curr_swap_extent; | ||
128 | int found_extent = 0; | ||
129 | |||
130 | while (nr_pages) { | ||
131 | struct list_head *lh; | ||
132 | |||
133 | if (se->start_page <= start_page && | ||
134 | start_page < se->start_page + se->nr_pages) { | ||
135 | pgoff_t offset = start_page - se->start_page; | ||
136 | sector_t start_block = se->start_block + offset; | ||
137 | sector_t nr_blocks = se->nr_pages - offset; | ||
138 | |||
139 | if (nr_blocks > nr_pages) | ||
140 | nr_blocks = nr_pages; | ||
141 | start_page += nr_blocks; | ||
142 | nr_pages -= nr_blocks; | ||
143 | |||
144 | if (!found_extent++) | ||
145 | si->curr_swap_extent = se; | ||
146 | |||
147 | start_block <<= PAGE_SHIFT - 9; | ||
148 | nr_blocks <<= PAGE_SHIFT - 9; | ||
149 | if (blkdev_issue_discard(si->bdev, start_block, | ||
150 | nr_blocks, GFP_NOIO)) | ||
151 | break; | ||
152 | } | ||
153 | |||
154 | lh = se->list.next; | ||
155 | if (lh == &si->extent_list) | ||
156 | lh = lh->next; | ||
157 | se = list_entry(lh, struct swap_extent, list); | ||
158 | } | ||
159 | } | ||
160 | |||
161 | static int wait_for_discard(void *word) | ||
162 | { | ||
163 | schedule(); | ||
164 | return 0; | ||
165 | } | ||
166 | |||
86 | #define SWAPFILE_CLUSTER 256 | 167 | #define SWAPFILE_CLUSTER 256 |
87 | #define LATENCY_LIMIT 256 | 168 | #define LATENCY_LIMIT 256 |
88 | 169 | ||
89 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) | 170 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) |
90 | { | 171 | { |
91 | unsigned long offset, last_in_cluster; | 172 | unsigned long offset; |
173 | unsigned long scan_base; | ||
174 | unsigned long last_in_cluster = 0; | ||
92 | int latency_ration = LATENCY_LIMIT; | 175 | int latency_ration = LATENCY_LIMIT; |
176 | int found_free_cluster = 0; | ||
93 | 177 | ||
94 | /* | 178 | /* |
95 | * We try to cluster swap pages by allocating them sequentially | 179 | * We try to cluster swap pages by allocating them sequentially |
96 | * in swap. Once we've allocated SWAPFILE_CLUSTER pages this | 180 | * in swap. Once we've allocated SWAPFILE_CLUSTER pages this |
97 | * way, however, we resort to first-free allocation, starting | 181 | * way, however, we resort to first-free allocation, starting |
@@ -99,16 +183,42 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
99 | * all over the entire swap partition, so that we reduce | 183 | * all over the entire swap partition, so that we reduce |
100 | * overall disk seek times between swap pages. -- sct | 184 | * overall disk seek times between swap pages. -- sct |
101 | * But we do now try to find an empty cluster. -Andrea | 185 | * But we do now try to find an empty cluster. -Andrea |
186 | * And we let swap pages go all over an SSD partition. Hugh | ||
102 | */ | 187 | */ |
103 | 188 | ||
104 | si->flags += SWP_SCANNING; | 189 | si->flags += SWP_SCANNING; |
105 | if (unlikely(!si->cluster_nr)) { | 190 | scan_base = offset = si->cluster_next; |
106 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 191 | |
107 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) | 192 | if (unlikely(!si->cluster_nr--)) { |
108 | goto lowest; | 193 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { |
194 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | ||
195 | goto checks; | ||
196 | } | ||
197 | if (si->flags & SWP_DISCARDABLE) { | ||
198 | /* | ||
199 | * Start range check on racing allocations, in case | ||
200 | * they overlap the cluster we eventually decide on | ||
201 | * (we scan without swap_lock to allow preemption). | ||
202 | * It's hardly conceivable that cluster_nr could be | ||
203 | * wrapped during our scan, but don't depend on it. | ||
204 | */ | ||
205 | if (si->lowest_alloc) | ||
206 | goto checks; | ||
207 | si->lowest_alloc = si->max; | ||
208 | si->highest_alloc = 0; | ||
209 | } | ||
109 | spin_unlock(&swap_lock); | 210 | spin_unlock(&swap_lock); |
110 | 211 | ||
111 | offset = si->lowest_bit; | 212 | /* |
213 | * If seek is expensive, start searching for new cluster from | ||
214 | * start of partition, to minimize the span of allocated swap. | ||
215 | * But if seek is cheap, search from our current position, so | ||
216 | * that swap is allocated from all over the partition: if the | ||
217 | * Flash Translation Layer only remaps within limited zones, | ||
218 | * we don't want to wear out the first zone too quickly. | ||
219 | */ | ||
220 | if (!(si->flags & SWP_SOLIDSTATE)) | ||
221 | scan_base = offset = si->lowest_bit; | ||
112 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; | 222 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; |
113 | 223 | ||
114 | /* Locate the first empty (unaligned) cluster */ | 224 | /* Locate the first empty (unaligned) cluster */ |
@@ -117,43 +227,124 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
117 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 227 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
118 | else if (offset == last_in_cluster) { | 228 | else if (offset == last_in_cluster) { |
119 | spin_lock(&swap_lock); | 229 | spin_lock(&swap_lock); |
120 | si->cluster_next = offset-SWAPFILE_CLUSTER+1; | 230 | offset -= SWAPFILE_CLUSTER - 1; |
121 | goto cluster; | 231 | si->cluster_next = offset; |
232 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | ||
233 | found_free_cluster = 1; | ||
234 | goto checks; | ||
122 | } | 235 | } |
123 | if (unlikely(--latency_ration < 0)) { | 236 | if (unlikely(--latency_ration < 0)) { |
124 | cond_resched(); | 237 | cond_resched(); |
125 | latency_ration = LATENCY_LIMIT; | 238 | latency_ration = LATENCY_LIMIT; |
126 | } | 239 | } |
127 | } | 240 | } |
241 | |||
242 | offset = si->lowest_bit; | ||
243 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; | ||
244 | |||
245 | /* Locate the first empty (unaligned) cluster */ | ||
246 | for (; last_in_cluster < scan_base; offset++) { | ||
247 | if (si->swap_map[offset]) | ||
248 | last_in_cluster = offset + SWAPFILE_CLUSTER; | ||
249 | else if (offset == last_in_cluster) { | ||
250 | spin_lock(&swap_lock); | ||
251 | offset -= SWAPFILE_CLUSTER - 1; | ||
252 | si->cluster_next = offset; | ||
253 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | ||
254 | found_free_cluster = 1; | ||
255 | goto checks; | ||
256 | } | ||
257 | if (unlikely(--latency_ration < 0)) { | ||
258 | cond_resched(); | ||
259 | latency_ration = LATENCY_LIMIT; | ||
260 | } | ||
261 | } | ||
262 | |||
263 | offset = scan_base; | ||
128 | spin_lock(&swap_lock); | 264 | spin_lock(&swap_lock); |
129 | goto lowest; | 265 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
266 | si->lowest_alloc = 0; | ||
130 | } | 267 | } |
131 | 268 | ||
132 | si->cluster_nr--; | 269 | checks: |
133 | cluster: | 270 | if (!(si->flags & SWP_WRITEOK)) |
134 | offset = si->cluster_next; | ||
135 | if (offset > si->highest_bit) | ||
136 | lowest: offset = si->lowest_bit; | ||
137 | checks: if (!(si->flags & SWP_WRITEOK)) | ||
138 | goto no_page; | 271 | goto no_page; |
139 | if (!si->highest_bit) | 272 | if (!si->highest_bit) |
140 | goto no_page; | 273 | goto no_page; |
141 | if (!si->swap_map[offset]) { | 274 | if (offset > si->highest_bit) |
142 | if (offset == si->lowest_bit) | 275 | scan_base = offset = si->lowest_bit; |
143 | si->lowest_bit++; | 276 | if (si->swap_map[offset]) |
144 | if (offset == si->highest_bit) | 277 | goto scan; |
145 | si->highest_bit--; | 278 | |
146 | si->inuse_pages++; | 279 | if (offset == si->lowest_bit) |
147 | if (si->inuse_pages == si->pages) { | 280 | si->lowest_bit++; |
148 | si->lowest_bit = si->max; | 281 | if (offset == si->highest_bit) |
149 | si->highest_bit = 0; | 282 | si->highest_bit--; |
283 | si->inuse_pages++; | ||
284 | if (si->inuse_pages == si->pages) { | ||
285 | si->lowest_bit = si->max; | ||
286 | si->highest_bit = 0; | ||
287 | } | ||
288 | si->swap_map[offset] = 1; | ||
289 | si->cluster_next = offset + 1; | ||
290 | si->flags -= SWP_SCANNING; | ||
291 | |||
292 | if (si->lowest_alloc) { | ||
293 | /* | ||
294 | * Only set when SWP_DISCARDABLE, and there's a scan | ||
295 | * for a free cluster in progress or just completed. | ||
296 | */ | ||
297 | if (found_free_cluster) { | ||
298 | /* | ||
299 | * To optimize wear-levelling, discard the | ||
300 | * old data of the cluster, taking care not to | ||
301 | * discard any of its pages that have already | ||
302 | * been allocated by racing tasks (offset has | ||
303 | * already stepped over any at the beginning). | ||
304 | */ | ||
305 | if (offset < si->highest_alloc && | ||
306 | si->lowest_alloc <= last_in_cluster) | ||
307 | last_in_cluster = si->lowest_alloc - 1; | ||
308 | si->flags |= SWP_DISCARDING; | ||
309 | spin_unlock(&swap_lock); | ||
310 | |||
311 | if (offset < last_in_cluster) | ||
312 | discard_swap_cluster(si, offset, | ||
313 | last_in_cluster - offset + 1); | ||
314 | |||
315 | spin_lock(&swap_lock); | ||
316 | si->lowest_alloc = 0; | ||
317 | si->flags &= ~SWP_DISCARDING; | ||
318 | |||
319 | smp_mb(); /* wake_up_bit advises this */ | ||
320 | wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); | ||
321 | |||
322 | } else if (si->flags & SWP_DISCARDING) { | ||
323 | /* | ||
324 | * Delay using pages allocated by racing tasks | ||
325 | * until the whole discard has been issued. We | ||
326 | * could defer that delay until swap_writepage, | ||
327 | * but it's easier to keep this self-contained. | ||
328 | */ | ||
329 | spin_unlock(&swap_lock); | ||
330 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), | ||
331 | wait_for_discard, TASK_UNINTERRUPTIBLE); | ||
332 | spin_lock(&swap_lock); | ||
333 | } else { | ||
334 | /* | ||
335 | * Note pages allocated by racing tasks while | ||
336 | * scan for a free cluster is in progress, so | ||
337 | * that its final discard can exclude them. | ||
338 | */ | ||
339 | if (offset < si->lowest_alloc) | ||
340 | si->lowest_alloc = offset; | ||
341 | if (offset > si->highest_alloc) | ||
342 | si->highest_alloc = offset; | ||
150 | } | 343 | } |
151 | si->swap_map[offset] = 1; | ||
152 | si->cluster_next = offset + 1; | ||
153 | si->flags -= SWP_SCANNING; | ||
154 | return offset; | ||
155 | } | 344 | } |
345 | return offset; | ||
156 | 346 | ||
347 | scan: | ||
157 | spin_unlock(&swap_lock); | 348 | spin_unlock(&swap_lock); |
158 | while (++offset <= si->highest_bit) { | 349 | while (++offset <= si->highest_bit) { |
159 | if (!si->swap_map[offset]) { | 350 | if (!si->swap_map[offset]) { |
@@ -165,8 +356,18 @@ checks: if (!(si->flags & SWP_WRITEOK)) | |||
165 | latency_ration = LATENCY_LIMIT; | 356 | latency_ration = LATENCY_LIMIT; |
166 | } | 357 | } |
167 | } | 358 | } |
359 | offset = si->lowest_bit; | ||
360 | while (++offset < scan_base) { | ||
361 | if (!si->swap_map[offset]) { | ||
362 | spin_lock(&swap_lock); | ||
363 | goto checks; | ||
364 | } | ||
365 | if (unlikely(--latency_ration < 0)) { | ||
366 | cond_resched(); | ||
367 | latency_ration = LATENCY_LIMIT; | ||
368 | } | ||
369 | } | ||
168 | spin_lock(&swap_lock); | 370 | spin_lock(&swap_lock); |
169 | goto lowest; | ||
170 | 371 | ||
171 | no_page: | 372 | no_page: |
172 | si->flags -= SWP_SCANNING; | 373 | si->flags -= SWP_SCANNING; |
@@ -268,10 +469,11 @@ bad_nofile: | |||
268 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); | 469 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); |
269 | out: | 470 | out: |
270 | return NULL; | 471 | return NULL; |
271 | } | 472 | } |
272 | 473 | ||
273 | static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) | 474 | static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) |
274 | { | 475 | { |
476 | unsigned long offset = swp_offset(ent); | ||
275 | int count = p->swap_map[offset]; | 477 | int count = p->swap_map[offset]; |
276 | 478 | ||
277 | if (count < SWAP_MAP_MAX) { | 479 | if (count < SWAP_MAP_MAX) { |
@@ -286,6 +488,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) | |||
286 | swap_list.next = p - swap_info; | 488 | swap_list.next = p - swap_info; |
287 | nr_swap_pages++; | 489 | nr_swap_pages++; |
288 | p->inuse_pages--; | 490 | p->inuse_pages--; |
491 | mem_cgroup_uncharge_swap(ent); | ||
289 | } | 492 | } |
290 | } | 493 | } |
291 | return count; | 494 | return count; |
@@ -301,7 +504,7 @@ void swap_free(swp_entry_t entry) | |||
301 | 504 | ||
302 | p = swap_info_get(entry); | 505 | p = swap_info_get(entry); |
303 | if (p) { | 506 | if (p) { |
304 | swap_entry_free(p, swp_offset(entry)); | 507 | swap_entry_free(p, entry); |
305 | spin_unlock(&swap_lock); | 508 | spin_unlock(&swap_lock); |
306 | } | 509 | } |
307 | } | 510 | } |
@@ -326,101 +529,62 @@ static inline int page_swapcount(struct page *page) | |||
326 | } | 529 | } |
327 | 530 | ||
328 | /* | 531 | /* |
329 | * We can use this swap cache entry directly | 532 | * We can write to an anon page without COW if there are no other references |
330 | * if there are no other references to it. | 533 | * to it. And as a side-effect, free up its swap: because the old content |
534 | * on disk will never be read, and seeking back there to write new content | ||
535 | * later would only waste time away from clustering. | ||
331 | */ | 536 | */ |
332 | int can_share_swap_page(struct page *page) | 537 | int reuse_swap_page(struct page *page) |
333 | { | 538 | { |
334 | int count; | 539 | int count; |
335 | 540 | ||
336 | BUG_ON(!PageLocked(page)); | 541 | VM_BUG_ON(!PageLocked(page)); |
337 | count = page_mapcount(page); | 542 | count = page_mapcount(page); |
338 | if (count <= 1 && PageSwapCache(page)) | 543 | if (count <= 1 && PageSwapCache(page)) { |
339 | count += page_swapcount(page); | 544 | count += page_swapcount(page); |
545 | if (count == 1 && !PageWriteback(page)) { | ||
546 | delete_from_swap_cache(page); | ||
547 | SetPageDirty(page); | ||
548 | } | ||
549 | } | ||
340 | return count == 1; | 550 | return count == 1; |
341 | } | 551 | } |
342 | 552 | ||
343 | /* | 553 | /* |
344 | * Work out if there are any other processes sharing this | 554 | * If swap is getting full, or if there are no more mappings of this page, |
345 | * swap cache page. Free it if you can. Return success. | 555 | * then try_to_free_swap is called to free its swap space. |
346 | */ | 556 | */ |
347 | static int remove_exclusive_swap_page_count(struct page *page, int count) | 557 | int try_to_free_swap(struct page *page) |
348 | { | 558 | { |
349 | int retval; | 559 | VM_BUG_ON(!PageLocked(page)); |
350 | struct swap_info_struct * p; | ||
351 | swp_entry_t entry; | ||
352 | |||
353 | BUG_ON(PagePrivate(page)); | ||
354 | BUG_ON(!PageLocked(page)); | ||
355 | 560 | ||
356 | if (!PageSwapCache(page)) | 561 | if (!PageSwapCache(page)) |
357 | return 0; | 562 | return 0; |
358 | if (PageWriteback(page)) | 563 | if (PageWriteback(page)) |
359 | return 0; | 564 | return 0; |
360 | if (page_count(page) != count) /* us + cache + ptes */ | 565 | if (page_swapcount(page)) |
361 | return 0; | 566 | return 0; |
362 | 567 | ||
363 | entry.val = page_private(page); | 568 | delete_from_swap_cache(page); |
364 | p = swap_info_get(entry); | 569 | SetPageDirty(page); |
365 | if (!p) | 570 | return 1; |
366 | return 0; | ||
367 | |||
368 | /* Is the only swap cache user the cache itself? */ | ||
369 | retval = 0; | ||
370 | if (p->swap_map[swp_offset(entry)] == 1) { | ||
371 | /* Recheck the page count with the swapcache lock held.. */ | ||
372 | spin_lock_irq(&swapper_space.tree_lock); | ||
373 | if ((page_count(page) == count) && !PageWriteback(page)) { | ||
374 | __delete_from_swap_cache(page); | ||
375 | SetPageDirty(page); | ||
376 | retval = 1; | ||
377 | } | ||
378 | spin_unlock_irq(&swapper_space.tree_lock); | ||
379 | } | ||
380 | spin_unlock(&swap_lock); | ||
381 | |||
382 | if (retval) { | ||
383 | swap_free(entry); | ||
384 | page_cache_release(page); | ||
385 | } | ||
386 | |||
387 | return retval; | ||
388 | } | ||
389 | |||
390 | /* | ||
391 | * Most of the time the page should have two references: one for the | ||
392 | * process and one for the swap cache. | ||
393 | */ | ||
394 | int remove_exclusive_swap_page(struct page *page) | ||
395 | { | ||
396 | return remove_exclusive_swap_page_count(page, 2); | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * The pageout code holds an extra reference to the page. That raises | ||
401 | * the reference count to test for to 2 for a page that is only in the | ||
402 | * swap cache plus 1 for each process that maps the page. | ||
403 | */ | ||
404 | int remove_exclusive_swap_page_ref(struct page *page) | ||
405 | { | ||
406 | return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page)); | ||
407 | } | 571 | } |
408 | 572 | ||
409 | /* | 573 | /* |
410 | * Free the swap entry like above, but also try to | 574 | * Free the swap entry like above, but also try to |
411 | * free the page cache entry if it is the last user. | 575 | * free the page cache entry if it is the last user. |
412 | */ | 576 | */ |
413 | void free_swap_and_cache(swp_entry_t entry) | 577 | int free_swap_and_cache(swp_entry_t entry) |
414 | { | 578 | { |
415 | struct swap_info_struct * p; | 579 | struct swap_info_struct *p; |
416 | struct page *page = NULL; | 580 | struct page *page = NULL; |
417 | 581 | ||
418 | if (is_migration_entry(entry)) | 582 | if (is_migration_entry(entry)) |
419 | return; | 583 | return 1; |
420 | 584 | ||
421 | p = swap_info_get(entry); | 585 | p = swap_info_get(entry); |
422 | if (p) { | 586 | if (p) { |
423 | if (swap_entry_free(p, swp_offset(entry)) == 1) { | 587 | if (swap_entry_free(p, entry) == 1) { |
424 | page = find_get_page(&swapper_space, entry.val); | 588 | page = find_get_page(&swapper_space, entry.val); |
425 | if (page && !trylock_page(page)) { | 589 | if (page && !trylock_page(page)) { |
426 | page_cache_release(page); | 590 | page_cache_release(page); |
@@ -430,20 +594,19 @@ void free_swap_and_cache(swp_entry_t entry) | |||
430 | spin_unlock(&swap_lock); | 594 | spin_unlock(&swap_lock); |
431 | } | 595 | } |
432 | if (page) { | 596 | if (page) { |
433 | int one_user; | 597 | /* |
434 | 598 | * Not mapped elsewhere, or swap space full? Free it! | |
435 | BUG_ON(PagePrivate(page)); | 599 | * Also recheck PageSwapCache now page is locked (above). |
436 | one_user = (page_count(page) == 2); | 600 | */ |
437 | /* Only cache user (+us), or swap space full? Free it! */ | ||
438 | /* Also recheck PageSwapCache after page is locked (above) */ | ||
439 | if (PageSwapCache(page) && !PageWriteback(page) && | 601 | if (PageSwapCache(page) && !PageWriteback(page) && |
440 | (one_user || vm_swap_full())) { | 602 | (!page_mapped(page) || vm_swap_full())) { |
441 | delete_from_swap_cache(page); | 603 | delete_from_swap_cache(page); |
442 | SetPageDirty(page); | 604 | SetPageDirty(page); |
443 | } | 605 | } |
444 | unlock_page(page); | 606 | unlock_page(page); |
445 | page_cache_release(page); | 607 | page_cache_release(page); |
446 | } | 608 | } |
609 | return p != NULL; | ||
447 | } | 610 | } |
448 | 611 | ||
449 | #ifdef CONFIG_HIBERNATION | 612 | #ifdef CONFIG_HIBERNATION |
@@ -530,17 +693,20 @@ unsigned int count_swap_pages(int type, int free) | |||
530 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | 693 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, |
531 | unsigned long addr, swp_entry_t entry, struct page *page) | 694 | unsigned long addr, swp_entry_t entry, struct page *page) |
532 | { | 695 | { |
696 | struct mem_cgroup *ptr = NULL; | ||
533 | spinlock_t *ptl; | 697 | spinlock_t *ptl; |
534 | pte_t *pte; | 698 | pte_t *pte; |
535 | int ret = 1; | 699 | int ret = 1; |
536 | 700 | ||
537 | if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) | 701 | if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { |
538 | ret = -ENOMEM; | 702 | ret = -ENOMEM; |
703 | goto out_nolock; | ||
704 | } | ||
539 | 705 | ||
540 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 706 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
541 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { | 707 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { |
542 | if (ret > 0) | 708 | if (ret > 0) |
543 | mem_cgroup_uncharge_page(page); | 709 | mem_cgroup_cancel_charge_swapin(ptr); |
544 | ret = 0; | 710 | ret = 0; |
545 | goto out; | 711 | goto out; |
546 | } | 712 | } |
@@ -550,6 +716,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
550 | set_pte_at(vma->vm_mm, addr, pte, | 716 | set_pte_at(vma->vm_mm, addr, pte, |
551 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 717 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
552 | page_add_anon_rmap(page, vma, addr); | 718 | page_add_anon_rmap(page, vma, addr); |
719 | mem_cgroup_commit_charge_swapin(page, ptr); | ||
553 | swap_free(entry); | 720 | swap_free(entry); |
554 | /* | 721 | /* |
555 | * Move the page to the active list so it is not | 722 | * Move the page to the active list so it is not |
@@ -558,6 +725,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
558 | activate_page(page); | 725 | activate_page(page); |
559 | out: | 726 | out: |
560 | pte_unmap_unlock(pte, ptl); | 727 | pte_unmap_unlock(pte, ptl); |
728 | out_nolock: | ||
561 | return ret; | 729 | return ret; |
562 | } | 730 | } |
563 | 731 | ||
@@ -776,10 +944,10 @@ static int try_to_unuse(unsigned int type) | |||
776 | break; | 944 | break; |
777 | } | 945 | } |
778 | 946 | ||
779 | /* | 947 | /* |
780 | * Get a page for the entry, using the existing swap | 948 | * Get a page for the entry, using the existing swap |
781 | * cache page if there is one. Otherwise, get a clean | 949 | * cache page if there is one. Otherwise, get a clean |
782 | * page and read the swap into it. | 950 | * page and read the swap into it. |
783 | */ | 951 | */ |
784 | swap_map = &si->swap_map[i]; | 952 | swap_map = &si->swap_map[i]; |
785 | entry = swp_entry(type, i); | 953 | entry = swp_entry(type, i); |
@@ -930,7 +1098,16 @@ static int try_to_unuse(unsigned int type) | |||
930 | lock_page(page); | 1098 | lock_page(page); |
931 | wait_on_page_writeback(page); | 1099 | wait_on_page_writeback(page); |
932 | } | 1100 | } |
933 | if (PageSwapCache(page)) | 1101 | |
1102 | /* | ||
1103 | * It is conceivable that a racing task removed this page from | ||
1104 | * swap cache just before we acquired the page lock at the top, | ||
1105 | * or while we dropped it in unuse_mm(). The page might even | ||
1106 | * be back in swap cache on another swap area: that we must not | ||
1107 | * delete, since it may not have been written out to swap yet. | ||
1108 | */ | ||
1109 | if (PageSwapCache(page) && | ||
1110 | likely(page_private(page) == entry.val)) | ||
934 | delete_from_swap_cache(page); | 1111 | delete_from_swap_cache(page); |
935 | 1112 | ||
936 | /* | 1113 | /* |
@@ -1203,27 +1380,7 @@ out: | |||
1203 | return ret; | 1380 | return ret; |
1204 | } | 1381 | } |
1205 | 1382 | ||
1206 | #if 0 /* We don't need this yet */ | 1383 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
1207 | #include <linux/backing-dev.h> | ||
1208 | int page_queue_congested(struct page *page) | ||
1209 | { | ||
1210 | struct backing_dev_info *bdi; | ||
1211 | |||
1212 | BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ | ||
1213 | |||
1214 | if (PageSwapCache(page)) { | ||
1215 | swp_entry_t entry = { .val = page_private(page) }; | ||
1216 | struct swap_info_struct *sis; | ||
1217 | |||
1218 | sis = get_swap_info_struct(swp_type(entry)); | ||
1219 | bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info; | ||
1220 | } else | ||
1221 | bdi = page->mapping->backing_dev_info; | ||
1222 | return bdi_write_congested(bdi); | ||
1223 | } | ||
1224 | #endif | ||
1225 | |||
1226 | asmlinkage long sys_swapoff(const char __user * specialfile) | ||
1227 | { | 1384 | { |
1228 | struct swap_info_struct * p = NULL; | 1385 | struct swap_info_struct * p = NULL; |
1229 | unsigned short *swap_map; | 1386 | unsigned short *swap_map; |
@@ -1233,7 +1390,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
1233 | char * pathname; | 1390 | char * pathname; |
1234 | int i, type, prev; | 1391 | int i, type, prev; |
1235 | int err; | 1392 | int err; |
1236 | 1393 | ||
1237 | if (!capable(CAP_SYS_ADMIN)) | 1394 | if (!capable(CAP_SYS_ADMIN)) |
1238 | return -EPERM; | 1395 | return -EPERM; |
1239 | 1396 | ||
@@ -1253,7 +1410,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
1253 | spin_lock(&swap_lock); | 1410 | spin_lock(&swap_lock); |
1254 | for (type = swap_list.head; type >= 0; type = swap_info[type].next) { | 1411 | for (type = swap_list.head; type >= 0; type = swap_info[type].next) { |
1255 | p = swap_info + type; | 1412 | p = swap_info + type; |
1256 | if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { | 1413 | if (p->flags & SWP_WRITEOK) { |
1257 | if (p->swap_file->f_mapping == mapping) | 1414 | if (p->swap_file->f_mapping == mapping) |
1258 | break; | 1415 | break; |
1259 | } | 1416 | } |
@@ -1343,6 +1500,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
1343 | spin_unlock(&swap_lock); | 1500 | spin_unlock(&swap_lock); |
1344 | mutex_unlock(&swapon_mutex); | 1501 | mutex_unlock(&swapon_mutex); |
1345 | vfree(swap_map); | 1502 | vfree(swap_map); |
1503 | /* Destroy swap account informatin */ | ||
1504 | swap_cgroup_swapoff(type); | ||
1505 | |||
1346 | inode = mapping->host; | 1506 | inode = mapping->host; |
1347 | if (S_ISBLK(inode->i_mode)) { | 1507 | if (S_ISBLK(inode->i_mode)) { |
1348 | struct block_device *bdev = I_BDEV(inode); | 1508 | struct block_device *bdev = I_BDEV(inode); |
@@ -1426,12 +1586,12 @@ static int swap_show(struct seq_file *swap, void *v) | |||
1426 | file = ptr->swap_file; | 1586 | file = ptr->swap_file; |
1427 | len = seq_path(swap, &file->f_path, " \t\n\\"); | 1587 | len = seq_path(swap, &file->f_path, " \t\n\\"); |
1428 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", | 1588 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
1429 | len < 40 ? 40 - len : 1, " ", | 1589 | len < 40 ? 40 - len : 1, " ", |
1430 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? | 1590 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? |
1431 | "partition" : "file\t", | 1591 | "partition" : "file\t", |
1432 | ptr->pages << (PAGE_SHIFT - 10), | 1592 | ptr->pages << (PAGE_SHIFT - 10), |
1433 | ptr->inuse_pages << (PAGE_SHIFT - 10), | 1593 | ptr->inuse_pages << (PAGE_SHIFT - 10), |
1434 | ptr->prio); | 1594 | ptr->prio); |
1435 | return 0; | 1595 | return 0; |
1436 | } | 1596 | } |
1437 | 1597 | ||
@@ -1476,7 +1636,7 @@ late_initcall(max_swapfiles_check); | |||
1476 | * | 1636 | * |
1477 | * The swapon system call | 1637 | * The swapon system call |
1478 | */ | 1638 | */ |
1479 | asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | 1639 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) |
1480 | { | 1640 | { |
1481 | struct swap_info_struct * p; | 1641 | struct swap_info_struct * p; |
1482 | char *name = NULL; | 1642 | char *name = NULL; |
@@ -1487,12 +1647,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1487 | int i, prev; | 1647 | int i, prev; |
1488 | int error; | 1648 | int error; |
1489 | union swap_header *swap_header = NULL; | 1649 | union swap_header *swap_header = NULL; |
1490 | int swap_header_version; | ||
1491 | unsigned int nr_good_pages = 0; | 1650 | unsigned int nr_good_pages = 0; |
1492 | int nr_extents = 0; | 1651 | int nr_extents = 0; |
1493 | sector_t span; | 1652 | sector_t span; |
1494 | unsigned long maxpages = 1; | 1653 | unsigned long maxpages = 1; |
1495 | int swapfilesize; | 1654 | unsigned long swapfilepages; |
1496 | unsigned short *swap_map = NULL; | 1655 | unsigned short *swap_map = NULL; |
1497 | struct page *page = NULL; | 1656 | struct page *page = NULL; |
1498 | struct inode *inode = NULL; | 1657 | struct inode *inode = NULL; |
@@ -1570,7 +1729,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1570 | goto bad_swap; | 1729 | goto bad_swap; |
1571 | } | 1730 | } |
1572 | 1731 | ||
1573 | swapfilesize = i_size_read(inode) >> PAGE_SHIFT; | 1732 | swapfilepages = i_size_read(inode) >> PAGE_SHIFT; |
1574 | 1733 | ||
1575 | /* | 1734 | /* |
1576 | * Read the swap header. | 1735 | * Read the swap header. |
@@ -1584,102 +1743,92 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1584 | error = PTR_ERR(page); | 1743 | error = PTR_ERR(page); |
1585 | goto bad_swap; | 1744 | goto bad_swap; |
1586 | } | 1745 | } |
1587 | kmap(page); | 1746 | swap_header = kmap(page); |
1588 | swap_header = page_address(page); | ||
1589 | 1747 | ||
1590 | if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) | 1748 | if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { |
1591 | swap_header_version = 1; | ||
1592 | else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) | ||
1593 | swap_header_version = 2; | ||
1594 | else { | ||
1595 | printk(KERN_ERR "Unable to find swap-space signature\n"); | 1749 | printk(KERN_ERR "Unable to find swap-space signature\n"); |
1596 | error = -EINVAL; | 1750 | error = -EINVAL; |
1597 | goto bad_swap; | 1751 | goto bad_swap; |
1598 | } | 1752 | } |
1599 | 1753 | ||
1600 | switch (swap_header_version) { | 1754 | /* swap partition endianess hack... */ |
1601 | case 1: | 1755 | if (swab32(swap_header->info.version) == 1) { |
1602 | printk(KERN_ERR "version 0 swap is no longer supported. " | 1756 | swab32s(&swap_header->info.version); |
1603 | "Use mkswap -v1 %s\n", name); | 1757 | swab32s(&swap_header->info.last_page); |
1758 | swab32s(&swap_header->info.nr_badpages); | ||
1759 | for (i = 0; i < swap_header->info.nr_badpages; i++) | ||
1760 | swab32s(&swap_header->info.badpages[i]); | ||
1761 | } | ||
1762 | /* Check the swap header's sub-version */ | ||
1763 | if (swap_header->info.version != 1) { | ||
1764 | printk(KERN_WARNING | ||
1765 | "Unable to handle swap header version %d\n", | ||
1766 | swap_header->info.version); | ||
1604 | error = -EINVAL; | 1767 | error = -EINVAL; |
1605 | goto bad_swap; | 1768 | goto bad_swap; |
1606 | case 2: | 1769 | } |
1607 | /* swap partition endianess hack... */ | ||
1608 | if (swab32(swap_header->info.version) == 1) { | ||
1609 | swab32s(&swap_header->info.version); | ||
1610 | swab32s(&swap_header->info.last_page); | ||
1611 | swab32s(&swap_header->info.nr_badpages); | ||
1612 | for (i = 0; i < swap_header->info.nr_badpages; i++) | ||
1613 | swab32s(&swap_header->info.badpages[i]); | ||
1614 | } | ||
1615 | /* Check the swap header's sub-version and the size of | ||
1616 | the swap file and bad block lists */ | ||
1617 | if (swap_header->info.version != 1) { | ||
1618 | printk(KERN_WARNING | ||
1619 | "Unable to handle swap header version %d\n", | ||
1620 | swap_header->info.version); | ||
1621 | error = -EINVAL; | ||
1622 | goto bad_swap; | ||
1623 | } | ||
1624 | 1770 | ||
1625 | p->lowest_bit = 1; | 1771 | p->lowest_bit = 1; |
1626 | p->cluster_next = 1; | 1772 | p->cluster_next = 1; |
1627 | 1773 | ||
1628 | /* | 1774 | /* |
1629 | * Find out how many pages are allowed for a single swap | 1775 | * Find out how many pages are allowed for a single swap |
1630 | * device. There are two limiting factors: 1) the number of | 1776 | * device. There are two limiting factors: 1) the number of |
1631 | * bits for the swap offset in the swp_entry_t type and | 1777 | * bits for the swap offset in the swp_entry_t type and |
1632 | * 2) the number of bits in the a swap pte as defined by | 1778 | * 2) the number of bits in the a swap pte as defined by |
1633 | * the different architectures. In order to find the | 1779 | * the different architectures. In order to find the |
1634 | * largest possible bit mask a swap entry with swap type 0 | 1780 | * largest possible bit mask a swap entry with swap type 0 |
1635 | * and swap offset ~0UL is created, encoded to a swap pte, | 1781 | * and swap offset ~0UL is created, encoded to a swap pte, |
1636 | * decoded to a swp_entry_t again and finally the swap | 1782 | * decoded to a swp_entry_t again and finally the swap |
1637 | * offset is extracted. This will mask all the bits from | 1783 | * offset is extracted. This will mask all the bits from |
1638 | * the initial ~0UL mask that can't be encoded in either | 1784 | * the initial ~0UL mask that can't be encoded in either |
1639 | * the swp_entry_t or the architecture definition of a | 1785 | * the swp_entry_t or the architecture definition of a |
1640 | * swap pte. | 1786 | * swap pte. |
1641 | */ | 1787 | */ |
1642 | maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; | 1788 | maxpages = swp_offset(pte_to_swp_entry( |
1643 | if (maxpages > swap_header->info.last_page) | 1789 | swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; |
1644 | maxpages = swap_header->info.last_page; | 1790 | if (maxpages > swap_header->info.last_page) |
1645 | p->highest_bit = maxpages - 1; | 1791 | maxpages = swap_header->info.last_page; |
1792 | p->highest_bit = maxpages - 1; | ||
1646 | 1793 | ||
1647 | error = -EINVAL; | 1794 | error = -EINVAL; |
1648 | if (!maxpages) | 1795 | if (!maxpages) |
1649 | goto bad_swap; | 1796 | goto bad_swap; |
1650 | if (swapfilesize && maxpages > swapfilesize) { | 1797 | if (swapfilepages && maxpages > swapfilepages) { |
1651 | printk(KERN_WARNING | 1798 | printk(KERN_WARNING |
1652 | "Swap area shorter than signature indicates\n"); | 1799 | "Swap area shorter than signature indicates\n"); |
1653 | goto bad_swap; | 1800 | goto bad_swap; |
1654 | } | 1801 | } |
1655 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) | 1802 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) |
1656 | goto bad_swap; | 1803 | goto bad_swap; |
1657 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) | 1804 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) |
1658 | goto bad_swap; | 1805 | goto bad_swap; |
1659 | 1806 | ||
1660 | /* OK, set up the swap map and apply the bad block list */ | 1807 | /* OK, set up the swap map and apply the bad block list */ |
1661 | swap_map = vmalloc(maxpages * sizeof(short)); | 1808 | swap_map = vmalloc(maxpages * sizeof(short)); |
1662 | if (!swap_map) { | 1809 | if (!swap_map) { |
1663 | error = -ENOMEM; | 1810 | error = -ENOMEM; |
1664 | goto bad_swap; | 1811 | goto bad_swap; |
1665 | } | 1812 | } |
1666 | 1813 | ||
1667 | error = 0; | 1814 | memset(swap_map, 0, maxpages * sizeof(short)); |
1668 | memset(swap_map, 0, maxpages * sizeof(short)); | 1815 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
1669 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 1816 | int page_nr = swap_header->info.badpages[i]; |
1670 | int page_nr = swap_header->info.badpages[i]; | 1817 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { |
1671 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) | 1818 | error = -EINVAL; |
1672 | error = -EINVAL; | ||
1673 | else | ||
1674 | swap_map[page_nr] = SWAP_MAP_BAD; | ||
1675 | } | ||
1676 | nr_good_pages = swap_header->info.last_page - | ||
1677 | swap_header->info.nr_badpages - | ||
1678 | 1 /* header page */; | ||
1679 | if (error) | ||
1680 | goto bad_swap; | 1819 | goto bad_swap; |
1820 | } | ||
1821 | swap_map[page_nr] = SWAP_MAP_BAD; | ||
1681 | } | 1822 | } |
1682 | 1823 | ||
1824 | error = swap_cgroup_swapon(type, maxpages); | ||
1825 | if (error) | ||
1826 | goto bad_swap; | ||
1827 | |||
1828 | nr_good_pages = swap_header->info.last_page - | ||
1829 | swap_header->info.nr_badpages - | ||
1830 | 1 /* header page */; | ||
1831 | |||
1683 | if (nr_good_pages) { | 1832 | if (nr_good_pages) { |
1684 | swap_map[0] = SWAP_MAP_BAD; | 1833 | swap_map[0] = SWAP_MAP_BAD; |
1685 | p->max = maxpages; | 1834 | p->max = maxpages; |
@@ -1697,6 +1846,13 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1697 | goto bad_swap; | 1846 | goto bad_swap; |
1698 | } | 1847 | } |
1699 | 1848 | ||
1849 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | ||
1850 | p->flags |= SWP_SOLIDSTATE; | ||
1851 | p->cluster_next = 1 + (random32() % p->highest_bit); | ||
1852 | } | ||
1853 | if (discard_swap(p) == 0) | ||
1854 | p->flags |= SWP_DISCARDABLE; | ||
1855 | |||
1700 | mutex_lock(&swapon_mutex); | 1856 | mutex_lock(&swapon_mutex); |
1701 | spin_lock(&swap_lock); | 1857 | spin_lock(&swap_lock); |
1702 | if (swap_flags & SWAP_FLAG_PREFER) | 1858 | if (swap_flags & SWAP_FLAG_PREFER) |
@@ -1705,14 +1861,16 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1705 | else | 1861 | else |
1706 | p->prio = --least_priority; | 1862 | p->prio = --least_priority; |
1707 | p->swap_map = swap_map; | 1863 | p->swap_map = swap_map; |
1708 | p->flags = SWP_ACTIVE; | 1864 | p->flags |= SWP_WRITEOK; |
1709 | nr_swap_pages += nr_good_pages; | 1865 | nr_swap_pages += nr_good_pages; |
1710 | total_swap_pages += nr_good_pages; | 1866 | total_swap_pages += nr_good_pages; |
1711 | 1867 | ||
1712 | printk(KERN_INFO "Adding %uk swap on %s. " | 1868 | printk(KERN_INFO "Adding %uk swap on %s. " |
1713 | "Priority:%d extents:%d across:%lluk\n", | 1869 | "Priority:%d extents:%d across:%lluk %s%s\n", |
1714 | nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, | 1870 | nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, |
1715 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10)); | 1871 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
1872 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", | ||
1873 | (p->flags & SWP_DISCARDABLE) ? "D" : ""); | ||
1716 | 1874 | ||
1717 | /* insert swap space into swap_list: */ | 1875 | /* insert swap space into swap_list: */ |
1718 | prev = -1; | 1876 | prev = -1; |
@@ -1738,6 +1896,7 @@ bad_swap: | |||
1738 | bd_release(bdev); | 1896 | bd_release(bdev); |
1739 | } | 1897 | } |
1740 | destroy_swap_extents(p); | 1898 | destroy_swap_extents(p); |
1899 | swap_cgroup_swapoff(type); | ||
1741 | bad_swap_2: | 1900 | bad_swap_2: |
1742 | spin_lock(&swap_lock); | 1901 | spin_lock(&swap_lock); |
1743 | p->swap_file = NULL; | 1902 | p->swap_file = NULL; |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c deleted file mode 100644 index 3e67d575ee6e..000000000000 --- a/mm/tiny-shmem.c +++ /dev/null | |||
@@ -1,134 +0,0 @@ | |||
1 | /* | ||
2 | * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code | ||
3 | * | ||
4 | * Matt Mackall <mpm@selenic.com> January, 2004 | ||
5 | * derived from mm/shmem.c and fs/ramfs/inode.c | ||
6 | * | ||
7 | * This is intended for small system where the benefits of the full | ||
8 | * shmem code (swap-backed and resource-limited) are outweighed by | ||
9 | * their complexity. On systems without swap this code should be | ||
10 | * effectively equivalent, but much lighter weight. | ||
11 | */ | ||
12 | |||
13 | #include <linux/fs.h> | ||
14 | #include <linux/init.h> | ||
15 | #include <linux/vfs.h> | ||
16 | #include <linux/mount.h> | ||
17 | #include <linux/file.h> | ||
18 | #include <linux/mm.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/swap.h> | ||
21 | #include <linux/ramfs.h> | ||
22 | |||
23 | static struct file_system_type tmpfs_fs_type = { | ||
24 | .name = "tmpfs", | ||
25 | .get_sb = ramfs_get_sb, | ||
26 | .kill_sb = kill_litter_super, | ||
27 | }; | ||
28 | |||
29 | static struct vfsmount *shm_mnt; | ||
30 | |||
31 | static int __init init_tmpfs(void) | ||
32 | { | ||
33 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); | ||
34 | |||
35 | shm_mnt = kern_mount(&tmpfs_fs_type); | ||
36 | BUG_ON(IS_ERR(shm_mnt)); | ||
37 | |||
38 | return 0; | ||
39 | } | ||
40 | module_init(init_tmpfs) | ||
41 | |||
42 | /** | ||
43 | * shmem_file_setup - get an unlinked file living in tmpfs | ||
44 | * @name: name for dentry (to be seen in /proc/<pid>/maps | ||
45 | * @size: size to be set for the file | ||
46 | * @flags: vm_flags | ||
47 | */ | ||
48 | struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | ||
49 | { | ||
50 | int error; | ||
51 | struct file *file; | ||
52 | struct inode *inode; | ||
53 | struct dentry *dentry, *root; | ||
54 | struct qstr this; | ||
55 | |||
56 | if (IS_ERR(shm_mnt)) | ||
57 | return (void *)shm_mnt; | ||
58 | |||
59 | error = -ENOMEM; | ||
60 | this.name = name; | ||
61 | this.len = strlen(name); | ||
62 | this.hash = 0; /* will go */ | ||
63 | root = shm_mnt->mnt_root; | ||
64 | dentry = d_alloc(root, &this); | ||
65 | if (!dentry) | ||
66 | goto put_memory; | ||
67 | |||
68 | error = -ENFILE; | ||
69 | file = get_empty_filp(); | ||
70 | if (!file) | ||
71 | goto put_dentry; | ||
72 | |||
73 | error = -ENOSPC; | ||
74 | inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); | ||
75 | if (!inode) | ||
76 | goto close_file; | ||
77 | |||
78 | d_instantiate(dentry, inode); | ||
79 | inode->i_size = size; | ||
80 | inode->i_nlink = 0; /* It is unlinked */ | ||
81 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
82 | &ramfs_file_operations); | ||
83 | |||
84 | #ifndef CONFIG_MMU | ||
85 | error = ramfs_nommu_expand_for_mapping(inode, size); | ||
86 | if (error) | ||
87 | goto close_file; | ||
88 | #endif | ||
89 | return file; | ||
90 | |||
91 | close_file: | ||
92 | put_filp(file); | ||
93 | put_dentry: | ||
94 | dput(dentry); | ||
95 | put_memory: | ||
96 | return ERR_PTR(error); | ||
97 | } | ||
98 | EXPORT_SYMBOL_GPL(shmem_file_setup); | ||
99 | |||
100 | /** | ||
101 | * shmem_zero_setup - setup a shared anonymous mapping | ||
102 | * @vma: the vma to be mmapped is prepared by do_mmap_pgoff | ||
103 | */ | ||
104 | int shmem_zero_setup(struct vm_area_struct *vma) | ||
105 | { | ||
106 | struct file *file; | ||
107 | loff_t size = vma->vm_end - vma->vm_start; | ||
108 | |||
109 | file = shmem_file_setup("dev/zero", size, vma->vm_flags); | ||
110 | if (IS_ERR(file)) | ||
111 | return PTR_ERR(file); | ||
112 | |||
113 | if (vma->vm_file) | ||
114 | fput(vma->vm_file); | ||
115 | vma->vm_file = file; | ||
116 | vma->vm_ops = &generic_file_vm_ops; | ||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | int shmem_unuse(swp_entry_t entry, struct page *page) | ||
121 | { | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | #ifndef CONFIG_MMU | ||
126 | unsigned long shmem_get_unmapped_area(struct file *file, | ||
127 | unsigned long addr, | ||
128 | unsigned long len, | ||
129 | unsigned long pgoff, | ||
130 | unsigned long flags) | ||
131 | { | ||
132 | return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags); | ||
133 | } | ||
134 | #endif | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1ddb77ba3995..75f49d312e8c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/rbtree.h> | 23 | #include <linux/rbtree.h> |
24 | #include <linux/radix-tree.h> | 24 | #include <linux/radix-tree.h> |
25 | #include <linux/rcupdate.h> | 25 | #include <linux/rcupdate.h> |
26 | #include <linux/bootmem.h> | ||
26 | 27 | ||
27 | #include <asm/atomic.h> | 28 | #include <asm/atomic.h> |
28 | #include <asm/uaccess.h> | 29 | #include <asm/uaccess.h> |
@@ -151,11 +152,12 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr, | |||
151 | * | 152 | * |
152 | * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] | 153 | * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] |
153 | */ | 154 | */ |
154 | static int vmap_page_range(unsigned long addr, unsigned long end, | 155 | static int vmap_page_range(unsigned long start, unsigned long end, |
155 | pgprot_t prot, struct page **pages) | 156 | pgprot_t prot, struct page **pages) |
156 | { | 157 | { |
157 | pgd_t *pgd; | 158 | pgd_t *pgd; |
158 | unsigned long next; | 159 | unsigned long next; |
160 | unsigned long addr = start; | ||
159 | int err = 0; | 161 | int err = 0; |
160 | int nr = 0; | 162 | int nr = 0; |
161 | 163 | ||
@@ -167,7 +169,7 @@ static int vmap_page_range(unsigned long addr, unsigned long end, | |||
167 | if (err) | 169 | if (err) |
168 | break; | 170 | break; |
169 | } while (pgd++, addr = next, addr != end); | 171 | } while (pgd++, addr = next, addr != end); |
170 | flush_cache_vmap(addr, end); | 172 | flush_cache_vmap(start, end); |
171 | 173 | ||
172 | if (unlikely(err)) | 174 | if (unlikely(err)) |
173 | return err; | 175 | return err; |
@@ -380,8 +382,9 @@ found: | |||
380 | goto retry; | 382 | goto retry; |
381 | } | 383 | } |
382 | if (printk_ratelimit()) | 384 | if (printk_ratelimit()) |
383 | printk(KERN_WARNING "vmap allocation failed: " | 385 | printk(KERN_WARNING |
384 | "use vmalloc=<size> to increase size.\n"); | 386 | "vmap allocation for size %lu failed: " |
387 | "use vmalloc=<size> to increase size.\n", size); | ||
385 | return ERR_PTR(-EBUSY); | 388 | return ERR_PTR(-EBUSY); |
386 | } | 389 | } |
387 | 390 | ||
@@ -431,6 +434,27 @@ static void unmap_vmap_area(struct vmap_area *va) | |||
431 | vunmap_page_range(va->va_start, va->va_end); | 434 | vunmap_page_range(va->va_start, va->va_end); |
432 | } | 435 | } |
433 | 436 | ||
437 | static void vmap_debug_free_range(unsigned long start, unsigned long end) | ||
438 | { | ||
439 | /* | ||
440 | * Unmap page tables and force a TLB flush immediately if | ||
441 | * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free | ||
442 | * bugs similarly to those in linear kernel virtual address | ||
443 | * space after a page has been freed. | ||
444 | * | ||
445 | * All the lazy freeing logic is still retained, in order to | ||
446 | * minimise intrusiveness of this debugging feature. | ||
447 | * | ||
448 | * This is going to be *slow* (linear kernel virtual address | ||
449 | * debugging doesn't do a broadcast TLB flush so it is a lot | ||
450 | * faster). | ||
451 | */ | ||
452 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
453 | vunmap_page_range(start, end); | ||
454 | flush_tlb_kernel_range(start, end); | ||
455 | #endif | ||
456 | } | ||
457 | |||
434 | /* | 458 | /* |
435 | * lazy_max_pages is the maximum amount of virtual address space we gather up | 459 | * lazy_max_pages is the maximum amount of virtual address space we gather up |
436 | * before attempting to purge with a TLB flush. | 460 | * before attempting to purge with a TLB flush. |
@@ -911,6 +935,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) | |||
911 | BUG_ON(addr & (PAGE_SIZE-1)); | 935 | BUG_ON(addr & (PAGE_SIZE-1)); |
912 | 936 | ||
913 | debug_check_no_locks_freed(mem, size); | 937 | debug_check_no_locks_freed(mem, size); |
938 | vmap_debug_free_range(addr, addr+size); | ||
914 | 939 | ||
915 | if (likely(count <= VMAP_MAX_ALLOC)) | 940 | if (likely(count <= VMAP_MAX_ALLOC)) |
916 | vb_free(mem, size); | 941 | vb_free(mem, size); |
@@ -959,6 +984,8 @@ EXPORT_SYMBOL(vm_map_ram); | |||
959 | 984 | ||
960 | void __init vmalloc_init(void) | 985 | void __init vmalloc_init(void) |
961 | { | 986 | { |
987 | struct vmap_area *va; | ||
988 | struct vm_struct *tmp; | ||
962 | int i; | 989 | int i; |
963 | 990 | ||
964 | for_each_possible_cpu(i) { | 991 | for_each_possible_cpu(i) { |
@@ -971,6 +998,14 @@ void __init vmalloc_init(void) | |||
971 | vbq->nr_dirty = 0; | 998 | vbq->nr_dirty = 0; |
972 | } | 999 | } |
973 | 1000 | ||
1001 | /* Import existing vmlist entries. */ | ||
1002 | for (tmp = vmlist; tmp; tmp = tmp->next) { | ||
1003 | va = alloc_bootmem(sizeof(struct vmap_area)); | ||
1004 | va->flags = tmp->flags | VM_VM_AREA; | ||
1005 | va->va_start = (unsigned long)tmp->addr; | ||
1006 | va->va_end = va->va_start + tmp->size; | ||
1007 | __insert_vmap_area(va); | ||
1008 | } | ||
974 | vmap_initialized = true; | 1009 | vmap_initialized = true; |
975 | } | 1010 | } |
976 | 1011 | ||
@@ -1127,6 +1162,8 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
1127 | if (va && va->flags & VM_VM_AREA) { | 1162 | if (va && va->flags & VM_VM_AREA) { |
1128 | struct vm_struct *vm = va->private; | 1163 | struct vm_struct *vm = va->private; |
1129 | struct vm_struct *tmp, **p; | 1164 | struct vm_struct *tmp, **p; |
1165 | |||
1166 | vmap_debug_free_range(va->va_start, va->va_end); | ||
1130 | free_unmap_vmap_area(va); | 1167 | free_unmap_vmap_area(va); |
1131 | vm->size -= PAGE_SIZE; | 1168 | vm->size -= PAGE_SIZE; |
1132 | 1169 | ||
@@ -1374,7 +1411,8 @@ void *vmalloc_user(unsigned long size) | |||
1374 | struct vm_struct *area; | 1411 | struct vm_struct *area; |
1375 | void *ret; | 1412 | void *ret; |
1376 | 1413 | ||
1377 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); | 1414 | ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, |
1415 | PAGE_KERNEL, -1, __builtin_return_address(0)); | ||
1378 | if (ret) { | 1416 | if (ret) { |
1379 | area = find_vm_area(ret); | 1417 | area = find_vm_area(ret); |
1380 | area->flags |= VM_USERMAP; | 1418 | area->flags |= VM_USERMAP; |
@@ -1419,7 +1457,8 @@ EXPORT_SYMBOL(vmalloc_node); | |||
1419 | 1457 | ||
1420 | void *vmalloc_exec(unsigned long size) | 1458 | void *vmalloc_exec(unsigned long size) |
1421 | { | 1459 | { |
1422 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); | 1460 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, |
1461 | -1, __builtin_return_address(0)); | ||
1423 | } | 1462 | } |
1424 | 1463 | ||
1425 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) | 1464 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) |
@@ -1439,7 +1478,8 @@ void *vmalloc_exec(unsigned long size) | |||
1439 | */ | 1478 | */ |
1440 | void *vmalloc_32(unsigned long size) | 1479 | void *vmalloc_32(unsigned long size) |
1441 | { | 1480 | { |
1442 | return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL); | 1481 | return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, |
1482 | -1, __builtin_return_address(0)); | ||
1443 | } | 1483 | } |
1444 | EXPORT_SYMBOL(vmalloc_32); | 1484 | EXPORT_SYMBOL(vmalloc_32); |
1445 | 1485 | ||
@@ -1455,7 +1495,8 @@ void *vmalloc_32_user(unsigned long size) | |||
1455 | struct vm_struct *area; | 1495 | struct vm_struct *area; |
1456 | void *ret; | 1496 | void *ret; |
1457 | 1497 | ||
1458 | ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); | 1498 | ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, |
1499 | -1, __builtin_return_address(0)); | ||
1459 | if (ret) { | 1500 | if (ret) { |
1460 | area = find_vm_area(ret); | 1501 | area = find_vm_area(ret); |
1461 | area->flags |= VM_USERMAP; | 1502 | area->flags |= VM_USERMAP; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 62e7f62fb559..9a27c44aa327 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -52,6 +52,9 @@ struct scan_control { | |||
52 | /* Incremented by the number of inactive pages that were scanned */ | 52 | /* Incremented by the number of inactive pages that were scanned */ |
53 | unsigned long nr_scanned; | 53 | unsigned long nr_scanned; |
54 | 54 | ||
55 | /* Number of pages freed so far during a call to shrink_zones() */ | ||
56 | unsigned long nr_reclaimed; | ||
57 | |||
55 | /* This context's GFP mask */ | 58 | /* This context's GFP mask */ |
56 | gfp_t gfp_mask; | 59 | gfp_t gfp_mask; |
57 | 60 | ||
@@ -122,11 +125,30 @@ static LIST_HEAD(shrinker_list); | |||
122 | static DECLARE_RWSEM(shrinker_rwsem); | 125 | static DECLARE_RWSEM(shrinker_rwsem); |
123 | 126 | ||
124 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 127 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
125 | #define scan_global_lru(sc) (!(sc)->mem_cgroup) | 128 | #define scanning_global_lru(sc) (!(sc)->mem_cgroup) |
126 | #else | 129 | #else |
127 | #define scan_global_lru(sc) (1) | 130 | #define scanning_global_lru(sc) (1) |
128 | #endif | 131 | #endif |
129 | 132 | ||
133 | static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, | ||
134 | struct scan_control *sc) | ||
135 | { | ||
136 | if (!scanning_global_lru(sc)) | ||
137 | return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); | ||
138 | |||
139 | return &zone->reclaim_stat; | ||
140 | } | ||
141 | |||
142 | static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, | ||
143 | enum lru_list lru) | ||
144 | { | ||
145 | if (!scanning_global_lru(sc)) | ||
146 | return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); | ||
147 | |||
148 | return zone_page_state(zone, NR_LRU_BASE + lru); | ||
149 | } | ||
150 | |||
151 | |||
130 | /* | 152 | /* |
131 | * Add a shrinker callback to be called from the vm | 153 | * Add a shrinker callback to be called from the vm |
132 | */ | 154 | */ |
@@ -509,7 +531,6 @@ redo: | |||
509 | lru = LRU_UNEVICTABLE; | 531 | lru = LRU_UNEVICTABLE; |
510 | add_page_to_unevictable_list(page); | 532 | add_page_to_unevictable_list(page); |
511 | } | 533 | } |
512 | mem_cgroup_move_lists(page, lru); | ||
513 | 534 | ||
514 | /* | 535 | /* |
515 | * page's status can change while we move it among lru. If an evictable | 536 | * page's status can change while we move it among lru. If an evictable |
@@ -544,7 +565,6 @@ void putback_lru_page(struct page *page) | |||
544 | 565 | ||
545 | lru = !!TestClearPageActive(page) + page_is_file_cache(page); | 566 | lru = !!TestClearPageActive(page) + page_is_file_cache(page); |
546 | lru_cache_add_lru(page, lru); | 567 | lru_cache_add_lru(page, lru); |
547 | mem_cgroup_move_lists(page, lru); | ||
548 | put_page(page); | 568 | put_page(page); |
549 | } | 569 | } |
550 | #endif /* CONFIG_UNEVICTABLE_LRU */ | 570 | #endif /* CONFIG_UNEVICTABLE_LRU */ |
@@ -617,7 +637,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
617 | referenced && page_mapping_inuse(page)) | 637 | referenced && page_mapping_inuse(page)) |
618 | goto activate_locked; | 638 | goto activate_locked; |
619 | 639 | ||
620 | #ifdef CONFIG_SWAP | ||
621 | /* | 640 | /* |
622 | * Anonymous process memory has backing store? | 641 | * Anonymous process memory has backing store? |
623 | * Try to allocate it some swap space here. | 642 | * Try to allocate it some swap space here. |
@@ -625,20 +644,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
625 | if (PageAnon(page) && !PageSwapCache(page)) { | 644 | if (PageAnon(page) && !PageSwapCache(page)) { |
626 | if (!(sc->gfp_mask & __GFP_IO)) | 645 | if (!(sc->gfp_mask & __GFP_IO)) |
627 | goto keep_locked; | 646 | goto keep_locked; |
628 | switch (try_to_munlock(page)) { | 647 | if (!add_to_swap(page)) |
629 | case SWAP_FAIL: /* shouldn't happen */ | ||
630 | case SWAP_AGAIN: | ||
631 | goto keep_locked; | ||
632 | case SWAP_MLOCK: | ||
633 | goto cull_mlocked; | ||
634 | case SWAP_SUCCESS: | ||
635 | ; /* fall thru'; add to swap cache */ | ||
636 | } | ||
637 | if (!add_to_swap(page, GFP_ATOMIC)) | ||
638 | goto activate_locked; | 648 | goto activate_locked; |
639 | may_enter_fs = 1; | 649 | may_enter_fs = 1; |
640 | } | 650 | } |
641 | #endif /* CONFIG_SWAP */ | ||
642 | 651 | ||
643 | mapping = page_mapping(page); | 652 | mapping = page_mapping(page); |
644 | 653 | ||
@@ -752,6 +761,8 @@ free_it: | |||
752 | continue; | 761 | continue; |
753 | 762 | ||
754 | cull_mlocked: | 763 | cull_mlocked: |
764 | if (PageSwapCache(page)) | ||
765 | try_to_free_swap(page); | ||
755 | unlock_page(page); | 766 | unlock_page(page); |
756 | putback_lru_page(page); | 767 | putback_lru_page(page); |
757 | continue; | 768 | continue; |
@@ -759,7 +770,7 @@ cull_mlocked: | |||
759 | activate_locked: | 770 | activate_locked: |
760 | /* Not a candidate for swapping, so reclaim swap space. */ | 771 | /* Not a candidate for swapping, so reclaim swap space. */ |
761 | if (PageSwapCache(page) && vm_swap_full()) | 772 | if (PageSwapCache(page) && vm_swap_full()) |
762 | remove_exclusive_swap_page_ref(page); | 773 | try_to_free_swap(page); |
763 | VM_BUG_ON(PageActive(page)); | 774 | VM_BUG_ON(PageActive(page)); |
764 | SetPageActive(page); | 775 | SetPageActive(page); |
765 | pgactivate++; | 776 | pgactivate++; |
@@ -819,6 +830,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
819 | return ret; | 830 | return ret; |
820 | 831 | ||
821 | ret = -EBUSY; | 832 | ret = -EBUSY; |
833 | |||
822 | if (likely(get_page_unless_zero(page))) { | 834 | if (likely(get_page_unless_zero(page))) { |
823 | /* | 835 | /* |
824 | * Be careful not to clear PageLRU until after we're | 836 | * Be careful not to clear PageLRU until after we're |
@@ -827,6 +839,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
827 | */ | 839 | */ |
828 | ClearPageLRU(page); | 840 | ClearPageLRU(page); |
829 | ret = 0; | 841 | ret = 0; |
842 | mem_cgroup_del_lru(page); | ||
830 | } | 843 | } |
831 | 844 | ||
832 | return ret; | 845 | return ret; |
@@ -1035,6 +1048,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1035 | struct pagevec pvec; | 1048 | struct pagevec pvec; |
1036 | unsigned long nr_scanned = 0; | 1049 | unsigned long nr_scanned = 0; |
1037 | unsigned long nr_reclaimed = 0; | 1050 | unsigned long nr_reclaimed = 0; |
1051 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | ||
1038 | 1052 | ||
1039 | pagevec_init(&pvec, 1); | 1053 | pagevec_init(&pvec, 1); |
1040 | 1054 | ||
@@ -1076,13 +1090,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1076 | __mod_zone_page_state(zone, NR_INACTIVE_ANON, | 1090 | __mod_zone_page_state(zone, NR_INACTIVE_ANON, |
1077 | -count[LRU_INACTIVE_ANON]); | 1091 | -count[LRU_INACTIVE_ANON]); |
1078 | 1092 | ||
1079 | if (scan_global_lru(sc)) { | 1093 | if (scanning_global_lru(sc)) |
1080 | zone->pages_scanned += nr_scan; | 1094 | zone->pages_scanned += nr_scan; |
1081 | zone->recent_scanned[0] += count[LRU_INACTIVE_ANON]; | 1095 | |
1082 | zone->recent_scanned[0] += count[LRU_ACTIVE_ANON]; | 1096 | reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; |
1083 | zone->recent_scanned[1] += count[LRU_INACTIVE_FILE]; | 1097 | reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; |
1084 | zone->recent_scanned[1] += count[LRU_ACTIVE_FILE]; | 1098 | reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE]; |
1085 | } | 1099 | reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE]; |
1100 | |||
1086 | spin_unlock_irq(&zone->lru_lock); | 1101 | spin_unlock_irq(&zone->lru_lock); |
1087 | 1102 | ||
1088 | nr_scanned += nr_scan; | 1103 | nr_scanned += nr_scan; |
@@ -1114,7 +1129,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1114 | if (current_is_kswapd()) { | 1129 | if (current_is_kswapd()) { |
1115 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); | 1130 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); |
1116 | __count_vm_events(KSWAPD_STEAL, nr_freed); | 1131 | __count_vm_events(KSWAPD_STEAL, nr_freed); |
1117 | } else if (scan_global_lru(sc)) | 1132 | } else if (scanning_global_lru(sc)) |
1118 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); | 1133 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); |
1119 | 1134 | ||
1120 | __count_zone_vm_events(PGSTEAL, zone, nr_freed); | 1135 | __count_zone_vm_events(PGSTEAL, zone, nr_freed); |
@@ -1140,10 +1155,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1140 | SetPageLRU(page); | 1155 | SetPageLRU(page); |
1141 | lru = page_lru(page); | 1156 | lru = page_lru(page); |
1142 | add_page_to_lru_list(zone, page, lru); | 1157 | add_page_to_lru_list(zone, page, lru); |
1143 | mem_cgroup_move_lists(page, lru); | 1158 | if (PageActive(page)) { |
1144 | if (PageActive(page) && scan_global_lru(sc)) { | ||
1145 | int file = !!page_is_file_cache(page); | 1159 | int file = !!page_is_file_cache(page); |
1146 | zone->recent_rotated[file]++; | 1160 | reclaim_stat->recent_rotated[file]++; |
1147 | } | 1161 | } |
1148 | if (!pagevec_add(&pvec, page)) { | 1162 | if (!pagevec_add(&pvec, page)) { |
1149 | spin_unlock_irq(&zone->lru_lock); | 1163 | spin_unlock_irq(&zone->lru_lock); |
@@ -1173,11 +1187,6 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) | |||
1173 | zone->prev_priority = priority; | 1187 | zone->prev_priority = priority; |
1174 | } | 1188 | } |
1175 | 1189 | ||
1176 | static inline int zone_is_near_oom(struct zone *zone) | ||
1177 | { | ||
1178 | return zone->pages_scanned >= (zone_lru_pages(zone) * 3); | ||
1179 | } | ||
1180 | |||
1181 | /* | 1190 | /* |
1182 | * This moves pages from the active list to the inactive list. | 1191 | * This moves pages from the active list to the inactive list. |
1183 | * | 1192 | * |
@@ -1208,6 +1217,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1208 | struct page *page; | 1217 | struct page *page; |
1209 | struct pagevec pvec; | 1218 | struct pagevec pvec; |
1210 | enum lru_list lru; | 1219 | enum lru_list lru; |
1220 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | ||
1211 | 1221 | ||
1212 | lru_add_drain(); | 1222 | lru_add_drain(); |
1213 | spin_lock_irq(&zone->lru_lock); | 1223 | spin_lock_irq(&zone->lru_lock); |
@@ -1218,10 +1228,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1218 | * zone->pages_scanned is used for detect zone's oom | 1228 | * zone->pages_scanned is used for detect zone's oom |
1219 | * mem_cgroup remembers nr_scan by itself. | 1229 | * mem_cgroup remembers nr_scan by itself. |
1220 | */ | 1230 | */ |
1221 | if (scan_global_lru(sc)) { | 1231 | if (scanning_global_lru(sc)) { |
1222 | zone->pages_scanned += pgscanned; | 1232 | zone->pages_scanned += pgscanned; |
1223 | zone->recent_scanned[!!file] += pgmoved; | ||
1224 | } | 1233 | } |
1234 | reclaim_stat->recent_scanned[!!file] += pgmoved; | ||
1225 | 1235 | ||
1226 | if (file) | 1236 | if (file) |
1227 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); | 1237 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); |
@@ -1248,6 +1258,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1248 | list_add(&page->lru, &l_inactive); | 1258 | list_add(&page->lru, &l_inactive); |
1249 | } | 1259 | } |
1250 | 1260 | ||
1261 | /* | ||
1262 | * Move the pages to the [file or anon] inactive list. | ||
1263 | */ | ||
1264 | pagevec_init(&pvec, 1); | ||
1265 | pgmoved = 0; | ||
1266 | lru = LRU_BASE + file * LRU_FILE; | ||
1267 | |||
1251 | spin_lock_irq(&zone->lru_lock); | 1268 | spin_lock_irq(&zone->lru_lock); |
1252 | /* | 1269 | /* |
1253 | * Count referenced pages from currently used mappings as | 1270 | * Count referenced pages from currently used mappings as |
@@ -1255,15 +1272,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1255 | * This helps balance scan pressure between file and anonymous | 1272 | * This helps balance scan pressure between file and anonymous |
1256 | * pages in get_scan_ratio. | 1273 | * pages in get_scan_ratio. |
1257 | */ | 1274 | */ |
1258 | zone->recent_rotated[!!file] += pgmoved; | 1275 | reclaim_stat->recent_rotated[!!file] += pgmoved; |
1259 | 1276 | ||
1260 | /* | ||
1261 | * Move the pages to the [file or anon] inactive list. | ||
1262 | */ | ||
1263 | pagevec_init(&pvec, 1); | ||
1264 | |||
1265 | pgmoved = 0; | ||
1266 | lru = LRU_BASE + file * LRU_FILE; | ||
1267 | while (!list_empty(&l_inactive)) { | 1277 | while (!list_empty(&l_inactive)) { |
1268 | page = lru_to_page(&l_inactive); | 1278 | page = lru_to_page(&l_inactive); |
1269 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 1279 | prefetchw_prev_lru_page(page, &l_inactive, flags); |
@@ -1273,7 +1283,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1273 | ClearPageActive(page); | 1283 | ClearPageActive(page); |
1274 | 1284 | ||
1275 | list_move(&page->lru, &zone->lru[lru].list); | 1285 | list_move(&page->lru, &zone->lru[lru].list); |
1276 | mem_cgroup_move_lists(page, lru); | 1286 | mem_cgroup_add_lru_list(page, lru); |
1277 | pgmoved++; | 1287 | pgmoved++; |
1278 | if (!pagevec_add(&pvec, page)) { | 1288 | if (!pagevec_add(&pvec, page)) { |
1279 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | 1289 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); |
@@ -1302,6 +1312,38 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1302 | pagevec_release(&pvec); | 1312 | pagevec_release(&pvec); |
1303 | } | 1313 | } |
1304 | 1314 | ||
1315 | static int inactive_anon_is_low_global(struct zone *zone) | ||
1316 | { | ||
1317 | unsigned long active, inactive; | ||
1318 | |||
1319 | active = zone_page_state(zone, NR_ACTIVE_ANON); | ||
1320 | inactive = zone_page_state(zone, NR_INACTIVE_ANON); | ||
1321 | |||
1322 | if (inactive * zone->inactive_ratio < active) | ||
1323 | return 1; | ||
1324 | |||
1325 | return 0; | ||
1326 | } | ||
1327 | |||
1328 | /** | ||
1329 | * inactive_anon_is_low - check if anonymous pages need to be deactivated | ||
1330 | * @zone: zone to check | ||
1331 | * @sc: scan control of this context | ||
1332 | * | ||
1333 | * Returns true if the zone does not have enough inactive anon pages, | ||
1334 | * meaning some active anon pages need to be deactivated. | ||
1335 | */ | ||
1336 | static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) | ||
1337 | { | ||
1338 | int low; | ||
1339 | |||
1340 | if (scanning_global_lru(sc)) | ||
1341 | low = inactive_anon_is_low_global(zone); | ||
1342 | else | ||
1343 | low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); | ||
1344 | return low; | ||
1345 | } | ||
1346 | |||
1305 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1347 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
1306 | struct zone *zone, struct scan_control *sc, int priority) | 1348 | struct zone *zone, struct scan_control *sc, int priority) |
1307 | { | 1349 | { |
@@ -1312,8 +1354,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | |||
1312 | return 0; | 1354 | return 0; |
1313 | } | 1355 | } |
1314 | 1356 | ||
1315 | if (lru == LRU_ACTIVE_ANON && | 1357 | if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { |
1316 | (!scan_global_lru(sc) || inactive_anon_is_low(zone))) { | ||
1317 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | 1358 | shrink_active_list(nr_to_scan, zone, sc, priority, file); |
1318 | return 0; | 1359 | return 0; |
1319 | } | 1360 | } |
@@ -1335,12 +1376,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1335 | unsigned long anon, file, free; | 1376 | unsigned long anon, file, free; |
1336 | unsigned long anon_prio, file_prio; | 1377 | unsigned long anon_prio, file_prio; |
1337 | unsigned long ap, fp; | 1378 | unsigned long ap, fp; |
1338 | 1379 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | |
1339 | anon = zone_page_state(zone, NR_ACTIVE_ANON) + | ||
1340 | zone_page_state(zone, NR_INACTIVE_ANON); | ||
1341 | file = zone_page_state(zone, NR_ACTIVE_FILE) + | ||
1342 | zone_page_state(zone, NR_INACTIVE_FILE); | ||
1343 | free = zone_page_state(zone, NR_FREE_PAGES); | ||
1344 | 1380 | ||
1345 | /* If we have no swap space, do not bother scanning anon pages. */ | 1381 | /* If we have no swap space, do not bother scanning anon pages. */ |
1346 | if (nr_swap_pages <= 0) { | 1382 | if (nr_swap_pages <= 0) { |
@@ -1349,11 +1385,20 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1349 | return; | 1385 | return; |
1350 | } | 1386 | } |
1351 | 1387 | ||
1352 | /* If we have very few page cache pages, force-scan anon pages. */ | 1388 | anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + |
1353 | if (unlikely(file + free <= zone->pages_high)) { | 1389 | zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); |
1354 | percent[0] = 100; | 1390 | file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + |
1355 | percent[1] = 0; | 1391 | zone_nr_pages(zone, sc, LRU_INACTIVE_FILE); |
1356 | return; | 1392 | |
1393 | if (scanning_global_lru(sc)) { | ||
1394 | free = zone_page_state(zone, NR_FREE_PAGES); | ||
1395 | /* If we have very few page cache pages, | ||
1396 | force-scan anon pages. */ | ||
1397 | if (unlikely(file + free <= zone->pages_high)) { | ||
1398 | percent[0] = 100; | ||
1399 | percent[1] = 0; | ||
1400 | return; | ||
1401 | } | ||
1357 | } | 1402 | } |
1358 | 1403 | ||
1359 | /* | 1404 | /* |
@@ -1367,17 +1412,17 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1367 | * | 1412 | * |
1368 | * anon in [0], file in [1] | 1413 | * anon in [0], file in [1] |
1369 | */ | 1414 | */ |
1370 | if (unlikely(zone->recent_scanned[0] > anon / 4)) { | 1415 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { |
1371 | spin_lock_irq(&zone->lru_lock); | 1416 | spin_lock_irq(&zone->lru_lock); |
1372 | zone->recent_scanned[0] /= 2; | 1417 | reclaim_stat->recent_scanned[0] /= 2; |
1373 | zone->recent_rotated[0] /= 2; | 1418 | reclaim_stat->recent_rotated[0] /= 2; |
1374 | spin_unlock_irq(&zone->lru_lock); | 1419 | spin_unlock_irq(&zone->lru_lock); |
1375 | } | 1420 | } |
1376 | 1421 | ||
1377 | if (unlikely(zone->recent_scanned[1] > file / 4)) { | 1422 | if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { |
1378 | spin_lock_irq(&zone->lru_lock); | 1423 | spin_lock_irq(&zone->lru_lock); |
1379 | zone->recent_scanned[1] /= 2; | 1424 | reclaim_stat->recent_scanned[1] /= 2; |
1380 | zone->recent_rotated[1] /= 2; | 1425 | reclaim_stat->recent_rotated[1] /= 2; |
1381 | spin_unlock_irq(&zone->lru_lock); | 1426 | spin_unlock_irq(&zone->lru_lock); |
1382 | } | 1427 | } |
1383 | 1428 | ||
@@ -1393,11 +1438,11 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1393 | * proportional to the fraction of recently scanned pages on | 1438 | * proportional to the fraction of recently scanned pages on |
1394 | * each list that were recently referenced and in active use. | 1439 | * each list that were recently referenced and in active use. |
1395 | */ | 1440 | */ |
1396 | ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); | 1441 | ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); |
1397 | ap /= zone->recent_rotated[0] + 1; | 1442 | ap /= reclaim_stat->recent_rotated[0] + 1; |
1398 | 1443 | ||
1399 | fp = (file_prio + 1) * (zone->recent_scanned[1] + 1); | 1444 | fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); |
1400 | fp /= zone->recent_rotated[1] + 1; | 1445 | fp /= reclaim_stat->recent_rotated[1] + 1; |
1401 | 1446 | ||
1402 | /* Normalize to percentages */ | 1447 | /* Normalize to percentages */ |
1403 | percent[0] = 100 * ap / (ap + fp + 1); | 1448 | percent[0] = 100 * ap / (ap + fp + 1); |
@@ -1408,69 +1453,72 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1408 | /* | 1453 | /* |
1409 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1454 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
1410 | */ | 1455 | */ |
1411 | static unsigned long shrink_zone(int priority, struct zone *zone, | 1456 | static void shrink_zone(int priority, struct zone *zone, |
1412 | struct scan_control *sc) | 1457 | struct scan_control *sc) |
1413 | { | 1458 | { |
1414 | unsigned long nr[NR_LRU_LISTS]; | 1459 | unsigned long nr[NR_LRU_LISTS]; |
1415 | unsigned long nr_to_scan; | 1460 | unsigned long nr_to_scan; |
1416 | unsigned long nr_reclaimed = 0; | ||
1417 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ | 1461 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ |
1418 | enum lru_list l; | 1462 | enum lru_list l; |
1463 | unsigned long nr_reclaimed = sc->nr_reclaimed; | ||
1464 | unsigned long swap_cluster_max = sc->swap_cluster_max; | ||
1419 | 1465 | ||
1420 | get_scan_ratio(zone, sc, percent); | 1466 | get_scan_ratio(zone, sc, percent); |
1421 | 1467 | ||
1422 | for_each_evictable_lru(l) { | 1468 | for_each_evictable_lru(l) { |
1423 | if (scan_global_lru(sc)) { | 1469 | int file = is_file_lru(l); |
1424 | int file = is_file_lru(l); | 1470 | int scan; |
1425 | int scan; | 1471 | |
1426 | 1472 | scan = zone_page_state(zone, NR_LRU_BASE + l); | |
1427 | scan = zone_page_state(zone, NR_LRU_BASE + l); | 1473 | if (priority) { |
1428 | if (priority) { | 1474 | scan >>= priority; |
1429 | scan >>= priority; | 1475 | scan = (scan * percent[file]) / 100; |
1430 | scan = (scan * percent[file]) / 100; | 1476 | } |
1431 | } | 1477 | if (scanning_global_lru(sc)) { |
1432 | zone->lru[l].nr_scan += scan; | 1478 | zone->lru[l].nr_scan += scan; |
1433 | nr[l] = zone->lru[l].nr_scan; | 1479 | nr[l] = zone->lru[l].nr_scan; |
1434 | if (nr[l] >= sc->swap_cluster_max) | 1480 | if (nr[l] >= swap_cluster_max) |
1435 | zone->lru[l].nr_scan = 0; | 1481 | zone->lru[l].nr_scan = 0; |
1436 | else | 1482 | else |
1437 | nr[l] = 0; | 1483 | nr[l] = 0; |
1438 | } else { | 1484 | } else |
1439 | /* | 1485 | nr[l] = scan; |
1440 | * This reclaim occurs not because zone memory shortage | ||
1441 | * but because memory controller hits its limit. | ||
1442 | * Don't modify zone reclaim related data. | ||
1443 | */ | ||
1444 | nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone, | ||
1445 | priority, l); | ||
1446 | } | ||
1447 | } | 1486 | } |
1448 | 1487 | ||
1449 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1488 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
1450 | nr[LRU_INACTIVE_FILE]) { | 1489 | nr[LRU_INACTIVE_FILE]) { |
1451 | for_each_evictable_lru(l) { | 1490 | for_each_evictable_lru(l) { |
1452 | if (nr[l]) { | 1491 | if (nr[l]) { |
1453 | nr_to_scan = min(nr[l], | 1492 | nr_to_scan = min(nr[l], swap_cluster_max); |
1454 | (unsigned long)sc->swap_cluster_max); | ||
1455 | nr[l] -= nr_to_scan; | 1493 | nr[l] -= nr_to_scan; |
1456 | 1494 | ||
1457 | nr_reclaimed += shrink_list(l, nr_to_scan, | 1495 | nr_reclaimed += shrink_list(l, nr_to_scan, |
1458 | zone, sc, priority); | 1496 | zone, sc, priority); |
1459 | } | 1497 | } |
1460 | } | 1498 | } |
1499 | /* | ||
1500 | * On large memory systems, scan >> priority can become | ||
1501 | * really large. This is fine for the starting priority; | ||
1502 | * we want to put equal scanning pressure on each zone. | ||
1503 | * However, if the VM has a harder time of freeing pages, | ||
1504 | * with multiple processes reclaiming pages, the total | ||
1505 | * freeing target can get unreasonably large. | ||
1506 | */ | ||
1507 | if (nr_reclaimed > swap_cluster_max && | ||
1508 | priority < DEF_PRIORITY && !current_is_kswapd()) | ||
1509 | break; | ||
1461 | } | 1510 | } |
1462 | 1511 | ||
1512 | sc->nr_reclaimed = nr_reclaimed; | ||
1513 | |||
1463 | /* | 1514 | /* |
1464 | * Even if we did not try to evict anon pages at all, we want to | 1515 | * Even if we did not try to evict anon pages at all, we want to |
1465 | * rebalance the anon lru active/inactive ratio. | 1516 | * rebalance the anon lru active/inactive ratio. |
1466 | */ | 1517 | */ |
1467 | if (!scan_global_lru(sc) || inactive_anon_is_low(zone)) | 1518 | if (inactive_anon_is_low(zone, sc)) |
1468 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | ||
1469 | else if (!scan_global_lru(sc)) | ||
1470 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 1519 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
1471 | 1520 | ||
1472 | throttle_vm_writeout(sc->gfp_mask); | 1521 | throttle_vm_writeout(sc->gfp_mask); |
1473 | return nr_reclaimed; | ||
1474 | } | 1522 | } |
1475 | 1523 | ||
1476 | /* | 1524 | /* |
@@ -1484,16 +1532,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone, | |||
1484 | * b) The zones may be over pages_high but they must go *over* pages_high to | 1532 | * b) The zones may be over pages_high but they must go *over* pages_high to |
1485 | * satisfy the `incremental min' zone defense algorithm. | 1533 | * satisfy the `incremental min' zone defense algorithm. |
1486 | * | 1534 | * |
1487 | * Returns the number of reclaimed pages. | ||
1488 | * | ||
1489 | * If a zone is deemed to be full of pinned pages then just give it a light | 1535 | * If a zone is deemed to be full of pinned pages then just give it a light |
1490 | * scan then give up on it. | 1536 | * scan then give up on it. |
1491 | */ | 1537 | */ |
1492 | static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | 1538 | static void shrink_zones(int priority, struct zonelist *zonelist, |
1493 | struct scan_control *sc) | 1539 | struct scan_control *sc) |
1494 | { | 1540 | { |
1495 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); | 1541 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); |
1496 | unsigned long nr_reclaimed = 0; | ||
1497 | struct zoneref *z; | 1542 | struct zoneref *z; |
1498 | struct zone *zone; | 1543 | struct zone *zone; |
1499 | 1544 | ||
@@ -1505,7 +1550,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | |||
1505 | * Take care memory controller reclaiming has small influence | 1550 | * Take care memory controller reclaiming has small influence |
1506 | * to global LRU. | 1551 | * to global LRU. |
1507 | */ | 1552 | */ |
1508 | if (scan_global_lru(sc)) { | 1553 | if (scanning_global_lru(sc)) { |
1509 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1554 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
1510 | continue; | 1555 | continue; |
1511 | note_zone_scanning_priority(zone, priority); | 1556 | note_zone_scanning_priority(zone, priority); |
@@ -1524,10 +1569,8 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | |||
1524 | priority); | 1569 | priority); |
1525 | } | 1570 | } |
1526 | 1571 | ||
1527 | nr_reclaimed += shrink_zone(priority, zone, sc); | 1572 | shrink_zone(priority, zone, sc); |
1528 | } | 1573 | } |
1529 | |||
1530 | return nr_reclaimed; | ||
1531 | } | 1574 | } |
1532 | 1575 | ||
1533 | /* | 1576 | /* |
@@ -1552,7 +1595,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1552 | int priority; | 1595 | int priority; |
1553 | unsigned long ret = 0; | 1596 | unsigned long ret = 0; |
1554 | unsigned long total_scanned = 0; | 1597 | unsigned long total_scanned = 0; |
1555 | unsigned long nr_reclaimed = 0; | ||
1556 | struct reclaim_state *reclaim_state = current->reclaim_state; | 1598 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1557 | unsigned long lru_pages = 0; | 1599 | unsigned long lru_pages = 0; |
1558 | struct zoneref *z; | 1600 | struct zoneref *z; |
@@ -1561,12 +1603,12 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1561 | 1603 | ||
1562 | delayacct_freepages_start(); | 1604 | delayacct_freepages_start(); |
1563 | 1605 | ||
1564 | if (scan_global_lru(sc)) | 1606 | if (scanning_global_lru(sc)) |
1565 | count_vm_event(ALLOCSTALL); | 1607 | count_vm_event(ALLOCSTALL); |
1566 | /* | 1608 | /* |
1567 | * mem_cgroup will not do shrink_slab. | 1609 | * mem_cgroup will not do shrink_slab. |
1568 | */ | 1610 | */ |
1569 | if (scan_global_lru(sc)) { | 1611 | if (scanning_global_lru(sc)) { |
1570 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1612 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1571 | 1613 | ||
1572 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1614 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
@@ -1580,21 +1622,21 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1580 | sc->nr_scanned = 0; | 1622 | sc->nr_scanned = 0; |
1581 | if (!priority) | 1623 | if (!priority) |
1582 | disable_swap_token(); | 1624 | disable_swap_token(); |
1583 | nr_reclaimed += shrink_zones(priority, zonelist, sc); | 1625 | shrink_zones(priority, zonelist, sc); |
1584 | /* | 1626 | /* |
1585 | * Don't shrink slabs when reclaiming memory from | 1627 | * Don't shrink slabs when reclaiming memory from |
1586 | * over limit cgroups | 1628 | * over limit cgroups |
1587 | */ | 1629 | */ |
1588 | if (scan_global_lru(sc)) { | 1630 | if (scanning_global_lru(sc)) { |
1589 | shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); | 1631 | shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); |
1590 | if (reclaim_state) { | 1632 | if (reclaim_state) { |
1591 | nr_reclaimed += reclaim_state->reclaimed_slab; | 1633 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; |
1592 | reclaim_state->reclaimed_slab = 0; | 1634 | reclaim_state->reclaimed_slab = 0; |
1593 | } | 1635 | } |
1594 | } | 1636 | } |
1595 | total_scanned += sc->nr_scanned; | 1637 | total_scanned += sc->nr_scanned; |
1596 | if (nr_reclaimed >= sc->swap_cluster_max) { | 1638 | if (sc->nr_reclaimed >= sc->swap_cluster_max) { |
1597 | ret = nr_reclaimed; | 1639 | ret = sc->nr_reclaimed; |
1598 | goto out; | 1640 | goto out; |
1599 | } | 1641 | } |
1600 | 1642 | ||
@@ -1616,8 +1658,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1616 | congestion_wait(WRITE, HZ/10); | 1658 | congestion_wait(WRITE, HZ/10); |
1617 | } | 1659 | } |
1618 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 1660 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
1619 | if (!sc->all_unreclaimable && scan_global_lru(sc)) | 1661 | if (!sc->all_unreclaimable && scanning_global_lru(sc)) |
1620 | ret = nr_reclaimed; | 1662 | ret = sc->nr_reclaimed; |
1621 | out: | 1663 | out: |
1622 | /* | 1664 | /* |
1623 | * Now that we've scanned all the zones at this priority level, note | 1665 | * Now that we've scanned all the zones at this priority level, note |
@@ -1629,7 +1671,7 @@ out: | |||
1629 | if (priority < 0) | 1671 | if (priority < 0) |
1630 | priority = 0; | 1672 | priority = 0; |
1631 | 1673 | ||
1632 | if (scan_global_lru(sc)) { | 1674 | if (scanning_global_lru(sc)) { |
1633 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1675 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1634 | 1676 | ||
1635 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1677 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
@@ -1665,19 +1707,24 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
1665 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 1707 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
1666 | 1708 | ||
1667 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | 1709 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, |
1668 | gfp_t gfp_mask) | 1710 | gfp_t gfp_mask, |
1711 | bool noswap, | ||
1712 | unsigned int swappiness) | ||
1669 | { | 1713 | { |
1670 | struct scan_control sc = { | 1714 | struct scan_control sc = { |
1671 | .may_writepage = !laptop_mode, | 1715 | .may_writepage = !laptop_mode, |
1672 | .may_swap = 1, | 1716 | .may_swap = 1, |
1673 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1717 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
1674 | .swappiness = vm_swappiness, | 1718 | .swappiness = swappiness, |
1675 | .order = 0, | 1719 | .order = 0, |
1676 | .mem_cgroup = mem_cont, | 1720 | .mem_cgroup = mem_cont, |
1677 | .isolate_pages = mem_cgroup_isolate_pages, | 1721 | .isolate_pages = mem_cgroup_isolate_pages, |
1678 | }; | 1722 | }; |
1679 | struct zonelist *zonelist; | 1723 | struct zonelist *zonelist; |
1680 | 1724 | ||
1725 | if (noswap) | ||
1726 | sc.may_swap = 0; | ||
1727 | |||
1681 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 1728 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
1682 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 1729 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
1683 | zonelist = NODE_DATA(numa_node_id())->node_zonelists; | 1730 | zonelist = NODE_DATA(numa_node_id())->node_zonelists; |
@@ -1712,7 +1759,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1712 | int priority; | 1759 | int priority; |
1713 | int i; | 1760 | int i; |
1714 | unsigned long total_scanned; | 1761 | unsigned long total_scanned; |
1715 | unsigned long nr_reclaimed; | ||
1716 | struct reclaim_state *reclaim_state = current->reclaim_state; | 1762 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1717 | struct scan_control sc = { | 1763 | struct scan_control sc = { |
1718 | .gfp_mask = GFP_KERNEL, | 1764 | .gfp_mask = GFP_KERNEL, |
@@ -1731,7 +1777,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1731 | 1777 | ||
1732 | loop_again: | 1778 | loop_again: |
1733 | total_scanned = 0; | 1779 | total_scanned = 0; |
1734 | nr_reclaimed = 0; | 1780 | sc.nr_reclaimed = 0; |
1735 | sc.may_writepage = !laptop_mode; | 1781 | sc.may_writepage = !laptop_mode; |
1736 | count_vm_event(PAGEOUTRUN); | 1782 | count_vm_event(PAGEOUTRUN); |
1737 | 1783 | ||
@@ -1766,7 +1812,7 @@ loop_again: | |||
1766 | * Do some background aging of the anon list, to give | 1812 | * Do some background aging of the anon list, to give |
1767 | * pages a chance to be referenced before reclaiming. | 1813 | * pages a chance to be referenced before reclaiming. |
1768 | */ | 1814 | */ |
1769 | if (inactive_anon_is_low(zone)) | 1815 | if (inactive_anon_is_low(zone, &sc)) |
1770 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 1816 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
1771 | &sc, priority, 0); | 1817 | &sc, priority, 0); |
1772 | 1818 | ||
@@ -1817,11 +1863,11 @@ loop_again: | |||
1817 | */ | 1863 | */ |
1818 | if (!zone_watermark_ok(zone, order, 8*zone->pages_high, | 1864 | if (!zone_watermark_ok(zone, order, 8*zone->pages_high, |
1819 | end_zone, 0)) | 1865 | end_zone, 0)) |
1820 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1866 | shrink_zone(priority, zone, &sc); |
1821 | reclaim_state->reclaimed_slab = 0; | 1867 | reclaim_state->reclaimed_slab = 0; |
1822 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1868 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
1823 | lru_pages); | 1869 | lru_pages); |
1824 | nr_reclaimed += reclaim_state->reclaimed_slab; | 1870 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
1825 | total_scanned += sc.nr_scanned; | 1871 | total_scanned += sc.nr_scanned; |
1826 | if (zone_is_all_unreclaimable(zone)) | 1872 | if (zone_is_all_unreclaimable(zone)) |
1827 | continue; | 1873 | continue; |
@@ -1835,7 +1881,7 @@ loop_again: | |||
1835 | * even in laptop mode | 1881 | * even in laptop mode |
1836 | */ | 1882 | */ |
1837 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 1883 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && |
1838 | total_scanned > nr_reclaimed + nr_reclaimed / 2) | 1884 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
1839 | sc.may_writepage = 1; | 1885 | sc.may_writepage = 1; |
1840 | } | 1886 | } |
1841 | if (all_zones_ok) | 1887 | if (all_zones_ok) |
@@ -1853,7 +1899,7 @@ loop_again: | |||
1853 | * matches the direct reclaim path behaviour in terms of impact | 1899 | * matches the direct reclaim path behaviour in terms of impact |
1854 | * on zone->*_priority. | 1900 | * on zone->*_priority. |
1855 | */ | 1901 | */ |
1856 | if (nr_reclaimed >= SWAP_CLUSTER_MAX) | 1902 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) |
1857 | break; | 1903 | break; |
1858 | } | 1904 | } |
1859 | out: | 1905 | out: |
@@ -1872,10 +1918,27 @@ out: | |||
1872 | 1918 | ||
1873 | try_to_freeze(); | 1919 | try_to_freeze(); |
1874 | 1920 | ||
1921 | /* | ||
1922 | * Fragmentation may mean that the system cannot be | ||
1923 | * rebalanced for high-order allocations in all zones. | ||
1924 | * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, | ||
1925 | * it means the zones have been fully scanned and are still | ||
1926 | * not balanced. For high-order allocations, there is | ||
1927 | * little point trying all over again as kswapd may | ||
1928 | * infinite loop. | ||
1929 | * | ||
1930 | * Instead, recheck all watermarks at order-0 as they | ||
1931 | * are the most important. If watermarks are ok, kswapd will go | ||
1932 | * back to sleep. High-order users can still perform direct | ||
1933 | * reclaim if they wish. | ||
1934 | */ | ||
1935 | if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) | ||
1936 | order = sc.order = 0; | ||
1937 | |||
1875 | goto loop_again; | 1938 | goto loop_again; |
1876 | } | 1939 | } |
1877 | 1940 | ||
1878 | return nr_reclaimed; | 1941 | return sc.nr_reclaimed; |
1879 | } | 1942 | } |
1880 | 1943 | ||
1881 | /* | 1944 | /* |
@@ -1902,7 +1965,7 @@ static int kswapd(void *p) | |||
1902 | }; | 1965 | }; |
1903 | node_to_cpumask_ptr(cpumask, pgdat->node_id); | 1966 | node_to_cpumask_ptr(cpumask, pgdat->node_id); |
1904 | 1967 | ||
1905 | if (!cpus_empty(*cpumask)) | 1968 | if (!cpumask_empty(cpumask)) |
1906 | set_cpus_allowed_ptr(tsk, cpumask); | 1969 | set_cpus_allowed_ptr(tsk, cpumask); |
1907 | current->reclaim_state = &reclaim_state; | 1970 | current->reclaim_state = &reclaim_state; |
1908 | 1971 | ||
@@ -2141,7 +2204,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
2141 | pg_data_t *pgdat = NODE_DATA(nid); | 2204 | pg_data_t *pgdat = NODE_DATA(nid); |
2142 | node_to_cpumask_ptr(mask, pgdat->node_id); | 2205 | node_to_cpumask_ptr(mask, pgdat->node_id); |
2143 | 2206 | ||
2144 | if (any_online_cpu(*mask) < nr_cpu_ids) | 2207 | if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) |
2145 | /* One of our CPUs online: restore mask */ | 2208 | /* One of our CPUs online: restore mask */ |
2146 | set_cpus_allowed_ptr(pgdat->kswapd, mask); | 2209 | set_cpus_allowed_ptr(pgdat->kswapd, mask); |
2147 | } | 2210 | } |
@@ -2227,7 +2290,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2227 | struct task_struct *p = current; | 2290 | struct task_struct *p = current; |
2228 | struct reclaim_state reclaim_state; | 2291 | struct reclaim_state reclaim_state; |
2229 | int priority; | 2292 | int priority; |
2230 | unsigned long nr_reclaimed = 0; | ||
2231 | struct scan_control sc = { | 2293 | struct scan_control sc = { |
2232 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 2294 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
2233 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 2295 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
@@ -2260,9 +2322,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2260 | priority = ZONE_RECLAIM_PRIORITY; | 2322 | priority = ZONE_RECLAIM_PRIORITY; |
2261 | do { | 2323 | do { |
2262 | note_zone_scanning_priority(zone, priority); | 2324 | note_zone_scanning_priority(zone, priority); |
2263 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 2325 | shrink_zone(priority, zone, &sc); |
2264 | priority--; | 2326 | priority--; |
2265 | } while (priority >= 0 && nr_reclaimed < nr_pages); | 2327 | } while (priority >= 0 && sc.nr_reclaimed < nr_pages); |
2266 | } | 2328 | } |
2267 | 2329 | ||
2268 | slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); | 2330 | slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); |
@@ -2286,13 +2348,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2286 | * Update nr_reclaimed by the number of slab pages we | 2348 | * Update nr_reclaimed by the number of slab pages we |
2287 | * reclaimed from this zone. | 2349 | * reclaimed from this zone. |
2288 | */ | 2350 | */ |
2289 | nr_reclaimed += slab_reclaimable - | 2351 | sc.nr_reclaimed += slab_reclaimable - |
2290 | zone_page_state(zone, NR_SLAB_RECLAIMABLE); | 2352 | zone_page_state(zone, NR_SLAB_RECLAIMABLE); |
2291 | } | 2353 | } |
2292 | 2354 | ||
2293 | p->reclaim_state = NULL; | 2355 | p->reclaim_state = NULL; |
2294 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 2356 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
2295 | return nr_reclaimed >= nr_pages; | 2357 | return sc.nr_reclaimed >= nr_pages; |
2296 | } | 2358 | } |
2297 | 2359 | ||
2298 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 2360 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
@@ -2393,6 +2455,7 @@ retry: | |||
2393 | 2455 | ||
2394 | __dec_zone_state(zone, NR_UNEVICTABLE); | 2456 | __dec_zone_state(zone, NR_UNEVICTABLE); |
2395 | list_move(&page->lru, &zone->lru[l].list); | 2457 | list_move(&page->lru, &zone->lru[l].list); |
2458 | mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); | ||
2396 | __inc_zone_state(zone, NR_INACTIVE_ANON + l); | 2459 | __inc_zone_state(zone, NR_INACTIVE_ANON + l); |
2397 | __count_vm_event(UNEVICTABLE_PGRESCUED); | 2460 | __count_vm_event(UNEVICTABLE_PGRESCUED); |
2398 | } else { | 2461 | } else { |
@@ -2401,6 +2464,7 @@ retry: | |||
2401 | */ | 2464 | */ |
2402 | SetPageUnevictable(page); | 2465 | SetPageUnevictable(page); |
2403 | list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); | 2466 | list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); |
2467 | mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); | ||
2404 | if (page_evictable(page, NULL)) | 2468 | if (page_evictable(page, NULL)) |
2405 | goto retry; | 2469 | goto retry; |
2406 | } | 2470 | } |
@@ -2472,7 +2536,7 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) | |||
2472 | * back onto @zone's unevictable list. | 2536 | * back onto @zone's unevictable list. |
2473 | */ | 2537 | */ |
2474 | #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ | 2538 | #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ |
2475 | void scan_zone_unevictable_pages(struct zone *zone) | 2539 | static void scan_zone_unevictable_pages(struct zone *zone) |
2476 | { | 2540 | { |
2477 | struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; | 2541 | struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; |
2478 | unsigned long scan; | 2542 | unsigned long scan; |
@@ -2514,7 +2578,7 @@ void scan_zone_unevictable_pages(struct zone *zone) | |||
2514 | * that has possibly/probably made some previously unevictable pages | 2578 | * that has possibly/probably made some previously unevictable pages |
2515 | * evictable. | 2579 | * evictable. |
2516 | */ | 2580 | */ |
2517 | void scan_all_zones_unevictable_pages(void) | 2581 | static void scan_all_zones_unevictable_pages(void) |
2518 | { | 2582 | { |
2519 | struct zone *zone; | 2583 | struct zone *zone; |
2520 | 2584 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index c3ccfda23adc..91149746bb8d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -20,7 +20,7 @@ | |||
20 | DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; | 20 | DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; |
21 | EXPORT_PER_CPU_SYMBOL(vm_event_states); | 21 | EXPORT_PER_CPU_SYMBOL(vm_event_states); |
22 | 22 | ||
23 | static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) | 23 | static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask) |
24 | { | 24 | { |
25 | int cpu; | 25 | int cpu; |
26 | int i; | 26 | int i; |
@@ -43,7 +43,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) | |||
43 | void all_vm_events(unsigned long *ret) | 43 | void all_vm_events(unsigned long *ret) |
44 | { | 44 | { |
45 | get_online_cpus(); | 45 | get_online_cpus(); |
46 | sum_vm_events(ret, &cpu_online_map); | 46 | sum_vm_events(ret, cpu_online_mask); |
47 | put_online_cpus(); | 47 | put_online_cpus(); |
48 | } | 48 | } |
49 | EXPORT_SYMBOL_GPL(all_vm_events); | 49 | EXPORT_SYMBOL_GPL(all_vm_events); |