aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.mailmap1
-rw-r--r--MAINTAINERS12
-rw-r--r--arch/blackfin/mach-bf533/boards/stamp.c1
-rw-r--r--fs/fcntl.c5
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c5
-rw-r--r--fs/ocfs2/namei.c43
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/rmap.h10
-rw-r--r--include/linux/writeback.h1
-rw-r--r--include/uapi/asm-generic/fcntl.h2
-rw-r--r--kernel/exit.c12
-rw-r--r--mm/Kconfig.debug9
-rw-r--r--mm/memcontrol.c17
-rw-r--r--mm/memory.c27
-rw-r--r--mm/page-writeback.c43
-rw-r--r--mm/rmap.c42
-rw-r--r--mm/vmscan.c24
17 files changed, 155 insertions, 101 deletions
diff --git a/.mailmap b/.mailmap
index ada8ad696b2e..d357e1bd2a43 100644
--- a/.mailmap
+++ b/.mailmap
@@ -51,6 +51,7 @@ Greg Kroah-Hartman <gregkh@suse.de>
51Greg Kroah-Hartman <greg@kroah.com> 51Greg Kroah-Hartman <greg@kroah.com>
52Henk Vergonet <Henk.Vergonet@gmail.com> 52Henk Vergonet <Henk.Vergonet@gmail.com>
53Henrik Kretzschmar <henne@nachtwindheim.de> 53Henrik Kretzschmar <henne@nachtwindheim.de>
54Henrik Rydberg <rydberg@bitmath.org>
54Herbert Xu <herbert@gondor.apana.org.au> 55Herbert Xu <herbert@gondor.apana.org.au>
55Jacob Shin <Jacob.Shin@amd.com> 56Jacob Shin <Jacob.Shin@amd.com>
56James Bottomley <jejb@mulgrave.(none)> 57James Bottomley <jejb@mulgrave.(none)>
diff --git a/MAINTAINERS b/MAINTAINERS
index ddb9ac8d32b3..79b2e4ba78ee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -724,15 +724,15 @@ F: include/uapi/linux/apm_bios.h
724F: drivers/char/apm-emulation.c 724F: drivers/char/apm-emulation.c
725 725
726APPLE BCM5974 MULTITOUCH DRIVER 726APPLE BCM5974 MULTITOUCH DRIVER
727M: Henrik Rydberg <rydberg@euromail.se> 727M: Henrik Rydberg <rydberg@bitmath.org>
728L: linux-input@vger.kernel.org 728L: linux-input@vger.kernel.org
729S: Maintained 729S: Odd fixes
730F: drivers/input/mouse/bcm5974.c 730F: drivers/input/mouse/bcm5974.c
731 731
732APPLE SMC DRIVER 732APPLE SMC DRIVER
733M: Henrik Rydberg <rydberg@euromail.se> 733M: Henrik Rydberg <rydberg@bitmath.org>
734L: lm-sensors@lm-sensors.org 734L: lm-sensors@lm-sensors.org
735S: Maintained 735S: Odd fixes
736F: drivers/hwmon/applesmc.c 736F: drivers/hwmon/applesmc.c
737 737
738APPLETALK NETWORK LAYER 738APPLETALK NETWORK LAYER
@@ -4940,10 +4940,10 @@ F: include/uapi/linux/input.h
4940F: include/linux/input/ 4940F: include/linux/input/
4941 4941
4942INPUT MULTITOUCH (MT) PROTOCOL 4942INPUT MULTITOUCH (MT) PROTOCOL
4943M: Henrik Rydberg <rydberg@euromail.se> 4943M: Henrik Rydberg <rydberg@bitmath.org>
4944L: linux-input@vger.kernel.org 4944L: linux-input@vger.kernel.org
4945T: git git://git.kernel.org/pub/scm/linux/kernel/git/rydberg/input-mt.git 4945T: git git://git.kernel.org/pub/scm/linux/kernel/git/rydberg/input-mt.git
4946S: Maintained 4946S: Odd fixes
4947F: Documentation/input/multi-touch-protocol.txt 4947F: Documentation/input/multi-touch-protocol.txt
4948F: drivers/input/input-mt.c 4948F: drivers/input/input-mt.c
4949K: \b(ABS|SYN)_MT_ 4949K: \b(ABS|SYN)_MT_
diff --git a/arch/blackfin/mach-bf533/boards/stamp.c b/arch/blackfin/mach-bf533/boards/stamp.c
index 6f4bac969bf7..23eada79439c 100644
--- a/arch/blackfin/mach-bf533/boards/stamp.c
+++ b/arch/blackfin/mach-bf533/boards/stamp.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/device.h> 9#include <linux/device.h>
10#include <linux/delay.h>
10#include <linux/platform_device.h> 11#include <linux/platform_device.h>
11#include <linux/mtd/mtd.h> 12#include <linux/mtd/mtd.h>
12#include <linux/mtd/partitions.h> 13#include <linux/mtd/partitions.h>
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 99d440a4a6ba..ee85cd4e136a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -740,14 +740,15 @@ static int __init fcntl_init(void)
740 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY 740 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
741 * is defined as O_NONBLOCK on some platforms and not on others. 741 * is defined as O_NONBLOCK on some platforms and not on others.
742 */ 742 */
743 BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( 743 BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
744 O_RDONLY | O_WRONLY | O_RDWR | 744 O_RDONLY | O_WRONLY | O_RDWR |
745 O_CREAT | O_EXCL | O_NOCTTY | 745 O_CREAT | O_EXCL | O_NOCTTY |
746 O_TRUNC | O_APPEND | /* O_NONBLOCK | */ 746 O_TRUNC | O_APPEND | /* O_NONBLOCK | */
747 __O_SYNC | O_DSYNC | FASYNC | 747 __O_SYNC | O_DSYNC | FASYNC |
748 O_DIRECT | O_LARGEFILE | O_DIRECTORY | 748 O_DIRECT | O_LARGEFILE | O_DIRECTORY |
749 O_NOFOLLOW | O_NOATIME | O_CLOEXEC | 749 O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
750 __FMODE_EXEC | O_PATH | __O_TMPFILE 750 __FMODE_EXEC | O_PATH | __O_TMPFILE |
751 __FMODE_NONOTIFY
751 )); 752 ));
752 753
753 fasync_cache = kmem_cache_create("fasync_cache", 754 fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 79b5af5e6a7b..cecd875653e4 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2023,11 +2023,8 @@ leave:
2023 dlm_lockres_drop_inflight_ref(dlm, res); 2023 dlm_lockres_drop_inflight_ref(dlm, res);
2024 spin_unlock(&res->spinlock); 2024 spin_unlock(&res->spinlock);
2025 2025
2026 if (ret < 0) { 2026 if (ret < 0)
2027 mlog_errno(ret); 2027 mlog_errno(ret);
2028 if (newlock)
2029 dlm_lock_put(newlock);
2030 }
2031 2028
2032 return ret; 2029 return ret;
2033} 2030}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b931e04e3388..914c121ec890 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -94,6 +94,14 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
94 struct inode *inode, 94 struct inode *inode,
95 const char *symname); 95 const char *symname);
96 96
97static int ocfs2_double_lock(struct ocfs2_super *osb,
98 struct buffer_head **bh1,
99 struct inode *inode1,
100 struct buffer_head **bh2,
101 struct inode *inode2,
102 int rename);
103
104static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
97/* An orphan dir name is an 8 byte value, printed as a hex string */ 105/* An orphan dir name is an 8 byte value, printed as a hex string */
98#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) 106#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
99 107
@@ -678,8 +686,10 @@ static int ocfs2_link(struct dentry *old_dentry,
678{ 686{
679 handle_t *handle; 687 handle_t *handle;
680 struct inode *inode = old_dentry->d_inode; 688 struct inode *inode = old_dentry->d_inode;
689 struct inode *old_dir = old_dentry->d_parent->d_inode;
681 int err; 690 int err;
682 struct buffer_head *fe_bh = NULL; 691 struct buffer_head *fe_bh = NULL;
692 struct buffer_head *old_dir_bh = NULL;
683 struct buffer_head *parent_fe_bh = NULL; 693 struct buffer_head *parent_fe_bh = NULL;
684 struct ocfs2_dinode *fe = NULL; 694 struct ocfs2_dinode *fe = NULL;
685 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 695 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -696,19 +706,33 @@ static int ocfs2_link(struct dentry *old_dentry,
696 706
697 dquot_initialize(dir); 707 dquot_initialize(dir);
698 708
699 err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); 709 err = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
710 &parent_fe_bh, dir, 0);
700 if (err < 0) { 711 if (err < 0) {
701 if (err != -ENOENT) 712 if (err != -ENOENT)
702 mlog_errno(err); 713 mlog_errno(err);
703 return err; 714 return err;
704 } 715 }
705 716
717 /* make sure both dirs have bhs
718 * get an extra ref on old_dir_bh if old==new */
719 if (!parent_fe_bh) {
720 if (old_dir_bh) {
721 parent_fe_bh = old_dir_bh;
722 get_bh(parent_fe_bh);
723 } else {
724 mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str);
725 err = -EIO;
726 goto out;
727 }
728 }
729
706 if (!dir->i_nlink) { 730 if (!dir->i_nlink) {
707 err = -ENOENT; 731 err = -ENOENT;
708 goto out; 732 goto out;
709 } 733 }
710 734
711 err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name, 735 err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
712 old_dentry->d_name.len, &old_de_ino); 736 old_dentry->d_name.len, &old_de_ino);
713 if (err) { 737 if (err) {
714 err = -ENOENT; 738 err = -ENOENT;
@@ -801,10 +825,11 @@ out_unlock_inode:
801 ocfs2_inode_unlock(inode, 1); 825 ocfs2_inode_unlock(inode, 1);
802 826
803out: 827out:
804 ocfs2_inode_unlock(dir, 1); 828 ocfs2_double_unlock(old_dir, dir);
805 829
806 brelse(fe_bh); 830 brelse(fe_bh);
807 brelse(parent_fe_bh); 831 brelse(parent_fe_bh);
832 brelse(old_dir_bh);
808 833
809 ocfs2_free_dir_lookup_result(&lookup); 834 ocfs2_free_dir_lookup_result(&lookup);
810 835
@@ -1072,14 +1097,15 @@ static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
1072} 1097}
1073 1098
1074/* 1099/*
1075 * The only place this should be used is rename! 1100 * The only place this should be used is rename and link!
1076 * if they have the same id, then the 1st one is the only one locked. 1101 * if they have the same id, then the 1st one is the only one locked.
1077 */ 1102 */
1078static int ocfs2_double_lock(struct ocfs2_super *osb, 1103static int ocfs2_double_lock(struct ocfs2_super *osb,
1079 struct buffer_head **bh1, 1104 struct buffer_head **bh1,
1080 struct inode *inode1, 1105 struct inode *inode1,
1081 struct buffer_head **bh2, 1106 struct buffer_head **bh2,
1082 struct inode *inode2) 1107 struct inode *inode2,
1108 int rename)
1083{ 1109{
1084 int status; 1110 int status;
1085 int inode1_is_ancestor, inode2_is_ancestor; 1111 int inode1_is_ancestor, inode2_is_ancestor;
@@ -1127,7 +1153,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1127 } 1153 }
1128 /* lock id2 */ 1154 /* lock id2 */
1129 status = ocfs2_inode_lock_nested(inode2, bh2, 1, 1155 status = ocfs2_inode_lock_nested(inode2, bh2, 1,
1130 OI_LS_RENAME1); 1156 rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT);
1131 if (status < 0) { 1157 if (status < 0) {
1132 if (status != -ENOENT) 1158 if (status != -ENOENT)
1133 mlog_errno(status); 1159 mlog_errno(status);
@@ -1136,7 +1162,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1136 } 1162 }
1137 1163
1138 /* lock id1 */ 1164 /* lock id1 */
1139 status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2); 1165 status = ocfs2_inode_lock_nested(inode1, bh1, 1,
1166 rename == 1 ? OI_LS_RENAME2 : OI_LS_PARENT);
1140 if (status < 0) { 1167 if (status < 0) {
1141 /* 1168 /*
1142 * An error return must mean that no cluster locks 1169 * An error return must mean that no cluster locks
@@ -1252,7 +1279,7 @@ static int ocfs2_rename(struct inode *old_dir,
1252 1279
1253 /* if old and new are the same, this'll just do one lock. */ 1280 /* if old and new are the same, this'll just do one lock. */
1254 status = ocfs2_double_lock(osb, &old_dir_bh, old_dir, 1281 status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
1255 &new_dir_bh, new_dir); 1282 &new_dir_bh, new_dir, 1);
1256 if (status < 0) { 1283 if (status < 0) {
1257 mlog_errno(status); 1284 mlog_errno(status);
1258 goto bail; 1285 goto bail;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f90c0282c114..42efe13077b6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -135,7 +135,7 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
135#define FMODE_CAN_WRITE ((__force fmode_t)0x40000) 135#define FMODE_CAN_WRITE ((__force fmode_t)0x40000)
136 136
137/* File was opened by fanotify and shouldn't generate fanotify events */ 137/* File was opened by fanotify and shouldn't generate fanotify events */
138#define FMODE_NONOTIFY ((__force fmode_t)0x1000000) 138#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
139 139
140/* 140/*
141 * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector 141 * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c0c2bce6b0b7..d9d7e7e56352 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -37,6 +37,16 @@ struct anon_vma {
37 atomic_t refcount; 37 atomic_t refcount;
38 38
39 /* 39 /*
40 * Count of child anon_vmas and VMAs which points to this anon_vma.
41 *
42 * This counter is used for making decision about reusing anon_vma
43 * instead of forking new one. See comments in function anon_vma_clone.
44 */
45 unsigned degree;
46
47 struct anon_vma *parent; /* Parent of this anon_vma */
48
49 /*
40 * NOTE: the LSB of the rb_root.rb_node is set by 50 * NOTE: the LSB of the rb_root.rb_node is set by
41 * mm_take_all_locks() _after_ taking the above lock. So the 51 * mm_take_all_locks() _after_ taking the above lock. So the
42 * rb_root must only be read/written after taking the above lock 52 * rb_root must only be read/written after taking the above lock
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a219be961c0a..00048339c23e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -177,7 +177,6 @@ int write_cache_pages(struct address_space *mapping,
177 struct writeback_control *wbc, writepage_t writepage, 177 struct writeback_control *wbc, writepage_t writepage,
178 void *data); 178 void *data);
179int do_writepages(struct address_space *mapping, struct writeback_control *wbc); 179int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
180void set_page_dirty_balance(struct page *page);
181void writeback_set_ratelimit(void); 180void writeback_set_ratelimit(void);
182void tag_pages_for_writeback(struct address_space *mapping, 181void tag_pages_for_writeback(struct address_space *mapping,
183 pgoff_t start, pgoff_t end); 182 pgoff_t start, pgoff_t end);
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 7543b3e51331..e063effe0cc1 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -5,7 +5,7 @@
5 5
6/* 6/*
7 * FMODE_EXEC is 0x20 7 * FMODE_EXEC is 0x20
8 * FMODE_NONOTIFY is 0x1000000 8 * FMODE_NONOTIFY is 0x4000000
9 * These cannot be used by userspace O_* until internal and external open 9 * These cannot be used by userspace O_* until internal and external open
10 * flags are split. 10 * flags are split.
11 * -Eric Paris 11 * -Eric Paris
diff --git a/kernel/exit.c b/kernel/exit.c
index 1ea4369890a3..6806c55475ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1287,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1287static int wait_consider_task(struct wait_opts *wo, int ptrace, 1287static int wait_consider_task(struct wait_opts *wo, int ptrace,
1288 struct task_struct *p) 1288 struct task_struct *p)
1289{ 1289{
1290 /*
1291 * We can race with wait_task_zombie() from another thread.
1292 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1293 * can't confuse the checks below.
1294 */
1295 int exit_state = ACCESS_ONCE(p->exit_state);
1290 int ret; 1296 int ret;
1291 1297
1292 if (unlikely(p->exit_state == EXIT_DEAD)) 1298 if (unlikely(exit_state == EXIT_DEAD))
1293 return 0; 1299 return 0;
1294 1300
1295 ret = eligible_child(wo, p); 1301 ret = eligible_child(wo, p);
@@ -1310,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1310 return 0; 1316 return 0;
1311 } 1317 }
1312 1318
1313 if (unlikely(p->exit_state == EXIT_TRACE)) { 1319 if (unlikely(exit_state == EXIT_TRACE)) {
1314 /* 1320 /*
1315 * ptrace == 0 means we are the natural parent. In this case 1321 * ptrace == 0 means we are the natural parent. In this case
1316 * we should clear notask_error, debugger will notify us. 1322 * we should clear notask_error, debugger will notify us.
@@ -1337,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1337 } 1343 }
1338 1344
1339 /* slay zombie? */ 1345 /* slay zombie? */
1340 if (p->exit_state == EXIT_ZOMBIE) { 1346 if (exit_state == EXIT_ZOMBIE) {
1341 /* we don't reap group leaders with subthreads */ 1347 /* we don't reap group leaders with subthreads */
1342 if (!delay_group_leader(p)) { 1348 if (!delay_group_leader(p)) {
1343 /* 1349 /*
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 56badfc4810a..957d3da53ddd 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -14,7 +14,6 @@ config DEBUG_PAGEALLOC
14 depends on !KMEMCHECK 14 depends on !KMEMCHECK
15 select PAGE_EXTENSION 15 select PAGE_EXTENSION
16 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC 16 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
17 select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
18 ---help--- 17 ---help---
19 Unmap pages from the kernel linear mapping after free_pages(). 18 Unmap pages from the kernel linear mapping after free_pages().
20 This results in a large slowdown, but helps to find certain types 19 This results in a large slowdown, but helps to find certain types
@@ -27,13 +26,5 @@ config DEBUG_PAGEALLOC
27 that would result in incorrect warnings of memory corruption after 26 that would result in incorrect warnings of memory corruption after
28 a resume because free pages are not saved to the suspend image. 27 a resume because free pages are not saved to the suspend image.
29 28
30config WANT_PAGE_DEBUG_FLAGS
31 bool
32
33config PAGE_POISONING 29config PAGE_POISONING
34 bool 30 bool
35 select WANT_PAGE_DEBUG_FLAGS
36
37config PAGE_GUARD
38 bool
39 select WANT_PAGE_DEBUG_FLAGS
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ef91e856c7e4..851924fa5170 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3043,18 +3043,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
3043 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3043 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3044 mem_cgroup_swap_statistics(from, false); 3044 mem_cgroup_swap_statistics(from, false);
3045 mem_cgroup_swap_statistics(to, true); 3045 mem_cgroup_swap_statistics(to, true);
3046 /*
3047 * This function is only called from task migration context now.
3048 * It postpones page_counter and refcount handling till the end
3049 * of task migration(mem_cgroup_clear_mc()) for performance
3050 * improvement. But we cannot postpone css_get(to) because if
3051 * the process that has been moved to @to does swap-in, the
3052 * refcount of @to might be decreased to 0.
3053 *
3054 * We are in attach() phase, so the cgroup is guaranteed to be
3055 * alive, so we can just call css_get().
3056 */
3057 css_get(&to->css);
3058 return 0; 3046 return 0;
3059 } 3047 }
3060 return -EINVAL; 3048 return -EINVAL;
@@ -4679,6 +4667,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4679 if (parent_css == NULL) { 4667 if (parent_css == NULL) {
4680 root_mem_cgroup = memcg; 4668 root_mem_cgroup = memcg;
4681 page_counter_init(&memcg->memory, NULL); 4669 page_counter_init(&memcg->memory, NULL);
4670 memcg->soft_limit = PAGE_COUNTER_MAX;
4682 page_counter_init(&memcg->memsw, NULL); 4671 page_counter_init(&memcg->memsw, NULL);
4683 page_counter_init(&memcg->kmem, NULL); 4672 page_counter_init(&memcg->kmem, NULL);
4684 } 4673 }
@@ -4724,6 +4713,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
4724 4713
4725 if (parent->use_hierarchy) { 4714 if (parent->use_hierarchy) {
4726 page_counter_init(&memcg->memory, &parent->memory); 4715 page_counter_init(&memcg->memory, &parent->memory);
4716 memcg->soft_limit = PAGE_COUNTER_MAX;
4727 page_counter_init(&memcg->memsw, &parent->memsw); 4717 page_counter_init(&memcg->memsw, &parent->memsw);
4728 page_counter_init(&memcg->kmem, &parent->kmem); 4718 page_counter_init(&memcg->kmem, &parent->kmem);
4729 4719
@@ -4733,6 +4723,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
4733 */ 4723 */
4734 } else { 4724 } else {
4735 page_counter_init(&memcg->memory, NULL); 4725 page_counter_init(&memcg->memory, NULL);
4726 memcg->soft_limit = PAGE_COUNTER_MAX;
4736 page_counter_init(&memcg->memsw, NULL); 4727 page_counter_init(&memcg->memsw, NULL);
4737 page_counter_init(&memcg->kmem, NULL); 4728 page_counter_init(&memcg->kmem, NULL);
4738 /* 4729 /*
@@ -4807,7 +4798,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
4807 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); 4798 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
4808 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); 4799 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
4809 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); 4800 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
4810 memcg->soft_limit = 0; 4801 memcg->soft_limit = PAGE_COUNTER_MAX;
4811} 4802}
4812 4803
4813#ifdef CONFIG_MMU 4804#ifdef CONFIG_MMU
diff --git a/mm/memory.c b/mm/memory.c
index d7e497e98f46..c6565f00fb38 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2137,17 +2137,24 @@ reuse:
2137 if (!dirty_page) 2137 if (!dirty_page)
2138 return ret; 2138 return ret;
2139 2139
2140 /*
2141 * Yes, Virginia, this is actually required to prevent a race
2142 * with clear_page_dirty_for_io() from clearing the page dirty
2143 * bit after it clear all dirty ptes, but before a racing
2144 * do_wp_page installs a dirty pte.
2145 *
2146 * do_shared_fault is protected similarly.
2147 */
2148 if (!page_mkwrite) { 2140 if (!page_mkwrite) {
2149 wait_on_page_locked(dirty_page); 2141 struct address_space *mapping;
2150 set_page_dirty_balance(dirty_page); 2142 int dirtied;
2143
2144 lock_page(dirty_page);
2145 dirtied = set_page_dirty(dirty_page);
2146 VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page);
2147 mapping = dirty_page->mapping;
2148 unlock_page(dirty_page);
2149
2150 if (dirtied && mapping) {
2151 /*
2152 * Some device drivers do not set page.mapping
2153 * but still dirty their pages
2154 */
2155 balance_dirty_pages_ratelimited(mapping);
2156 }
2157
2151 /* file_update_time outside page_lock */ 2158 /* file_update_time outside page_lock */
2152 if (vma->vm_file) 2159 if (vma->vm_file)
2153 file_update_time(vma->vm_file); 2160 file_update_time(vma->vm_file);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d5d81f5384d1..6f4335238e33 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1541,16 +1541,6 @@ pause:
1541 bdi_start_background_writeback(bdi); 1541 bdi_start_background_writeback(bdi);
1542} 1542}
1543 1543
1544void set_page_dirty_balance(struct page *page)
1545{
1546 if (set_page_dirty(page)) {
1547 struct address_space *mapping = page_mapping(page);
1548
1549 if (mapping)
1550 balance_dirty_pages_ratelimited(mapping);
1551 }
1552}
1553
1554static DEFINE_PER_CPU(int, bdp_ratelimits); 1544static DEFINE_PER_CPU(int, bdp_ratelimits);
1555 1545
1556/* 1546/*
@@ -2123,32 +2113,25 @@ EXPORT_SYMBOL(account_page_dirtied);
2123 * page dirty in that case, but not all the buffers. This is a "bottom-up" 2113 * page dirty in that case, but not all the buffers. This is a "bottom-up"
2124 * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. 2114 * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
2125 * 2115 *
2126 * Most callers have locked the page, which pins the address_space in memory. 2116 * The caller must ensure this doesn't race with truncation. Most will simply
2127 * But zap_pte_range() does not lock the page, however in that case the 2117 * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
2128 * mapping is pinned by the vma's ->vm_file reference. 2118 * the pte lock held, which also locks out truncation.
2129 *
2130 * We take care to handle the case where the page was truncated from the
2131 * mapping by re-checking page_mapping() inside tree_lock.
2132 */ 2119 */
2133int __set_page_dirty_nobuffers(struct page *page) 2120int __set_page_dirty_nobuffers(struct page *page)
2134{ 2121{
2135 if (!TestSetPageDirty(page)) { 2122 if (!TestSetPageDirty(page)) {
2136 struct address_space *mapping = page_mapping(page); 2123 struct address_space *mapping = page_mapping(page);
2137 struct address_space *mapping2;
2138 unsigned long flags; 2124 unsigned long flags;
2139 2125
2140 if (!mapping) 2126 if (!mapping)
2141 return 1; 2127 return 1;
2142 2128
2143 spin_lock_irqsave(&mapping->tree_lock, flags); 2129 spin_lock_irqsave(&mapping->tree_lock, flags);
2144 mapping2 = page_mapping(page); 2130 BUG_ON(page_mapping(page) != mapping);
2145 if (mapping2) { /* Race with truncate? */ 2131 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
2146 BUG_ON(mapping2 != mapping); 2132 account_page_dirtied(page, mapping);
2147 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 2133 radix_tree_tag_set(&mapping->page_tree, page_index(page),
2148 account_page_dirtied(page, mapping); 2134 PAGECACHE_TAG_DIRTY);
2149 radix_tree_tag_set(&mapping->page_tree,
2150 page_index(page), PAGECACHE_TAG_DIRTY);
2151 }
2152 spin_unlock_irqrestore(&mapping->tree_lock, flags); 2135 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2153 if (mapping->host) { 2136 if (mapping->host) {
2154 /* !PageAnon && !swapper_space */ 2137 /* !PageAnon && !swapper_space */
@@ -2305,12 +2288,10 @@ int clear_page_dirty_for_io(struct page *page)
2305 /* 2288 /*
2306 * We carefully synchronise fault handlers against 2289 * We carefully synchronise fault handlers against
2307 * installing a dirty pte and marking the page dirty 2290 * installing a dirty pte and marking the page dirty
2308 * at this point. We do this by having them hold the 2291 * at this point. We do this by having them hold the
2309 * page lock at some point after installing their 2292 * page lock while dirtying the page, and pages are
2310 * pte, but before marking the page dirty. 2293 * always locked coming in here, so we get the desired
2311 * Pages are always locked coming in here, so we get 2294 * exclusion.
2312 * the desired exclusion. See mm/memory.c:do_wp_page()
2313 * for more comments.
2314 */ 2295 */
2315 if (TestClearPageDirty(page)) { 2296 if (TestClearPageDirty(page)) {
2316 dec_zone_page_state(page, NR_FILE_DIRTY); 2297 dec_zone_page_state(page, NR_FILE_DIRTY);
diff --git a/mm/rmap.c b/mm/rmap.c
index c5bc241127b2..71cd5bd0c17d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -72,6 +72,8 @@ static inline struct anon_vma *anon_vma_alloc(void)
72 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 72 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
73 if (anon_vma) { 73 if (anon_vma) {
74 atomic_set(&anon_vma->refcount, 1); 74 atomic_set(&anon_vma->refcount, 1);
75 anon_vma->degree = 1; /* Reference for first vma */
76 anon_vma->parent = anon_vma;
75 /* 77 /*
76 * Initialise the anon_vma root to point to itself. If called 78 * Initialise the anon_vma root to point to itself. If called
77 * from fork, the root will be reset to the parents anon_vma. 79 * from fork, the root will be reset to the parents anon_vma.
@@ -188,6 +190,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
188 if (likely(!vma->anon_vma)) { 190 if (likely(!vma->anon_vma)) {
189 vma->anon_vma = anon_vma; 191 vma->anon_vma = anon_vma;
190 anon_vma_chain_link(vma, avc, anon_vma); 192 anon_vma_chain_link(vma, avc, anon_vma);
193 /* vma reference or self-parent link for new root */
194 anon_vma->degree++;
191 allocated = NULL; 195 allocated = NULL;
192 avc = NULL; 196 avc = NULL;
193 } 197 }
@@ -236,6 +240,14 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
236/* 240/*
237 * Attach the anon_vmas from src to dst. 241 * Attach the anon_vmas from src to dst.
238 * Returns 0 on success, -ENOMEM on failure. 242 * Returns 0 on success, -ENOMEM on failure.
243 *
244 * If dst->anon_vma is NULL this function tries to find and reuse existing
245 * anon_vma which has no vmas and only one child anon_vma. This prevents
246 * degradation of anon_vma hierarchy to endless linear chain in case of
247 * constantly forking task. On the other hand, an anon_vma with more than one
248 * child isn't reused even if there was no alive vma, thus rmap walker has a
249 * good chance of avoiding scanning the whole hierarchy when it searches where
250 * page is mapped.
239 */ 251 */
240int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 252int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
241{ 253{
@@ -256,7 +268,21 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
256 anon_vma = pavc->anon_vma; 268 anon_vma = pavc->anon_vma;
257 root = lock_anon_vma_root(root, anon_vma); 269 root = lock_anon_vma_root(root, anon_vma);
258 anon_vma_chain_link(dst, avc, anon_vma); 270 anon_vma_chain_link(dst, avc, anon_vma);
271
272 /*
273 * Reuse existing anon_vma if its degree lower than two,
274 * that means it has no vma and only one anon_vma child.
275 *
276 * Do not chose parent anon_vma, otherwise first child
277 * will always reuse it. Root anon_vma is never reused:
278 * it has self-parent reference and at least one child.
279 */
280 if (!dst->anon_vma && anon_vma != src->anon_vma &&
281 anon_vma->degree < 2)
282 dst->anon_vma = anon_vma;
259 } 283 }
284 if (dst->anon_vma)
285 dst->anon_vma->degree++;
260 unlock_anon_vma_root(root); 286 unlock_anon_vma_root(root);
261 return 0; 287 return 0;
262 288
@@ -280,6 +306,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
280 if (!pvma->anon_vma) 306 if (!pvma->anon_vma)
281 return 0; 307 return 0;
282 308
309 /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
310 vma->anon_vma = NULL;
311
283 /* 312 /*
284 * First, attach the new VMA to the parent VMA's anon_vmas, 313 * First, attach the new VMA to the parent VMA's anon_vmas,
285 * so rmap can find non-COWed pages in child processes. 314 * so rmap can find non-COWed pages in child processes.
@@ -288,6 +317,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
288 if (error) 317 if (error)
289 return error; 318 return error;
290 319
320 /* An existing anon_vma has been reused, all done then. */
321 if (vma->anon_vma)
322 return 0;
323
291 /* Then add our own anon_vma. */ 324 /* Then add our own anon_vma. */
292 anon_vma = anon_vma_alloc(); 325 anon_vma = anon_vma_alloc();
293 if (!anon_vma) 326 if (!anon_vma)
@@ -301,6 +334,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
301 * lock any of the anon_vmas in this anon_vma tree. 334 * lock any of the anon_vmas in this anon_vma tree.
302 */ 335 */
303 anon_vma->root = pvma->anon_vma->root; 336 anon_vma->root = pvma->anon_vma->root;
337 anon_vma->parent = pvma->anon_vma;
304 /* 338 /*
305 * With refcounts, an anon_vma can stay around longer than the 339 * With refcounts, an anon_vma can stay around longer than the
306 * process it belongs to. The root anon_vma needs to be pinned until 340 * process it belongs to. The root anon_vma needs to be pinned until
@@ -311,6 +345,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
311 vma->anon_vma = anon_vma; 345 vma->anon_vma = anon_vma;
312 anon_vma_lock_write(anon_vma); 346 anon_vma_lock_write(anon_vma);
313 anon_vma_chain_link(vma, avc, anon_vma); 347 anon_vma_chain_link(vma, avc, anon_vma);
348 anon_vma->parent->degree++;
314 anon_vma_unlock_write(anon_vma); 349 anon_vma_unlock_write(anon_vma);
315 350
316 return 0; 351 return 0;
@@ -341,12 +376,16 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
341 * Leave empty anon_vmas on the list - we'll need 376 * Leave empty anon_vmas on the list - we'll need
342 * to free them outside the lock. 377 * to free them outside the lock.
343 */ 378 */
344 if (RB_EMPTY_ROOT(&anon_vma->rb_root)) 379 if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
380 anon_vma->parent->degree--;
345 continue; 381 continue;
382 }
346 383
347 list_del(&avc->same_vma); 384 list_del(&avc->same_vma);
348 anon_vma_chain_free(avc); 385 anon_vma_chain_free(avc);
349 } 386 }
387 if (vma->anon_vma)
388 vma->anon_vma->degree--;
350 unlock_anon_vma_root(root); 389 unlock_anon_vma_root(root);
351 390
352 /* 391 /*
@@ -357,6 +396,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
357 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 396 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
358 struct anon_vma *anon_vma = avc->anon_vma; 397 struct anon_vma *anon_vma = avc->anon_vma;
359 398
399 BUG_ON(anon_vma->degree);
360 put_anon_vma(anon_vma); 400 put_anon_vma(anon_vma);
361 401
362 list_del(&avc->same_vma); 402 list_del(&avc->same_vma);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd9a72bc4a1b..ab2505c3ef54 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2921,18 +2921,20 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2921 return false; 2921 return false;
2922 2922
2923 /* 2923 /*
2924 * There is a potential race between when kswapd checks its watermarks 2924 * The throttled processes are normally woken up in balance_pgdat() as
2925 * and a process gets throttled. There is also a potential race if 2925 * soon as pfmemalloc_watermark_ok() is true. But there is a potential
2926 * processes get throttled, kswapd wakes, a large process exits therby 2926 * race between when kswapd checks the watermarks and a process gets
2927 * balancing the zones that causes kswapd to miss a wakeup. If kswapd 2927 * throttled. There is also a potential race if processes get
2928 * is going to sleep, no process should be sleeping on pfmemalloc_wait 2928 * throttled, kswapd wakes, a large process exits thereby balancing the
2929 * so wake them now if necessary. If necessary, processes will wake 2929 * zones, which causes kswapd to exit balance_pgdat() before reaching
2930 * kswapd and get throttled again 2930 * the wake up checks. If kswapd is going to sleep, no process should
2931 * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
2932 * the wake up is premature, processes will wake kswapd and get
2933 * throttled again. The difference from wake ups in balance_pgdat() is
2934 * that here we are under prepare_to_wait().
2931 */ 2935 */
2932 if (waitqueue_active(&pgdat->pfmemalloc_wait)) { 2936 if (waitqueue_active(&pgdat->pfmemalloc_wait))
2933 wake_up(&pgdat->pfmemalloc_wait); 2937 wake_up_all(&pgdat->pfmemalloc_wait);
2934 return false;
2935 }
2936 2938
2937 return pgdat_balanced(pgdat, order, classzone_idx); 2939 return pgdat_balanced(pgdat, order, classzone_idx);
2938} 2940}