aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-01-12 23:42:54 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-12 23:42:54 -0500
commit099469502f62fbe0d7e4f0b83a2f22538367f734 (patch)
tree5229c3818b2e6e09d35026d49314047121130536 /fs
parent7c17d86a8502c2e30c2eea777ed1b830aa3b447b (diff)
parent35f1526845a9d804206883e19bd257d3dcef758f (diff)
Merge branch 'akpm' (aka "Andrew's patch-bomb, take two")
Andrew explains: - various misc stuff - Most of the rest of MM: memcg, threaded hugepages, others. - cpumask - kexec - kdump - some direct-io performance tweaking - radix-tree optimisations - new selftests code A note on this: often people will develop a new userspace-visible feature and will develop userspace code to exercise/test that feature. Then they merge the patch and the selftest code dies. Sometimes we paste it into the changelog. Sometimes the code gets thrown into Documentation/(!). This saddens me. So this patch creates a bare-bones framework which will henceforth allow me to ask people to include their test apps in the kernel tree so we can keep them alive. Then when people enhance or fix the feature, I can ask them to update the test app too. The infrastruture is terribly trivial at present - let's see how it evolves. - checkpoint/restart feature work. A note on this: this is a project by various mad Russians to perform c/r mainly from userspace, with various oddball helper code added into the kernel where the need is demonstrated. So rather than some large central lump of code, what we have is little bits and pieces popping up in various places which either expose something new or which permit something which is normally kernel-private to be modified. The overall project is an ongoing thing. I've judged that the size and scope of the thing means that we're more likely to be successful with it if we integrate the support into mainline piecemeal rather than allowing it all to develop out-of-tree. However I'm less confident than the developers that it will all eventually work! So what I'm asking them to do is to wrap each piece of new code inside CONFIG_CHECKPOINT_RESTORE. So if it all eventually comes to tears and the project as a whole fails, it should be a simple matter to go through and delete all trace of it. This lot pretty much wraps up the -rc1 merge for me. * akpm: (96 commits) unlzo: fix input buffer free ramoops: update parameters only after successful init ramoops: fix use of rounddown_pow_of_two() c/r: prctl: add PR_SET_MM codes to set up mm_struct entries c/r: procfs: add start_data, end_data, start_brk members to /proc/$pid/stat v4 c/r: introduce CHECKPOINT_RESTORE symbol selftests: new x86 breakpoints selftest selftests: new very basic kernel selftests directory radix_tree: take radix_tree_path off stack radix_tree: remove radix_tree_indirect_to_ptr() dio: optimize cache misses in the submission path vfs: cache request_queue in struct block_device fs/direct-io.c: calculate fs_count correctly in get_more_blocks() drivers/parport/parport_pc.c: fix warnings panic: don't print redundant backtraces on oops sysctl: add the kernel.ns_last_pid control kdump: add udev events for memory online/offline include/linux/crash_dump.h needs elf.h kdump: fix crash_kexec()/smp_send_stop() race in panic() kdump: crashk_res init check for /sys/kernel/kexec_crash_size ...
Diffstat (limited to 'fs')
-rw-r--r--fs/block_dev.c3
-rw-r--r--fs/btrfs/disk-io.c5
-rw-r--r--fs/direct-io.c57
-rw-r--r--fs/eventpoll.c234
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/write.c4
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/proc/array.c7
-rw-r--r--fs/proc/base.c2
10 files changed, 269 insertions, 50 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index afe74dda632b..0e575d1304b4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1139,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1139 mutex_lock_nested(&bdev->bd_mutex, for_part); 1139 mutex_lock_nested(&bdev->bd_mutex, for_part);
1140 if (!bdev->bd_openers) { 1140 if (!bdev->bd_openers) {
1141 bdev->bd_disk = disk; 1141 bdev->bd_disk = disk;
1142 bdev->bd_queue = disk->queue;
1142 bdev->bd_contains = bdev; 1143 bdev->bd_contains = bdev;
1143 if (!partno) { 1144 if (!partno) {
1144 struct backing_dev_info *bdi; 1145 struct backing_dev_info *bdi;
@@ -1159,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1159 disk_put_part(bdev->bd_part); 1160 disk_put_part(bdev->bd_part);
1160 bdev->bd_part = NULL; 1161 bdev->bd_part = NULL;
1161 bdev->bd_disk = NULL; 1162 bdev->bd_disk = NULL;
1163 bdev->bd_queue = NULL;
1162 mutex_unlock(&bdev->bd_mutex); 1164 mutex_unlock(&bdev->bd_mutex);
1163 disk_unblock_events(disk); 1165 disk_unblock_events(disk);
1164 put_disk(disk); 1166 put_disk(disk);
@@ -1232,6 +1234,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1232 disk_put_part(bdev->bd_part); 1234 disk_put_part(bdev->bd_part);
1233 bdev->bd_disk = NULL; 1235 bdev->bd_disk = NULL;
1234 bdev->bd_part = NULL; 1236 bdev->bd_part = NULL;
1237 bdev->bd_queue = NULL;
1235 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); 1238 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1236 if (bdev != bdev->bd_contains) 1239 if (bdev != bdev->bd_contains)
1237 __blkdev_put(bdev->bd_contains, mode, 1); 1240 __blkdev_put(bdev->bd_contains, mode, 1);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f99a099a7747..d8525662ca7a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -872,7 +872,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
872 872
873#ifdef CONFIG_MIGRATION 873#ifdef CONFIG_MIGRATION
874static int btree_migratepage(struct address_space *mapping, 874static int btree_migratepage(struct address_space *mapping,
875 struct page *newpage, struct page *page) 875 struct page *newpage, struct page *page,
876 enum migrate_mode mode)
876{ 877{
877 /* 878 /*
878 * we can't safely write a btree page from here, 879 * we can't safely write a btree page from here,
@@ -887,7 +888,7 @@ static int btree_migratepage(struct address_space *mapping,
887 if (page_has_private(page) && 888 if (page_has_private(page) &&
888 !try_to_release_page(page, GFP_KERNEL)) 889 !try_to_release_page(page, GFP_KERNEL))
889 return -EAGAIN; 890 return -EAGAIN;
890 return migrate_page(mapping, newpage, page); 891 return migrate_page(mapping, newpage, page, mode);
891} 892}
892#endif 893#endif
893 894
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d740ab67ff6e..4a588dbd11bf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -36,6 +36,7 @@
36#include <linux/rwsem.h> 36#include <linux/rwsem.h>
37#include <linux/uio.h> 37#include <linux/uio.h>
38#include <linux/atomic.h> 38#include <linux/atomic.h>
39#include <linux/prefetch.h>
39 40
40/* 41/*
41 * How many user pages to map in one call to get_user_pages(). This determines 42 * How many user pages to map in one call to get_user_pages(). This determines
@@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
580{ 581{
581 int ret; 582 int ret;
582 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ 583 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
584 sector_t fs_endblk; /* Into file, in filesystem-sized blocks */
583 unsigned long fs_count; /* Number of filesystem-sized blocks */ 585 unsigned long fs_count; /* Number of filesystem-sized blocks */
584 unsigned long dio_count;/* Number of dio_block-sized blocks */
585 unsigned long blkmask;
586 int create; 586 int create;
587 587
588 /* 588 /*
@@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
593 if (ret == 0) { 593 if (ret == 0) {
594 BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); 594 BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
595 fs_startblk = sdio->block_in_file >> sdio->blkfactor; 595 fs_startblk = sdio->block_in_file >> sdio->blkfactor;
596 dio_count = sdio->final_block_in_request - sdio->block_in_file; 596 fs_endblk = (sdio->final_block_in_request - 1) >>
597 fs_count = dio_count >> sdio->blkfactor; 597 sdio->blkfactor;
598 blkmask = (1 << sdio->blkfactor) - 1; 598 fs_count = fs_endblk - fs_startblk + 1;
599 if (dio_count & blkmask)
600 fs_count++;
601 599
602 map_bh->b_state = 0; 600 map_bh->b_state = 0;
603 map_bh->b_size = fs_count << dio->inode->i_blkbits; 601 map_bh->b_size = fs_count << dio->inode->i_blkbits;
@@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio)
1090 * individual fields and will generate much worse code. This is important 1088 * individual fields and will generate much worse code. This is important
1091 * for the whole file. 1089 * for the whole file.
1092 */ 1090 */
1093ssize_t 1091static inline ssize_t
1094__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1092do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1095 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1093 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1096 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1094 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1097 dio_submit_t submit_io, int flags) 1095 dio_submit_t submit_io, int flags)
@@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1100 size_t size; 1098 size_t size;
1101 unsigned long addr; 1099 unsigned long addr;
1102 unsigned blkbits = inode->i_blkbits; 1100 unsigned blkbits = inode->i_blkbits;
1103 unsigned bdev_blkbits = 0;
1104 unsigned blocksize_mask = (1 << blkbits) - 1; 1101 unsigned blocksize_mask = (1 << blkbits) - 1;
1105 ssize_t retval = -EINVAL; 1102 ssize_t retval = -EINVAL;
1106 loff_t end = offset; 1103 loff_t end = offset;
@@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1113 if (rw & WRITE) 1110 if (rw & WRITE)
1114 rw = WRITE_ODIRECT; 1111 rw = WRITE_ODIRECT;
1115 1112
1116 if (bdev) 1113 /*
1117 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); 1114 * Avoid references to bdev if not absolutely needed to give
1115 * the early prefetch in the caller enough time.
1116 */
1118 1117
1119 if (offset & blocksize_mask) { 1118 if (offset & blocksize_mask) {
1120 if (bdev) 1119 if (bdev)
1121 blkbits = bdev_blkbits; 1120 blkbits = blksize_bits(bdev_logical_block_size(bdev));
1122 blocksize_mask = (1 << blkbits) - 1; 1121 blocksize_mask = (1 << blkbits) - 1;
1123 if (offset & blocksize_mask) 1122 if (offset & blocksize_mask)
1124 goto out; 1123 goto out;
@@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1129 addr = (unsigned long)iov[seg].iov_base; 1128 addr = (unsigned long)iov[seg].iov_base;
1130 size = iov[seg].iov_len; 1129 size = iov[seg].iov_len;
1131 end += size; 1130 end += size;
1132 if ((addr & blocksize_mask) || (size & blocksize_mask)) { 1131 if (unlikely((addr & blocksize_mask) ||
1132 (size & blocksize_mask))) {
1133 if (bdev) 1133 if (bdev)
1134 blkbits = bdev_blkbits; 1134 blkbits = blksize_bits(
1135 bdev_logical_block_size(bdev));
1135 blocksize_mask = (1 << blkbits) - 1; 1136 blocksize_mask = (1 << blkbits) - 1;
1136 if ((addr & blocksize_mask) || (size & blocksize_mask)) 1137 if ((addr & blocksize_mask) || (size & blocksize_mask))
1137 goto out; 1138 goto out;
1138 } 1139 }
1139 } 1140 }
@@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1316out: 1317out:
1317 return retval; 1318 return retval;
1318} 1319}
1320
1321ssize_t
1322__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1323 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1324 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1325 dio_submit_t submit_io, int flags)
1326{
1327 /*
1328 * The block device state is needed in the end to finally
1329 * submit everything. Since it's likely to be cache cold
1330 * prefetch it here as first thing to hide some of the
1331 * latency.
1332 *
1333 * Attempt to prefetch the pieces we likely need later.
1334 */
1335 prefetch(&bdev->bd_disk->part_tbl);
1336 prefetch(bdev->bd_queue);
1337 prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
1338
1339 return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
1340 nr_segs, get_block, end_io,
1341 submit_io, flags);
1342}
1343
1319EXPORT_SYMBOL(__blockdev_direct_IO); 1344EXPORT_SYMBOL(__blockdev_direct_IO);
1320 1345
1321static __init int dio_init(void) 1346static __init int dio_init(void)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 828e750af23a..aabdfc38cf24 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -197,6 +197,12 @@ struct eventpoll {
197 197
198 /* The user that created the eventpoll descriptor */ 198 /* The user that created the eventpoll descriptor */
199 struct user_struct *user; 199 struct user_struct *user;
200
201 struct file *file;
202
203 /* used to optimize loop detection check */
204 int visited;
205 struct list_head visited_list_link;
200}; 206};
201 207
202/* Wait structure used by the poll hooks */ 208/* Wait structure used by the poll hooks */
@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly;
255/* Slab cache used to allocate "struct eppoll_entry" */ 261/* Slab cache used to allocate "struct eppoll_entry" */
256static struct kmem_cache *pwq_cache __read_mostly; 262static struct kmem_cache *pwq_cache __read_mostly;
257 263
264/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
265static LIST_HEAD(visited_list);
266
267/*
268 * List of files with newly added links, where we may need to limit the number
269 * of emanating paths. Protected by the epmutex.
270 */
271static LIST_HEAD(tfile_check_list);
272
258#ifdef CONFIG_SYSCTL 273#ifdef CONFIG_SYSCTL
259 274
260#include <linux/sysctl.h> 275#include <linux/sysctl.h>
@@ -276,6 +291,12 @@ ctl_table epoll_table[] = {
276}; 291};
277#endif /* CONFIG_SYSCTL */ 292#endif /* CONFIG_SYSCTL */
278 293
294static const struct file_operations eventpoll_fops;
295
296static inline int is_file_epoll(struct file *f)
297{
298 return f->f_op == &eventpoll_fops;
299}
279 300
280/* Setup the structure that is used as key for the RB tree */ 301/* Setup the structure that is used as key for the RB tree */
281static inline void ep_set_ffd(struct epoll_filefd *ffd, 302static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = {
711 .llseek = noop_llseek, 732 .llseek = noop_llseek,
712}; 733};
713 734
714/* Fast test to see if the file is an eventpoll file */
715static inline int is_file_epoll(struct file *f)
716{
717 return f->f_op == &eventpoll_fops;
718}
719
720/* 735/*
721 * This is called from eventpoll_release() to unlink files from the eventpoll 736 * This is called from eventpoll_release() to unlink files from the eventpoll
722 * interface. We need to have this facility to cleanup correctly files that are 737 * interface. We need to have this facility to cleanup correctly files that are
@@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
926 rb_insert_color(&epi->rbn, &ep->rbr); 941 rb_insert_color(&epi->rbn, &ep->rbr);
927} 942}
928 943
944
945
946#define PATH_ARR_SIZE 5
947/*
948 * These are the number paths of length 1 to 5, that we are allowing to emanate
949 * from a single file of interest. For example, we allow 1000 paths of length
950 * 1, to emanate from each file of interest. This essentially represents the
951 * potential wakeup paths, which need to be limited in order to avoid massive
952 * uncontrolled wakeup storms. The common use case should be a single ep which
953 * is connected to n file sources. In this case each file source has 1 path
954 * of length 1. Thus, the numbers below should be more than sufficient. These
955 * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
956 * and delete can't add additional paths. Protected by the epmutex.
957 */
958static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
959static int path_count[PATH_ARR_SIZE];
960
961static int path_count_inc(int nests)
962{
963 if (++path_count[nests] > path_limits[nests])
964 return -1;
965 return 0;
966}
967
968static void path_count_init(void)
969{
970 int i;
971
972 for (i = 0; i < PATH_ARR_SIZE; i++)
973 path_count[i] = 0;
974}
975
976static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
977{
978 int error = 0;
979 struct file *file = priv;
980 struct file *child_file;
981 struct epitem *epi;
982
983 list_for_each_entry(epi, &file->f_ep_links, fllink) {
984 child_file = epi->ep->file;
985 if (is_file_epoll(child_file)) {
986 if (list_empty(&child_file->f_ep_links)) {
987 if (path_count_inc(call_nests)) {
988 error = -1;
989 break;
990 }
991 } else {
992 error = ep_call_nested(&poll_loop_ncalls,
993 EP_MAX_NESTS,
994 reverse_path_check_proc,
995 child_file, child_file,
996 current);
997 }
998 if (error != 0)
999 break;
1000 } else {
1001 printk(KERN_ERR "reverse_path_check_proc: "
1002 "file is not an ep!\n");
1003 }
1004 }
1005 return error;
1006}
1007
1008/**
1009 * reverse_path_check - The tfile_check_list is list of file *, which have
1010 * links that are proposed to be newly added. We need to
1011 * make sure that those added links don't add too many
1012 * paths such that we will spend all our time waking up
1013 * eventpoll objects.
1014 *
1015 * Returns: Returns zero if the proposed links don't create too many paths,
1016 * -1 otherwise.
1017 */
1018static int reverse_path_check(void)
1019{
1020 int length = 0;
1021 int error = 0;
1022 struct file *current_file;
1023
1024 /* let's call this for all tfiles */
1025 list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
1026 length++;
1027 path_count_init();
1028 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1029 reverse_path_check_proc, current_file,
1030 current_file, current);
1031 if (error)
1032 break;
1033 }
1034 return error;
1035}
1036
929/* 1037/*
930 * Must be called with "mtx" held. 1038 * Must be called with "mtx" held.
931 */ 1039 */
@@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
987 */ 1095 */
988 ep_rbtree_insert(ep, epi); 1096 ep_rbtree_insert(ep, epi);
989 1097
1098 /* now check if we've created too many backpaths */
1099 error = -EINVAL;
1100 if (reverse_path_check())
1101 goto error_remove_epi;
1102
990 /* We have to drop the new item inside our item list to keep track of it */ 1103 /* We have to drop the new item inside our item list to keep track of it */
991 spin_lock_irqsave(&ep->lock, flags); 1104 spin_lock_irqsave(&ep->lock, flags);
992 1105
@@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1011 1124
1012 return 0; 1125 return 0;
1013 1126
1127error_remove_epi:
1128 spin_lock(&tfile->f_lock);
1129 if (ep_is_linked(&epi->fllink))
1130 list_del_init(&epi->fllink);
1131 spin_unlock(&tfile->f_lock);
1132
1133 rb_erase(&epi->rbn, &ep->rbr);
1134
1014error_unregister: 1135error_unregister:
1015 ep_unregister_pollwait(ep, epi); 1136 ep_unregister_pollwait(ep, epi);
1016 1137
@@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1275 int error = 0; 1396 int error = 0;
1276 struct file *file = priv; 1397 struct file *file = priv;
1277 struct eventpoll *ep = file->private_data; 1398 struct eventpoll *ep = file->private_data;
1399 struct eventpoll *ep_tovisit;
1278 struct rb_node *rbp; 1400 struct rb_node *rbp;
1279 struct epitem *epi; 1401 struct epitem *epi;
1280 1402
1281 mutex_lock_nested(&ep->mtx, call_nests + 1); 1403 mutex_lock_nested(&ep->mtx, call_nests + 1);
1404 ep->visited = 1;
1405 list_add(&ep->visited_list_link, &visited_list);
1282 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { 1406 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1283 epi = rb_entry(rbp, struct epitem, rbn); 1407 epi = rb_entry(rbp, struct epitem, rbn);
1284 if (unlikely(is_file_epoll(epi->ffd.file))) { 1408 if (unlikely(is_file_epoll(epi->ffd.file))) {
1409 ep_tovisit = epi->ffd.file->private_data;
1410 if (ep_tovisit->visited)
1411 continue;
1285 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, 1412 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1286 ep_loop_check_proc, epi->ffd.file, 1413 ep_loop_check_proc, epi->ffd.file,
1287 epi->ffd.file->private_data, current); 1414 ep_tovisit, current);
1288 if (error != 0) 1415 if (error != 0)
1289 break; 1416 break;
1417 } else {
1418 /*
1419 * If we've reached a file that is not associated with
1420 * an ep, then we need to check if the newly added
1421 * links are going to add too many wakeup paths. We do
1422 * this by adding it to the tfile_check_list, if it's
1423 * not already there, and calling reverse_path_check()
1424 * during ep_insert().
1425 */
1426 if (list_empty(&epi->ffd.file->f_tfile_llink))
1427 list_add(&epi->ffd.file->f_tfile_llink,
1428 &tfile_check_list);
1290 } 1429 }
1291 } 1430 }
1292 mutex_unlock(&ep->mtx); 1431 mutex_unlock(&ep->mtx);
@@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1307 */ 1446 */
1308static int ep_loop_check(struct eventpoll *ep, struct file *file) 1447static int ep_loop_check(struct eventpoll *ep, struct file *file)
1309{ 1448{
1310 return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, 1449 int ret;
1450 struct eventpoll *ep_cur, *ep_next;
1451
1452 ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1311 ep_loop_check_proc, file, ep, current); 1453 ep_loop_check_proc, file, ep, current);
1454 /* clear visited list */
1455 list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
1456 visited_list_link) {
1457 ep_cur->visited = 0;
1458 list_del(&ep_cur->visited_list_link);
1459 }
1460 return ret;
1461}
1462
1463static void clear_tfile_check_list(void)
1464{
1465 struct file *file;
1466
1467 /* first clear the tfile_check_list */
1468 while (!list_empty(&tfile_check_list)) {
1469 file = list_first_entry(&tfile_check_list, struct file,
1470 f_tfile_llink);
1471 list_del_init(&file->f_tfile_llink);
1472 }
1473 INIT_LIST_HEAD(&tfile_check_list);
1312} 1474}
1313 1475
1314/* 1476/*
@@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
1316 */ 1478 */
1317SYSCALL_DEFINE1(epoll_create1, int, flags) 1479SYSCALL_DEFINE1(epoll_create1, int, flags)
1318{ 1480{
1319 int error; 1481 int error, fd;
1320 struct eventpoll *ep = NULL; 1482 struct eventpoll *ep = NULL;
1483 struct file *file;
1321 1484
1322 /* Check the EPOLL_* constant for consistency. */ 1485 /* Check the EPOLL_* constant for consistency. */
1323 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); 1486 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
@@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
1334 * Creates all the items needed to setup an eventpoll file. That is, 1497 * Creates all the items needed to setup an eventpoll file. That is,
1335 * a file structure and a free file descriptor. 1498 * a file structure and a free file descriptor.
1336 */ 1499 */
1337 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, 1500 fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
1501 if (fd < 0) {
1502 error = fd;
1503 goto out_free_ep;
1504 }
1505 file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
1338 O_RDWR | (flags & O_CLOEXEC)); 1506 O_RDWR | (flags & O_CLOEXEC));
1339 if (error < 0) 1507 if (IS_ERR(file)) {
1340 ep_free(ep); 1508 error = PTR_ERR(file);
1341 1509 goto out_free_fd;
1510 }
1511 fd_install(fd, file);
1512 ep->file = file;
1513 return fd;
1514
1515out_free_fd:
1516 put_unused_fd(fd);
1517out_free_ep:
1518 ep_free(ep);
1342 return error; 1519 return error;
1343} 1520}
1344 1521
@@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1404 /* 1581 /*
1405 * When we insert an epoll file descriptor, inside another epoll file 1582 * When we insert an epoll file descriptor, inside another epoll file
1406 * descriptor, there is the change of creating closed loops, which are 1583 * descriptor, there is the change of creating closed loops, which are
1407 * better be handled here, than in more critical paths. 1584 * better be handled here, than in more critical paths. While we are
1585 * checking for loops we also determine the list of files reachable
1586 * and hang them on the tfile_check_list, so we can check that we
1587 * haven't created too many possible wakeup paths.
1408 * 1588 *
1409 * We hold epmutex across the loop check and the insert in this case, in 1589 * We need to hold the epmutex across both ep_insert and ep_remove
1410 * order to prevent two separate inserts from racing and each doing the 1590 * b/c we want to make sure we are looking at a coherent view of
1411 * insert "at the same time" such that ep_loop_check passes on both 1591 * epoll network.
1412 * before either one does the insert, thereby creating a cycle.
1413 */ 1592 */
1414 if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { 1593 if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
1415 mutex_lock(&epmutex); 1594 mutex_lock(&epmutex);
1416 did_lock_epmutex = 1; 1595 did_lock_epmutex = 1;
1417 error = -ELOOP;
1418 if (ep_loop_check(ep, tfile) != 0)
1419 goto error_tgt_fput;
1420 } 1596 }
1421 1597 if (op == EPOLL_CTL_ADD) {
1598 if (is_file_epoll(tfile)) {
1599 error = -ELOOP;
1600 if (ep_loop_check(ep, tfile) != 0)
1601 goto error_tgt_fput;
1602 } else
1603 list_add(&tfile->f_tfile_llink, &tfile_check_list);
1604 }
1422 1605
1423 mutex_lock_nested(&ep->mtx, 0); 1606 mutex_lock_nested(&ep->mtx, 0);
1424 1607
@@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1437 error = ep_insert(ep, &epds, tfile, fd); 1620 error = ep_insert(ep, &epds, tfile, fd);
1438 } else 1621 } else
1439 error = -EEXIST; 1622 error = -EEXIST;
1623 clear_tfile_check_list();
1440 break; 1624 break;
1441 case EPOLL_CTL_DEL: 1625 case EPOLL_CTL_DEL:
1442 if (epi) 1626 if (epi)
@@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1455 mutex_unlock(&ep->mtx); 1639 mutex_unlock(&ep->mtx);
1456 1640
1457error_tgt_fput: 1641error_tgt_fput:
1458 if (unlikely(did_lock_epmutex)) 1642 if (did_lock_epmutex)
1459 mutex_unlock(&epmutex); 1643 mutex_unlock(&epmutex);
1460 1644
1461 fput(tfile); 1645 fput(tfile);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e425ad9d0490..1e85a7ac0217 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -583,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page)
583} 583}
584 584
585static int hugetlbfs_migrate_page(struct address_space *mapping, 585static int hugetlbfs_migrate_page(struct address_space *mapping,
586 struct page *newpage, struct page *page) 586 struct page *newpage, struct page *page,
587 enum migrate_mode mode)
587{ 588{
588 int rc; 589 int rc;
589 590
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5ee92538b063..8102db9b926c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -332,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data);
332 332
333#ifdef CONFIG_MIGRATION 333#ifdef CONFIG_MIGRATION
334extern int nfs_migrate_page(struct address_space *, 334extern int nfs_migrate_page(struct address_space *,
335 struct page *, struct page *); 335 struct page *, struct page *, enum migrate_mode);
336#else 336#else
337#define nfs_migrate_page NULL 337#define nfs_migrate_page NULL
338#endif 338#endif
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0c3885255f97..834f0fe96f89 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1688,7 +1688,7 @@ out_error:
1688 1688
1689#ifdef CONFIG_MIGRATION 1689#ifdef CONFIG_MIGRATION
1690int nfs_migrate_page(struct address_space *mapping, struct page *newpage, 1690int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1691 struct page *page) 1691 struct page *page, enum migrate_mode mode)
1692{ 1692{
1693 /* 1693 /*
1694 * If PagePrivate is set, then the page is currently associated with 1694 * If PagePrivate is set, then the page is currently associated with
@@ -1703,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1703 1703
1704 nfs_fscache_release_page(page, GFP_KERNEL); 1704 nfs_fscache_release_page(page, GFP_KERNEL);
1705 1705
1706 return migrate_page(mapping, newpage, page); 1706 return migrate_page(mapping, newpage, page, mode);
1707} 1707}
1708#endif 1708#endif
1709 1709
diff --git a/fs/pipe.c b/fs/pipe.c
index f0e485d54e64..a932ced92a16 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
1137 if (nr_pages < pipe->nrbufs) 1137 if (nr_pages < pipe->nrbufs)
1138 return -EBUSY; 1138 return -EBUSY;
1139 1139
1140 bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); 1140 bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
1141 if (unlikely(!bufs)) 1141 if (unlikely(!bufs))
1142 return -ENOMEM; 1142 return -ENOMEM;
1143 1143
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 8c344f037bd0..9252ee3b71e3 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
464 464
465 seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ 465 seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
466%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ 466%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
467%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", 467%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n",
468 pid_nr_ns(pid, ns), 468 pid_nr_ns(pid, ns),
469 tcomm, 469 tcomm,
470 state, 470 state,
@@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
511 task->policy, 511 task->policy,
512 (unsigned long long)delayacct_blkio_ticks(task), 512 (unsigned long long)delayacct_blkio_ticks(task),
513 cputime_to_clock_t(gtime), 513 cputime_to_clock_t(gtime),
514 cputime_to_clock_t(cgtime)); 514 cputime_to_clock_t(cgtime),
515 (mm && permitted) ? mm->start_data : 0,
516 (mm && permitted) ? mm->end_data : 0,
517 (mm && permitted) ? mm->start_brk : 0);
515 if (mm) 518 if (mm)
516 mmput(mm); 519 mmput(mm);
517 return 0; 520 return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8173dfd89cb2..5485a5388ecb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -654,6 +654,8 @@ static int proc_pid_permission(struct inode *inode, int mask)
654 bool has_perms; 654 bool has_perms;
655 655
656 task = get_proc_task(inode); 656 task = get_proc_task(inode);
657 if (!task)
658 return -ESRCH;
657 has_perms = has_pid_permissions(pid, task, 1); 659 has_perms = has_pid_permissions(pid, task, 1);
658 put_task_struct(task); 660 put_task_struct(task);
659 661