aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavide Libenzi <davidel@xmailserver.org>2008-12-01 16:13:55 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2008-12-01 22:55:24 -0500
commit7ef9964e6d1b911b78709f144000aacadd0ebc21 (patch)
tree30667d0a2f8e53973ff48d2c02df48bbc6fe74aa
parentb7d271df873c5121a4ca1c70dea126b5920ec2f1 (diff)
epoll: introduce resource usage limits
It has been thought that the per-user file descriptors limit would also limit the resources that a normal user can request via the epoll interface. Vegard Nossum reported a very simple program (a modified version attached) that can make a normal user to request a pretty large amount of kernel memory, well within the its maximum number of fds. To solve such problem, default limits are now imposed, and /proc based configuration has been introduced. A new directory has been created, named /proc/sys/fs/epoll/ and inside there, there are two configuration points: max_user_instances = Maximum number of devices - per user max_user_watches = Maximum number of "watched" fds - per user The current default for "max_user_watches" limits the memory used by epoll to store "watches", to 1/32 of the amount of the low RAM. As example, a 256MB 32bit machine, will have "max_user_watches" set to roughly 90000. That should be enough to not break existing heavy epoll users. The default value for "max_user_instances" is set to 128, that should be enough too. This also changes the userspace, because a new error code can now come out from EPOLL_CTL_ADD (-ENOSPC). The EMFILE from epoll_create() was already listed, so that should be ok. [akpm@linux-foundation.org: use get_current_user()] Signed-off-by: Davide Libenzi <davidel@xmailserver.org> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: <stable@kernel.org> Cc: Cyrill Gorcunov <gorcunov@gmail.com> Reported-by: Vegard Nossum <vegardno@ifi.uio.no> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/filesystems/proc.txt27
-rw-r--r--fs/eventpoll.c85
-rw-r--r--include/linux/sched.h4
-rw-r--r--kernel/sysctl.c10
4 files changed, 118 insertions, 8 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index bcceb99b81dd..bb1b0dd3bfcb 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -44,6 +44,7 @@ Table of Contents
44 2.14 /proc/<pid>/io - Display the IO accounting fields 44 2.14 /proc/<pid>/io - Display the IO accounting fields
45 2.15 /proc/<pid>/coredump_filter - Core dump filtering settings 45 2.15 /proc/<pid>/coredump_filter - Core dump filtering settings
46 2.16 /proc/<pid>/mountinfo - Information about mounts 46 2.16 /proc/<pid>/mountinfo - Information about mounts
47 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface
47 48
48------------------------------------------------------------------------------ 49------------------------------------------------------------------------------
49Preface 50Preface
@@ -2483,4 +2484,30 @@ For more information on mount propagation see:
2483 2484
2484 Documentation/filesystems/sharedsubtree.txt 2485 Documentation/filesystems/sharedsubtree.txt
2485 2486
24872.17 /proc/sys/fs/epoll - Configuration options for the epoll interface
2488--------------------------------------------------------
2489
2490This directory contains configuration options for the epoll(7) interface.
2491
2492max_user_instances
2493------------------
2494
2495This is the maximum number of epoll file descriptors that a single user can
2496have open at a given time. The default value is 128, and should be enough
2497for normal users.
2498
2499max_user_watches
2500----------------
2501
2502Every epoll file descriptor can store a number of files to be monitored
2503for event readiness. Each one of these monitored files constitutes a "watch".
2504This configuration option sets the maximum number of "watches" that are
2505allowed for each user.
2506Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
2507on a 64bit one.
2508The current default value for max_user_watches is the 1/32 of the available
2509low memory, divided for the "watch" cost in bytes.
2510
2511
2486------------------------------------------------------------------------------ 2512------------------------------------------------------------------------------
2513
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index aec5c13f6341..96355d505347 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -102,6 +102,8 @@
102 102
103#define EP_UNACTIVE_PTR ((void *) -1L) 103#define EP_UNACTIVE_PTR ((void *) -1L)
104 104
105#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
106
105struct epoll_filefd { 107struct epoll_filefd {
106 struct file *file; 108 struct file *file;
107 int fd; 109 int fd;
@@ -200,6 +202,9 @@ struct eventpoll {
200 * holding ->lock. 202 * holding ->lock.
201 */ 203 */
202 struct epitem *ovflist; 204 struct epitem *ovflist;
205
206 /* The user that created the eventpoll descriptor */
207 struct user_struct *user;
203}; 208};
204 209
205/* Wait structure used by the poll hooks */ 210/* Wait structure used by the poll hooks */
@@ -227,9 +232,17 @@ struct ep_pqueue {
227}; 232};
228 233
229/* 234/*
235 * Configuration options available inside /proc/sys/fs/epoll/
236 */
237/* Maximum number of epoll devices, per user */
238static int max_user_instances __read_mostly;
239/* Maximum number of epoll watched descriptors, per user */
240static int max_user_watches __read_mostly;
241
242/*
230 * This mutex is used to serialize ep_free() and eventpoll_release_file(). 243 * This mutex is used to serialize ep_free() and eventpoll_release_file().
231 */ 244 */
232static struct mutex epmutex; 245static DEFINE_MUTEX(epmutex);
233 246
234/* Safe wake up implementation */ 247/* Safe wake up implementation */
235static struct poll_safewake psw; 248static struct poll_safewake psw;
@@ -240,6 +253,33 @@ static struct kmem_cache *epi_cache __read_mostly;
240/* Slab cache used to allocate "struct eppoll_entry" */ 253/* Slab cache used to allocate "struct eppoll_entry" */
241static struct kmem_cache *pwq_cache __read_mostly; 254static struct kmem_cache *pwq_cache __read_mostly;
242 255
256#ifdef CONFIG_SYSCTL
257
258#include <linux/sysctl.h>
259
260static int zero;
261
262ctl_table epoll_table[] = {
263 {
264 .procname = "max_user_instances",
265 .data = &max_user_instances,
266 .maxlen = sizeof(int),
267 .mode = 0644,
268 .proc_handler = &proc_dointvec_minmax,
269 .extra1 = &zero,
270 },
271 {
272 .procname = "max_user_watches",
273 .data = &max_user_watches,
274 .maxlen = sizeof(int),
275 .mode = 0644,
276 .proc_handler = &proc_dointvec_minmax,
277 .extra1 = &zero,
278 },
279 { .ctl_name = 0 }
280};
281#endif /* CONFIG_SYSCTL */
282
243 283
244/* Setup the structure that is used as key for the RB tree */ 284/* Setup the structure that is used as key for the RB tree */
245static inline void ep_set_ffd(struct epoll_filefd *ffd, 285static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -402,6 +442,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
402 /* At this point it is safe to free the eventpoll item */ 442 /* At this point it is safe to free the eventpoll item */
403 kmem_cache_free(epi_cache, epi); 443 kmem_cache_free(epi_cache, epi);
404 444
445 atomic_dec(&ep->user->epoll_watches);
446
405 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n", 447 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
406 current, ep, file)); 448 current, ep, file));
407 449
@@ -449,6 +491,8 @@ static void ep_free(struct eventpoll *ep)
449 491
450 mutex_unlock(&epmutex); 492 mutex_unlock(&epmutex);
451 mutex_destroy(&ep->mtx); 493 mutex_destroy(&ep->mtx);
494 atomic_dec(&ep->user->epoll_devs);
495 free_uid(ep->user);
452 kfree(ep); 496 kfree(ep);
453} 497}
454 498
@@ -532,10 +576,19 @@ void eventpoll_release_file(struct file *file)
532 576
533static int ep_alloc(struct eventpoll **pep) 577static int ep_alloc(struct eventpoll **pep)
534{ 578{
535 struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL); 579 int error;
580 struct user_struct *user;
581 struct eventpoll *ep;
536 582
537 if (!ep) 583 user = get_current_user();
538 return -ENOMEM; 584 error = -EMFILE;
585 if (unlikely(atomic_read(&user->epoll_devs) >=
586 max_user_instances))
587 goto free_uid;
588 error = -ENOMEM;
589 ep = kzalloc(sizeof(*ep), GFP_KERNEL);
590 if (unlikely(!ep))
591 goto free_uid;
539 592
540 spin_lock_init(&ep->lock); 593 spin_lock_init(&ep->lock);
541 mutex_init(&ep->mtx); 594 mutex_init(&ep->mtx);
@@ -544,12 +597,17 @@ static int ep_alloc(struct eventpoll **pep)
544 INIT_LIST_HEAD(&ep->rdllist); 597 INIT_LIST_HEAD(&ep->rdllist);
545 ep->rbr = RB_ROOT; 598 ep->rbr = RB_ROOT;
546 ep->ovflist = EP_UNACTIVE_PTR; 599 ep->ovflist = EP_UNACTIVE_PTR;
600 ep->user = user;
547 601
548 *pep = ep; 602 *pep = ep;
549 603
550 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", 604 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
551 current, ep)); 605 current, ep));
552 return 0; 606 return 0;
607
608free_uid:
609 free_uid(user);
610 return error;
553} 611}
554 612
555/* 613/*
@@ -703,9 +761,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
703 struct epitem *epi; 761 struct epitem *epi;
704 struct ep_pqueue epq; 762 struct ep_pqueue epq;
705 763
706 error = -ENOMEM; 764 if (unlikely(atomic_read(&ep->user->epoll_watches) >=
765 max_user_watches))
766 return -ENOSPC;
707 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) 767 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
708 goto error_return; 768 return -ENOMEM;
709 769
710 /* Item initialization follow here ... */ 770 /* Item initialization follow here ... */
711 INIT_LIST_HEAD(&epi->rdllink); 771 INIT_LIST_HEAD(&epi->rdllink);
@@ -735,6 +795,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
735 * install process. Namely an allocation for a wait queue failed due 795 * install process. Namely an allocation for a wait queue failed due
736 * high memory pressure. 796 * high memory pressure.
737 */ 797 */
798 error = -ENOMEM;
738 if (epi->nwait < 0) 799 if (epi->nwait < 0)
739 goto error_unregister; 800 goto error_unregister;
740 801
@@ -765,6 +826,8 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
765 826
766 spin_unlock_irqrestore(&ep->lock, flags); 827 spin_unlock_irqrestore(&ep->lock, flags);
767 828
829 atomic_inc(&ep->user->epoll_watches);
830
768 /* We have to call this outside the lock */ 831 /* We have to call this outside the lock */
769 if (pwake) 832 if (pwake)
770 ep_poll_safewake(&psw, &ep->poll_wait); 833 ep_poll_safewake(&psw, &ep->poll_wait);
@@ -789,7 +852,7 @@ error_unregister:
789 spin_unlock_irqrestore(&ep->lock, flags); 852 spin_unlock_irqrestore(&ep->lock, flags);
790 853
791 kmem_cache_free(epi_cache, epi); 854 kmem_cache_free(epi_cache, epi);
792error_return: 855
793 return error; 856 return error;
794} 857}
795 858
@@ -1078,6 +1141,7 @@ asmlinkage long sys_epoll_create1(int flags)
1078 flags & O_CLOEXEC); 1141 flags & O_CLOEXEC);
1079 if (fd < 0) 1142 if (fd < 0)
1080 ep_free(ep); 1143 ep_free(ep);
1144 atomic_inc(&ep->user->epoll_devs);
1081 1145
1082error_return: 1146error_return:
1083 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", 1147 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
@@ -1299,7 +1363,12 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
1299 1363
1300static int __init eventpoll_init(void) 1364static int __init eventpoll_init(void)
1301{ 1365{
1302 mutex_init(&epmutex); 1366 struct sysinfo si;
1367
1368 si_meminfo(&si);
1369 max_user_instances = 128;
1370 max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) /
1371 EP_ITEM_COST;
1303 1372
1304 /* Initialize the structure used to perform safe poll wait head wake ups */ 1373 /* Initialize the structure used to perform safe poll wait head wake ups */
1305 ep_poll_safewake_init(&psw); 1374 ep_poll_safewake_init(&psw);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 644ffbda17ca..55e30d114477 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -630,6 +630,10 @@ struct user_struct {
630 atomic_t inotify_watches; /* How many inotify watches does this user have? */ 630 atomic_t inotify_watches; /* How many inotify watches does this user have? */
631 atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ 631 atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
632#endif 632#endif
633#ifdef CONFIG_EPOLL
634 atomic_t epoll_devs; /* The number of epoll descriptors currently open */
635 atomic_t epoll_watches; /* The number of file descriptors currently watched */
636#endif
633#ifdef CONFIG_POSIX_MQUEUE 637#ifdef CONFIG_POSIX_MQUEUE
634 /* protected by mq_lock */ 638 /* protected by mq_lock */
635 unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ 639 unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9d048fa2d902..3d56fe7570da 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -176,6 +176,9 @@ extern struct ctl_table random_table[];
176#ifdef CONFIG_INOTIFY_USER 176#ifdef CONFIG_INOTIFY_USER
177extern struct ctl_table inotify_table[]; 177extern struct ctl_table inotify_table[];
178#endif 178#endif
179#ifdef CONFIG_EPOLL
180extern struct ctl_table epoll_table[];
181#endif
179 182
180#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 183#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
181int sysctl_legacy_va_layout; 184int sysctl_legacy_va_layout;
@@ -1325,6 +1328,13 @@ static struct ctl_table fs_table[] = {
1325 .child = inotify_table, 1328 .child = inotify_table,
1326 }, 1329 },
1327#endif 1330#endif
1331#ifdef CONFIG_EPOLL
1332 {
1333 .procname = "epoll",
1334 .mode = 0555,
1335 .child = epoll_table,
1336 },
1337#endif
1328#endif 1338#endif
1329 { 1339 {
1330 .ctl_name = KERN_SETUID_DUMPABLE, 1340 .ctl_name = KERN_SETUID_DUMPABLE,