diff options
author | Davide Libenzi <davidel@xmailserver.org> | 2008-12-01 16:13:55 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-12-01 22:55:24 -0500 |
commit | 7ef9964e6d1b911b78709f144000aacadd0ebc21 (patch) | |
tree | 30667d0a2f8e53973ff48d2c02df48bbc6fe74aa | |
parent | b7d271df873c5121a4ca1c70dea126b5920ec2f1 (diff) |
epoll: introduce resource usage limits
It has been thought that the per-user file descriptors limit would also
limit the resources that a normal user can request via the epoll
interface. Vegard Nossum reported a very simple program (a modified
version attached) that can make a normal user to request a pretty large
amount of kernel memory, well within the its maximum number of fds. To
solve such problem, default limits are now imposed, and /proc based
configuration has been introduced. A new directory has been created,
named /proc/sys/fs/epoll/ and inside there, there are two configuration
points:
max_user_instances = Maximum number of devices - per user
max_user_watches = Maximum number of "watched" fds - per user
The current default for "max_user_watches" limits the memory used by epoll
to store "watches", to 1/32 of the amount of the low RAM. As example, a
256MB 32bit machine, will have "max_user_watches" set to roughly 90000.
That should be enough to not break existing heavy epoll users. The
default value for "max_user_instances" is set to 128, that should be
enough too.
This also changes the userspace, because a new error code can now come out
from EPOLL_CTL_ADD (-ENOSPC). The EMFILE from epoll_create() was already
listed, so that should be ok.
[akpm@linux-foundation.org: use get_current_user()]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: <stable@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Reported-by: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/filesystems/proc.txt | 27 | ||||
-rw-r--r-- | fs/eventpoll.c | 85 | ||||
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | kernel/sysctl.c | 10 |
4 files changed, 118 insertions, 8 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index bcceb99b81d..bb1b0dd3bfc 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -44,6 +44,7 @@ Table of Contents | |||
44 | 2.14 /proc/<pid>/io - Display the IO accounting fields | 44 | 2.14 /proc/<pid>/io - Display the IO accounting fields |
45 | 2.15 /proc/<pid>/coredump_filter - Core dump filtering settings | 45 | 2.15 /proc/<pid>/coredump_filter - Core dump filtering settings |
46 | 2.16 /proc/<pid>/mountinfo - Information about mounts | 46 | 2.16 /proc/<pid>/mountinfo - Information about mounts |
47 | 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface | ||
47 | 48 | ||
48 | ------------------------------------------------------------------------------ | 49 | ------------------------------------------------------------------------------ |
49 | Preface | 50 | Preface |
@@ -2483,4 +2484,30 @@ For more information on mount propagation see: | |||
2483 | 2484 | ||
2484 | Documentation/filesystems/sharedsubtree.txt | 2485 | Documentation/filesystems/sharedsubtree.txt |
2485 | 2486 | ||
2487 | 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface | ||
2488 | -------------------------------------------------------- | ||
2489 | |||
2490 | This directory contains configuration options for the epoll(7) interface. | ||
2491 | |||
2492 | max_user_instances | ||
2493 | ------------------ | ||
2494 | |||
2495 | This is the maximum number of epoll file descriptors that a single user can | ||
2496 | have open at a given time. The default value is 128, and should be enough | ||
2497 | for normal users. | ||
2498 | |||
2499 | max_user_watches | ||
2500 | ---------------- | ||
2501 | |||
2502 | Every epoll file descriptor can store a number of files to be monitored | ||
2503 | for event readiness. Each one of these monitored files constitutes a "watch". | ||
2504 | This configuration option sets the maximum number of "watches" that are | ||
2505 | allowed for each user. | ||
2506 | Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes | ||
2507 | on a 64bit one. | ||
2508 | The current default value for max_user_watches is the 1/32 of the available | ||
2509 | low memory, divided for the "watch" cost in bytes. | ||
2510 | |||
2511 | |||
2486 | ------------------------------------------------------------------------------ | 2512 | ------------------------------------------------------------------------------ |
2513 | |||
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index aec5c13f634..96355d50534 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c | |||
@@ -102,6 +102,8 @@ | |||
102 | 102 | ||
103 | #define EP_UNACTIVE_PTR ((void *) -1L) | 103 | #define EP_UNACTIVE_PTR ((void *) -1L) |
104 | 104 | ||
105 | #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) | ||
106 | |||
105 | struct epoll_filefd { | 107 | struct epoll_filefd { |
106 | struct file *file; | 108 | struct file *file; |
107 | int fd; | 109 | int fd; |
@@ -200,6 +202,9 @@ struct eventpoll { | |||
200 | * holding ->lock. | 202 | * holding ->lock. |
201 | */ | 203 | */ |
202 | struct epitem *ovflist; | 204 | struct epitem *ovflist; |
205 | |||
206 | /* The user that created the eventpoll descriptor */ | ||
207 | struct user_struct *user; | ||
203 | }; | 208 | }; |
204 | 209 | ||
205 | /* Wait structure used by the poll hooks */ | 210 | /* Wait structure used by the poll hooks */ |
@@ -227,9 +232,17 @@ struct ep_pqueue { | |||
227 | }; | 232 | }; |
228 | 233 | ||
229 | /* | 234 | /* |
235 | * Configuration options available inside /proc/sys/fs/epoll/ | ||
236 | */ | ||
237 | /* Maximum number of epoll devices, per user */ | ||
238 | static int max_user_instances __read_mostly; | ||
239 | /* Maximum number of epoll watched descriptors, per user */ | ||
240 | static int max_user_watches __read_mostly; | ||
241 | |||
242 | /* | ||
230 | * This mutex is used to serialize ep_free() and eventpoll_release_file(). | 243 | * This mutex is used to serialize ep_free() and eventpoll_release_file(). |
231 | */ | 244 | */ |
232 | static struct mutex epmutex; | 245 | static DEFINE_MUTEX(epmutex); |
233 | 246 | ||
234 | /* Safe wake up implementation */ | 247 | /* Safe wake up implementation */ |
235 | static struct poll_safewake psw; | 248 | static struct poll_safewake psw; |
@@ -240,6 +253,33 @@ static struct kmem_cache *epi_cache __read_mostly; | |||
240 | /* Slab cache used to allocate "struct eppoll_entry" */ | 253 | /* Slab cache used to allocate "struct eppoll_entry" */ |
241 | static struct kmem_cache *pwq_cache __read_mostly; | 254 | static struct kmem_cache *pwq_cache __read_mostly; |
242 | 255 | ||
256 | #ifdef CONFIG_SYSCTL | ||
257 | |||
258 | #include <linux/sysctl.h> | ||
259 | |||
260 | static int zero; | ||
261 | |||
262 | ctl_table epoll_table[] = { | ||
263 | { | ||
264 | .procname = "max_user_instances", | ||
265 | .data = &max_user_instances, | ||
266 | .maxlen = sizeof(int), | ||
267 | .mode = 0644, | ||
268 | .proc_handler = &proc_dointvec_minmax, | ||
269 | .extra1 = &zero, | ||
270 | }, | ||
271 | { | ||
272 | .procname = "max_user_watches", | ||
273 | .data = &max_user_watches, | ||
274 | .maxlen = sizeof(int), | ||
275 | .mode = 0644, | ||
276 | .proc_handler = &proc_dointvec_minmax, | ||
277 | .extra1 = &zero, | ||
278 | }, | ||
279 | { .ctl_name = 0 } | ||
280 | }; | ||
281 | #endif /* CONFIG_SYSCTL */ | ||
282 | |||
243 | 283 | ||
244 | /* Setup the structure that is used as key for the RB tree */ | 284 | /* Setup the structure that is used as key for the RB tree */ |
245 | static inline void ep_set_ffd(struct epoll_filefd *ffd, | 285 | static inline void ep_set_ffd(struct epoll_filefd *ffd, |
@@ -402,6 +442,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) | |||
402 | /* At this point it is safe to free the eventpoll item */ | 442 | /* At this point it is safe to free the eventpoll item */ |
403 | kmem_cache_free(epi_cache, epi); | 443 | kmem_cache_free(epi_cache, epi); |
404 | 444 | ||
445 | atomic_dec(&ep->user->epoll_watches); | ||
446 | |||
405 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n", | 447 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n", |
406 | current, ep, file)); | 448 | current, ep, file)); |
407 | 449 | ||
@@ -449,6 +491,8 @@ static void ep_free(struct eventpoll *ep) | |||
449 | 491 | ||
450 | mutex_unlock(&epmutex); | 492 | mutex_unlock(&epmutex); |
451 | mutex_destroy(&ep->mtx); | 493 | mutex_destroy(&ep->mtx); |
494 | atomic_dec(&ep->user->epoll_devs); | ||
495 | free_uid(ep->user); | ||
452 | kfree(ep); | 496 | kfree(ep); |
453 | } | 497 | } |
454 | 498 | ||
@@ -532,10 +576,19 @@ void eventpoll_release_file(struct file *file) | |||
532 | 576 | ||
533 | static int ep_alloc(struct eventpoll **pep) | 577 | static int ep_alloc(struct eventpoll **pep) |
534 | { | 578 | { |
535 | struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL); | 579 | int error; |
580 | struct user_struct *user; | ||
581 | struct eventpoll *ep; | ||
536 | 582 | ||
537 | if (!ep) | 583 | user = get_current_user(); |
538 | return -ENOMEM; | 584 | error = -EMFILE; |
585 | if (unlikely(atomic_read(&user->epoll_devs) >= | ||
586 | max_user_instances)) | ||
587 | goto free_uid; | ||
588 | error = -ENOMEM; | ||
589 | ep = kzalloc(sizeof(*ep), GFP_KERNEL); | ||
590 | if (unlikely(!ep)) | ||
591 | goto free_uid; | ||
539 | 592 | ||
540 | spin_lock_init(&ep->lock); | 593 | spin_lock_init(&ep->lock); |
541 | mutex_init(&ep->mtx); | 594 | mutex_init(&ep->mtx); |
@@ -544,12 +597,17 @@ static int ep_alloc(struct eventpoll **pep) | |||
544 | INIT_LIST_HEAD(&ep->rdllist); | 597 | INIT_LIST_HEAD(&ep->rdllist); |
545 | ep->rbr = RB_ROOT; | 598 | ep->rbr = RB_ROOT; |
546 | ep->ovflist = EP_UNACTIVE_PTR; | 599 | ep->ovflist = EP_UNACTIVE_PTR; |
600 | ep->user = user; | ||
547 | 601 | ||
548 | *pep = ep; | 602 | *pep = ep; |
549 | 603 | ||
550 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", | 604 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", |
551 | current, ep)); | 605 | current, ep)); |
552 | return 0; | 606 | return 0; |
607 | |||
608 | free_uid: | ||
609 | free_uid(user); | ||
610 | return error; | ||
553 | } | 611 | } |
554 | 612 | ||
555 | /* | 613 | /* |
@@ -703,9 +761,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | |||
703 | struct epitem *epi; | 761 | struct epitem *epi; |
704 | struct ep_pqueue epq; | 762 | struct ep_pqueue epq; |
705 | 763 | ||
706 | error = -ENOMEM; | 764 | if (unlikely(atomic_read(&ep->user->epoll_watches) >= |
765 | max_user_watches)) | ||
766 | return -ENOSPC; | ||
707 | if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) | 767 | if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) |
708 | goto error_return; | 768 | return -ENOMEM; |
709 | 769 | ||
710 | /* Item initialization follow here ... */ | 770 | /* Item initialization follow here ... */ |
711 | INIT_LIST_HEAD(&epi->rdllink); | 771 | INIT_LIST_HEAD(&epi->rdllink); |
@@ -735,6 +795,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | |||
735 | * install process. Namely an allocation for a wait queue failed due | 795 | * install process. Namely an allocation for a wait queue failed due |
736 | * high memory pressure. | 796 | * high memory pressure. |
737 | */ | 797 | */ |
798 | error = -ENOMEM; | ||
738 | if (epi->nwait < 0) | 799 | if (epi->nwait < 0) |
739 | goto error_unregister; | 800 | goto error_unregister; |
740 | 801 | ||
@@ -765,6 +826,8 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | |||
765 | 826 | ||
766 | spin_unlock_irqrestore(&ep->lock, flags); | 827 | spin_unlock_irqrestore(&ep->lock, flags); |
767 | 828 | ||
829 | atomic_inc(&ep->user->epoll_watches); | ||
830 | |||
768 | /* We have to call this outside the lock */ | 831 | /* We have to call this outside the lock */ |
769 | if (pwake) | 832 | if (pwake) |
770 | ep_poll_safewake(&psw, &ep->poll_wait); | 833 | ep_poll_safewake(&psw, &ep->poll_wait); |
@@ -789,7 +852,7 @@ error_unregister: | |||
789 | spin_unlock_irqrestore(&ep->lock, flags); | 852 | spin_unlock_irqrestore(&ep->lock, flags); |
790 | 853 | ||
791 | kmem_cache_free(epi_cache, epi); | 854 | kmem_cache_free(epi_cache, epi); |
792 | error_return: | 855 | |
793 | return error; | 856 | return error; |
794 | } | 857 | } |
795 | 858 | ||
@@ -1078,6 +1141,7 @@ asmlinkage long sys_epoll_create1(int flags) | |||
1078 | flags & O_CLOEXEC); | 1141 | flags & O_CLOEXEC); |
1079 | if (fd < 0) | 1142 | if (fd < 0) |
1080 | ep_free(ep); | 1143 | ep_free(ep); |
1144 | atomic_inc(&ep->user->epoll_devs); | ||
1081 | 1145 | ||
1082 | error_return: | 1146 | error_return: |
1083 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", | 1147 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", |
@@ -1299,7 +1363,12 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, | |||
1299 | 1363 | ||
1300 | static int __init eventpoll_init(void) | 1364 | static int __init eventpoll_init(void) |
1301 | { | 1365 | { |
1302 | mutex_init(&epmutex); | 1366 | struct sysinfo si; |
1367 | |||
1368 | si_meminfo(&si); | ||
1369 | max_user_instances = 128; | ||
1370 | max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) / | ||
1371 | EP_ITEM_COST; | ||
1303 | 1372 | ||
1304 | /* Initialize the structure used to perform safe poll wait head wake ups */ | 1373 | /* Initialize the structure used to perform safe poll wait head wake ups */ |
1305 | ep_poll_safewake_init(&psw); | 1374 | ep_poll_safewake_init(&psw); |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 644ffbda17c..55e30d11447 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -630,6 +630,10 @@ struct user_struct { | |||
630 | atomic_t inotify_watches; /* How many inotify watches does this user have? */ | 630 | atomic_t inotify_watches; /* How many inotify watches does this user have? */ |
631 | atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ | 631 | atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ |
632 | #endif | 632 | #endif |
633 | #ifdef CONFIG_EPOLL | ||
634 | atomic_t epoll_devs; /* The number of epoll descriptors currently open */ | ||
635 | atomic_t epoll_watches; /* The number of file descriptors currently watched */ | ||
636 | #endif | ||
633 | #ifdef CONFIG_POSIX_MQUEUE | 637 | #ifdef CONFIG_POSIX_MQUEUE |
634 | /* protected by mq_lock */ | 638 | /* protected by mq_lock */ |
635 | unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ | 639 | unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9d048fa2d90..3d56fe7570d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -176,6 +176,9 @@ extern struct ctl_table random_table[]; | |||
176 | #ifdef CONFIG_INOTIFY_USER | 176 | #ifdef CONFIG_INOTIFY_USER |
177 | extern struct ctl_table inotify_table[]; | 177 | extern struct ctl_table inotify_table[]; |
178 | #endif | 178 | #endif |
179 | #ifdef CONFIG_EPOLL | ||
180 | extern struct ctl_table epoll_table[]; | ||
181 | #endif | ||
179 | 182 | ||
180 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT | 183 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT |
181 | int sysctl_legacy_va_layout; | 184 | int sysctl_legacy_va_layout; |
@@ -1325,6 +1328,13 @@ static struct ctl_table fs_table[] = { | |||
1325 | .child = inotify_table, | 1328 | .child = inotify_table, |
1326 | }, | 1329 | }, |
1327 | #endif | 1330 | #endif |
1331 | #ifdef CONFIG_EPOLL | ||
1332 | { | ||
1333 | .procname = "epoll", | ||
1334 | .mode = 0555, | ||
1335 | .child = epoll_table, | ||
1336 | }, | ||
1337 | #endif | ||
1328 | #endif | 1338 | #endif |
1329 | { | 1339 | { |
1330 | .ctl_name = KERN_SETUID_DUMPABLE, | 1340 | .ctl_name = KERN_SETUID_DUMPABLE, |