aboutsummaryrefslogtreecommitdiffstats
path: root/fs/eventpoll.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r--fs/eventpoll.c175
1 files changed, 158 insertions, 17 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cc8a9b7d6064..ed38801b57a7 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -62,7 +62,14 @@
62 * This mutex is acquired by ep_free() during the epoll file 62 * This mutex is acquired by ep_free() during the epoll file
63 * cleanup path and it is also acquired by eventpoll_release_file() 63 * cleanup path and it is also acquired by eventpoll_release_file()
64 * if a file has been pushed inside an epoll set and it is then 64 * if a file has been pushed inside an epoll set and it is then
65 * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). 65 * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
66 * It is also acquired when inserting an epoll fd onto another epoll
67 * fd. We do this so that we walk the epoll tree and ensure that this
68 * insertion does not create a cycle of epoll file descriptors, which
69 * could lead to deadlock. We need a global mutex to prevent two
70 * simultaneous inserts (A into B and B into A) from racing and
71 * constructing a cycle without either insert observing that it is
72 * going to.
66 * It is possible to drop the "ep->mtx" and to use the global 73 * It is possible to drop the "ep->mtx" and to use the global
67 * mutex "epmutex" (together with "ep->lock") to have it working, 74 * mutex "epmutex" (together with "ep->lock") to have it working,
68 * but having "ep->mtx" will make the interface more scalable. 75 * but having "ep->mtx" will make the interface more scalable.
@@ -145,11 +152,11 @@ struct epitem {
145 152
146/* 153/*
147 * This structure is stored inside the "private_data" member of the file 154 * This structure is stored inside the "private_data" member of the file
148 * structure and rapresent the main data sructure for the eventpoll 155 * structure and represents the main data structure for the eventpoll
149 * interface. 156 * interface.
150 */ 157 */
151struct eventpoll { 158struct eventpoll {
152 /* Protect the this structure access */ 159 /* Protect the access to this structure */
153 spinlock_t lock; 160 spinlock_t lock;
154 161
155 /* 162 /*
@@ -224,6 +231,9 @@ static long max_user_watches __read_mostly;
224 */ 231 */
225static DEFINE_MUTEX(epmutex); 232static DEFINE_MUTEX(epmutex);
226 233
234/* Used to check for epoll file descriptor inclusion loops */
235static struct nested_calls poll_loop_ncalls;
236
227/* Used for safe wake up implementation */ 237/* Used for safe wake up implementation */
228static struct nested_calls poll_safewake_ncalls; 238static struct nested_calls poll_safewake_ncalls;
229 239
@@ -306,6 +316,19 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
306} 316}
307 317
308/** 318/**
319 * ep_events_available - Checks if ready events might be available.
320 *
321 * @ep: Pointer to the eventpoll context.
322 *
323 * Returns: Returns a value different than zero if ready events are available,
324 * or zero otherwise.
325 */
326static inline int ep_events_available(struct eventpoll *ep)
327{
328 return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
329}
330
331/**
309 * ep_call_nested - Perform a bound (possibly) nested call, by checking 332 * ep_call_nested - Perform a bound (possibly) nested call, by checking
310 * that the recursion limit is not exceeded, and that 333 * that the recursion limit is not exceeded, and that
311 * the same nested call (by the meaning of same cookie) is 334 * the same nested call (by the meaning of same cookie) is
@@ -783,7 +806,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
783 806
784/* 807/*
785 * This is the callback that is passed to the wait queue wakeup 808 * This is the callback that is passed to the wait queue wakeup
786 * machanism. It is called by the stored file descriptors when they 809 * mechanism. It is called by the stored file descriptors when they
787 * have events to report. 810 * have events to report.
788 */ 811 */
789static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) 812static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
@@ -814,9 +837,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
814 goto out_unlock; 837 goto out_unlock;
815 838
816 /* 839 /*
817 * If we are trasfering events to userspace, we can hold no locks 840 * If we are transferring events to userspace, we can hold no locks
818 * (because we're accessing user memory, and because of linux f_op->poll() 841 * (because we're accessing user memory, and because of linux f_op->poll()
819 * semantics). All the events that happens during that period of time are 842 * semantics). All the events that happen during that period of time are
820 * chained in ep->ovflist and requeued later on. 843 * chained in ep->ovflist and requeued later on.
821 */ 844 */
822 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { 845 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
@@ -1114,31 +1137,63 @@ static int ep_send_events(struct eventpoll *ep,
1114 return ep_scan_ready_list(ep, ep_send_events_proc, &esed); 1137 return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
1115} 1138}
1116 1139
1140static inline struct timespec ep_set_mstimeout(long ms)
1141{
1142 struct timespec now, ts = {
1143 .tv_sec = ms / MSEC_PER_SEC,
1144 .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1145 };
1146
1147 ktime_get_ts(&now);
1148 return timespec_add_safe(now, ts);
1149}
1150
1151/**
1152 * ep_poll - Retrieves ready events, and delivers them to the caller supplied
1153 * event buffer.
1154 *
1155 * @ep: Pointer to the eventpoll context.
1156 * @events: Pointer to the userspace buffer where the ready events should be
1157 * stored.
1158 * @maxevents: Size (in terms of number of events) of the caller event buffer.
1159 * @timeout: Maximum timeout for the ready events fetch operation, in
1160 * milliseconds. If the @timeout is zero, the function will not block,
1161 * while if the @timeout is less than zero, the function will block
1162 * until at least one event has been retrieved (or an error
1163 * occurred).
1164 *
1165 * Returns: Returns the number of ready events which have been fetched, or an
1166 * error code, in case of error.
1167 */
1117static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1168static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1118 int maxevents, long timeout) 1169 int maxevents, long timeout)
1119{ 1170{
1120 int res, eavail, timed_out = 0; 1171 int res = 0, eavail, timed_out = 0;
1121 unsigned long flags; 1172 unsigned long flags;
1122 long slack; 1173 long slack = 0;
1123 wait_queue_t wait; 1174 wait_queue_t wait;
1124 struct timespec end_time;
1125 ktime_t expires, *to = NULL; 1175 ktime_t expires, *to = NULL;
1126 1176
1127 if (timeout > 0) { 1177 if (timeout > 0) {
1128 ktime_get_ts(&end_time); 1178 struct timespec end_time = ep_set_mstimeout(timeout);
1129 timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC); 1179
1130 slack = select_estimate_accuracy(&end_time); 1180 slack = select_estimate_accuracy(&end_time);
1131 to = &expires; 1181 to = &expires;
1132 *to = timespec_to_ktime(end_time); 1182 *to = timespec_to_ktime(end_time);
1133 } else if (timeout == 0) { 1183 } else if (timeout == 0) {
1184 /*
1185 * Avoid the unnecessary trip to the wait queue loop, if the
1186 * caller specified a non blocking operation.
1187 */
1134 timed_out = 1; 1188 timed_out = 1;
1189 spin_lock_irqsave(&ep->lock, flags);
1190 goto check_events;
1135 } 1191 }
1136 1192
1137retry: 1193fetch_events:
1138 spin_lock_irqsave(&ep->lock, flags); 1194 spin_lock_irqsave(&ep->lock, flags);
1139 1195
1140 res = 0; 1196 if (!ep_events_available(ep)) {
1141 if (list_empty(&ep->rdllist)) {
1142 /* 1197 /*
1143 * We don't have any available event to return to the caller. 1198 * We don't have any available event to return to the caller.
1144 * We need to sleep here, and we will be wake up by 1199 * We need to sleep here, and we will be wake up by
@@ -1154,7 +1209,7 @@ retry:
1154 * to TASK_INTERRUPTIBLE before doing the checks. 1209 * to TASK_INTERRUPTIBLE before doing the checks.
1155 */ 1210 */
1156 set_current_state(TASK_INTERRUPTIBLE); 1211 set_current_state(TASK_INTERRUPTIBLE);
1157 if (!list_empty(&ep->rdllist) || timed_out) 1212 if (ep_events_available(ep) || timed_out)
1158 break; 1213 break;
1159 if (signal_pending(current)) { 1214 if (signal_pending(current)) {
1160 res = -EINTR; 1215 res = -EINTR;
@@ -1171,8 +1226,9 @@ retry:
1171 1226
1172 set_current_state(TASK_RUNNING); 1227 set_current_state(TASK_RUNNING);
1173 } 1228 }
1229check_events:
1174 /* Is it worth to try to dig for events ? */ 1230 /* Is it worth to try to dig for events ? */
1175 eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; 1231 eavail = ep_events_available(ep);
1176 1232
1177 spin_unlock_irqrestore(&ep->lock, flags); 1233 spin_unlock_irqrestore(&ep->lock, flags);
1178 1234
@@ -1183,11 +1239,67 @@ retry:
1183 */ 1239 */
1184 if (!res && eavail && 1240 if (!res && eavail &&
1185 !(res = ep_send_events(ep, events, maxevents)) && !timed_out) 1241 !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1186 goto retry; 1242 goto fetch_events;
1187 1243
1188 return res; 1244 return res;
1189} 1245}
1190 1246
1247/**
1248 * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
1249 * API, to verify that adding an epoll file inside another
1250 * epoll structure, does not violate the constraints, in
1251 * terms of closed loops, or too deep chains (which can
1252 * result in excessive stack usage).
1253 *
1254 * @priv: Pointer to the epoll file to be currently checked.
1255 * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
1256 * data structure pointer.
1257 * @call_nests: Current dept of the @ep_call_nested() call stack.
1258 *
1259 * Returns: Returns zero if adding the epoll @file inside current epoll
1260 * structure @ep does not violate the constraints, or -1 otherwise.
1261 */
1262static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1263{
1264 int error = 0;
1265 struct file *file = priv;
1266 struct eventpoll *ep = file->private_data;
1267 struct rb_node *rbp;
1268 struct epitem *epi;
1269
1270 mutex_lock(&ep->mtx);
1271 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1272 epi = rb_entry(rbp, struct epitem, rbn);
1273 if (unlikely(is_file_epoll(epi->ffd.file))) {
1274 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1275 ep_loop_check_proc, epi->ffd.file,
1276 epi->ffd.file->private_data, current);
1277 if (error != 0)
1278 break;
1279 }
1280 }
1281 mutex_unlock(&ep->mtx);
1282
1283 return error;
1284}
1285
1286/**
1287 * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
1288 * another epoll file (represented by @ep) does not create
1289 * closed loops or too deep chains.
1290 *
1291 * @ep: Pointer to the epoll private data structure.
1292 * @file: Pointer to the epoll file to be checked.
1293 *
1294 * Returns: Returns zero if adding the epoll @file inside current epoll
1295 * structure @ep does not violate the constraints, or -1 otherwise.
1296 */
1297static int ep_loop_check(struct eventpoll *ep, struct file *file)
1298{
1299 return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1300 ep_loop_check_proc, file, ep, current);
1301}
1302
1191/* 1303/*
1192 * Open an eventpoll file descriptor. 1304 * Open an eventpoll file descriptor.
1193 */ 1305 */
@@ -1236,6 +1348,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1236 struct epoll_event __user *, event) 1348 struct epoll_event __user *, event)
1237{ 1349{
1238 int error; 1350 int error;
1351 int did_lock_epmutex = 0;
1239 struct file *file, *tfile; 1352 struct file *file, *tfile;
1240 struct eventpoll *ep; 1353 struct eventpoll *ep;
1241 struct epitem *epi; 1354 struct epitem *epi;
@@ -1277,6 +1390,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1277 */ 1390 */
1278 ep = file->private_data; 1391 ep = file->private_data;
1279 1392
1393 /*
1394 * When we insert an epoll file descriptor, inside another epoll file
1395 * descriptor, there is the change of creating closed loops, which are
1396 * better be handled here, than in more critical paths.
1397 *
1398 * We hold epmutex across the loop check and the insert in this case, in
1399 * order to prevent two separate inserts from racing and each doing the
1400 * insert "at the same time" such that ep_loop_check passes on both
1401 * before either one does the insert, thereby creating a cycle.
1402 */
1403 if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
1404 mutex_lock(&epmutex);
1405 did_lock_epmutex = 1;
1406 error = -ELOOP;
1407 if (ep_loop_check(ep, tfile) != 0)
1408 goto error_tgt_fput;
1409 }
1410
1411
1280 mutex_lock(&ep->mtx); 1412 mutex_lock(&ep->mtx);
1281 1413
1282 /* 1414 /*
@@ -1312,6 +1444,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1312 mutex_unlock(&ep->mtx); 1444 mutex_unlock(&ep->mtx);
1313 1445
1314error_tgt_fput: 1446error_tgt_fput:
1447 if (unlikely(did_lock_epmutex))
1448 mutex_unlock(&epmutex);
1449
1315 fput(tfile); 1450 fput(tfile);
1316error_fput: 1451error_fput:
1317 fput(file); 1452 fput(file);
@@ -1431,6 +1566,12 @@ static int __init eventpoll_init(void)
1431 EP_ITEM_COST; 1566 EP_ITEM_COST;
1432 BUG_ON(max_user_watches < 0); 1567 BUG_ON(max_user_watches < 0);
1433 1568
1569 /*
1570 * Initialize the structure used to perform epoll file descriptor
1571 * inclusion loops checks.
1572 */
1573 ep_nested_calls_init(&poll_loop_ncalls);
1574
1434 /* Initialize the structure used to perform safe poll wait head wake ups */ 1575 /* Initialize the structure used to perform safe poll wait head wake ups */
1435 ep_nested_calls_init(&poll_safewake_ncalls); 1576 ep_nested_calls_init(&poll_safewake_ncalls);
1436 1577