diff options
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r-- | fs/eventpoll.c | 175 |
1 files changed, 158 insertions, 17 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index cc8a9b7d6064..ed38801b57a7 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c | |||
@@ -62,7 +62,14 @@ | |||
62 | * This mutex is acquired by ep_free() during the epoll file | 62 | * This mutex is acquired by ep_free() during the epoll file |
63 | * cleanup path and it is also acquired by eventpoll_release_file() | 63 | * cleanup path and it is also acquired by eventpoll_release_file() |
64 | * if a file has been pushed inside an epoll set and it is then | 64 | * if a file has been pushed inside an epoll set and it is then |
65 | * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). | 65 | * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL). |
66 | * It is also acquired when inserting an epoll fd onto another epoll | ||
67 | * fd. We do this so that we walk the epoll tree and ensure that this | ||
68 | * insertion does not create a cycle of epoll file descriptors, which | ||
69 | * could lead to deadlock. We need a global mutex to prevent two | ||
70 | * simultaneous inserts (A into B and B into A) from racing and | ||
71 | * constructing a cycle without either insert observing that it is | ||
72 | * going to. | ||
66 | * It is possible to drop the "ep->mtx" and to use the global | 73 | * It is possible to drop the "ep->mtx" and to use the global |
67 | * mutex "epmutex" (together with "ep->lock") to have it working, | 74 | * mutex "epmutex" (together with "ep->lock") to have it working, |
68 | * but having "ep->mtx" will make the interface more scalable. | 75 | * but having "ep->mtx" will make the interface more scalable. |
@@ -145,11 +152,11 @@ struct epitem { | |||
145 | 152 | ||
146 | /* | 153 | /* |
147 | * This structure is stored inside the "private_data" member of the file | 154 | * This structure is stored inside the "private_data" member of the file |
148 | * structure and rapresent the main data sructure for the eventpoll | 155 | * structure and represents the main data structure for the eventpoll |
149 | * interface. | 156 | * interface. |
150 | */ | 157 | */ |
151 | struct eventpoll { | 158 | struct eventpoll { |
152 | /* Protect the this structure access */ | 159 | /* Protect the access to this structure */ |
153 | spinlock_t lock; | 160 | spinlock_t lock; |
154 | 161 | ||
155 | /* | 162 | /* |
@@ -224,6 +231,9 @@ static long max_user_watches __read_mostly; | |||
224 | */ | 231 | */ |
225 | static DEFINE_MUTEX(epmutex); | 232 | static DEFINE_MUTEX(epmutex); |
226 | 233 | ||
234 | /* Used to check for epoll file descriptor inclusion loops */ | ||
235 | static struct nested_calls poll_loop_ncalls; | ||
236 | |||
227 | /* Used for safe wake up implementation */ | 237 | /* Used for safe wake up implementation */ |
228 | static struct nested_calls poll_safewake_ncalls; | 238 | static struct nested_calls poll_safewake_ncalls; |
229 | 239 | ||
@@ -306,6 +316,19 @@ static void ep_nested_calls_init(struct nested_calls *ncalls) | |||
306 | } | 316 | } |
307 | 317 | ||
308 | /** | 318 | /** |
319 | * ep_events_available - Checks if ready events might be available. | ||
320 | * | ||
321 | * @ep: Pointer to the eventpoll context. | ||
322 | * | ||
323 | * Returns: Returns a value different than zero if ready events are available, | ||
324 | * or zero otherwise. | ||
325 | */ | ||
326 | static inline int ep_events_available(struct eventpoll *ep) | ||
327 | { | ||
328 | return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; | ||
329 | } | ||
330 | |||
331 | /** | ||
309 | * ep_call_nested - Perform a bound (possibly) nested call, by checking | 332 | * ep_call_nested - Perform a bound (possibly) nested call, by checking |
310 | * that the recursion limit is not exceeded, and that | 333 | * that the recursion limit is not exceeded, and that |
311 | * the same nested call (by the meaning of same cookie) is | 334 | * the same nested call (by the meaning of same cookie) is |
@@ -783,7 +806,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) | |||
783 | 806 | ||
784 | /* | 807 | /* |
785 | * This is the callback that is passed to the wait queue wakeup | 808 | * This is the callback that is passed to the wait queue wakeup |
786 | * machanism. It is called by the stored file descriptors when they | 809 | * mechanism. It is called by the stored file descriptors when they |
787 | * have events to report. | 810 | * have events to report. |
788 | */ | 811 | */ |
789 | static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) | 812 | static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) |
@@ -814,9 +837,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k | |||
814 | goto out_unlock; | 837 | goto out_unlock; |
815 | 838 | ||
816 | /* | 839 | /* |
817 | * If we are trasfering events to userspace, we can hold no locks | 840 | * If we are transferring events to userspace, we can hold no locks |
818 | * (because we're accessing user memory, and because of linux f_op->poll() | 841 | * (because we're accessing user memory, and because of linux f_op->poll() |
819 | * semantics). All the events that happens during that period of time are | 842 | * semantics). All the events that happen during that period of time are |
820 | * chained in ep->ovflist and requeued later on. | 843 | * chained in ep->ovflist and requeued later on. |
821 | */ | 844 | */ |
822 | if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { | 845 | if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { |
@@ -1114,31 +1137,63 @@ static int ep_send_events(struct eventpoll *ep, | |||
1114 | return ep_scan_ready_list(ep, ep_send_events_proc, &esed); | 1137 | return ep_scan_ready_list(ep, ep_send_events_proc, &esed); |
1115 | } | 1138 | } |
1116 | 1139 | ||
1140 | static inline struct timespec ep_set_mstimeout(long ms) | ||
1141 | { | ||
1142 | struct timespec now, ts = { | ||
1143 | .tv_sec = ms / MSEC_PER_SEC, | ||
1144 | .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC), | ||
1145 | }; | ||
1146 | |||
1147 | ktime_get_ts(&now); | ||
1148 | return timespec_add_safe(now, ts); | ||
1149 | } | ||
1150 | |||
1151 | /** | ||
1152 | * ep_poll - Retrieves ready events, and delivers them to the caller supplied | ||
1153 | * event buffer. | ||
1154 | * | ||
1155 | * @ep: Pointer to the eventpoll context. | ||
1156 | * @events: Pointer to the userspace buffer where the ready events should be | ||
1157 | * stored. | ||
1158 | * @maxevents: Size (in terms of number of events) of the caller event buffer. | ||
1159 | * @timeout: Maximum timeout for the ready events fetch operation, in | ||
1160 | * milliseconds. If the @timeout is zero, the function will not block, | ||
1161 | * while if the @timeout is less than zero, the function will block | ||
1162 | * until at least one event has been retrieved (or an error | ||
1163 | * occurred). | ||
1164 | * | ||
1165 | * Returns: Returns the number of ready events which have been fetched, or an | ||
1166 | * error code, in case of error. | ||
1167 | */ | ||
1117 | static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, | 1168 | static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, |
1118 | int maxevents, long timeout) | 1169 | int maxevents, long timeout) |
1119 | { | 1170 | { |
1120 | int res, eavail, timed_out = 0; | 1171 | int res = 0, eavail, timed_out = 0; |
1121 | unsigned long flags; | 1172 | unsigned long flags; |
1122 | long slack; | 1173 | long slack = 0; |
1123 | wait_queue_t wait; | 1174 | wait_queue_t wait; |
1124 | struct timespec end_time; | ||
1125 | ktime_t expires, *to = NULL; | 1175 | ktime_t expires, *to = NULL; |
1126 | 1176 | ||
1127 | if (timeout > 0) { | 1177 | if (timeout > 0) { |
1128 | ktime_get_ts(&end_time); | 1178 | struct timespec end_time = ep_set_mstimeout(timeout); |
1129 | timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC); | 1179 | |
1130 | slack = select_estimate_accuracy(&end_time); | 1180 | slack = select_estimate_accuracy(&end_time); |
1131 | to = &expires; | 1181 | to = &expires; |
1132 | *to = timespec_to_ktime(end_time); | 1182 | *to = timespec_to_ktime(end_time); |
1133 | } else if (timeout == 0) { | 1183 | } else if (timeout == 0) { |
1184 | /* | ||
1185 | * Avoid the unnecessary trip to the wait queue loop, if the | ||
1186 | * caller specified a non blocking operation. | ||
1187 | */ | ||
1134 | timed_out = 1; | 1188 | timed_out = 1; |
1189 | spin_lock_irqsave(&ep->lock, flags); | ||
1190 | goto check_events; | ||
1135 | } | 1191 | } |
1136 | 1192 | ||
1137 | retry: | 1193 | fetch_events: |
1138 | spin_lock_irqsave(&ep->lock, flags); | 1194 | spin_lock_irqsave(&ep->lock, flags); |
1139 | 1195 | ||
1140 | res = 0; | 1196 | if (!ep_events_available(ep)) { |
1141 | if (list_empty(&ep->rdllist)) { | ||
1142 | /* | 1197 | /* |
1143 | * We don't have any available event to return to the caller. | 1198 | * We don't have any available event to return to the caller. |
1144 | * We need to sleep here, and we will be wake up by | 1199 | * We need to sleep here, and we will be wake up by |
@@ -1154,7 +1209,7 @@ retry: | |||
1154 | * to TASK_INTERRUPTIBLE before doing the checks. | 1209 | * to TASK_INTERRUPTIBLE before doing the checks. |
1155 | */ | 1210 | */ |
1156 | set_current_state(TASK_INTERRUPTIBLE); | 1211 | set_current_state(TASK_INTERRUPTIBLE); |
1157 | if (!list_empty(&ep->rdllist) || timed_out) | 1212 | if (ep_events_available(ep) || timed_out) |
1158 | break; | 1213 | break; |
1159 | if (signal_pending(current)) { | 1214 | if (signal_pending(current)) { |
1160 | res = -EINTR; | 1215 | res = -EINTR; |
@@ -1171,8 +1226,9 @@ retry: | |||
1171 | 1226 | ||
1172 | set_current_state(TASK_RUNNING); | 1227 | set_current_state(TASK_RUNNING); |
1173 | } | 1228 | } |
1229 | check_events: | ||
1174 | /* Is it worth to try to dig for events ? */ | 1230 | /* Is it worth to try to dig for events ? */ |
1175 | eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; | 1231 | eavail = ep_events_available(ep); |
1176 | 1232 | ||
1177 | spin_unlock_irqrestore(&ep->lock, flags); | 1233 | spin_unlock_irqrestore(&ep->lock, flags); |
1178 | 1234 | ||
@@ -1183,11 +1239,67 @@ retry: | |||
1183 | */ | 1239 | */ |
1184 | if (!res && eavail && | 1240 | if (!res && eavail && |
1185 | !(res = ep_send_events(ep, events, maxevents)) && !timed_out) | 1241 | !(res = ep_send_events(ep, events, maxevents)) && !timed_out) |
1186 | goto retry; | 1242 | goto fetch_events; |
1187 | 1243 | ||
1188 | return res; | 1244 | return res; |
1189 | } | 1245 | } |
1190 | 1246 | ||
1247 | /** | ||
1248 | * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested() | ||
1249 | * API, to verify that adding an epoll file inside another | ||
1250 | * epoll structure, does not violate the constraints, in | ||
1251 | * terms of closed loops, or too deep chains (which can | ||
1252 | * result in excessive stack usage). | ||
1253 | * | ||
1254 | * @priv: Pointer to the epoll file to be currently checked. | ||
1255 | * @cookie: Original cookie for this call. This is the top-of-the-chain epoll | ||
1256 | * data structure pointer. | ||
1257 | * @call_nests: Current dept of the @ep_call_nested() call stack. | ||
1258 | * | ||
1259 | * Returns: Returns zero if adding the epoll @file inside current epoll | ||
1260 | * structure @ep does not violate the constraints, or -1 otherwise. | ||
1261 | */ | ||
1262 | static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) | ||
1263 | { | ||
1264 | int error = 0; | ||
1265 | struct file *file = priv; | ||
1266 | struct eventpoll *ep = file->private_data; | ||
1267 | struct rb_node *rbp; | ||
1268 | struct epitem *epi; | ||
1269 | |||
1270 | mutex_lock(&ep->mtx); | ||
1271 | for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { | ||
1272 | epi = rb_entry(rbp, struct epitem, rbn); | ||
1273 | if (unlikely(is_file_epoll(epi->ffd.file))) { | ||
1274 | error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, | ||
1275 | ep_loop_check_proc, epi->ffd.file, | ||
1276 | epi->ffd.file->private_data, current); | ||
1277 | if (error != 0) | ||
1278 | break; | ||
1279 | } | ||
1280 | } | ||
1281 | mutex_unlock(&ep->mtx); | ||
1282 | |||
1283 | return error; | ||
1284 | } | ||
1285 | |||
1286 | /** | ||
1287 | * ep_loop_check - Performs a check to verify that adding an epoll file (@file) | ||
1288 | * another epoll file (represented by @ep) does not create | ||
1289 | * closed loops or too deep chains. | ||
1290 | * | ||
1291 | * @ep: Pointer to the epoll private data structure. | ||
1292 | * @file: Pointer to the epoll file to be checked. | ||
1293 | * | ||
1294 | * Returns: Returns zero if adding the epoll @file inside current epoll | ||
1295 | * structure @ep does not violate the constraints, or -1 otherwise. | ||
1296 | */ | ||
1297 | static int ep_loop_check(struct eventpoll *ep, struct file *file) | ||
1298 | { | ||
1299 | return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, | ||
1300 | ep_loop_check_proc, file, ep, current); | ||
1301 | } | ||
1302 | |||
1191 | /* | 1303 | /* |
1192 | * Open an eventpoll file descriptor. | 1304 | * Open an eventpoll file descriptor. |
1193 | */ | 1305 | */ |
@@ -1236,6 +1348,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, | |||
1236 | struct epoll_event __user *, event) | 1348 | struct epoll_event __user *, event) |
1237 | { | 1349 | { |
1238 | int error; | 1350 | int error; |
1351 | int did_lock_epmutex = 0; | ||
1239 | struct file *file, *tfile; | 1352 | struct file *file, *tfile; |
1240 | struct eventpoll *ep; | 1353 | struct eventpoll *ep; |
1241 | struct epitem *epi; | 1354 | struct epitem *epi; |
@@ -1277,6 +1390,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, | |||
1277 | */ | 1390 | */ |
1278 | ep = file->private_data; | 1391 | ep = file->private_data; |
1279 | 1392 | ||
1393 | /* | ||
1394 | * When we insert an epoll file descriptor, inside another epoll file | ||
1395 | * descriptor, there is the change of creating closed loops, which are | ||
1396 | * better be handled here, than in more critical paths. | ||
1397 | * | ||
1398 | * We hold epmutex across the loop check and the insert in this case, in | ||
1399 | * order to prevent two separate inserts from racing and each doing the | ||
1400 | * insert "at the same time" such that ep_loop_check passes on both | ||
1401 | * before either one does the insert, thereby creating a cycle. | ||
1402 | */ | ||
1403 | if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { | ||
1404 | mutex_lock(&epmutex); | ||
1405 | did_lock_epmutex = 1; | ||
1406 | error = -ELOOP; | ||
1407 | if (ep_loop_check(ep, tfile) != 0) | ||
1408 | goto error_tgt_fput; | ||
1409 | } | ||
1410 | |||
1411 | |||
1280 | mutex_lock(&ep->mtx); | 1412 | mutex_lock(&ep->mtx); |
1281 | 1413 | ||
1282 | /* | 1414 | /* |
@@ -1312,6 +1444,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, | |||
1312 | mutex_unlock(&ep->mtx); | 1444 | mutex_unlock(&ep->mtx); |
1313 | 1445 | ||
1314 | error_tgt_fput: | 1446 | error_tgt_fput: |
1447 | if (unlikely(did_lock_epmutex)) | ||
1448 | mutex_unlock(&epmutex); | ||
1449 | |||
1315 | fput(tfile); | 1450 | fput(tfile); |
1316 | error_fput: | 1451 | error_fput: |
1317 | fput(file); | 1452 | fput(file); |
@@ -1431,6 +1566,12 @@ static int __init eventpoll_init(void) | |||
1431 | EP_ITEM_COST; | 1566 | EP_ITEM_COST; |
1432 | BUG_ON(max_user_watches < 0); | 1567 | BUG_ON(max_user_watches < 0); |
1433 | 1568 | ||
1569 | /* | ||
1570 | * Initialize the structure used to perform epoll file descriptor | ||
1571 | * inclusion loops checks. | ||
1572 | */ | ||
1573 | ep_nested_calls_init(&poll_loop_ncalls); | ||
1574 | |||
1434 | /* Initialize the structure used to perform safe poll wait head wake ups */ | 1575 | /* Initialize the structure used to perform safe poll wait head wake ups */ |
1435 | ep_nested_calls_init(&poll_safewake_ncalls); | 1576 | ep_nested_calls_init(&poll_safewake_ncalls); |
1436 | 1577 | ||