aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Morton <akpm@linux-foundation.org>2012-11-08 18:53:35 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-11-09 00:41:46 -0500
commita80a6b85b428e6ce12a8363bb1f08d44c50f3252 (patch)
tree250a57516ef79c94119b27ceeab4ef7d3360e6c3
parentc24f9f195edf8c7f78eff1081cdadd26bd272ee3 (diff)
revert "epoll: support for disabling items, and a self-test app"
Revert commit 03a7beb55b9f ("epoll: support for disabling items, and a self-test app") pending resolution of the issues identified by Michael Kerrisk, copied below. We'll revisit this for 3.8. : I've taken a look at this patch as it currently stands in 3.7-rc1, and : done a bit of testing. (By the way, the test program : tools/testing/selftests/epoll/test_epoll.c does not compile...) : : There are one or two places where the behavior seems a little strange, : so I have a question or two at the end of this mail. But other than : that, I want to check my understanding so that the interface can be : correctly documented. : : Just to go though my understanding, the problem is the following : scenario in a multithreaded application: : : 1. Multiple threads are performing epoll_wait() operations, : and maintaining a user-space cache that contains information : corresponding to each file descriptor being monitored by : epoll_wait(). : : 2. At some point, a thread wants to delete (EPOLL_CTL_DEL) : a file descriptor from the epoll interest list, and : delete the corresponding record from the user-space cache. : : 3. The problem with (2) is that some other thread may have : previously done an epoll_wait() that retrieved information : about the fd in question, and may be in the middle of using : information in the cache that relates to that fd. Thus, : there is a potential race. : : 4. The race can't solved purely in user space, because doing : so would require applying a mutex across the epoll_wait() : call, which would of course blow thread concurrency. : : Right? : : Your solution is the EPOLL_CTL_DISABLE operation. I want to : confirm my understanding about how to use this flag, since : the description that has accompanied the patches so far : has been a bit sparse : : 0. In the scenario you're concerned about, deleting a file : descriptor means (safely) doing the following: : (a) Deleting the file descriptor from the epoll interest list : using EPOLL_CTL_DEL : (b) Deleting the corresponding record in the user-space cache : : 1. It's only meaningful to use this EPOLL_CTL_DISABLE in : conjunction with EPOLLONESHOT. : : 2. Using EPOLL_CTL_DISABLE without using EPOLLONESHOT in : conjunction is a logical error. : : 3. The correct way to code multithreaded applications using : EPOLL_CTL_DISABLE and EPOLLONESHOT is as follows: : : a. All EPOLL_CTL_ADD and EPOLL_CTL_MOD operations should : should EPOLLONESHOT. : : b. When a thread wants to delete a file descriptor, it : should do the following: : : [1] Call epoll_ctl(EPOLL_CTL_DISABLE) : [2] If the return status from epoll_ctl(EPOLL_CTL_DISABLE) : was zero, then the file descriptor can be safely : deleted by the thread that made this call. : [3] If the epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY, : then the descriptor is in use. In this case, the calling : thread should set a flag in the user-space cache to : indicate that the thread that is using the descriptor : should perform the deletion operation. : : Is all of the above correct? : : The implementation depends on checking on whether : (events & ~EP_PRIVATE_BITS) == 0 : This replies on the fact that EPOLL_CTL_AD and EPOLL_CTL_MOD always : set EPOLLHUP and EPOLLERR in the 'events' mask, and EPOLLONESHOT : causes those flags (as well as all others in ~EP_PRIVATE_BITS) to be : cleared. : : A corollary to the previous paragraph is that using EPOLL_CTL_DISABLE : is only useful in conjunction with EPOLLONESHOT. However, as things : stand, one can use EPOLL_CTL_DISABLE on a file descriptor that does : not have EPOLLONESHOT set in 'events' This results in the following : (slightly surprising) behavior: : : (a) The first call to epoll_ctl(EPOLL_CTL_DISABLE) returns 0 : (the indicator that the file descriptor can be safely deleted). : (b) The next call to epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY. : : This doesn't seem particularly useful, and in fact is probably an : indication that the user made a logic error: they should only be using : epoll_ctl(EPOLL_CTL_DISABLE) on a file descriptor for which : EPOLLONESHOT was set in 'events'. If that is correct, then would it : not make sense to return an error to user space for this case? Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: "Paton J. Lewis" <palewis@adobe.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/eventpoll.c38
-rw-r--r--include/uapi/linux/eventpoll.h1
-rw-r--r--tools/testing/selftests/Makefile2
-rw-r--r--tools/testing/selftests/epoll/Makefile11
-rw-r--r--tools/testing/selftests/epoll/test_epoll.c344
5 files changed, 4 insertions, 392 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index da72250ddc1c..cd96649bfe62 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
346/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ 346/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
347static inline int ep_op_has_event(int op) 347static inline int ep_op_has_event(int op)
348{ 348{
349 return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD; 349 return op != EPOLL_CTL_DEL;
350} 350}
351 351
352/* Initialize the poll safe wake up structure */ 352/* Initialize the poll safe wake up structure */
@@ -676,34 +676,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
676 return 0; 676 return 0;
677} 677}
678 678
679/*
680 * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
681 * had no event flags set, indicating that another thread may be currently
682 * handling that item's events (in the case that EPOLLONESHOT was being
683 * used). Otherwise a zero result indicates that the item has been disabled
684 * from receiving events. A disabled item may be re-enabled via
685 * EPOLL_CTL_MOD. Must be called with "mtx" held.
686 */
687static int ep_disable(struct eventpoll *ep, struct epitem *epi)
688{
689 int result = 0;
690 unsigned long flags;
691
692 spin_lock_irqsave(&ep->lock, flags);
693 if (epi->event.events & ~EP_PRIVATE_BITS) {
694 if (ep_is_linked(&epi->rdllink))
695 list_del_init(&epi->rdllink);
696 /* Ensure ep_poll_callback will not add epi back onto ready
697 list: */
698 epi->event.events &= EP_PRIVATE_BITS;
699 }
700 else
701 result = -EBUSY;
702 spin_unlock_irqrestore(&ep->lock, flags);
703
704 return result;
705}
706
707static void ep_free(struct eventpoll *ep) 679static void ep_free(struct eventpoll *ep)
708{ 680{
709 struct rb_node *rbp; 681 struct rb_node *rbp;
@@ -1048,6 +1020,8 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
1048 rb_insert_color(&epi->rbn, &ep->rbr); 1020 rb_insert_color(&epi->rbn, &ep->rbr);
1049} 1021}
1050 1022
1023
1024
1051#define PATH_ARR_SIZE 5 1025#define PATH_ARR_SIZE 5
1052/* 1026/*
1053 * These are the number paths of length 1 to 5, that we are allowing to emanate 1027 * These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1813,12 +1787,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1813 } else 1787 } else
1814 error = -ENOENT; 1788 error = -ENOENT;
1815 break; 1789 break;
1816 case EPOLL_CTL_DISABLE:
1817 if (epi)
1818 error = ep_disable(ep, epi);
1819 else
1820 error = -ENOENT;
1821 break;
1822 } 1790 }
1823 mutex_unlock(&ep->mtx); 1791 mutex_unlock(&ep->mtx);
1824 1792
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 8c99ce7202c5..2c267bcbb85c 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -25,7 +25,6 @@
25#define EPOLL_CTL_ADD 1 25#define EPOLL_CTL_ADD 1
26#define EPOLL_CTL_DEL 2 26#define EPOLL_CTL_DEL 2
27#define EPOLL_CTL_MOD 3 27#define EPOLL_CTL_MOD 3
28#define EPOLL_CTL_DISABLE 4
29 28
30/* 29/*
31 * Request the handling of system wakeup events so as to prevent system suspends 30 * Request the handling of system wakeup events so as to prevent system suspends
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 43480149119e..85baf11e2acd 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -1,4 +1,4 @@
1TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug epoll 1TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug
2 2
3all: 3all:
4 for TARGET in $(TARGETS); do \ 4 for TARGET in $(TARGETS); do \
diff --git a/tools/testing/selftests/epoll/Makefile b/tools/testing/selftests/epoll/Makefile
deleted file mode 100644
index 19806ed62f50..000000000000
--- a/tools/testing/selftests/epoll/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
1# Makefile for epoll selftests
2
3all: test_epoll
4%: %.c
5 gcc -pthread -g -o $@ $^
6
7run_tests: all
8 ./test_epoll
9
10clean:
11 $(RM) test_epoll
diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c
deleted file mode 100644
index f7525392ce84..000000000000
--- a/tools/testing/selftests/epoll/test_epoll.c
+++ /dev/null
@@ -1,344 +0,0 @@
1/*
2 * tools/testing/selftests/epoll/test_epoll.c
3 *
4 * Copyright 2012 Adobe Systems Incorporated
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * Paton J. Lewis <palewis@adobe.com>
12 *
13 */
14
15#include <errno.h>
16#include <fcntl.h>
17#include <pthread.h>
18#include <stdio.h>
19#include <stdlib.h>
20#include <unistd.h>
21#include <sys/epoll.h>
22#include <sys/socket.h>
23
24/*
25 * A pointer to an epoll_item_private structure will be stored in the epoll
26 * item's event structure so that we can get access to the epoll_item_private
27 * data after calling epoll_wait:
28 */
29struct epoll_item_private {
30 int index; /* Position of this struct within the epoll_items array. */
31 int fd;
32 uint32_t events;
33 pthread_mutex_t mutex; /* Guards the following variables... */
34 int stop;
35 int status; /* Stores any error encountered while handling item. */
36 /* The following variable allows us to test whether we have encountered
37 a problem while attempting to cancel and delete the associated
38 event. When the test program exits, 'deleted' should be exactly
39 one. If it is greater than one, then the failed test reflects a real
40 world situation where we would have tried to access the epoll item's
41 private data after deleting it: */
42 int deleted;
43};
44
45struct epoll_item_private *epoll_items;
46
47/*
48 * Delete the specified item from the epoll set. In a real-world secneario this
49 * is where we would free the associated data structure, but in this testing
50 * environment we retain the structure so that we can test for double-deletion:
51 */
52void delete_item(int index)
53{
54 __sync_fetch_and_add(&epoll_items[index].deleted, 1);
55}
56
57/*
58 * A pointer to a read_thread_data structure will be passed as the argument to
59 * each read thread:
60 */
61struct read_thread_data {
62 int stop;
63 int status; /* Indicates any error encountered by the read thread. */
64 int epoll_set;
65};
66
67/*
68 * The function executed by the read threads:
69 */
70void *read_thread_function(void *function_data)
71{
72 struct read_thread_data *thread_data =
73 (struct read_thread_data *)function_data;
74 struct epoll_event event_data;
75 struct epoll_item_private *item_data;
76 char socket_data;
77
78 /* Handle events until we encounter an error or this thread's 'stop'
79 condition is set: */
80 while (1) {
81 int result = epoll_wait(thread_data->epoll_set,
82 &event_data,
83 1, /* Number of desired events */
84 1000); /* Timeout in ms */
85 if (result < 0) {
86 /* Breakpoints signal all threads. Ignore that while
87 debugging: */
88 if (errno == EINTR)
89 continue;
90 thread_data->status = errno;
91 return 0;
92 } else if (thread_data->stop)
93 return 0;
94 else if (result == 0) /* Timeout */
95 continue;
96
97 /* We need the mutex here because checking for the stop
98 condition and re-enabling the epoll item need to be done
99 together as one atomic operation when EPOLL_CTL_DISABLE is
100 available: */
101 item_data = (struct epoll_item_private *)event_data.data.ptr;
102 pthread_mutex_lock(&item_data->mutex);
103
104 /* Remove the item from the epoll set if we want to stop
105 handling that event: */
106 if (item_data->stop)
107 delete_item(item_data->index);
108 else {
109 /* Clear the data that was written to the other end of
110 our non-blocking socket: */
111 do {
112 if (read(item_data->fd, &socket_data, 1) < 1) {
113 if ((errno == EAGAIN) ||
114 (errno == EWOULDBLOCK))
115 break;
116 else
117 goto error_unlock;
118 }
119 } while (item_data->events & EPOLLET);
120
121 /* The item was one-shot, so re-enable it: */
122 event_data.events = item_data->events;
123 if (epoll_ctl(thread_data->epoll_set,
124 EPOLL_CTL_MOD,
125 item_data->fd,
126 &event_data) < 0)
127 goto error_unlock;
128 }
129
130 pthread_mutex_unlock(&item_data->mutex);
131 }
132
133error_unlock:
134 thread_data->status = item_data->status = errno;
135 pthread_mutex_unlock(&item_data->mutex);
136 return 0;
137}
138
139/*
140 * A pointer to a write_thread_data structure will be passed as the argument to
141 * the write thread:
142 */
143struct write_thread_data {
144 int stop;
145 int status; /* Indicates any error encountered by the write thread. */
146 int n_fds;
147 int *fds;
148};
149
150/*
151 * The function executed by the write thread. It writes a single byte to each
152 * socket in turn until the stop condition for this thread is set. If writing to
153 * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for
154 * the moment and just move on to the next socket in the list. We don't care
155 * about the order in which we deliver events to the epoll set. In fact we don't
156 * care about the data we're writing to the pipes at all; we just want to
157 * trigger epoll events:
158 */
159void *write_thread_function(void *function_data)
160{
161 const char data = 'X';
162 int index;
163 struct write_thread_data *thread_data =
164 (struct write_thread_data *)function_data;
165 while (!thread_data->stop)
166 for (index = 0;
167 !thread_data->stop && (index < thread_data->n_fds);
168 ++index)
169 if ((write(thread_data->fds[index], &data, 1) < 1) &&
170 (errno != EAGAIN) &&
171 (errno != EWOULDBLOCK)) {
172 thread_data->status = errno;
173 return;
174 }
175}
176
177/*
178 * Arguments are currently ignored:
179 */
180int main(int argc, char **argv)
181{
182 const int n_read_threads = 100;
183 const int n_epoll_items = 500;
184 int index;
185 int epoll_set = epoll_create1(0);
186 struct write_thread_data write_thread_data = {
187 0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int))
188 };
189 struct read_thread_data *read_thread_data =
190 malloc(n_read_threads * sizeof(struct read_thread_data));
191 pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t));
192 pthread_t write_thread;
193
194 printf("-----------------\n");
195 printf("Runing test_epoll\n");
196 printf("-----------------\n");
197
198 epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private));
199
200 if (epoll_set < 0 || epoll_items == 0 || write_thread_data.fds == 0 ||
201 read_thread_data == 0 || read_threads == 0)
202 goto error;
203
204 if (sysconf(_SC_NPROCESSORS_ONLN) < 2) {
205 printf("Error: please run this test on a multi-core system.\n");
206 goto error;
207 }
208
209 /* Create the socket pairs and epoll items: */
210 for (index = 0; index < n_epoll_items; ++index) {
211 int socket_pair[2];
212 struct epoll_event event_data;
213 if (socketpair(AF_UNIX,
214 SOCK_STREAM | SOCK_NONBLOCK,
215 0,
216 socket_pair) < 0)
217 goto error;
218 write_thread_data.fds[index] = socket_pair[0];
219 epoll_items[index].index = index;
220 epoll_items[index].fd = socket_pair[1];
221 if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0)
222 goto error;
223 /* We always use EPOLLONESHOT because this test is currently
224 structured to demonstrate the need for EPOLL_CTL_DISABLE,
225 which only produces useful information in the EPOLLONESHOT
226 case (without EPOLLONESHOT, calling epoll_ctl with
227 EPOLL_CTL_DISABLE will never return EBUSY). If support for
228 testing events without EPOLLONESHOT is desired, it should
229 probably be implemented in a separate unit test. */
230 epoll_items[index].events = EPOLLIN | EPOLLONESHOT;
231 if (index < n_epoll_items / 2)
232 epoll_items[index].events |= EPOLLET;
233 epoll_items[index].stop = 0;
234 epoll_items[index].status = 0;
235 epoll_items[index].deleted = 0;
236 event_data.events = epoll_items[index].events;
237 event_data.data.ptr = &epoll_items[index];
238 if (epoll_ctl(epoll_set,
239 EPOLL_CTL_ADD,
240 epoll_items[index].fd,
241 &event_data) < 0)
242 goto error;
243 }
244
245 /* Create and start the read threads: */
246 for (index = 0; index < n_read_threads; ++index) {
247 read_thread_data[index].stop = 0;
248 read_thread_data[index].status = 0;
249 read_thread_data[index].epoll_set = epoll_set;
250 if (pthread_create(&read_threads[index],
251 NULL,
252 read_thread_function,
253 &read_thread_data[index]) != 0)
254 goto error;
255 }
256
257 if (pthread_create(&write_thread,
258 NULL,
259 write_thread_function,
260 &write_thread_data) != 0)
261 goto error;
262
263 /* Cancel all event pollers: */
264#ifdef EPOLL_CTL_DISABLE
265 for (index = 0; index < n_epoll_items; ++index) {
266 pthread_mutex_lock(&epoll_items[index].mutex);
267 ++epoll_items[index].stop;
268 if (epoll_ctl(epoll_set,
269 EPOLL_CTL_DISABLE,
270 epoll_items[index].fd,
271 NULL) == 0)
272 delete_item(index);
273 else if (errno != EBUSY) {
274 pthread_mutex_unlock(&epoll_items[index].mutex);
275 goto error;
276 }
277 /* EBUSY means events were being handled; allow the other thread
278 to delete the item. */
279 pthread_mutex_unlock(&epoll_items[index].mutex);
280 }
281#else
282 for (index = 0; index < n_epoll_items; ++index) {
283 pthread_mutex_lock(&epoll_items[index].mutex);
284 ++epoll_items[index].stop;
285 pthread_mutex_unlock(&epoll_items[index].mutex);
286 /* Wait in case a thread running read_thread_function is
287 currently executing code between epoll_wait and
288 pthread_mutex_lock with this item. Note that a longer delay
289 would make double-deletion less likely (at the expense of
290 performance), but there is no guarantee that any delay would
291 ever be sufficient. Note also that we delete all event
292 pollers at once for testing purposes, but in a real-world
293 environment we are likely to want to be able to cancel event
294 pollers at arbitrary times. Therefore we can't improve this
295 situation by just splitting this loop into two loops
296 (i.e. signal 'stop' for all items, sleep, and then delete all
297 items). We also can't fix the problem via EPOLL_CTL_DEL
298 because that command can't prevent the case where some other
299 thread is executing read_thread_function within the region
300 mentioned above: */
301 usleep(1);
302 pthread_mutex_lock(&epoll_items[index].mutex);
303 if (!epoll_items[index].deleted)
304 delete_item(index);
305 pthread_mutex_unlock(&epoll_items[index].mutex);
306 }
307#endif
308
309 /* Shut down the read threads: */
310 for (index = 0; index < n_read_threads; ++index)
311 __sync_fetch_and_add(&read_thread_data[index].stop, 1);
312 for (index = 0; index < n_read_threads; ++index) {
313 if (pthread_join(read_threads[index], NULL) != 0)
314 goto error;
315 if (read_thread_data[index].status)
316 goto error;
317 }
318
319 /* Shut down the write thread: */
320 __sync_fetch_and_add(&write_thread_data.stop, 1);
321 if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status)
322 goto error;
323
324 /* Check for final error conditions: */
325 for (index = 0; index < n_epoll_items; ++index) {
326 if (epoll_items[index].status != 0)
327 goto error;
328 if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0)
329 goto error;
330 }
331 for (index = 0; index < n_epoll_items; ++index)
332 if (epoll_items[index].deleted != 1) {
333 printf("Error: item data deleted %1d times.\n",
334 epoll_items[index].deleted);
335 goto error;
336 }
337
338 printf("[PASS]\n");
339 return 0;
340
341 error:
342 printf("[FAIL]\n");
343 return errno;
344}