aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaton J. Lewis <palewis@adobe.com>2012-10-04 20:13:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-05 14:05:00 -0400
commit03a7beb55b9fad363f0dd33e72ccf2d3e1c2a406 (patch)
treee89cb2a2db5645600f28699ebf3b4a98195a3fb3
parenta0a0a7a94c765f7219b57fa3b79389901bb0bc99 (diff)
epoll: support for disabling items, and a self-test app
Enhanced epoll_ctl to support EPOLL_CTL_DISABLE, which disables an epoll item. If epoll_ctl doesn't return -EBUSY in this case, it is then safe to delete the epoll item in a multi-threaded environment. Also added a new test_epoll self- test app to both demonstrate the need for this feature and test it. Signed-off-by: Paton J. Lewis <palewis@adobe.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Jason Baron <jbaron@redhat.com> Cc: Paul Holland <pholland@adobe.com> Cc: Davide Libenzi <davidel@xmailserver.org> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/eventpoll.c38
-rw-r--r--include/linux/eventpoll.h1
-rw-r--r--tools/testing/selftests/Makefile2
-rw-r--r--tools/testing/selftests/epoll/Makefile11
-rw-r--r--tools/testing/selftests/epoll/test_epoll.c344
5 files changed, 392 insertions, 4 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cd96649bfe62..da72250ddc1c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
346/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ 346/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
347static inline int ep_op_has_event(int op) 347static inline int ep_op_has_event(int op)
348{ 348{
349 return op != EPOLL_CTL_DEL; 349 return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD;
350} 350}
351 351
352/* Initialize the poll safe wake up structure */ 352/* Initialize the poll safe wake up structure */
@@ -676,6 +676,34 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
676 return 0; 676 return 0;
677} 677}
678 678
679/*
680 * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
681 * had no event flags set, indicating that another thread may be currently
682 * handling that item's events (in the case that EPOLLONESHOT was being
683 * used). Otherwise a zero result indicates that the item has been disabled
684 * from receiving events. A disabled item may be re-enabled via
685 * EPOLL_CTL_MOD. Must be called with "mtx" held.
686 */
687static int ep_disable(struct eventpoll *ep, struct epitem *epi)
688{
689 int result = 0;
690 unsigned long flags;
691
692 spin_lock_irqsave(&ep->lock, flags);
693 if (epi->event.events & ~EP_PRIVATE_BITS) {
694 if (ep_is_linked(&epi->rdllink))
695 list_del_init(&epi->rdllink);
696 /* Ensure ep_poll_callback will not add epi back onto ready
697 list: */
698 epi->event.events &= EP_PRIVATE_BITS;
699 }
700 else
701 result = -EBUSY;
702 spin_unlock_irqrestore(&ep->lock, flags);
703
704 return result;
705}
706
679static void ep_free(struct eventpoll *ep) 707static void ep_free(struct eventpoll *ep)
680{ 708{
681 struct rb_node *rbp; 709 struct rb_node *rbp;
@@ -1020,8 +1048,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
1020 rb_insert_color(&epi->rbn, &ep->rbr); 1048 rb_insert_color(&epi->rbn, &ep->rbr);
1021} 1049}
1022 1050
1023
1024
1025#define PATH_ARR_SIZE 5 1051#define PATH_ARR_SIZE 5
1026/* 1052/*
1027 * These are the number paths of length 1 to 5, that we are allowing to emanate 1053 * These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1787,6 +1813,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1787 } else 1813 } else
1788 error = -ENOENT; 1814 error = -ENOENT;
1789 break; 1815 break;
1816 case EPOLL_CTL_DISABLE:
1817 if (epi)
1818 error = ep_disable(ep, epi);
1819 else
1820 error = -ENOENT;
1821 break;
1790 } 1822 }
1791 mutex_unlock(&ep->mtx); 1823 mutex_unlock(&ep->mtx);
1792 1824
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index f4bb378ccf6a..41085d0f3955 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -25,6 +25,7 @@
25#define EPOLL_CTL_ADD 1 25#define EPOLL_CTL_ADD 1
26#define EPOLL_CTL_DEL 2 26#define EPOLL_CTL_DEL 2
27#define EPOLL_CTL_MOD 3 27#define EPOLL_CTL_MOD 3
28#define EPOLL_CTL_DISABLE 4
28 29
29/* 30/*
30 * Request the handling of system wakeup events so as to prevent system suspends 31 * Request the handling of system wakeup events so as to prevent system suspends
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 85baf11e2acd..43480149119e 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -1,4 +1,4 @@
1TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug 1TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug epoll
2 2
3all: 3all:
4 for TARGET in $(TARGETS); do \ 4 for TARGET in $(TARGETS); do \
diff --git a/tools/testing/selftests/epoll/Makefile b/tools/testing/selftests/epoll/Makefile
new file mode 100644
index 000000000000..19806ed62f50
--- /dev/null
+++ b/tools/testing/selftests/epoll/Makefile
@@ -0,0 +1,11 @@
1# Makefile for epoll selftests
2
3all: test_epoll
4%: %.c
5 gcc -pthread -g -o $@ $^
6
7run_tests: all
8 ./test_epoll
9
10clean:
11 $(RM) test_epoll
diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c
new file mode 100644
index 000000000000..e0fcff1e8331
--- /dev/null
+++ b/tools/testing/selftests/epoll/test_epoll.c
@@ -0,0 +1,344 @@
1/*
2 * tools/testing/selftests/epoll/test_epoll.c
3 *
4 * Copyright 2012 Adobe Systems Incorporated
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * Paton J. Lewis <palewis@adobe.com>
12 *
13 */
14
15#include <errno.h>
16#include <fcntl.h>
17#include <pthread.h>
18#include <stdio.h>
19#include <stdlib.h>
20#include <unistd.h>
21#include <sys/epoll.h>
22#include <sys/socket.h>
23
24/*
25 * A pointer to an epoll_item_private structure will be stored in the epoll
26 * item's event structure so that we can get access to the epoll_item_private
27 * data after calling epoll_wait:
28 */
29struct epoll_item_private {
30 int index; /* Position of this struct within the epoll_items array. */
31 int fd;
32 uint32_t events;
33 pthread_mutex_t mutex; /* Guards the following variables... */
34 int stop;
35 int status; /* Stores any error encountered while handling item. */
36 /* The following variable allows us to test whether we have encountered
37 a problem while attempting to cancel and delete the associated
38 event. When the test program exits, 'deleted' should be exactly
39 one. If it is greater than one, then the failed test reflects a real
40 world situation where we would have tried to access the epoll item's
41 private data after deleting it: */
42 int deleted;
43};
44
45struct epoll_item_private *epoll_items;
46
47/*
48 * Delete the specified item from the epoll set. In a real-world secneario this
49 * is where we would free the associated data structure, but in this testing
50 * environment we retain the structure so that we can test for double-deletion:
51 */
52void delete_item(int index)
53{
54 __sync_fetch_and_add(&epoll_items[index].deleted, 1);
55}
56
57/*
58 * A pointer to a read_thread_data structure will be passed as the argument to
59 * each read thread:
60 */
61struct read_thread_data {
62 int stop;
63 int status; /* Indicates any error encountered by the read thread. */
64 int epoll_set;
65};
66
67/*
68 * The function executed by the read threads:
69 */
70void *read_thread_function(void *function_data)
71{
72 struct read_thread_data *thread_data =
73 (struct read_thread_data *)function_data;
74 struct epoll_event event_data;
75 struct epoll_item_private *item_data;
76 char socket_data;
77
78 /* Handle events until we encounter an error or this thread's 'stop'
79 condition is set: */
80 while (1) {
81 int result = epoll_wait(thread_data->epoll_set,
82 &event_data,
83 1, /* Number of desired events */
84 1000); /* Timeout in ms */
85 if (result < 0) {
86 /* Breakpoints signal all threads. Ignore that while
87 debugging: */
88 if (errno == EINTR)
89 continue;
90 thread_data->status = errno;
91 return 0;
92 } else if (thread_data->stop)
93 return 0;
94 else if (result == 0) /* Timeout */
95 continue;
96
97 /* We need the mutex here because checking for the stop
98 condition and re-enabling the epoll item need to be done
99 together as one atomic operation when EPOLL_CTL_DISABLE is
100 available: */
101 item_data = (struct epoll_item_private *)event_data.data.ptr;
102 pthread_mutex_lock(&item_data->mutex);
103
104 /* Remove the item from the epoll set if we want to stop
105 handling that event: */
106 if (item_data->stop)
107 delete_item(item_data->index);
108 else {
109 /* Clear the data that was written to the other end of
110 our non-blocking socket: */
111 do {
112 if (read(item_data->fd, &socket_data, 1) < 1) {
113 if ((errno == EAGAIN) ||
114 (errno == EWOULDBLOCK))
115 break;
116 else
117 goto error_unlock;
118 }
119 } while (item_data->events & EPOLLET);
120
121 /* The item was one-shot, so re-enable it: */
122 event_data.events = item_data->events;
123 if (epoll_ctl(thread_data->epoll_set,
124 EPOLL_CTL_MOD,
125 item_data->fd,
126 &event_data) < 0)
127 goto error_unlock;
128 }
129
130 pthread_mutex_unlock(&item_data->mutex);
131 }
132
133error_unlock:
134 thread_data->status = item_data->status = errno;
135 pthread_mutex_unlock(&item_data->mutex);
136 return 0;
137}
138
139/*
140 * A pointer to a write_thread_data structure will be passed as the argument to
141 * the write thread:
142 */
143struct write_thread_data {
144 int stop;
145 int status; /* Indicates any error encountered by the write thread. */
146 int n_fds;
147 int *fds;
148};
149
150/*
151 * The function executed by the write thread. It writes a single byte to each
152 * socket in turn until the stop condition for this thread is set. If writing to
153 * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for
154 * the moment and just move on to the next socket in the list. We don't care
155 * about the order in which we deliver events to the epoll set. In fact we don't
156 * care about the data we're writing to the pipes at all; we just want to
157 * trigger epoll events:
158 */
159void *write_thread_function(void *function_data)
160{
161 const char data = 'X';
162 int index;
163 struct write_thread_data *thread_data =
164 (struct write_thread_data *)function_data;
165 while (!write_thread_data->stop)
166 for (index = 0;
167 !thread_data->stop && (index < thread_data->n_fds);
168 ++index)
169 if ((write(thread_data->fds[index], &data, 1) < 1) &&
170 (errno != EAGAIN) &&
171 (errno != EWOULDBLOCK)) {
172 write_thread_data->status = errno;
173 return;
174 }
175}
176
177/*
178 * Arguments are currently ignored:
179 */
180int main(int argc, char **argv)
181{
182 const int n_read_threads = 100;
183 const int n_epoll_items = 500;
184 int index;
185 int epoll_set = epoll_create1(0);
186 struct write_thread_data write_thread_data = {
187 0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int))
188 };
189 struct read_thread_data *read_thread_data =
190 malloc(n_read_threads * sizeof(struct read_thread_data));
191 pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t));
192 pthread_t write_thread;
193
194 printf("-----------------\n");
195 printf("Runing test_epoll\n");
196 printf("-----------------\n");
197
198 epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private));
199
200 if (epoll_set < 0 || epoll_items == 0 || write_thread_data.fds == 0 ||
201 read_thread_data == 0 || read_threads == 0)
202 goto error;
203
204 if (sysconf(_SC_NPROCESSORS_ONLN) < 2) {
205 printf("Error: please run this test on a multi-core system.\n");
206 goto error;
207 }
208
209 /* Create the socket pairs and epoll items: */
210 for (index = 0; index < n_epoll_items; ++index) {
211 int socket_pair[2];
212 struct epoll_event event_data;
213 if (socketpair(AF_UNIX,
214 SOCK_STREAM | SOCK_NONBLOCK,
215 0,
216 socket_pair) < 0)
217 goto error;
218 write_thread_data.fds[index] = socket_pair[0];
219 epoll_items[index].index = index;
220 epoll_items[index].fd = socket_pair[1];
221 if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0)
222 goto error;
223 /* We always use EPOLLONESHOT because this test is currently
224 structured to demonstrate the need for EPOLL_CTL_DISABLE,
225 which only produces useful information in the EPOLLONESHOT
226 case (without EPOLLONESHOT, calling epoll_ctl with
227 EPOLL_CTL_DISABLE will never return EBUSY). If support for
228 testing events without EPOLLONESHOT is desired, it should
229 probably be implemented in a separate unit test. */
230 epoll_items[index].events = EPOLLIN | EPOLLONESHOT;
231 if (index < n_epoll_items / 2)
232 epoll_items[index].events |= EPOLLET;
233 epoll_items[index].stop = 0;
234 epoll_items[index].status = 0;
235 epoll_items[index].deleted = 0;
236 event_data.events = epoll_items[index].events;
237 event_data.data.ptr = &epoll_items[index];
238 if (epoll_ctl(epoll_set,
239 EPOLL_CTL_ADD,
240 epoll_items[index].fd,
241 &event_data) < 0)
242 goto error;
243 }
244
245 /* Create and start the read threads: */
246 for (index = 0; index < n_read_threads; ++index) {
247 read_thread_data[index].stop = 0;
248 read_thread_data[index].status = 0;
249 read_thread_data[index].epoll_set = epoll_set;
250 if (pthread_create(&read_threads[index],
251 NULL,
252 read_thread_function,
253 &read_thread_data[index]) != 0)
254 goto error;
255 }
256
257 if (pthread_create(&write_thread,
258 NULL,
259 write_thread_function,
260 &write_thread_data) != 0)
261 goto error;
262
263 /* Cancel all event pollers: */
264#ifdef EPOLL_CTL_DISABLE
265 for (index = 0; index < n_epoll_items; ++index) {
266 pthread_mutex_lock(&epoll_items[index].mutex);
267 ++epoll_items[index].stop;
268 if (epoll_ctl(epoll_set,
269 EPOLL_CTL_DISABLE,
270 epoll_items[index].fd,
271 NULL) == 0)
272 delete_item(index);
273 else if (errno != EBUSY) {
274 pthread_mutex_unlock(&epoll_items[index].mutex);
275 goto error;
276 }
277 /* EBUSY means events were being handled; allow the other thread
278 to delete the item. */
279 pthread_mutex_unlock(&epoll_items[index].mutex);
280 }
281#else
282 for (index = 0; index < n_epoll_items; ++index) {
283 pthread_mutex_lock(&epoll_items[index].mutex);
284 ++epoll_items[index].stop;
285 pthread_mutex_unlock(&epoll_items[index].mutex);
286 /* Wait in case a thread running read_thread_function is
287 currently executing code between epoll_wait and
288 pthread_mutex_lock with this item. Note that a longer delay
289 would make double-deletion less likely (at the expense of
290 performance), but there is no guarantee that any delay would
291 ever be sufficient. Note also that we delete all event
292 pollers at once for testing purposes, but in a real-world
293 environment we are likely to want to be able to cancel event
294 pollers at arbitrary times. Therefore we can't improve this
295 situation by just splitting this loop into two loops
296 (i.e. signal 'stop' for all items, sleep, and then delete all
297 items). We also can't fix the problem via EPOLL_CTL_DEL
298 because that command can't prevent the case where some other
299 thread is executing read_thread_function within the region
300 mentioned above: */
301 usleep(1);
302 pthread_mutex_lock(&epoll_items[index].mutex);
303 if (!epoll_items[index].deleted)
304 delete_item(index);
305 pthread_mutex_unlock(&epoll_items[index].mutex);
306 }
307#endif
308
309 /* Shut down the read threads: */
310 for (index = 0; index < n_read_threads; ++index)
311 __sync_fetch_and_add(&read_thread_data[index].stop, 1);
312 for (index = 0; index < n_read_threads; ++index) {
313 if (pthread_join(read_threads[index], NULL) != 0)
314 goto error;
315 if (read_thread_data[index].status)
316 goto error;
317 }
318
319 /* Shut down the write thread: */
320 __sync_fetch_and_add(&write_thread_data.stop, 1);
321 if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status)
322 goto error;
323
324 /* Check for final error conditions: */
325 for (index = 0; index < n_epoll_items; ++index) {
326 if (epoll_items[index].status != 0)
327 goto error;
328 if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0)
329 goto error;
330 }
331 for (index = 0; index < n_epoll_items; ++index)
332 if (epoll_items[index].deleted != 1) {
333 printf("Error: item data deleted %1d times.\n",
334 epoll_items[index].deleted);
335 goto error;
336 }
337
338 printf("[PASS]\n");
339 return 0;
340
341 error:
342 printf("[FAIL]\n");
343 return errno;
344}