diff options
author | Paton J. Lewis <palewis@adobe.com> | 2012-10-04 20:13:39 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-05 14:05:00 -0400 |
commit | 03a7beb55b9fad363f0dd33e72ccf2d3e1c2a406 (patch) | |
tree | e89cb2a2db5645600f28699ebf3b4a98195a3fb3 | |
parent | a0a0a7a94c765f7219b57fa3b79389901bb0bc99 (diff) |
epoll: support for disabling items, and a self-test app
Enhanced epoll_ctl to support EPOLL_CTL_DISABLE, which disables an epoll
item. If epoll_ctl doesn't return -EBUSY in this case, it is then safe to
delete the epoll item in a multi-threaded environment. Also added a new
test_epoll self- test app to both demonstrate the need for this feature
and test it.
Signed-off-by: Paton J. Lewis <palewis@adobe.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Paul Holland <pholland@adobe.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | fs/eventpoll.c | 38 | ||||
-rw-r--r-- | include/linux/eventpoll.h | 1 | ||||
-rw-r--r-- | tools/testing/selftests/Makefile | 2 | ||||
-rw-r--r-- | tools/testing/selftests/epoll/Makefile | 11 | ||||
-rw-r--r-- | tools/testing/selftests/epoll/test_epoll.c | 344 |
5 files changed, 392 insertions, 4 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index cd96649bfe62..da72250ddc1c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c | |||
@@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) | |||
346 | /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ | 346 | /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ |
347 | static inline int ep_op_has_event(int op) | 347 | static inline int ep_op_has_event(int op) |
348 | { | 348 | { |
349 | return op != EPOLL_CTL_DEL; | 349 | return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD; |
350 | } | 350 | } |
351 | 351 | ||
352 | /* Initialize the poll safe wake up structure */ | 352 | /* Initialize the poll safe wake up structure */ |
@@ -676,6 +676,34 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) | |||
676 | return 0; | 676 | return 0; |
677 | } | 677 | } |
678 | 678 | ||
679 | /* | ||
680 | * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item | ||
681 | * had no event flags set, indicating that another thread may be currently | ||
682 | * handling that item's events (in the case that EPOLLONESHOT was being | ||
683 | * used). Otherwise a zero result indicates that the item has been disabled | ||
684 | * from receiving events. A disabled item may be re-enabled via | ||
685 | * EPOLL_CTL_MOD. Must be called with "mtx" held. | ||
686 | */ | ||
687 | static int ep_disable(struct eventpoll *ep, struct epitem *epi) | ||
688 | { | ||
689 | int result = 0; | ||
690 | unsigned long flags; | ||
691 | |||
692 | spin_lock_irqsave(&ep->lock, flags); | ||
693 | if (epi->event.events & ~EP_PRIVATE_BITS) { | ||
694 | if (ep_is_linked(&epi->rdllink)) | ||
695 | list_del_init(&epi->rdllink); | ||
696 | /* Ensure ep_poll_callback will not add epi back onto ready | ||
697 | list: */ | ||
698 | epi->event.events &= EP_PRIVATE_BITS; | ||
699 | } | ||
700 | else | ||
701 | result = -EBUSY; | ||
702 | spin_unlock_irqrestore(&ep->lock, flags); | ||
703 | |||
704 | return result; | ||
705 | } | ||
706 | |||
679 | static void ep_free(struct eventpoll *ep) | 707 | static void ep_free(struct eventpoll *ep) |
680 | { | 708 | { |
681 | struct rb_node *rbp; | 709 | struct rb_node *rbp; |
@@ -1020,8 +1048,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) | |||
1020 | rb_insert_color(&epi->rbn, &ep->rbr); | 1048 | rb_insert_color(&epi->rbn, &ep->rbr); |
1021 | } | 1049 | } |
1022 | 1050 | ||
1023 | |||
1024 | |||
1025 | #define PATH_ARR_SIZE 5 | 1051 | #define PATH_ARR_SIZE 5 |
1026 | /* | 1052 | /* |
1027 | * These are the number paths of length 1 to 5, that we are allowing to emanate | 1053 | * These are the number paths of length 1 to 5, that we are allowing to emanate |
@@ -1787,6 +1813,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, | |||
1787 | } else | 1813 | } else |
1788 | error = -ENOENT; | 1814 | error = -ENOENT; |
1789 | break; | 1815 | break; |
1816 | case EPOLL_CTL_DISABLE: | ||
1817 | if (epi) | ||
1818 | error = ep_disable(ep, epi); | ||
1819 | else | ||
1820 | error = -ENOENT; | ||
1821 | break; | ||
1790 | } | 1822 | } |
1791 | mutex_unlock(&ep->mtx); | 1823 | mutex_unlock(&ep->mtx); |
1792 | 1824 | ||
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index f4bb378ccf6a..41085d0f3955 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h | |||
@@ -25,6 +25,7 @@ | |||
25 | #define EPOLL_CTL_ADD 1 | 25 | #define EPOLL_CTL_ADD 1 |
26 | #define EPOLL_CTL_DEL 2 | 26 | #define EPOLL_CTL_DEL 2 |
27 | #define EPOLL_CTL_MOD 3 | 27 | #define EPOLL_CTL_MOD 3 |
28 | #define EPOLL_CTL_DISABLE 4 | ||
28 | 29 | ||
29 | /* | 30 | /* |
30 | * Request the handling of system wakeup events so as to prevent system suspends | 31 | * Request the handling of system wakeup events so as to prevent system suspends |
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 85baf11e2acd..43480149119e 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile | |||
@@ -1,4 +1,4 @@ | |||
1 | TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug | 1 | TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug epoll |
2 | 2 | ||
3 | all: | 3 | all: |
4 | for TARGET in $(TARGETS); do \ | 4 | for TARGET in $(TARGETS); do \ |
diff --git a/tools/testing/selftests/epoll/Makefile b/tools/testing/selftests/epoll/Makefile new file mode 100644 index 000000000000..19806ed62f50 --- /dev/null +++ b/tools/testing/selftests/epoll/Makefile | |||
@@ -0,0 +1,11 @@ | |||
1 | # Makefile for epoll selftests | ||
2 | |||
3 | all: test_epoll | ||
4 | %: %.c | ||
5 | gcc -pthread -g -o $@ $^ | ||
6 | |||
7 | run_tests: all | ||
8 | ./test_epoll | ||
9 | |||
10 | clean: | ||
11 | $(RM) test_epoll | ||
diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c new file mode 100644 index 000000000000..e0fcff1e8331 --- /dev/null +++ b/tools/testing/selftests/epoll/test_epoll.c | |||
@@ -0,0 +1,344 @@ | |||
1 | /* | ||
2 | * tools/testing/selftests/epoll/test_epoll.c | ||
3 | * | ||
4 | * Copyright 2012 Adobe Systems Incorporated | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * Paton J. Lewis <palewis@adobe.com> | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | #include <errno.h> | ||
16 | #include <fcntl.h> | ||
17 | #include <pthread.h> | ||
18 | #include <stdio.h> | ||
19 | #include <stdlib.h> | ||
20 | #include <unistd.h> | ||
21 | #include <sys/epoll.h> | ||
22 | #include <sys/socket.h> | ||
23 | |||
24 | /* | ||
25 | * A pointer to an epoll_item_private structure will be stored in the epoll | ||
26 | * item's event structure so that we can get access to the epoll_item_private | ||
27 | * data after calling epoll_wait: | ||
28 | */ | ||
29 | struct epoll_item_private { | ||
30 | int index; /* Position of this struct within the epoll_items array. */ | ||
31 | int fd; | ||
32 | uint32_t events; | ||
33 | pthread_mutex_t mutex; /* Guards the following variables... */ | ||
34 | int stop; | ||
35 | int status; /* Stores any error encountered while handling item. */ | ||
36 | /* The following variable allows us to test whether we have encountered | ||
37 | a problem while attempting to cancel and delete the associated | ||
38 | event. When the test program exits, 'deleted' should be exactly | ||
39 | one. If it is greater than one, then the failed test reflects a real | ||
40 | world situation where we would have tried to access the epoll item's | ||
41 | private data after deleting it: */ | ||
42 | int deleted; | ||
43 | }; | ||
44 | |||
45 | struct epoll_item_private *epoll_items; | ||
46 | |||
47 | /* | ||
48 | * Delete the specified item from the epoll set. In a real-world secneario this | ||
49 | * is where we would free the associated data structure, but in this testing | ||
50 | * environment we retain the structure so that we can test for double-deletion: | ||
51 | */ | ||
52 | void delete_item(int index) | ||
53 | { | ||
54 | __sync_fetch_and_add(&epoll_items[index].deleted, 1); | ||
55 | } | ||
56 | |||
57 | /* | ||
58 | * A pointer to a read_thread_data structure will be passed as the argument to | ||
59 | * each read thread: | ||
60 | */ | ||
61 | struct read_thread_data { | ||
62 | int stop; | ||
63 | int status; /* Indicates any error encountered by the read thread. */ | ||
64 | int epoll_set; | ||
65 | }; | ||
66 | |||
67 | /* | ||
68 | * The function executed by the read threads: | ||
69 | */ | ||
70 | void *read_thread_function(void *function_data) | ||
71 | { | ||
72 | struct read_thread_data *thread_data = | ||
73 | (struct read_thread_data *)function_data; | ||
74 | struct epoll_event event_data; | ||
75 | struct epoll_item_private *item_data; | ||
76 | char socket_data; | ||
77 | |||
78 | /* Handle events until we encounter an error or this thread's 'stop' | ||
79 | condition is set: */ | ||
80 | while (1) { | ||
81 | int result = epoll_wait(thread_data->epoll_set, | ||
82 | &event_data, | ||
83 | 1, /* Number of desired events */ | ||
84 | 1000); /* Timeout in ms */ | ||
85 | if (result < 0) { | ||
86 | /* Breakpoints signal all threads. Ignore that while | ||
87 | debugging: */ | ||
88 | if (errno == EINTR) | ||
89 | continue; | ||
90 | thread_data->status = errno; | ||
91 | return 0; | ||
92 | } else if (thread_data->stop) | ||
93 | return 0; | ||
94 | else if (result == 0) /* Timeout */ | ||
95 | continue; | ||
96 | |||
97 | /* We need the mutex here because checking for the stop | ||
98 | condition and re-enabling the epoll item need to be done | ||
99 | together as one atomic operation when EPOLL_CTL_DISABLE is | ||
100 | available: */ | ||
101 | item_data = (struct epoll_item_private *)event_data.data.ptr; | ||
102 | pthread_mutex_lock(&item_data->mutex); | ||
103 | |||
104 | /* Remove the item from the epoll set if we want to stop | ||
105 | handling that event: */ | ||
106 | if (item_data->stop) | ||
107 | delete_item(item_data->index); | ||
108 | else { | ||
109 | /* Clear the data that was written to the other end of | ||
110 | our non-blocking socket: */ | ||
111 | do { | ||
112 | if (read(item_data->fd, &socket_data, 1) < 1) { | ||
113 | if ((errno == EAGAIN) || | ||
114 | (errno == EWOULDBLOCK)) | ||
115 | break; | ||
116 | else | ||
117 | goto error_unlock; | ||
118 | } | ||
119 | } while (item_data->events & EPOLLET); | ||
120 | |||
121 | /* The item was one-shot, so re-enable it: */ | ||
122 | event_data.events = item_data->events; | ||
123 | if (epoll_ctl(thread_data->epoll_set, | ||
124 | EPOLL_CTL_MOD, | ||
125 | item_data->fd, | ||
126 | &event_data) < 0) | ||
127 | goto error_unlock; | ||
128 | } | ||
129 | |||
130 | pthread_mutex_unlock(&item_data->mutex); | ||
131 | } | ||
132 | |||
133 | error_unlock: | ||
134 | thread_data->status = item_data->status = errno; | ||
135 | pthread_mutex_unlock(&item_data->mutex); | ||
136 | return 0; | ||
137 | } | ||
138 | |||
139 | /* | ||
140 | * A pointer to a write_thread_data structure will be passed as the argument to | ||
141 | * the write thread: | ||
142 | */ | ||
143 | struct write_thread_data { | ||
144 | int stop; | ||
145 | int status; /* Indicates any error encountered by the write thread. */ | ||
146 | int n_fds; | ||
147 | int *fds; | ||
148 | }; | ||
149 | |||
150 | /* | ||
151 | * The function executed by the write thread. It writes a single byte to each | ||
152 | * socket in turn until the stop condition for this thread is set. If writing to | ||
153 | * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for | ||
154 | * the moment and just move on to the next socket in the list. We don't care | ||
155 | * about the order in which we deliver events to the epoll set. In fact we don't | ||
156 | * care about the data we're writing to the pipes at all; we just want to | ||
157 | * trigger epoll events: | ||
158 | */ | ||
159 | void *write_thread_function(void *function_data) | ||
160 | { | ||
161 | const char data = 'X'; | ||
162 | int index; | ||
163 | struct write_thread_data *thread_data = | ||
164 | (struct write_thread_data *)function_data; | ||
165 | while (!write_thread_data->stop) | ||
166 | for (index = 0; | ||
167 | !thread_data->stop && (index < thread_data->n_fds); | ||
168 | ++index) | ||
169 | if ((write(thread_data->fds[index], &data, 1) < 1) && | ||
170 | (errno != EAGAIN) && | ||
171 | (errno != EWOULDBLOCK)) { | ||
172 | write_thread_data->status = errno; | ||
173 | return; | ||
174 | } | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Arguments are currently ignored: | ||
179 | */ | ||
180 | int main(int argc, char **argv) | ||
181 | { | ||
182 | const int n_read_threads = 100; | ||
183 | const int n_epoll_items = 500; | ||
184 | int index; | ||
185 | int epoll_set = epoll_create1(0); | ||
186 | struct write_thread_data write_thread_data = { | ||
187 | 0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int)) | ||
188 | }; | ||
189 | struct read_thread_data *read_thread_data = | ||
190 | malloc(n_read_threads * sizeof(struct read_thread_data)); | ||
191 | pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t)); | ||
192 | pthread_t write_thread; | ||
193 | |||
194 | printf("-----------------\n"); | ||
195 | printf("Runing test_epoll\n"); | ||
196 | printf("-----------------\n"); | ||
197 | |||
198 | epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private)); | ||
199 | |||
200 | if (epoll_set < 0 || epoll_items == 0 || write_thread_data.fds == 0 || | ||
201 | read_thread_data == 0 || read_threads == 0) | ||
202 | goto error; | ||
203 | |||
204 | if (sysconf(_SC_NPROCESSORS_ONLN) < 2) { | ||
205 | printf("Error: please run this test on a multi-core system.\n"); | ||
206 | goto error; | ||
207 | } | ||
208 | |||
209 | /* Create the socket pairs and epoll items: */ | ||
210 | for (index = 0; index < n_epoll_items; ++index) { | ||
211 | int socket_pair[2]; | ||
212 | struct epoll_event event_data; | ||
213 | if (socketpair(AF_UNIX, | ||
214 | SOCK_STREAM | SOCK_NONBLOCK, | ||
215 | 0, | ||
216 | socket_pair) < 0) | ||
217 | goto error; | ||
218 | write_thread_data.fds[index] = socket_pair[0]; | ||
219 | epoll_items[index].index = index; | ||
220 | epoll_items[index].fd = socket_pair[1]; | ||
221 | if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0) | ||
222 | goto error; | ||
223 | /* We always use EPOLLONESHOT because this test is currently | ||
224 | structured to demonstrate the need for EPOLL_CTL_DISABLE, | ||
225 | which only produces useful information in the EPOLLONESHOT | ||
226 | case (without EPOLLONESHOT, calling epoll_ctl with | ||
227 | EPOLL_CTL_DISABLE will never return EBUSY). If support for | ||
228 | testing events without EPOLLONESHOT is desired, it should | ||
229 | probably be implemented in a separate unit test. */ | ||
230 | epoll_items[index].events = EPOLLIN | EPOLLONESHOT; | ||
231 | if (index < n_epoll_items / 2) | ||
232 | epoll_items[index].events |= EPOLLET; | ||
233 | epoll_items[index].stop = 0; | ||
234 | epoll_items[index].status = 0; | ||
235 | epoll_items[index].deleted = 0; | ||
236 | event_data.events = epoll_items[index].events; | ||
237 | event_data.data.ptr = &epoll_items[index]; | ||
238 | if (epoll_ctl(epoll_set, | ||
239 | EPOLL_CTL_ADD, | ||
240 | epoll_items[index].fd, | ||
241 | &event_data) < 0) | ||
242 | goto error; | ||
243 | } | ||
244 | |||
245 | /* Create and start the read threads: */ | ||
246 | for (index = 0; index < n_read_threads; ++index) { | ||
247 | read_thread_data[index].stop = 0; | ||
248 | read_thread_data[index].status = 0; | ||
249 | read_thread_data[index].epoll_set = epoll_set; | ||
250 | if (pthread_create(&read_threads[index], | ||
251 | NULL, | ||
252 | read_thread_function, | ||
253 | &read_thread_data[index]) != 0) | ||
254 | goto error; | ||
255 | } | ||
256 | |||
257 | if (pthread_create(&write_thread, | ||
258 | NULL, | ||
259 | write_thread_function, | ||
260 | &write_thread_data) != 0) | ||
261 | goto error; | ||
262 | |||
263 | /* Cancel all event pollers: */ | ||
264 | #ifdef EPOLL_CTL_DISABLE | ||
265 | for (index = 0; index < n_epoll_items; ++index) { | ||
266 | pthread_mutex_lock(&epoll_items[index].mutex); | ||
267 | ++epoll_items[index].stop; | ||
268 | if (epoll_ctl(epoll_set, | ||
269 | EPOLL_CTL_DISABLE, | ||
270 | epoll_items[index].fd, | ||
271 | NULL) == 0) | ||
272 | delete_item(index); | ||
273 | else if (errno != EBUSY) { | ||
274 | pthread_mutex_unlock(&epoll_items[index].mutex); | ||
275 | goto error; | ||
276 | } | ||
277 | /* EBUSY means events were being handled; allow the other thread | ||
278 | to delete the item. */ | ||
279 | pthread_mutex_unlock(&epoll_items[index].mutex); | ||
280 | } | ||
281 | #else | ||
282 | for (index = 0; index < n_epoll_items; ++index) { | ||
283 | pthread_mutex_lock(&epoll_items[index].mutex); | ||
284 | ++epoll_items[index].stop; | ||
285 | pthread_mutex_unlock(&epoll_items[index].mutex); | ||
286 | /* Wait in case a thread running read_thread_function is | ||
287 | currently executing code between epoll_wait and | ||
288 | pthread_mutex_lock with this item. Note that a longer delay | ||
289 | would make double-deletion less likely (at the expense of | ||
290 | performance), but there is no guarantee that any delay would | ||
291 | ever be sufficient. Note also that we delete all event | ||
292 | pollers at once for testing purposes, but in a real-world | ||
293 | environment we are likely to want to be able to cancel event | ||
294 | pollers at arbitrary times. Therefore we can't improve this | ||
295 | situation by just splitting this loop into two loops | ||
296 | (i.e. signal 'stop' for all items, sleep, and then delete all | ||
297 | items). We also can't fix the problem via EPOLL_CTL_DEL | ||
298 | because that command can't prevent the case where some other | ||
299 | thread is executing read_thread_function within the region | ||
300 | mentioned above: */ | ||
301 | usleep(1); | ||
302 | pthread_mutex_lock(&epoll_items[index].mutex); | ||
303 | if (!epoll_items[index].deleted) | ||
304 | delete_item(index); | ||
305 | pthread_mutex_unlock(&epoll_items[index].mutex); | ||
306 | } | ||
307 | #endif | ||
308 | |||
309 | /* Shut down the read threads: */ | ||
310 | for (index = 0; index < n_read_threads; ++index) | ||
311 | __sync_fetch_and_add(&read_thread_data[index].stop, 1); | ||
312 | for (index = 0; index < n_read_threads; ++index) { | ||
313 | if (pthread_join(read_threads[index], NULL) != 0) | ||
314 | goto error; | ||
315 | if (read_thread_data[index].status) | ||
316 | goto error; | ||
317 | } | ||
318 | |||
319 | /* Shut down the write thread: */ | ||
320 | __sync_fetch_and_add(&write_thread_data.stop, 1); | ||
321 | if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status) | ||
322 | goto error; | ||
323 | |||
324 | /* Check for final error conditions: */ | ||
325 | for (index = 0; index < n_epoll_items; ++index) { | ||
326 | if (epoll_items[index].status != 0) | ||
327 | goto error; | ||
328 | if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0) | ||
329 | goto error; | ||
330 | } | ||
331 | for (index = 0; index < n_epoll_items; ++index) | ||
332 | if (epoll_items[index].deleted != 1) { | ||
333 | printf("Error: item data deleted %1d times.\n", | ||
334 | epoll_items[index].deleted); | ||
335 | goto error; | ||
336 | } | ||
337 | |||
338 | printf("[PASS]\n"); | ||
339 | return 0; | ||
340 | |||
341 | error: | ||
342 | printf("[FAIL]\n"); | ||
343 | return errno; | ||
344 | } | ||