aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHans Verkuil <hans.verkuil@cisco.com>2012-03-23 18:02:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-23 19:58:38 -0400
commit626cf236608505d376e4799adb4f7eb00a8594af (patch)
treeeb0421fec1a7fed05cd9ea785bd18b5f40c80971
parent5cde7656d0dd222170eb0250bd1f70c9018fd438 (diff)
poll: add poll_requested_events() and poll_does_not_wait() functions
In some cases the poll() implementation in a driver has to do different things depending on the events the caller wants to poll for. An example is when a driver needs to start a DMA engine if the caller polls for POLLIN, but doesn't want to do that if POLLIN is not requested but instead only POLLOUT or POLLPRI is requested. This is something that can happen in the video4linux subsystem among others. Unfortunately, the current epoll/poll/select implementation doesn't provide that information reliably. The poll_table_struct does have it: it has a key field with the event mask. But once a poll() call matches one or more bits of that mask any following poll() calls are passed a NULL poll_table pointer. Also, the eventpoll implementation always left the key field at ~0 instead of using the requested events mask. This was changed in eventpoll.c so the key field now contains the actual events that should be polled for as set by the caller. The solution to the NULL poll_table pointer is to set the qproc field to NULL in poll_table once poll() matches the events, not the poll_table pointer itself. That way drivers can obtain the mask through a new poll_requested_events inline. The poll_table_struct can still be NULL since some kernel code calls it internally (netfs_state_poll() in ./drivers/staging/pohmelfs/netfs.h). In that case poll_requested_events() returns ~0 (i.e. all events). Very rarely drivers might want to know whether poll_wait will actually wait. If another earlier file descriptor in the set already matched the events the caller wanted to wait for, then the kernel will return from the select() call without waiting. This might be useful information in order to avoid doing expensive work. A new helper function poll_does_not_wait() is added that drivers can use to detect this situation. This is now used in sock_poll_wait() in include/net/sock.h. This was the only place in the kernel that needed this information. Drivers should no longer access any of the poll_table internals, but use the poll_requested_events() and poll_does_not_wait() access functions instead. In order to enforce that the poll_table fields are now prepended with an underscore and a comment was added warning against using them directly. This required a change in unix_dgram_poll() in unix/af_unix.c which used the key field to get the requested events. It's been replaced by a call to poll_requested_events(). For qproc it was especially important to change its name since the behavior of that field changes with this patch since this function pointer can now be NULL when that wasn't possible in the past. Any driver accessing the qproc or key fields directly will now fail to compile. Some notes regarding the correctness of this patch: the driver's poll() function is called with a 'struct poll_table_struct *wait' argument. This pointer may or may not be NULL, drivers can never rely on it being one or the other as that depends on whether or not an earlier file descriptor in the select()'s fdset matched the requested events. There are only three things a driver can do with the wait argument: 1) obtain the key field: events = wait ? wait->key : ~0; This will still work although it should be replaced with the new poll_requested_events() function (which does exactly the same). This will now even work better, since wait is no longer set to NULL unnecessarily. 2) use the qproc callback. This could be deadly since qproc can now be NULL. Renaming qproc should prevent this from happening. There are no kernel drivers that actually access this callback directly, BTW. 3) test whether wait == NULL to determine whether poll would return without waiting. This is no longer sufficient as the correct test is now wait == NULL || wait->_qproc == NULL. However, the worst that can happen here is a slight performance hit in the case where wait != NULL and wait->_qproc == NULL. In that case the driver will assume that poll_wait() will actually add the fd to the set of waiting file descriptors. Of course, poll_wait() will not do that since it tests for wait->_qproc. This will not break anything, though. There is only one place in the whole kernel where this happens (sock_poll_wait() in include/net/sock.h) and that code will be replaced by a call to poll_does_not_wait() in the next patch. Note that even if wait->_qproc != NULL drivers cannot rely on poll_wait() actually waiting. The next file descriptor from the set might match the event mask and thus any possible waits will never happen. Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com> Reviewed-by: Jonathan Corbet <corbet@lwn.net> Reviewed-by: Al Viro <viro@zeniv.linux.org.uk> Cc: Davide Libenzi <davidel@xmailserver.org> Signed-off-by: Hans de Goede <hdegoede@redhat.com> Cc: Mauro Carvalho Chehab <mchehab@infradead.org> Cc: David Miller <davem@davemloft.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/eventpoll.c18
-rw-r--r--fs/select.c40
-rw-r--r--include/linux/poll.h37
-rw-r--r--include/net/sock.h2
-rw-r--r--net/unix/af_unix.c2
5 files changed, 66 insertions, 33 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4d9d3a45e356..ca300071e79c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -699,9 +699,12 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
699 void *priv) 699 void *priv)
700{ 700{
701 struct epitem *epi, *tmp; 701 struct epitem *epi, *tmp;
702 poll_table pt;
702 703
704 init_poll_funcptr(&pt, NULL);
703 list_for_each_entry_safe(epi, tmp, head, rdllink) { 705 list_for_each_entry_safe(epi, tmp, head, rdllink) {
704 if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & 706 pt._key = epi->event.events;
707 if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
705 epi->event.events) 708 epi->event.events)
706 return POLLIN | POLLRDNORM; 709 return POLLIN | POLLRDNORM;
707 else { 710 else {
@@ -1097,6 +1100,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1097 /* Initialize the poll table using the queue callback */ 1100 /* Initialize the poll table using the queue callback */
1098 epq.epi = epi; 1101 epq.epi = epi;
1099 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); 1102 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
1103 epq.pt._key = event->events;
1100 1104
1101 /* 1105 /*
1102 * Attach the item to the poll hooks and get current event bits. 1106 * Attach the item to the poll hooks and get current event bits.
@@ -1191,6 +1195,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1191{ 1195{
1192 int pwake = 0; 1196 int pwake = 0;
1193 unsigned int revents; 1197 unsigned int revents;
1198 poll_table pt;
1199
1200 init_poll_funcptr(&pt, NULL);
1194 1201
1195 /* 1202 /*
1196 * Set the new event interest mask before calling f_op->poll(); 1203 * Set the new event interest mask before calling f_op->poll();
@@ -1198,13 +1205,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1198 * f_op->poll() call and the new event set registering. 1205 * f_op->poll() call and the new event set registering.
1199 */ 1206 */
1200 epi->event.events = event->events; 1207 epi->event.events = event->events;
1208 pt._key = event->events;
1201 epi->event.data = event->data; /* protected by mtx */ 1209 epi->event.data = event->data; /* protected by mtx */
1202 1210
1203 /* 1211 /*
1204 * Get current event bits. We can safely use the file* here because 1212 * Get current event bits. We can safely use the file* here because
1205 * its usage count has been increased by the caller of this function. 1213 * its usage count has been increased by the caller of this function.
1206 */ 1214 */
1207 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); 1215 revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
1208 1216
1209 /* 1217 /*
1210 * If the item is "hot" and it is not registered inside the ready 1218 * If the item is "hot" and it is not registered inside the ready
@@ -1239,6 +1247,9 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1239 unsigned int revents; 1247 unsigned int revents;
1240 struct epitem *epi; 1248 struct epitem *epi;
1241 struct epoll_event __user *uevent; 1249 struct epoll_event __user *uevent;
1250 poll_table pt;
1251
1252 init_poll_funcptr(&pt, NULL);
1242 1253
1243 /* 1254 /*
1244 * We can loop without lock because we are passed a task private list. 1255 * We can loop without lock because we are passed a task private list.
@@ -1251,7 +1262,8 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1251 1262
1252 list_del_init(&epi->rdllink); 1263 list_del_init(&epi->rdllink);
1253 1264
1254 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & 1265 pt._key = epi->event.events;
1266 revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
1255 epi->event.events; 1267 epi->event.events;
1256 1268
1257 /* 1269 /*
diff --git a/fs/select.c b/fs/select.c
index e782258d0de3..ecfd0b125ba2 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -223,7 +223,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
223 get_file(filp); 223 get_file(filp);
224 entry->filp = filp; 224 entry->filp = filp;
225 entry->wait_address = wait_address; 225 entry->wait_address = wait_address;
226 entry->key = p->key; 226 entry->key = p->_key;
227 init_waitqueue_func_entry(&entry->wait, pollwake); 227 init_waitqueue_func_entry(&entry->wait, pollwake);
228 entry->wait.private = pwq; 228 entry->wait.private = pwq;
229 add_wait_queue(wait_address, &entry->wait); 229 add_wait_queue(wait_address, &entry->wait);
@@ -386,13 +386,11 @@ get_max:
386static inline void wait_key_set(poll_table *wait, unsigned long in, 386static inline void wait_key_set(poll_table *wait, unsigned long in,
387 unsigned long out, unsigned long bit) 387 unsigned long out, unsigned long bit)
388{ 388{
389 if (wait) { 389 wait->_key = POLLEX_SET;
390 wait->key = POLLEX_SET; 390 if (in & bit)
391 if (in & bit) 391 wait->_key |= POLLIN_SET;
392 wait->key |= POLLIN_SET; 392 if (out & bit)
393 if (out & bit) 393 wait->_key |= POLLOUT_SET;
394 wait->key |= POLLOUT_SET;
395 }
396} 394}
397 395
398int do_select(int n, fd_set_bits *fds, struct timespec *end_time) 396int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
@@ -414,7 +412,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
414 poll_initwait(&table); 412 poll_initwait(&table);
415 wait = &table.pt; 413 wait = &table.pt;
416 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { 414 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
417 wait = NULL; 415 wait->_qproc = NULL;
418 timed_out = 1; 416 timed_out = 1;
419 } 417 }
420 418
@@ -459,17 +457,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
459 if ((mask & POLLIN_SET) && (in & bit)) { 457 if ((mask & POLLIN_SET) && (in & bit)) {
460 res_in |= bit; 458 res_in |= bit;
461 retval++; 459 retval++;
462 wait = NULL; 460 wait->_qproc = NULL;
463 } 461 }
464 if ((mask & POLLOUT_SET) && (out & bit)) { 462 if ((mask & POLLOUT_SET) && (out & bit)) {
465 res_out |= bit; 463 res_out |= bit;
466 retval++; 464 retval++;
467 wait = NULL; 465 wait->_qproc = NULL;
468 } 466 }
469 if ((mask & POLLEX_SET) && (ex & bit)) { 467 if ((mask & POLLEX_SET) && (ex & bit)) {
470 res_ex |= bit; 468 res_ex |= bit;
471 retval++; 469 retval++;
472 wait = NULL; 470 wait->_qproc = NULL;
473 } 471 }
474 } 472 }
475 } 473 }
@@ -481,7 +479,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
481 *rexp = res_ex; 479 *rexp = res_ex;
482 cond_resched(); 480 cond_resched();
483 } 481 }
484 wait = NULL; 482 wait->_qproc = NULL;
485 if (retval || timed_out || signal_pending(current)) 483 if (retval || timed_out || signal_pending(current))
486 break; 484 break;
487 if (table.error) { 485 if (table.error) {
@@ -720,7 +718,7 @@ struct poll_list {
720 * interested in events matching the pollfd->events mask, and the result 718 * interested in events matching the pollfd->events mask, and the result
721 * matching that mask is both recorded in pollfd->revents and returned. The 719 * matching that mask is both recorded in pollfd->revents and returned. The
722 * pwait poll_table will be used by the fd-provided poll handler for waiting, 720 * pwait poll_table will be used by the fd-provided poll handler for waiting,
723 * if non-NULL. 721 * if pwait->_qproc is non-NULL.
724 */ 722 */
725static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) 723static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
726{ 724{
@@ -738,9 +736,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
738 if (file != NULL) { 736 if (file != NULL) {
739 mask = DEFAULT_POLLMASK; 737 mask = DEFAULT_POLLMASK;
740 if (file->f_op && file->f_op->poll) { 738 if (file->f_op && file->f_op->poll) {
741 if (pwait) 739 pwait->_key = pollfd->events|POLLERR|POLLHUP;
742 pwait->key = pollfd->events |
743 POLLERR | POLLHUP;
744 mask = file->f_op->poll(file, pwait); 740 mask = file->f_op->poll(file, pwait);
745 } 741 }
746 /* Mask out unneeded events. */ 742 /* Mask out unneeded events. */
@@ -763,7 +759,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
763 759
764 /* Optimise the no-wait case */ 760 /* Optimise the no-wait case */
765 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { 761 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
766 pt = NULL; 762 pt->_qproc = NULL;
767 timed_out = 1; 763 timed_out = 1;
768 } 764 }
769 765
@@ -781,22 +777,22 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
781 for (; pfd != pfd_end; pfd++) { 777 for (; pfd != pfd_end; pfd++) {
782 /* 778 /*
783 * Fish for events. If we found one, record it 779 * Fish for events. If we found one, record it
784 * and kill the poll_table, so we don't 780 * and kill poll_table->_qproc, so we don't
785 * needlessly register any other waiters after 781 * needlessly register any other waiters after
786 * this. They'll get immediately deregistered 782 * this. They'll get immediately deregistered
787 * when we break out and return. 783 * when we break out and return.
788 */ 784 */
789 if (do_pollfd(pfd, pt)) { 785 if (do_pollfd(pfd, pt)) {
790 count++; 786 count++;
791 pt = NULL; 787 pt->_qproc = NULL;
792 } 788 }
793 } 789 }
794 } 790 }
795 /* 791 /*
796 * All waiters have already been registered, so don't provide 792 * All waiters have already been registered, so don't provide
797 * a poll_table to them on the next loop iteration. 793 * a poll_table->_qproc to them on the next loop iteration.
798 */ 794 */
799 pt = NULL; 795 pt->_qproc = NULL;
800 if (!count) { 796 if (!count) {
801 count = wait->error; 797 count = wait->error;
802 if (signal_pending(current)) 798 if (signal_pending(current))
diff --git a/include/linux/poll.h b/include/linux/poll.h
index cf40010ce0cd..48fe8bc398d1 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -32,21 +32,46 @@ struct poll_table_struct;
32 */ 32 */
33typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); 33typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
34 34
35/*
36 * Do not touch the structure directly, use the access functions
37 * poll_does_not_wait() and poll_requested_events() instead.
38 */
35typedef struct poll_table_struct { 39typedef struct poll_table_struct {
36 poll_queue_proc qproc; 40 poll_queue_proc _qproc;
37 unsigned long key; 41 unsigned long _key;
38} poll_table; 42} poll_table;
39 43
40static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) 44static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
41{ 45{
42 if (p && wait_address) 46 if (p && p->_qproc && wait_address)
43 p->qproc(filp, wait_address, p); 47 p->_qproc(filp, wait_address, p);
48}
49
50/*
51 * Return true if it is guaranteed that poll will not wait. This is the case
52 * if the poll() of another file descriptor in the set got an event, so there
53 * is no need for waiting.
54 */
55static inline bool poll_does_not_wait(const poll_table *p)
56{
57 return p == NULL || p->_qproc == NULL;
58}
59
60/*
61 * Return the set of events that the application wants to poll for.
62 * This is useful for drivers that need to know whether a DMA transfer has
63 * to be started implicitly on poll(). You typically only want to do that
64 * if the application is actually polling for POLLIN and/or POLLOUT.
65 */
66static inline unsigned long poll_requested_events(const poll_table *p)
67{
68 return p ? p->_key : ~0UL;
44} 69}
45 70
46static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) 71static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
47{ 72{
48 pt->qproc = qproc; 73 pt->_qproc = qproc;
49 pt->key = ~0UL; /* all events enabled */ 74 pt->_key = ~0UL; /* all events enabled */
50} 75}
51 76
52struct poll_table_entry { 77struct poll_table_entry {
diff --git a/include/net/sock.h b/include/net/sock.h
index 04bc0b30e9e9..a6ba1f8871fd 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1854,7 +1854,7 @@ static inline bool wq_has_sleeper(struct socket_wq *wq)
1854static inline void sock_poll_wait(struct file *filp, 1854static inline void sock_poll_wait(struct file *filp,
1855 wait_queue_head_t *wait_address, poll_table *p) 1855 wait_queue_head_t *wait_address, poll_table *p)
1856{ 1856{
1857 if (p && wait_address) { 1857 if (!poll_does_not_wait(p) && wait_address) {
1858 poll_wait(filp, wait_address, p); 1858 poll_wait(filp, wait_address, p);
1859 /* 1859 /*
1860 * We need to be sure we are in sync with the 1860 * We need to be sure we are in sync with the
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index eb4277c33188..d510353ef431 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2206,7 +2206,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2206 } 2206 }
2207 2207
2208 /* No write status requested, avoid expensive OUT tests. */ 2208 /* No write status requested, avoid expensive OUT tests. */
2209 if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT))) 2209 if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2210 return mask; 2210 return mask;
2211 2211
2212 writable = unix_writable(sk); 2212 writable = unix_writable(sk);