aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <htejun@gmail.com>2009-01-06 17:40:59 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-01-06 18:59:12 -0500
commit5f820f648c92a5ecc771a96b3c29aa6e90013bba (patch)
tree0445b45fa33072d37b32c6ef592a4d0c102e05cc
parent67ec7d3ab779ad9001ef57a6b4cfdf80ac9f9acc (diff)
poll: allow f_op->poll to sleep
f_op->poll is the only vfs operation which is not allowed to sleep. It's because poll and select implementation used task state to synchronize against wake ups, which doesn't have to be the case anymore as wait/wake interface can now use custom wake up functions. The non-sleep restriction can be a bit tricky because ->poll is not called from an atomic context and the result of accidentally sleeping in ->poll only shows up as temporary busy looping when the timing is right or rather wrong. This patch converts poll/select to use custom wake up function and use separate triggered variable to synchronize against wake up events. The only added overhead is an extra function call during wake up and negligible. This patch removes the one non-sleep exception from vfs locking rules and is beneficial to userland filesystem implementations like FUSE, 9p or peculiar fs like spufs as it's very difficult for those to implement non-sleeping poll method. While at it, make the following cosmetic changes to make poll.h and select.c checkpatch friendly. * s/type * symbol/type *symbol/ : three places in poll.h * remove blank line before EXPORT_SYMBOL() : two places in select.c Oleg: spotted missing barrier in poll_schedule_timeout() Davide: spotted missing write barrier in pollwake() Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Eric Van Hensbergen <ericvh@gmail.com> Cc: Ron Minnich <rminnich@sandia.gov> Cc: Ingo Molnar <mingo@elte.hu> Cc: Christoph Hellwig <hch@infradead.org> Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Davide Libenzi <davidel@xmailserver.org> Cc: Brad Boyer <flar@allandria.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Roland McGrath <roland@redhat.com> Cc: Mauro Carvalho Chehab <mchehab@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Cc: Davide Libenzi <davidel@xmailserver.org> Cc: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/filesystems/Locking2
-rw-r--r--drivers/media/video/v4l1-compat.c4
-rw-r--r--fs/select.c76
-rw-r--r--include/linux/poll.h15
4 files changed, 76 insertions, 21 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index ccec55394380..cfbfa15a46ba 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -397,7 +397,7 @@ prototypes:
397}; 397};
398 398
399locking rules: 399locking rules:
400 All except ->poll() may block. 400 All may block.
401 BKL 401 BKL
402llseek: no (see below) 402llseek: no (see below)
403read: no 403read: no
diff --git a/drivers/media/video/v4l1-compat.c b/drivers/media/video/v4l1-compat.c
index d450cab20be4..b617bf05e2d7 100644
--- a/drivers/media/video/v4l1-compat.c
+++ b/drivers/media/video/v4l1-compat.c
@@ -203,7 +203,6 @@ static int poll_one(struct file *file, struct poll_wqueues *pwq)
203 table = &pwq->pt; 203 table = &pwq->pt;
204 for (;;) { 204 for (;;) {
205 int mask; 205 int mask;
206 set_current_state(TASK_INTERRUPTIBLE);
207 mask = file->f_op->poll(file, table); 206 mask = file->f_op->poll(file, table);
208 if (mask & POLLIN) 207 if (mask & POLLIN)
209 break; 208 break;
@@ -212,9 +211,8 @@ static int poll_one(struct file *file, struct poll_wqueues *pwq)
212 retval = -ERESTARTSYS; 211 retval = -ERESTARTSYS;
213 break; 212 break;
214 } 213 }
215 schedule(); 214 poll_schedule(pwq, TASK_INTERRUPTIBLE);
216 } 215 }
217 set_current_state(TASK_RUNNING);
218 poll_freewait(pwq); 216 poll_freewait(pwq);
219 return retval; 217 return retval;
220} 218}
diff --git a/fs/select.c b/fs/select.c
index 87df51eadcf2..08b91beed806 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
109void poll_initwait(struct poll_wqueues *pwq) 109void poll_initwait(struct poll_wqueues *pwq)
110{ 110{
111 init_poll_funcptr(&pwq->pt, __pollwait); 111 init_poll_funcptr(&pwq->pt, __pollwait);
112 pwq->polling_task = current;
112 pwq->error = 0; 113 pwq->error = 0;
113 pwq->table = NULL; 114 pwq->table = NULL;
114 pwq->inline_index = 0; 115 pwq->inline_index = 0;
115} 116}
116
117EXPORT_SYMBOL(poll_initwait); 117EXPORT_SYMBOL(poll_initwait);
118 118
119static void free_poll_entry(struct poll_table_entry *entry) 119static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq)
142 free_page((unsigned long) old); 142 free_page((unsigned long) old);
143 } 143 }
144} 144}
145
146EXPORT_SYMBOL(poll_freewait); 145EXPORT_SYMBOL(poll_freewait);
147 146
148static struct poll_table_entry *poll_get_entry(poll_table *_p) 147static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
149{ 148{
150 struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
151 struct poll_table_page *table = p->table; 149 struct poll_table_page *table = p->table;
152 150
153 if (p->inline_index < N_INLINE_POLL_ENTRIES) 151 if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
159 new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); 157 new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
160 if (!new_table) { 158 if (!new_table) {
161 p->error = -ENOMEM; 159 p->error = -ENOMEM;
162 __set_current_state(TASK_RUNNING);
163 return NULL; 160 return NULL;
164 } 161 }
165 new_table->entry = new_table->entries; 162 new_table->entry = new_table->entries;
@@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
171 return table->entry++; 168 return table->entry++;
172} 169}
173 170
171static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
172{
173 struct poll_wqueues *pwq = wait->private;
174 DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
175
176 /*
177 * Although this function is called under waitqueue lock, LOCK
178 * doesn't imply write barrier and the users expect write
179 * barrier semantics on wakeup functions. The following
180 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
181 * and is paired with set_mb() in poll_schedule_timeout.
182 */
183 smp_wmb();
184 pwq->triggered = 1;
185
186 /*
187 * Perform the default wake up operation using a dummy
188 * waitqueue.
189 *
190 * TODO: This is hacky but there currently is no interface to
191 * pass in @sync. @sync is scheduled to be removed and once
192 * that happens, wake_up_process() can be used directly.
193 */
194 return default_wake_function(&dummy_wait, mode, sync, key);
195}
196
174/* Add a new entry */ 197/* Add a new entry */
175static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, 198static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
176 poll_table *p) 199 poll_table *p)
177{ 200{
178 struct poll_table_entry *entry = poll_get_entry(p); 201 struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
202 struct poll_table_entry *entry = poll_get_entry(pwq);
179 if (!entry) 203 if (!entry)
180 return; 204 return;
181 get_file(filp); 205 get_file(filp);
182 entry->filp = filp; 206 entry->filp = filp;
183 entry->wait_address = wait_address; 207 entry->wait_address = wait_address;
184 init_waitqueue_entry(&entry->wait, current); 208 init_waitqueue_func_entry(&entry->wait, pollwake);
209 entry->wait.private = pwq;
185 add_wait_queue(wait_address, &entry->wait); 210 add_wait_queue(wait_address, &entry->wait);
186} 211}
187 212
213int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
214 ktime_t *expires, unsigned long slack)
215{
216 int rc = -EINTR;
217
218 set_current_state(state);
219 if (!pwq->triggered)
220 rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
221 __set_current_state(TASK_RUNNING);
222
223 /*
224 * Prepare for the next iteration.
225 *
226 * The following set_mb() serves two purposes. First, it's
227 * the counterpart rmb of the wmb in pollwake() such that data
228 * written before wake up is always visible after wake up.
229 * Second, the full barrier guarantees that triggered clearing
230 * doesn't pass event check of the next iteration. Note that
231 * this problem doesn't exist for the first iteration as
232 * add_wait_queue() has full barrier semantics.
233 */
234 set_mb(pwq->triggered, 0);
235
236 return rc;
237}
238EXPORT_SYMBOL(poll_schedule_timeout);
239
188/** 240/**
189 * poll_select_set_timeout - helper function to setup the timeout value 241 * poll_select_set_timeout - helper function to setup the timeout value
190 * @to: pointer to timespec variable for the final timeout 242 * @to: pointer to timespec variable for the final timeout
@@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
340 for (;;) { 392 for (;;) {
341 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; 393 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
342 394
343 set_current_state(TASK_INTERRUPTIBLE);
344
345 inp = fds->in; outp = fds->out; exp = fds->ex; 395 inp = fds->in; outp = fds->out; exp = fds->ex;
346 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; 396 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
347 397
@@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
411 to = &expire; 461 to = &expire;
412 } 462 }
413 463
414 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) 464 if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
465 to, slack))
415 timed_out = 1; 466 timed_out = 1;
416 } 467 }
417 __set_current_state(TASK_RUNNING);
418 468
419 poll_freewait(&table); 469 poll_freewait(&table);
420 470
@@ -666,7 +716,6 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
666 for (;;) { 716 for (;;) {
667 struct poll_list *walk; 717 struct poll_list *walk;
668 718
669 set_current_state(TASK_INTERRUPTIBLE);
670 for (walk = list; walk != NULL; walk = walk->next) { 719 for (walk = list; walk != NULL; walk = walk->next) {
671 struct pollfd * pfd, * pfd_end; 720 struct pollfd * pfd, * pfd_end;
672 721
@@ -709,10 +758,9 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
709 to = &expire; 758 to = &expire;
710 } 759 }
711 760
712 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) 761 if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
713 timed_out = 1; 762 timed_out = 1;
714 } 763 }
715 __set_current_state(TASK_RUNNING);
716 return count; 764 return count;
717} 765}
718 766
diff --git a/include/linux/poll.h b/include/linux/poll.h
index badd98ab06f6..8c24ef8d9976 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -46,9 +46,9 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
46} 46}
47 47
48struct poll_table_entry { 48struct poll_table_entry {
49 struct file * filp; 49 struct file *filp;
50 wait_queue_t wait; 50 wait_queue_t wait;
51 wait_queue_head_t * wait_address; 51 wait_queue_head_t *wait_address;
52}; 52};
53 53
54/* 54/*
@@ -56,7 +56,9 @@ struct poll_table_entry {
56 */ 56 */
57struct poll_wqueues { 57struct poll_wqueues {
58 poll_table pt; 58 poll_table pt;
59 struct poll_table_page * table; 59 struct poll_table_page *table;
60 struct task_struct *polling_task;
61 int triggered;
60 int error; 62 int error;
61 int inline_index; 63 int inline_index;
62 struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES]; 64 struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
@@ -64,6 +66,13 @@ struct poll_wqueues {
64 66
65extern void poll_initwait(struct poll_wqueues *pwq); 67extern void poll_initwait(struct poll_wqueues *pwq);
66extern void poll_freewait(struct poll_wqueues *pwq); 68extern void poll_freewait(struct poll_wqueues *pwq);
69extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
70 ktime_t *expires, unsigned long slack);
71
72static inline int poll_schedule(struct poll_wqueues *pwq, int state)
73{
74 return poll_schedule_timeout(pwq, state, NULL, 0);
75}
67 76
68/* 77/*
69 * Scaleable version of the fd_set. 78 * Scaleable version of the fd_set.