aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorTejun Heo <htejun@gmail.com>2009-01-06 17:40:59 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-01-06 18:59:12 -0500
commit5f820f648c92a5ecc771a96b3c29aa6e90013bba (patch)
tree0445b45fa33072d37b32c6ef592a4d0c102e05cc /fs
parent67ec7d3ab779ad9001ef57a6b4cfdf80ac9f9acc (diff)
poll: allow f_op->poll to sleep
f_op->poll is the only vfs operation which is not allowed to sleep. It's because poll and select implementation used task state to synchronize against wake ups, which doesn't have to be the case anymore as wait/wake interface can now use custom wake up functions. The non-sleep restriction can be a bit tricky because ->poll is not called from an atomic context and the result of accidentally sleeping in ->poll only shows up as temporary busy looping when the timing is right or rather wrong. This patch converts poll/select to use custom wake up function and use separate triggered variable to synchronize against wake up events. The only added overhead is an extra function call during wake up and negligible. This patch removes the one non-sleep exception from vfs locking rules and is beneficial to userland filesystem implementations like FUSE, 9p or peculiar fs like spufs as it's very difficult for those to implement non-sleeping poll method. While at it, make the following cosmetic changes to make poll.h and select.c checkpatch friendly. * s/type * symbol/type *symbol/ : three places in poll.h * remove blank line before EXPORT_SYMBOL() : two places in select.c Oleg: spotted missing barrier in poll_schedule_timeout() Davide: spotted missing write barrier in pollwake() Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Eric Van Hensbergen <ericvh@gmail.com> Cc: Ron Minnich <rminnich@sandia.gov> Cc: Ingo Molnar <mingo@elte.hu> Cc: Christoph Hellwig <hch@infradead.org> Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Davide Libenzi <davidel@xmailserver.org> Cc: Brad Boyer <flar@allandria.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Roland McGrath <roland@redhat.com> Cc: Mauro Carvalho Chehab <mchehab@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Cc: Davide Libenzi <davidel@xmailserver.org> Cc: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/select.c76
1 files changed, 62 insertions, 14 deletions
diff --git a/fs/select.c b/fs/select.c
index 87df51eadcf2..08b91beed806 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
109void poll_initwait(struct poll_wqueues *pwq) 109void poll_initwait(struct poll_wqueues *pwq)
110{ 110{
111 init_poll_funcptr(&pwq->pt, __pollwait); 111 init_poll_funcptr(&pwq->pt, __pollwait);
112 pwq->polling_task = current;
112 pwq->error = 0; 113 pwq->error = 0;
113 pwq->table = NULL; 114 pwq->table = NULL;
114 pwq->inline_index = 0; 115 pwq->inline_index = 0;
115} 116}
116
117EXPORT_SYMBOL(poll_initwait); 117EXPORT_SYMBOL(poll_initwait);
118 118
119static void free_poll_entry(struct poll_table_entry *entry) 119static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq)
142 free_page((unsigned long) old); 142 free_page((unsigned long) old);
143 } 143 }
144} 144}
145
146EXPORT_SYMBOL(poll_freewait); 145EXPORT_SYMBOL(poll_freewait);
147 146
148static struct poll_table_entry *poll_get_entry(poll_table *_p) 147static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
149{ 148{
150 struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
151 struct poll_table_page *table = p->table; 149 struct poll_table_page *table = p->table;
152 150
153 if (p->inline_index < N_INLINE_POLL_ENTRIES) 151 if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
159 new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); 157 new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
160 if (!new_table) { 158 if (!new_table) {
161 p->error = -ENOMEM; 159 p->error = -ENOMEM;
162 __set_current_state(TASK_RUNNING);
163 return NULL; 160 return NULL;
164 } 161 }
165 new_table->entry = new_table->entries; 162 new_table->entry = new_table->entries;
@@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
171 return table->entry++; 168 return table->entry++;
172} 169}
173 170
171static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
172{
173 struct poll_wqueues *pwq = wait->private;
174 DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
175
176 /*
177 * Although this function is called under waitqueue lock, LOCK
178 * doesn't imply write barrier and the users expect write
179 * barrier semantics on wakeup functions. The following
180 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
181 * and is paired with set_mb() in poll_schedule_timeout.
182 */
183 smp_wmb();
184 pwq->triggered = 1;
185
186 /*
187 * Perform the default wake up operation using a dummy
188 * waitqueue.
189 *
190 * TODO: This is hacky but there currently is no interface to
191 * pass in @sync. @sync is scheduled to be removed and once
192 * that happens, wake_up_process() can be used directly.
193 */
194 return default_wake_function(&dummy_wait, mode, sync, key);
195}
196
174/* Add a new entry */ 197/* Add a new entry */
175static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, 198static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
176 poll_table *p) 199 poll_table *p)
177{ 200{
178 struct poll_table_entry *entry = poll_get_entry(p); 201 struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
202 struct poll_table_entry *entry = poll_get_entry(pwq);
179 if (!entry) 203 if (!entry)
180 return; 204 return;
181 get_file(filp); 205 get_file(filp);
182 entry->filp = filp; 206 entry->filp = filp;
183 entry->wait_address = wait_address; 207 entry->wait_address = wait_address;
184 init_waitqueue_entry(&entry->wait, current); 208 init_waitqueue_func_entry(&entry->wait, pollwake);
209 entry->wait.private = pwq;
185 add_wait_queue(wait_address, &entry->wait); 210 add_wait_queue(wait_address, &entry->wait);
186} 211}
187 212
213int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
214 ktime_t *expires, unsigned long slack)
215{
216 int rc = -EINTR;
217
218 set_current_state(state);
219 if (!pwq->triggered)
220 rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
221 __set_current_state(TASK_RUNNING);
222
223 /*
224 * Prepare for the next iteration.
225 *
226 * The following set_mb() serves two purposes. First, it's
227 * the counterpart rmb of the wmb in pollwake() such that data
228 * written before wake up is always visible after wake up.
229 * Second, the full barrier guarantees that triggered clearing
230 * doesn't pass event check of the next iteration. Note that
231 * this problem doesn't exist for the first iteration as
232 * add_wait_queue() has full barrier semantics.
233 */
234 set_mb(pwq->triggered, 0);
235
236 return rc;
237}
238EXPORT_SYMBOL(poll_schedule_timeout);
239
188/** 240/**
189 * poll_select_set_timeout - helper function to setup the timeout value 241 * poll_select_set_timeout - helper function to setup the timeout value
190 * @to: pointer to timespec variable for the final timeout 242 * @to: pointer to timespec variable for the final timeout
@@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
340 for (;;) { 392 for (;;) {
341 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; 393 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
342 394
343 set_current_state(TASK_INTERRUPTIBLE);
344
345 inp = fds->in; outp = fds->out; exp = fds->ex; 395 inp = fds->in; outp = fds->out; exp = fds->ex;
346 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; 396 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
347 397
@@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
411 to = &expire; 461 to = &expire;
412 } 462 }
413 463
414 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) 464 if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
465 to, slack))
415 timed_out = 1; 466 timed_out = 1;
416 } 467 }
417 __set_current_state(TASK_RUNNING);
418 468
419 poll_freewait(&table); 469 poll_freewait(&table);
420 470
@@ -666,7 +716,6 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
666 for (;;) { 716 for (;;) {
667 struct poll_list *walk; 717 struct poll_list *walk;
668 718
669 set_current_state(TASK_INTERRUPTIBLE);
670 for (walk = list; walk != NULL; walk = walk->next) { 719 for (walk = list; walk != NULL; walk = walk->next) {
671 struct pollfd * pfd, * pfd_end; 720 struct pollfd * pfd, * pfd_end;
672 721
@@ -709,10 +758,9 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
709 to = &expire; 758 to = &expire;
710 } 759 }
711 760
712 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) 761 if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
713 timed_out = 1; 762 timed_out = 1;
714 } 763 }
715 __set_current_state(TASK_RUNNING);
716 return count; 764 return count;
717} 765}
718 766