diff options
author | Davide Libenzi <davidel@xmailserver.org> | 2009-06-30 14:41:11 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-30 21:55:58 -0400 |
commit | 133890103b9de08904f909995973e4b5c08a780e (patch) | |
tree | 0cda85a58dafafa0a197cf1a789124203f1e7a88 /fs | |
parent | f7c2df9b55212d5ec94169a4de11e44c683e0af4 (diff) |
eventfd: revised interface and cleanups
Change the eventfd interface to de-couple the eventfd memory context, from
the file pointer instance.
Without such change, there is no clean way to racely free handle the
POLLHUP event sent when the last instance of the file* goes away. Also,
now the internal eventfd APIs are using the eventfd context instead of the
file*.
This patch is required by KVM's IRQfd code, which is still under
development.
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Avi Kivity <avi@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/aio.c | 24 | ||||
-rw-r--r-- | fs/eventfd.c | 122 |
2 files changed, 117 insertions, 29 deletions
@@ -485,6 +485,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) | |||
485 | { | 485 | { |
486 | assert_spin_locked(&ctx->ctx_lock); | 486 | assert_spin_locked(&ctx->ctx_lock); |
487 | 487 | ||
488 | if (req->ki_eventfd != NULL) | ||
489 | eventfd_ctx_put(req->ki_eventfd); | ||
488 | if (req->ki_dtor) | 490 | if (req->ki_dtor) |
489 | req->ki_dtor(req); | 491 | req->ki_dtor(req); |
490 | if (req->ki_iovec != &req->ki_inline_vec) | 492 | if (req->ki_iovec != &req->ki_inline_vec) |
@@ -509,8 +511,6 @@ static void aio_fput_routine(struct work_struct *data) | |||
509 | /* Complete the fput(s) */ | 511 | /* Complete the fput(s) */ |
510 | if (req->ki_filp != NULL) | 512 | if (req->ki_filp != NULL) |
511 | __fput(req->ki_filp); | 513 | __fput(req->ki_filp); |
512 | if (req->ki_eventfd != NULL) | ||
513 | __fput(req->ki_eventfd); | ||
514 | 514 | ||
515 | /* Link the iocb into the context's free list */ | 515 | /* Link the iocb into the context's free list */ |
516 | spin_lock_irq(&ctx->ctx_lock); | 516 | spin_lock_irq(&ctx->ctx_lock); |
@@ -528,8 +528,6 @@ static void aio_fput_routine(struct work_struct *data) | |||
528 | */ | 528 | */ |
529 | static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) | 529 | static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) |
530 | { | 530 | { |
531 | int schedule_putreq = 0; | ||
532 | |||
533 | dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", | 531 | dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", |
534 | req, atomic_long_read(&req->ki_filp->f_count)); | 532 | req, atomic_long_read(&req->ki_filp->f_count)); |
535 | 533 | ||
@@ -549,24 +547,16 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) | |||
549 | * we would not be holding the last reference to the file*, so | 547 | * we would not be holding the last reference to the file*, so |
550 | * this function will be executed w/out any aio kthread wakeup. | 548 | * this function will be executed w/out any aio kthread wakeup. |
551 | */ | 549 | */ |
552 | if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) | 550 | if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { |
553 | schedule_putreq++; | ||
554 | else | ||
555 | req->ki_filp = NULL; | ||
556 | if (req->ki_eventfd != NULL) { | ||
557 | if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count))) | ||
558 | schedule_putreq++; | ||
559 | else | ||
560 | req->ki_eventfd = NULL; | ||
561 | } | ||
562 | if (unlikely(schedule_putreq)) { | ||
563 | get_ioctx(ctx); | 551 | get_ioctx(ctx); |
564 | spin_lock(&fput_lock); | 552 | spin_lock(&fput_lock); |
565 | list_add(&req->ki_list, &fput_head); | 553 | list_add(&req->ki_list, &fput_head); |
566 | spin_unlock(&fput_lock); | 554 | spin_unlock(&fput_lock); |
567 | queue_work(aio_wq, &fput_work); | 555 | queue_work(aio_wq, &fput_work); |
568 | } else | 556 | } else { |
557 | req->ki_filp = NULL; | ||
569 | really_put_req(ctx, req); | 558 | really_put_req(ctx, req); |
559 | } | ||
570 | return 1; | 560 | return 1; |
571 | } | 561 | } |
572 | 562 | ||
@@ -1622,7 +1612,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, | |||
1622 | * an eventfd() fd, and will be signaled for each completed | 1612 | * an eventfd() fd, and will be signaled for each completed |
1623 | * event using the eventfd_signal() function. | 1613 | * event using the eventfd_signal() function. |
1624 | */ | 1614 | */ |
1625 | req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd); | 1615 | req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); |
1626 | if (IS_ERR(req->ki_eventfd)) { | 1616 | if (IS_ERR(req->ki_eventfd)) { |
1627 | ret = PTR_ERR(req->ki_eventfd); | 1617 | ret = PTR_ERR(req->ki_eventfd); |
1628 | req->ki_eventfd = NULL; | 1618 | req->ki_eventfd = NULL; |
diff --git a/fs/eventfd.c b/fs/eventfd.c index 3f0e1974abdc..31d12de83a2a 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c | |||
@@ -14,35 +14,44 @@ | |||
14 | #include <linux/list.h> | 14 | #include <linux/list.h> |
15 | #include <linux/spinlock.h> | 15 | #include <linux/spinlock.h> |
16 | #include <linux/anon_inodes.h> | 16 | #include <linux/anon_inodes.h> |
17 | #include <linux/eventfd.h> | ||
18 | #include <linux/syscalls.h> | 17 | #include <linux/syscalls.h> |
19 | #include <linux/module.h> | 18 | #include <linux/module.h> |
19 | #include <linux/kref.h> | ||
20 | #include <linux/eventfd.h> | ||
20 | 21 | ||
21 | struct eventfd_ctx { | 22 | struct eventfd_ctx { |
23 | struct kref kref; | ||
22 | wait_queue_head_t wqh; | 24 | wait_queue_head_t wqh; |
23 | /* | 25 | /* |
24 | * Every time that a write(2) is performed on an eventfd, the | 26 | * Every time that a write(2) is performed on an eventfd, the |
25 | * value of the __u64 being written is added to "count" and a | 27 | * value of the __u64 being written is added to "count" and a |
26 | * wakeup is performed on "wqh". A read(2) will return the "count" | 28 | * wakeup is performed on "wqh". A read(2) will return the "count" |
27 | * value to userspace, and will reset "count" to zero. The kernel | 29 | * value to userspace, and will reset "count" to zero. The kernel |
28 | * size eventfd_signal() also, adds to the "count" counter and | 30 | * side eventfd_signal() also, adds to the "count" counter and |
29 | * issue a wakeup. | 31 | * issue a wakeup. |
30 | */ | 32 | */ |
31 | __u64 count; | 33 | __u64 count; |
32 | unsigned int flags; | 34 | unsigned int flags; |
33 | }; | 35 | }; |
34 | 36 | ||
35 | /* | 37 | /** |
36 | * Adds "n" to the eventfd counter "count". Returns "n" in case of | 38 | * eventfd_signal - Adds @n to the eventfd counter. |
37 | * success, or a value lower then "n" in case of coutner overflow. | 39 | * @ctx: [in] Pointer to the eventfd context. |
38 | * This function is supposed to be called by the kernel in paths | 40 | * @n: [in] Value of the counter to be added to the eventfd internal counter. |
39 | * that do not allow sleeping. In this function we allow the counter | 41 | * The value cannot be negative. |
40 | * to reach the ULLONG_MAX value, and we signal this as overflow | 42 | * |
41 | * condition by returining a POLLERR to poll(2). | 43 | * This function is supposed to be called by the kernel in paths that do not |
44 | * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX | ||
45 | * value, and we signal this as overflow condition by returining a POLLERR | ||
46 | * to poll(2). | ||
47 | * | ||
48 | * Returns @n in case of success, a non-negative number lower than @n in case | ||
49 | * of overflow, or the following error codes: | ||
50 | * | ||
51 | * -EINVAL : The value of @n is negative. | ||
42 | */ | 52 | */ |
43 | int eventfd_signal(struct file *file, int n) | 53 | int eventfd_signal(struct eventfd_ctx *ctx, int n) |
44 | { | 54 | { |
45 | struct eventfd_ctx *ctx = file->private_data; | ||
46 | unsigned long flags; | 55 | unsigned long flags; |
47 | 56 | ||
48 | if (n < 0) | 57 | if (n < 0) |
@@ -59,9 +68,45 @@ int eventfd_signal(struct file *file, int n) | |||
59 | } | 68 | } |
60 | EXPORT_SYMBOL_GPL(eventfd_signal); | 69 | EXPORT_SYMBOL_GPL(eventfd_signal); |
61 | 70 | ||
71 | static void eventfd_free(struct kref *kref) | ||
72 | { | ||
73 | struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); | ||
74 | |||
75 | kfree(ctx); | ||
76 | } | ||
77 | |||
78 | /** | ||
79 | * eventfd_ctx_get - Acquires a reference to the internal eventfd context. | ||
80 | * @ctx: [in] Pointer to the eventfd context. | ||
81 | * | ||
82 | * Returns: In case of success, returns a pointer to the eventfd context. | ||
83 | */ | ||
84 | struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx) | ||
85 | { | ||
86 | kref_get(&ctx->kref); | ||
87 | return ctx; | ||
88 | } | ||
89 | EXPORT_SYMBOL_GPL(eventfd_ctx_get); | ||
90 | |||
91 | /** | ||
92 | * eventfd_ctx_put - Releases a reference to the internal eventfd context. | ||
93 | * @ctx: [in] Pointer to eventfd context. | ||
94 | * | ||
95 | * The eventfd context reference must have been previously acquired either | ||
96 | * with eventfd_ctx_get() or eventfd_ctx_fdget()). | ||
97 | */ | ||
98 | void eventfd_ctx_put(struct eventfd_ctx *ctx) | ||
99 | { | ||
100 | kref_put(&ctx->kref, eventfd_free); | ||
101 | } | ||
102 | EXPORT_SYMBOL_GPL(eventfd_ctx_put); | ||
103 | |||
62 | static int eventfd_release(struct inode *inode, struct file *file) | 104 | static int eventfd_release(struct inode *inode, struct file *file) |
63 | { | 105 | { |
64 | kfree(file->private_data); | 106 | struct eventfd_ctx *ctx = file->private_data; |
107 | |||
108 | wake_up_poll(&ctx->wqh, POLLHUP); | ||
109 | eventfd_ctx_put(ctx); | ||
65 | return 0; | 110 | return 0; |
66 | } | 111 | } |
67 | 112 | ||
@@ -185,6 +230,16 @@ static const struct file_operations eventfd_fops = { | |||
185 | .write = eventfd_write, | 230 | .write = eventfd_write, |
186 | }; | 231 | }; |
187 | 232 | ||
233 | /** | ||
234 | * eventfd_fget - Acquire a reference of an eventfd file descriptor. | ||
235 | * @fd: [in] Eventfd file descriptor. | ||
236 | * | ||
237 | * Returns a pointer to the eventfd file structure in case of success, or the | ||
238 | * following error pointer: | ||
239 | * | ||
240 | * -EBADF : Invalid @fd file descriptor. | ||
241 | * -EINVAL : The @fd file descriptor is not an eventfd file. | ||
242 | */ | ||
188 | struct file *eventfd_fget(int fd) | 243 | struct file *eventfd_fget(int fd) |
189 | { | 244 | { |
190 | struct file *file; | 245 | struct file *file; |
@@ -201,6 +256,48 @@ struct file *eventfd_fget(int fd) | |||
201 | } | 256 | } |
202 | EXPORT_SYMBOL_GPL(eventfd_fget); | 257 | EXPORT_SYMBOL_GPL(eventfd_fget); |
203 | 258 | ||
259 | /** | ||
260 | * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context. | ||
261 | * @fd: [in] Eventfd file descriptor. | ||
262 | * | ||
263 | * Returns a pointer to the internal eventfd context, otherwise the error | ||
264 | * pointers returned by the following functions: | ||
265 | * | ||
266 | * eventfd_fget | ||
267 | */ | ||
268 | struct eventfd_ctx *eventfd_ctx_fdget(int fd) | ||
269 | { | ||
270 | struct file *file; | ||
271 | struct eventfd_ctx *ctx; | ||
272 | |||
273 | file = eventfd_fget(fd); | ||
274 | if (IS_ERR(file)) | ||
275 | return (struct eventfd_ctx *) file; | ||
276 | ctx = eventfd_ctx_get(file->private_data); | ||
277 | fput(file); | ||
278 | |||
279 | return ctx; | ||
280 | } | ||
281 | EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); | ||
282 | |||
283 | /** | ||
284 | * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context. | ||
285 | * @file: [in] Eventfd file pointer. | ||
286 | * | ||
287 | * Returns a pointer to the internal eventfd context, otherwise the error | ||
288 | * pointer: | ||
289 | * | ||
290 | * -EINVAL : The @fd file descriptor is not an eventfd file. | ||
291 | */ | ||
292 | struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) | ||
293 | { | ||
294 | if (file->f_op != &eventfd_fops) | ||
295 | return ERR_PTR(-EINVAL); | ||
296 | |||
297 | return eventfd_ctx_get(file->private_data); | ||
298 | } | ||
299 | EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); | ||
300 | |||
204 | SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) | 301 | SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) |
205 | { | 302 | { |
206 | int fd; | 303 | int fd; |
@@ -217,6 +314,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) | |||
217 | if (!ctx) | 314 | if (!ctx) |
218 | return -ENOMEM; | 315 | return -ENOMEM; |
219 | 316 | ||
317 | kref_init(&ctx->kref); | ||
220 | init_waitqueue_head(&ctx->wqh); | 318 | init_waitqueue_head(&ctx->wqh); |
221 | ctx->count = count; | 319 | ctx->count = count; |
222 | ctx->flags = flags; | 320 | ctx->flags = flags; |