diff options
author | Al Viro <viro@ZenIV.linux.org.uk> | 2008-11-14 20:15:43 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-11-15 15:26:44 -0500 |
commit | 8f7b0ba1c853919b85b54774775f567f30006107 (patch) | |
tree | 1acd2b7ed5ed0de3eecfff9da5da4e779731f8a8 /fs | |
parent | 0d3b71009737511ea937ac405205fd8214b898bb (diff) |
Fix inotify watch removal/umount races
Inotify watch removals suck violently.
To kick the watch out we need (in this order) inode->inotify_mutex and
ih->mutex. That's fine if we have a hold on inode; however, for all
other cases we need to make damn sure we don't race with umount. We can
*NOT* just grab a reference to a watch - inotify_unmount_inodes() will
happily sail past it and we'll end with reference to inode potentially
outliving its superblock.
Ideally we just want to grab an active reference to superblock if we
can; that will make sure we won't go into inotify_umount_inodes() until
we are done. Cleanup is just deactivate_super().
However, that leaves a messy case - what if we *are* racing with
umount() and active references to superblock can't be acquired anymore?
We can bump ->s_count, grab ->s_umount, which will almost certainly wait
until the superblock is shut down and the watch in question is pining
for fjords. That's fine, but there is a problem - we might have hit the
window between ->s_active getting to 0 / ->s_count - below S_BIAS (i.e.
the moment when superblock is past the point of no return and is heading
for shutdown) and the moment when deactivate_super() acquires
->s_umount.
We could just do drop_super() yield() and retry, but that's rather
antisocial and this stuff is luser-triggerable. OTOH, having grabbed
->s_umount and having found that we'd got there first (i.e. that
->s_root is non-NULL) we know that we won't race with
inotify_umount_inodes().
So we could grab a reference to watch and do the rest as above, just
with drop_super() instead of deactivate_super(), right? Wrong. We had
to drop ih->mutex before we could grab ->s_umount. So the watch
could've been gone already.
That still can be dealt with - we need to save watch->wd, do idr_find()
and compare its result with our pointer. If they match, we either have
the damn thing still alive or we'd lost not one but two races at once,
the watch had been killed and a new one got created with the same ->wd
at the same address. That couldn't have happened in inotify_destroy(),
but inotify_rm_wd() could run into that. Still, "new one got created"
is not a problem - we have every right to kill it or leave it alone,
whatever's more convenient.
So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
"grab it and kill it" check. If it's been our original watch, we are
fine, if it's a newcomer - nevermind, just pretend that we'd won the
race and kill the fscker anyway; we are safe since we know that its
superblock won't be going away.
And yes, this is far beyond mere "not very pretty"; so's the entire
concept of inotify to start with.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/inotify.c | 150 |
1 files changed, 144 insertions, 6 deletions
diff --git a/fs/inotify.c b/fs/inotify.c index 690e72595e6..7bbed1b8982 100644 --- a/fs/inotify.c +++ b/fs/inotify.c | |||
@@ -106,6 +106,20 @@ void get_inotify_watch(struct inotify_watch *watch) | |||
106 | } | 106 | } |
107 | EXPORT_SYMBOL_GPL(get_inotify_watch); | 107 | EXPORT_SYMBOL_GPL(get_inotify_watch); |
108 | 108 | ||
109 | int pin_inotify_watch(struct inotify_watch *watch) | ||
110 | { | ||
111 | struct super_block *sb = watch->inode->i_sb; | ||
112 | spin_lock(&sb_lock); | ||
113 | if (sb->s_count >= S_BIAS) { | ||
114 | atomic_inc(&sb->s_active); | ||
115 | spin_unlock(&sb_lock); | ||
116 | atomic_inc(&watch->count); | ||
117 | return 1; | ||
118 | } | ||
119 | spin_unlock(&sb_lock); | ||
120 | return 0; | ||
121 | } | ||
122 | |||
109 | /** | 123 | /** |
110 | * put_inotify_watch - decrements the ref count on a given watch. cleans up | 124 | * put_inotify_watch - decrements the ref count on a given watch. cleans up |
111 | * watch references if the count reaches zero. inotify_watch is freed by | 125 | * watch references if the count reaches zero. inotify_watch is freed by |
@@ -124,6 +138,13 @@ void put_inotify_watch(struct inotify_watch *watch) | |||
124 | } | 138 | } |
125 | EXPORT_SYMBOL_GPL(put_inotify_watch); | 139 | EXPORT_SYMBOL_GPL(put_inotify_watch); |
126 | 140 | ||
141 | void unpin_inotify_watch(struct inotify_watch *watch) | ||
142 | { | ||
143 | struct super_block *sb = watch->inode->i_sb; | ||
144 | put_inotify_watch(watch); | ||
145 | deactivate_super(sb); | ||
146 | } | ||
147 | |||
127 | /* | 148 | /* |
128 | * inotify_handle_get_wd - returns the next WD for use by the given handle | 149 | * inotify_handle_get_wd - returns the next WD for use by the given handle |
129 | * | 150 | * |
@@ -479,6 +500,112 @@ void inotify_init_watch(struct inotify_watch *watch) | |||
479 | } | 500 | } |
480 | EXPORT_SYMBOL_GPL(inotify_init_watch); | 501 | EXPORT_SYMBOL_GPL(inotify_init_watch); |
481 | 502 | ||
503 | /* | ||
504 | * Watch removals suck violently. To kick the watch out we need (in this | ||
505 | * order) inode->inotify_mutex and ih->mutex. That's fine if we have | ||
506 | * a hold on inode; however, for all other cases we need to make damn sure | ||
507 | * we don't race with umount. We can *NOT* just grab a reference to a | ||
508 | * watch - inotify_unmount_inodes() will happily sail past it and we'll end | ||
509 | * with reference to inode potentially outliving its superblock. Ideally | ||
510 | * we just want to grab an active reference to superblock if we can; that | ||
511 | * will make sure we won't go into inotify_umount_inodes() until we are | ||
512 | * done. Cleanup is just deactivate_super(). However, that leaves a messy | ||
513 | * case - what if we *are* racing with umount() and active references to | ||
514 | * superblock can't be acquired anymore? We can bump ->s_count, grab | ||
515 | * ->s_umount, which will almost certainly wait until the superblock is shut | ||
516 | * down and the watch in question is pining for fjords. That's fine, but | ||
517 | * there is a problem - we might have hit the window between ->s_active | ||
518 | * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock | ||
519 | * is past the point of no return and is heading for shutdown) and the | ||
520 | * moment when deactivate_super() acquires ->s_umount. We could just do | ||
521 | * drop_super() yield() and retry, but that's rather antisocial and this | ||
522 | * stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having | ||
523 | * found that we'd got there first (i.e. that ->s_root is non-NULL) we know | ||
524 | * that we won't race with inotify_umount_inodes(). So we could grab a | ||
525 | * reference to watch and do the rest as above, just with drop_super() instead | ||
526 | * of deactivate_super(), right? Wrong. We had to drop ih->mutex before we | ||
527 | * could grab ->s_umount. So the watch could've been gone already. | ||
528 | * | ||
529 | * That still can be dealt with - we need to save watch->wd, do idr_find() | ||
530 | * and compare its result with our pointer. If they match, we either have | ||
531 | * the damn thing still alive or we'd lost not one but two races at once, | ||
532 | * the watch had been killed and a new one got created with the same ->wd | ||
533 | * at the same address. That couldn't have happened in inotify_destroy(), | ||
534 | * but inotify_rm_wd() could run into that. Still, "new one got created" | ||
535 | * is not a problem - we have every right to kill it or leave it alone, | ||
536 | * whatever's more convenient. | ||
537 | * | ||
538 | * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as | ||
539 | * "grab it and kill it" check. If it's been our original watch, we are | ||
540 | * fine, if it's a newcomer - nevermind, just pretend that we'd won the | ||
541 | * race and kill the fscker anyway; we are safe since we know that its | ||
542 | * superblock won't be going away. | ||
543 | * | ||
544 | * And yes, this is far beyond mere "not very pretty"; so's the entire | ||
545 | * concept of inotify to start with. | ||
546 | */ | ||
547 | |||
548 | /** | ||
549 | * pin_to_kill - pin the watch down for removal | ||
550 | * @ih: inotify handle | ||
551 | * @watch: watch to kill | ||
552 | * | ||
553 | * Called with ih->mutex held, drops it. Possible return values: | ||
554 | * 0 - nothing to do, it has died | ||
555 | * 1 - remove it, drop the reference and deactivate_super() | ||
556 | * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid | ||
557 | * that variant, since it involved a lot of PITA, but that's the best that | ||
558 | * could've been done. | ||
559 | */ | ||
560 | static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch) | ||
561 | { | ||
562 | struct super_block *sb = watch->inode->i_sb; | ||
563 | s32 wd = watch->wd; | ||
564 | |||
565 | spin_lock(&sb_lock); | ||
566 | if (sb->s_count >= S_BIAS) { | ||
567 | atomic_inc(&sb->s_active); | ||
568 | spin_unlock(&sb_lock); | ||
569 | get_inotify_watch(watch); | ||
570 | mutex_unlock(&ih->mutex); | ||
571 | return 1; /* the best outcome */ | ||
572 | } | ||
573 | sb->s_count++; | ||
574 | spin_unlock(&sb_lock); | ||
575 | mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */ | ||
576 | down_read(&sb->s_umount); | ||
577 | if (likely(!sb->s_root)) { | ||
578 | /* fs is already shut down; the watch is dead */ | ||
579 | drop_super(sb); | ||
580 | return 0; | ||
581 | } | ||
582 | /* raced with the final deactivate_super() */ | ||
583 | mutex_lock(&ih->mutex); | ||
584 | if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) { | ||
585 | /* the watch is dead */ | ||
586 | mutex_unlock(&ih->mutex); | ||
587 | drop_super(sb); | ||
588 | return 0; | ||
589 | } | ||
590 | /* still alive or freed and reused with the same sb and wd; kill */ | ||
591 | get_inotify_watch(watch); | ||
592 | mutex_unlock(&ih->mutex); | ||
593 | return 2; | ||
594 | } | ||
595 | |||
596 | static void unpin_and_kill(struct inotify_watch *watch, int how) | ||
597 | { | ||
598 | struct super_block *sb = watch->inode->i_sb; | ||
599 | put_inotify_watch(watch); | ||
600 | switch (how) { | ||
601 | case 1: | ||
602 | deactivate_super(sb); | ||
603 | break; | ||
604 | case 2: | ||
605 | drop_super(sb); | ||
606 | } | ||
607 | } | ||
608 | |||
482 | /** | 609 | /** |
483 | * inotify_destroy - clean up and destroy an inotify instance | 610 | * inotify_destroy - clean up and destroy an inotify instance |
484 | * @ih: inotify handle | 611 | * @ih: inotify handle |
@@ -490,11 +617,15 @@ void inotify_destroy(struct inotify_handle *ih) | |||
490 | * pretty. We cannot do a simple iteration over the list, because we | 617 | * pretty. We cannot do a simple iteration over the list, because we |
491 | * do not know the inode until we iterate to the watch. But we need to | 618 | * do not know the inode until we iterate to the watch. But we need to |
492 | * hold inode->inotify_mutex before ih->mutex. The following works. | 619 | * hold inode->inotify_mutex before ih->mutex. The following works. |
620 | * | ||
621 | * AV: it had to become even uglier to start working ;-/ | ||
493 | */ | 622 | */ |
494 | while (1) { | 623 | while (1) { |
495 | struct inotify_watch *watch; | 624 | struct inotify_watch *watch; |
496 | struct list_head *watches; | 625 | struct list_head *watches; |
626 | struct super_block *sb; | ||
497 | struct inode *inode; | 627 | struct inode *inode; |
628 | int how; | ||
498 | 629 | ||
499 | mutex_lock(&ih->mutex); | 630 | mutex_lock(&ih->mutex); |
500 | watches = &ih->watches; | 631 | watches = &ih->watches; |
@@ -503,8 +634,10 @@ void inotify_destroy(struct inotify_handle *ih) | |||
503 | break; | 634 | break; |
504 | } | 635 | } |
505 | watch = list_first_entry(watches, struct inotify_watch, h_list); | 636 | watch = list_first_entry(watches, struct inotify_watch, h_list); |
506 | get_inotify_watch(watch); | 637 | sb = watch->inode->i_sb; |
507 | mutex_unlock(&ih->mutex); | 638 | how = pin_to_kill(ih, watch); |
639 | if (!how) | ||
640 | continue; | ||
508 | 641 | ||
509 | inode = watch->inode; | 642 | inode = watch->inode; |
510 | mutex_lock(&inode->inotify_mutex); | 643 | mutex_lock(&inode->inotify_mutex); |
@@ -518,7 +651,7 @@ void inotify_destroy(struct inotify_handle *ih) | |||
518 | 651 | ||
519 | mutex_unlock(&ih->mutex); | 652 | mutex_unlock(&ih->mutex); |
520 | mutex_unlock(&inode->inotify_mutex); | 653 | mutex_unlock(&inode->inotify_mutex); |
521 | put_inotify_watch(watch); | 654 | unpin_and_kill(watch, how); |
522 | } | 655 | } |
523 | 656 | ||
524 | /* free this handle: the put matching the get in inotify_init() */ | 657 | /* free this handle: the put matching the get in inotify_init() */ |
@@ -719,7 +852,9 @@ void inotify_evict_watch(struct inotify_watch *watch) | |||
719 | int inotify_rm_wd(struct inotify_handle *ih, u32 wd) | 852 | int inotify_rm_wd(struct inotify_handle *ih, u32 wd) |
720 | { | 853 | { |
721 | struct inotify_watch *watch; | 854 | struct inotify_watch *watch; |
855 | struct super_block *sb; | ||
722 | struct inode *inode; | 856 | struct inode *inode; |
857 | int how; | ||
723 | 858 | ||
724 | mutex_lock(&ih->mutex); | 859 | mutex_lock(&ih->mutex); |
725 | watch = idr_find(&ih->idr, wd); | 860 | watch = idr_find(&ih->idr, wd); |
@@ -727,9 +862,12 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) | |||
727 | mutex_unlock(&ih->mutex); | 862 | mutex_unlock(&ih->mutex); |
728 | return -EINVAL; | 863 | return -EINVAL; |
729 | } | 864 | } |
730 | get_inotify_watch(watch); | 865 | sb = watch->inode->i_sb; |
866 | how = pin_to_kill(ih, watch); | ||
867 | if (!how) | ||
868 | return 0; | ||
869 | |||
731 | inode = watch->inode; | 870 | inode = watch->inode; |
732 | mutex_unlock(&ih->mutex); | ||
733 | 871 | ||
734 | mutex_lock(&inode->inotify_mutex); | 872 | mutex_lock(&inode->inotify_mutex); |
735 | mutex_lock(&ih->mutex); | 873 | mutex_lock(&ih->mutex); |
@@ -740,7 +878,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) | |||
740 | 878 | ||
741 | mutex_unlock(&ih->mutex); | 879 | mutex_unlock(&ih->mutex); |
742 | mutex_unlock(&inode->inotify_mutex); | 880 | mutex_unlock(&inode->inotify_mutex); |
743 | put_inotify_watch(watch); | 881 | unpin_and_kill(watch, how); |
744 | 882 | ||
745 | return 0; | 883 | return 0; |
746 | } | 884 | } |