diff options
author | Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | 2009-01-05 21:38:14 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2009-01-05 21:38:14 -0500 |
commit | 393418676a7602e1d7d3f6e560159c65c8cbd50e (patch) | |
tree | 26b166e426748a7a4bfe3784cd8258a895daba79 | |
parent | 3300beda523136f9f87821e4fba85c5c9e319645 (diff) |
ext4: Fix the race between read_inode_bitmap() and ext4_new_inode()
We need to make sure we update the inode bitmap and clear
EXT4_BG_INODE_UNINIT flag with sb_bgl_lock held, since
ext4_read_inode_bitmap() looks at EXT4_BG_INODE_UNINIT to decide
whether to initialize the inode bitmap each time it is called.
(introduced by commit c806e68f.)
ext4_read_inode_bitmap does:
spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
ext4_init_inode_bitmap(sb, bh, block_group, desc);
and ext4_new_inode does
if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
ino, inode_bitmap_bh->b_data))
......
...
spin_lock(sb_bgl_lock(sbi, group));
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
i.e., on allocation we update the bitmap then we take the sb_bgl_lock
and clear the EXT4_BG_INODE_UNINIT flag. What can happen is a
parallel ext4_read_inode_bitmap can zero out the bitmap in between
the above ext4_set_bit_atomic and spin_lock(sb_bg_lock..)
The race results in below user visible errors
EXT4-fs error (device sdb1): ext4_free_inode: bit already cleared for inode 168449
EXT4-fs warning (device sdb1): ext4_unlink: Deleting nonexistent file ...
EXT4-fs warning (device sdb1): ext4_rmdir: empty directory has too many links ...
# ls -al /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71
ls: /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71: Stale NFS file handle
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
-rw-r--r-- | fs/ext4/ialloc.c | 146 |
1 files changed, 86 insertions, 60 deletions
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b47427a21f1c..d4e544f30be2 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c | |||
@@ -573,6 +573,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent, | |||
573 | } | 573 | } |
574 | 574 | ||
575 | /* | 575 | /* |
576 | * claim the inode from the inode bitmap. If the group | ||
577 | * is uninit we need to take the groups's sb_bgl_lock | ||
578 | * and clear the uninit flag. The inode bitmap update | ||
579 | * and group desc uninit flag clear should be done | ||
580 | * after holding sb_bgl_lock so that ext4_read_inode_bitmap | ||
581 | * doesn't race with the ext4_claim_inode | ||
582 | */ | ||
583 | static int ext4_claim_inode(struct super_block *sb, | ||
584 | struct buffer_head *inode_bitmap_bh, | ||
585 | unsigned long ino, ext4_group_t group, int mode) | ||
586 | { | ||
587 | int free = 0, retval = 0, count; | ||
588 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
589 | struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); | ||
590 | |||
591 | spin_lock(sb_bgl_lock(sbi, group)); | ||
592 | if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { | ||
593 | /* not a free inode */ | ||
594 | retval = 1; | ||
595 | goto err_ret; | ||
596 | } | ||
597 | ino++; | ||
598 | if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || | ||
599 | ino > EXT4_INODES_PER_GROUP(sb)) { | ||
600 | spin_unlock(sb_bgl_lock(sbi, group)); | ||
601 | ext4_error(sb, __func__, | ||
602 | "reserved inode or inode > inodes count - " | ||
603 | "block_group = %u, inode=%lu", group, | ||
604 | ino + group * EXT4_INODES_PER_GROUP(sb)); | ||
605 | return 1; | ||
606 | } | ||
607 | /* If we didn't allocate from within the initialized part of the inode | ||
608 | * table then we need to initialize up to this inode. */ | ||
609 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { | ||
610 | |||
611 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { | ||
612 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); | ||
613 | /* When marking the block group with | ||
614 | * ~EXT4_BG_INODE_UNINIT we don't want to depend | ||
615 | * on the value of bg_itable_unused even though | ||
616 | * mke2fs could have initialized the same for us. | ||
617 | * Instead we calculated the value below | ||
618 | */ | ||
619 | |||
620 | free = 0; | ||
621 | } else { | ||
622 | free = EXT4_INODES_PER_GROUP(sb) - | ||
623 | ext4_itable_unused_count(sb, gdp); | ||
624 | } | ||
625 | |||
626 | /* | ||
627 | * Check the relative inode number against the last used | ||
628 | * relative inode number in this group. if it is greater | ||
629 | * we need to update the bg_itable_unused count | ||
630 | * | ||
631 | */ | ||
632 | if (ino > free) | ||
633 | ext4_itable_unused_set(sb, gdp, | ||
634 | (EXT4_INODES_PER_GROUP(sb) - ino)); | ||
635 | } | ||
636 | count = ext4_free_inodes_count(sb, gdp) - 1; | ||
637 | ext4_free_inodes_set(sb, gdp, count); | ||
638 | if (S_ISDIR(mode)) { | ||
639 | count = ext4_used_dirs_count(sb, gdp) + 1; | ||
640 | ext4_used_dirs_set(sb, gdp, count); | ||
641 | } | ||
642 | gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); | ||
643 | err_ret: | ||
644 | spin_unlock(sb_bgl_lock(sbi, group)); | ||
645 | return retval; | ||
646 | } | ||
647 | |||
648 | /* | ||
576 | * There are two policies for allocating an inode. If the new inode is | 649 | * There are two policies for allocating an inode. If the new inode is |
577 | * a directory, then a forward search is made for a block group with both | 650 | * a directory, then a forward search is made for a block group with both |
578 | * free space and a low directory-to-inode ratio; if that fails, then of | 651 | * free space and a low directory-to-inode ratio; if that fails, then of |
@@ -594,7 +667,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) | |||
594 | struct ext4_super_block *es; | 667 | struct ext4_super_block *es; |
595 | struct ext4_inode_info *ei; | 668 | struct ext4_inode_info *ei; |
596 | struct ext4_sb_info *sbi; | 669 | struct ext4_sb_info *sbi; |
597 | int ret2, err = 0, count; | 670 | int ret2, err = 0; |
598 | struct inode *ret; | 671 | struct inode *ret; |
599 | ext4_group_t i; | 672 | ext4_group_t i; |
600 | int free = 0; | 673 | int free = 0; |
@@ -658,8 +731,13 @@ repeat_in_this_group: | |||
658 | if (err) | 731 | if (err) |
659 | goto fail; | 732 | goto fail; |
660 | 733 | ||
661 | if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), | 734 | BUFFER_TRACE(group_desc_bh, "get_write_access"); |
662 | ino, inode_bitmap_bh->b_data)) { | 735 | err = ext4_journal_get_write_access(handle, |
736 | group_desc_bh); | ||
737 | if (err) | ||
738 | goto fail; | ||
739 | if (!ext4_claim_inode(sb, inode_bitmap_bh, | ||
740 | ino, group, mode)) { | ||
663 | /* we won it */ | 741 | /* we won it */ |
664 | BUFFER_TRACE(inode_bitmap_bh, | 742 | BUFFER_TRACE(inode_bitmap_bh, |
665 | "call ext4_handle_dirty_metadata"); | 743 | "call ext4_handle_dirty_metadata"); |
@@ -668,10 +746,13 @@ repeat_in_this_group: | |||
668 | inode_bitmap_bh); | 746 | inode_bitmap_bh); |
669 | if (err) | 747 | if (err) |
670 | goto fail; | 748 | goto fail; |
749 | /* zero bit is inode number 1*/ | ||
750 | ino++; | ||
671 | goto got; | 751 | goto got; |
672 | } | 752 | } |
673 | /* we lost it */ | 753 | /* we lost it */ |
674 | ext4_handle_release_buffer(handle, inode_bitmap_bh); | 754 | ext4_handle_release_buffer(handle, inode_bitmap_bh); |
755 | ext4_handle_release_buffer(handle, group_desc_bh); | ||
675 | 756 | ||
676 | if (++ino < EXT4_INODES_PER_GROUP(sb)) | 757 | if (++ino < EXT4_INODES_PER_GROUP(sb)) |
677 | goto repeat_in_this_group; | 758 | goto repeat_in_this_group; |
@@ -691,22 +772,6 @@ repeat_in_this_group: | |||
691 | goto out; | 772 | goto out; |
692 | 773 | ||
693 | got: | 774 | got: |
694 | ino++; | ||
695 | if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || | ||
696 | ino > EXT4_INODES_PER_GROUP(sb)) { | ||
697 | ext4_error(sb, __func__, | ||
698 | "reserved inode or inode > inodes count - " | ||
699 | "block_group = %u, inode=%lu", group, | ||
700 | ino + group * EXT4_INODES_PER_GROUP(sb)); | ||
701 | err = -EIO; | ||
702 | goto fail; | ||
703 | } | ||
704 | |||
705 | BUFFER_TRACE(group_desc_bh, "get_write_access"); | ||
706 | err = ext4_journal_get_write_access(handle, group_desc_bh); | ||
707 | if (err) | ||
708 | goto fail; | ||
709 | |||
710 | /* We may have to initialize the block bitmap if it isn't already */ | 775 | /* We may have to initialize the block bitmap if it isn't already */ |
711 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && | 776 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && |
712 | gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | 777 | gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { |
@@ -743,49 +808,10 @@ got: | |||
743 | if (err) | 808 | if (err) |
744 | goto fail; | 809 | goto fail; |
745 | } | 810 | } |
746 | |||
747 | spin_lock(sb_bgl_lock(sbi, group)); | ||
748 | /* If we didn't allocate from within the initialized part of the inode | ||
749 | * table then we need to initialize up to this inode. */ | ||
750 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { | ||
751 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { | ||
752 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); | ||
753 | |||
754 | /* When marking the block group with | ||
755 | * ~EXT4_BG_INODE_UNINIT we don't want to depend | ||
756 | * on the value of bg_itable_unused even though | ||
757 | * mke2fs could have initialized the same for us. | ||
758 | * Instead we calculated the value below | ||
759 | */ | ||
760 | |||
761 | free = 0; | ||
762 | } else { | ||
763 | free = EXT4_INODES_PER_GROUP(sb) - | ||
764 | ext4_itable_unused_count(sb, gdp); | ||
765 | } | ||
766 | |||
767 | /* | ||
768 | * Check the relative inode number against the last used | ||
769 | * relative inode number in this group. if it is greater | ||
770 | * we need to update the bg_itable_unused count | ||
771 | * | ||
772 | */ | ||
773 | if (ino > free) | ||
774 | ext4_itable_unused_set(sb, gdp, | ||
775 | (EXT4_INODES_PER_GROUP(sb) - ino)); | ||
776 | } | ||
777 | |||
778 | count = ext4_free_inodes_count(sb, gdp) - 1; | ||
779 | ext4_free_inodes_set(sb, gdp, count); | ||
780 | if (S_ISDIR(mode)) { | ||
781 | count = ext4_used_dirs_count(sb, gdp) + 1; | ||
782 | ext4_used_dirs_set(sb, gdp, count); | ||
783 | } | ||
784 | gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); | ||
785 | spin_unlock(sb_bgl_lock(sbi, group)); | ||
786 | BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); | 811 | BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); |
787 | err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); | 812 | err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); |
788 | if (err) goto fail; | 813 | if (err) |
814 | goto fail; | ||
789 | 815 | ||
790 | percpu_counter_dec(&sbi->s_freeinodes_counter); | 816 | percpu_counter_dec(&sbi->s_freeinodes_counter); |
791 | if (S_ISDIR(mode)) | 817 | if (S_ISDIR(mode)) |