aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>2009-01-05 21:38:14 -0500
committerTheodore Ts'o <tytso@mit.edu>2009-01-05 21:38:14 -0500
commit393418676a7602e1d7d3f6e560159c65c8cbd50e (patch)
tree26b166e426748a7a4bfe3784cd8258a895daba79
parent3300beda523136f9f87821e4fba85c5c9e319645 (diff)
ext4: Fix the race between read_inode_bitmap() and ext4_new_inode()
We need to make sure we update the inode bitmap and clear EXT4_BG_INODE_UNINIT flag with sb_bgl_lock held, since ext4_read_inode_bitmap() looks at EXT4_BG_INODE_UNINIT to decide whether to initialize the inode bitmap each time it is called. (introduced by commit c806e68f.) ext4_read_inode_bitmap does: spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); and ext4_new_inode does if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), ino, inode_bitmap_bh->b_data)) ...... ... spin_lock(sb_bgl_lock(sbi, group)); gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); i.e., on allocation we update the bitmap then we take the sb_bgl_lock and clear the EXT4_BG_INODE_UNINIT flag. What can happen is a parallel ext4_read_inode_bitmap can zero out the bitmap in between the above ext4_set_bit_atomic and spin_lock(sb_bg_lock..) The race results in below user visible errors EXT4-fs error (device sdb1): ext4_free_inode: bit already cleared for inode 168449 EXT4-fs warning (device sdb1): ext4_unlink: Deleting nonexistent file ... EXT4-fs warning (device sdb1): ext4_rmdir: empty directory has too many links ... # ls -al /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71 ls: /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71: Stale NFS file handle Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Cc: stable@kernel.org
-rw-r--r--fs/ext4/ialloc.c146
1 files changed, 86 insertions, 60 deletions
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b47427a21f1c..d4e544f30be2 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -573,6 +573,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
573} 573}
574 574
575/* 575/*
576 * claim the inode from the inode bitmap. If the group
577 * is uninit we need to take the groups's sb_bgl_lock
578 * and clear the uninit flag. The inode bitmap update
579 * and group desc uninit flag clear should be done
580 * after holding sb_bgl_lock so that ext4_read_inode_bitmap
581 * doesn't race with the ext4_claim_inode
582 */
583static int ext4_claim_inode(struct super_block *sb,
584 struct buffer_head *inode_bitmap_bh,
585 unsigned long ino, ext4_group_t group, int mode)
586{
587 int free = 0, retval = 0, count;
588 struct ext4_sb_info *sbi = EXT4_SB(sb);
589 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
590
591 spin_lock(sb_bgl_lock(sbi, group));
592 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
593 /* not a free inode */
594 retval = 1;
595 goto err_ret;
596 }
597 ino++;
598 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
599 ino > EXT4_INODES_PER_GROUP(sb)) {
600 spin_unlock(sb_bgl_lock(sbi, group));
601 ext4_error(sb, __func__,
602 "reserved inode or inode > inodes count - "
603 "block_group = %u, inode=%lu", group,
604 ino + group * EXT4_INODES_PER_GROUP(sb));
605 return 1;
606 }
607 /* If we didn't allocate from within the initialized part of the inode
608 * table then we need to initialize up to this inode. */
609 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
610
611 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
612 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
613 /* When marking the block group with
614 * ~EXT4_BG_INODE_UNINIT we don't want to depend
615 * on the value of bg_itable_unused even though
616 * mke2fs could have initialized the same for us.
617 * Instead we calculated the value below
618 */
619
620 free = 0;
621 } else {
622 free = EXT4_INODES_PER_GROUP(sb) -
623 ext4_itable_unused_count(sb, gdp);
624 }
625
626 /*
627 * Check the relative inode number against the last used
628 * relative inode number in this group. if it is greater
629 * we need to update the bg_itable_unused count
630 *
631 */
632 if (ino > free)
633 ext4_itable_unused_set(sb, gdp,
634 (EXT4_INODES_PER_GROUP(sb) - ino));
635 }
636 count = ext4_free_inodes_count(sb, gdp) - 1;
637 ext4_free_inodes_set(sb, gdp, count);
638 if (S_ISDIR(mode)) {
639 count = ext4_used_dirs_count(sb, gdp) + 1;
640 ext4_used_dirs_set(sb, gdp, count);
641 }
642 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
643err_ret:
644 spin_unlock(sb_bgl_lock(sbi, group));
645 return retval;
646}
647
648/*
576 * There are two policies for allocating an inode. If the new inode is 649 * There are two policies for allocating an inode. If the new inode is
577 * a directory, then a forward search is made for a block group with both 650 * a directory, then a forward search is made for a block group with both
578 * free space and a low directory-to-inode ratio; if that fails, then of 651 * free space and a low directory-to-inode ratio; if that fails, then of
@@ -594,7 +667,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
594 struct ext4_super_block *es; 667 struct ext4_super_block *es;
595 struct ext4_inode_info *ei; 668 struct ext4_inode_info *ei;
596 struct ext4_sb_info *sbi; 669 struct ext4_sb_info *sbi;
597 int ret2, err = 0, count; 670 int ret2, err = 0;
598 struct inode *ret; 671 struct inode *ret;
599 ext4_group_t i; 672 ext4_group_t i;
600 int free = 0; 673 int free = 0;
@@ -658,8 +731,13 @@ repeat_in_this_group:
658 if (err) 731 if (err)
659 goto fail; 732 goto fail;
660 733
661 if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), 734 BUFFER_TRACE(group_desc_bh, "get_write_access");
662 ino, inode_bitmap_bh->b_data)) { 735 err = ext4_journal_get_write_access(handle,
736 group_desc_bh);
737 if (err)
738 goto fail;
739 if (!ext4_claim_inode(sb, inode_bitmap_bh,
740 ino, group, mode)) {
663 /* we won it */ 741 /* we won it */
664 BUFFER_TRACE(inode_bitmap_bh, 742 BUFFER_TRACE(inode_bitmap_bh,
665 "call ext4_handle_dirty_metadata"); 743 "call ext4_handle_dirty_metadata");
@@ -668,10 +746,13 @@ repeat_in_this_group:
668 inode_bitmap_bh); 746 inode_bitmap_bh);
669 if (err) 747 if (err)
670 goto fail; 748 goto fail;
749 /* zero bit is inode number 1*/
750 ino++;
671 goto got; 751 goto got;
672 } 752 }
673 /* we lost it */ 753 /* we lost it */
674 ext4_handle_release_buffer(handle, inode_bitmap_bh); 754 ext4_handle_release_buffer(handle, inode_bitmap_bh);
755 ext4_handle_release_buffer(handle, group_desc_bh);
675 756
676 if (++ino < EXT4_INODES_PER_GROUP(sb)) 757 if (++ino < EXT4_INODES_PER_GROUP(sb))
677 goto repeat_in_this_group; 758 goto repeat_in_this_group;
@@ -691,22 +772,6 @@ repeat_in_this_group:
691 goto out; 772 goto out;
692 773
693got: 774got:
694 ino++;
695 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
696 ino > EXT4_INODES_PER_GROUP(sb)) {
697 ext4_error(sb, __func__,
698 "reserved inode or inode > inodes count - "
699 "block_group = %u, inode=%lu", group,
700 ino + group * EXT4_INODES_PER_GROUP(sb));
701 err = -EIO;
702 goto fail;
703 }
704
705 BUFFER_TRACE(group_desc_bh, "get_write_access");
706 err = ext4_journal_get_write_access(handle, group_desc_bh);
707 if (err)
708 goto fail;
709
710 /* We may have to initialize the block bitmap if it isn't already */ 775 /* We may have to initialize the block bitmap if it isn't already */
711 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && 776 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
712 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 777 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -743,49 +808,10 @@ got:
743 if (err) 808 if (err)
744 goto fail; 809 goto fail;
745 } 810 }
746
747 spin_lock(sb_bgl_lock(sbi, group));
748 /* If we didn't allocate from within the initialized part of the inode
749 * table then we need to initialize up to this inode. */
750 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
751 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
752 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
753
754 /* When marking the block group with
755 * ~EXT4_BG_INODE_UNINIT we don't want to depend
756 * on the value of bg_itable_unused even though
757 * mke2fs could have initialized the same for us.
758 * Instead we calculated the value below
759 */
760
761 free = 0;
762 } else {
763 free = EXT4_INODES_PER_GROUP(sb) -
764 ext4_itable_unused_count(sb, gdp);
765 }
766
767 /*
768 * Check the relative inode number against the last used
769 * relative inode number in this group. if it is greater
770 * we need to update the bg_itable_unused count
771 *
772 */
773 if (ino > free)
774 ext4_itable_unused_set(sb, gdp,
775 (EXT4_INODES_PER_GROUP(sb) - ino));
776 }
777
778 count = ext4_free_inodes_count(sb, gdp) - 1;
779 ext4_free_inodes_set(sb, gdp, count);
780 if (S_ISDIR(mode)) {
781 count = ext4_used_dirs_count(sb, gdp) + 1;
782 ext4_used_dirs_set(sb, gdp, count);
783 }
784 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
785 spin_unlock(sb_bgl_lock(sbi, group));
786 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); 811 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
787 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); 812 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
788 if (err) goto fail; 813 if (err)
814 goto fail;
789 815
790 percpu_counter_dec(&sbi->s_freeinodes_counter); 816 percpu_counter_dec(&sbi->s_freeinodes_counter);
791 if (S_ISDIR(mode)) 817 if (S_ISDIR(mode))