From e219cca082f52e7dfea41f3be264b7b5eb204227 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 6 Nov 2008 22:37:59 -0500 Subject: jbd: don't give up looking for space so easily in __log_wait_for_space Commit be07c4ed introducd a regression because it assumed that if there were no transactions ready to be checkpointed, that no progress could be made on making space available in the journal, and so the journal should be aborted. This assumption is false; it could be the case that simply calling cleanup_journal_tail() will recover the necessary space, or, for small journals, the currently committing transaction could be responsible for chewing up the required space in the log, so we need to wait for the currently committing transaction to finish before trying to force a checkpoint operation. This patch fixes the bug reported by Meelis Roos at: http://bugzilla.kernel.org/show_bug.cgi?id=11937 Signed-off-by: "Theodore Ts'o" Cc: Duane Griffin Cc: Toshiyuki Okajima --- fs/jbd/checkpoint.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 1bd8d4acc6f2..61f32f3868cd 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -115,7 +115,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh) */ void __log_wait_for_space(journal_t *journal) { - int nblocks; + int nblocks, space_left; assert_spin_locked(&journal->j_state_lock); nblocks = jbd_space_needed(journal); @@ -128,25 +128,42 @@ void __log_wait_for_space(journal_t *journal) /* * Test again, another process may have checkpointed while we * were waiting for the checkpoint lock. If there are no - * outstanding transactions there is nothing to checkpoint and - * we can't make progress. Abort the journal in this case. + * transactions ready to be checkpointed, try to recover + * journal space by calling cleanup_journal_tail(), and if + * that doesn't work, by waiting for the currently committing + * transaction to complete. If there is absolutely no way + * to make progress, this is either a BUG or corrupted + * filesystem, so abort the journal and leave a stack + * trace for forensic evidence. */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); nblocks = jbd_space_needed(journal); - if (__log_space_left(journal) < nblocks) { + space_left = __log_space_left(journal); + if (space_left < nblocks) { int chkpt = journal->j_checkpoint_transactions != NULL; + tid_t tid = 0; + if (journal->j_committing_transaction) + tid = journal->j_committing_transaction->t_tid; spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_state_lock); if (chkpt) { log_do_checkpoint(journal); + } else if (cleanup_journal_tail(journal) == 0) { + /* We were able to recover space; yay! */ + ; + } else if (tid) { + log_wait_commit(journal, tid); } else { - printk(KERN_ERR "%s: no transactions\n", - __func__); + printk(KERN_ERR "%s: needed %d blocks and " + "only had %d space available\n", + __func__, nblocks, space_left); + printk(KERN_ERR "%s: no way to get more " + "journal space\n", __func__); + WARN_ON(1); journal_abort(journal, 0); } - spin_lock(&journal->j_state_lock); } else { spin_unlock(&journal->j_list_lock); -- cgit v1.2.2 From 8c3f25d8950c3e9fe6c9849f88679b3f2a071550 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 6 Nov 2008 22:38:07 -0500 Subject: jbd2: don't give up looking for space so easily in __jbd2_log_wait_for_space MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 23f8b79e introducd a regression because it assumed that if there were no transactions ready to be checkpointed, that no progress could be made on making space available in the journal, and so the journal should be aborted. This assumption is false; it could be the case that simply calling jbd2_cleanup_journal_tail() will recover the necessary space, or, for small journals, the currently committing transaction could be responsible for chewing up the required space in the log, so we need to wait for the currently committing transaction to finish before trying to force a checkpoint operation. This patch fixes a bug reported by Mihai Harpau at: https://bugzilla.redhat.com/show_bug.cgi?id=469582 This patch fixes a bug reported by François Valenduc at: http://bugzilla.kernel.org/show_bug.cgi?id=11840 Signed-off-by: "Theodore Ts'o" Cc: Duane Griffin Cc: Toshiyuki Okajima --- fs/jbd2/checkpoint.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 9203c3332f17..9497718fe920 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -116,7 +116,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh) */ void __jbd2_log_wait_for_space(journal_t *journal) { - int nblocks; + int nblocks, space_left; assert_spin_locked(&journal->j_state_lock); nblocks = jbd_space_needed(journal); @@ -129,25 +129,43 @@ void __jbd2_log_wait_for_space(journal_t *journal) /* * Test again, another process may have checkpointed while we * were waiting for the checkpoint lock. If there are no - * outstanding transactions there is nothing to checkpoint and - * we can't make progress. Abort the journal in this case. + * transactions ready to be checkpointed, try to recover + * journal space by calling cleanup_journal_tail(), and if + * that doesn't work, by waiting for the currently committing + * transaction to complete. If there is absolutely no way + * to make progress, this is either a BUG or corrupted + * filesystem, so abort the journal and leave a stack + * trace for forensic evidence. */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); nblocks = jbd_space_needed(journal); - if (__jbd2_log_space_left(journal) < nblocks) { + space_left = __jbd2_log_space_left(journal); + if (space_left < nblocks) { int chkpt = journal->j_checkpoint_transactions != NULL; + tid_t tid = 0; + if (journal->j_committing_transaction) + tid = journal->j_committing_transaction->t_tid; spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_state_lock); if (chkpt) { jbd2_log_do_checkpoint(journal); + } else if (jbd2_cleanup_journal_tail(journal) == 0) { + /* We were able to recover space; yay! */ + ; + } else if (tid) { + jbd2_log_wait_commit(journal, tid); } else { - printk(KERN_ERR "%s: no transactions\n", - __func__); + printk(KERN_ERR "%s: needed %d blocks and " + "only had %d space available\n", + __func__, nblocks, space_left); + printk(KERN_ERR "%s: no way to get more " + "journal space in %s\n", __func__, + journal->j_devname); + WARN_ON(1); jbd2_journal_abort(journal, 0); } - spin_lock(&journal->j_state_lock); } else { spin_unlock(&journal->j_list_lock); -- cgit v1.2.2 From 2423840ded13e6d3b52d88aff8d033bb78fafd08 Mon Sep 17 00:00:00 2001 From: Sami Liedes Date: Sun, 2 Nov 2008 19:23:30 -0500 Subject: jbd2: deregister proc on failure in jbd2_journal_init_inode jbd2_journal_init_inode() does not call jbd2_stats_proc_exit() on all failure paths after calling jbd2_stats_proc_init(). This leaves dangling references to the fs in proc. This patch fixes a bug reported by Sami Leides at: http://bugzilla.kernel.org/show_bug.cgi?id=11493 Signed-off-by: Sami Liedes Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 783de118de92..e70d657a19f8 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1089,6 +1089,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", __func__); + jbd2_stats_proc_exit(journal); kfree(journal); return NULL; } @@ -1098,6 +1099,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) if (err) { printk(KERN_ERR "%s: Cannnot locate journal superblock\n", __func__); + jbd2_stats_proc_exit(journal); kfree(journal); return NULL; } -- cgit v1.2.2 From ae2d9fb18e575ed37ffc241ece4bf68f0be4ae32 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 4 Nov 2008 09:10:50 -0500 Subject: ext4: fix missing ext4_unlock_group in error path If we try to free a block which is already freed, the code was returning without first unlocking the group. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dfe17a134052..444ad998f72e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4441,6 +4441,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, else if (block >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; else { + ext4_unlock_group(sb, group); ext4_error(sb, __func__, "Double free of blocks %d (%d %d)\n", block, entry->start_blk, entry->count); -- cgit v1.2.2 From d94e99a64c3beece22dbfb2b335771a59184eb0a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 4 Nov 2008 09:11:26 -0500 Subject: ext4: Convert to host order before using the values. Use le16_to_cpu to read the s_reserved_gdt_blocks values from super block. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 994859df010e..e27acd18b4b0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1458,9 +1458,8 @@ static int ext4_fill_flex_info(struct super_block *sb) /* We allocate both existing and potentially added groups */ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + - ((sbi->s_es->s_reserved_gdt_blocks +1 ) << - EXT4_DESC_PER_BLOCK_BITS(sb))) / - groups_per_flex; + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << + EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; sbi->s_flex_groups = kzalloc(flex_group_count * sizeof(struct flex_groups), GFP_KERNEL); if (sbi->s_flex_groups == NULL) { -- cgit v1.2.2 From 14ce0cb411c88681ab8f3a4c9caa7f42e97a3184 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 3 Nov 2008 18:10:55 -0500 Subject: ext4: wait on all pending commits in ext4_sync_fs() In ext4_sync_fs, we only wait for a commit to finish if we started it, but there may be one already in progress which will not be synced. In the case of a data=ordered umount with pending long symlinks which are delayed due to a long list of other I/O on the backing block device, this causes the buffer associated with the long symlinks to not be moved to the inode dirty list in the second phase of fsync_super. Then, before they can be dirtied again, kjournald exits, seeing the UMOUNT flag and the dirty pages are never written to the backing block device, causing long symlink corruption and exposing new or previously freed block data to userspace. To ensure all commits are synced, we flush all journal commits now when sync_fs'ing ext4. Signed-off-by: Arthur Jones Signed-off-by: Andrew Morton Signed-off-by: "Theodore Ts'o" Cc: Eric Sandeen Cc: --- fs/ext4/super.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e27acd18b4b0..e4a241c65dbe 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2884,12 +2884,9 @@ int ext4_force_commit(struct super_block *sb) /* * Ext4 always journals updates to the superblock itself, so we don't * have to propagate any other updates to the superblock on disk at this - * point. Just start an async writeback to get the buffers on their way - * to the disk. - * - * This implicitly triggers the writebehind on sync(). + * point. (We can probably nuke this function altogether, and remove + * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...) */ - static void ext4_write_super(struct super_block *sb) { if (mutex_trylock(&sb->s_lock) != 0) @@ -2899,15 +2896,15 @@ static void ext4_write_super(struct super_block *sb) static int ext4_sync_fs(struct super_block *sb, int wait) { - tid_t target; + int ret = 0; trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); sb->s_dirt = 0; - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { - if (wait) - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); - } - return 0; + if (wait) + ret = ext4_force_commit(sb); + else + jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); + return ret; } /* -- cgit v1.2.2 From ac51d83705c2a38c71f39cde99708b14e6212a60 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 6 Nov 2008 16:49:36 -0500 Subject: ext4: calculate journal credits correctly This fixes a 2.6.27 regression which was introduced in commit a02908f1. We weren't passing the chunk parameter down to the two subections, ext4_indirect_trans_blocks() and ext4_ext_index_trans_blocks(), with the result that massively overestimate the amount of credits needed by ext4_da_writepages, especially in the non-extents case. This causes failures especially on /boot partitions, which tend to be small and non-extent using since GRUB doesn't handle extents. This patch fixes the bug reported by Joseph Fannin at: http://bugzilla.kernel.org/show_bug.cgi?id=11964 Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8dbf6953845b..5a130b56f1cf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4580,9 +4580,10 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) - return ext4_indirect_trans_blocks(inode, nrblocks, 0); - return ext4_ext_index_trans_blocks(inode, nrblocks, 0); + return ext4_indirect_trans_blocks(inode, nrblocks, chunk); + return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); } + /* * Account for index blocks, block groups bitmaps and block group * descriptor blocks if modify datablocks and index blocks -- cgit v1.2.2 From ed9b3e3379731e9f9d2f73f3d7fd9e7d2ce3df4a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 7 Nov 2008 09:06:45 -0500 Subject: ext4: Mark the buffer_heads as dirty and uptodate after prepare_write We need to make sure we mark the buffer_heads as dirty and uptodate so that block_write_full_page write them correctly. This fixes mmap corruptions that can occur in low memory situations. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5a130b56f1cf..be21a5ae33cb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2329,6 +2329,8 @@ static int ext4_da_writepage(struct page *page, unlock_page(page); return 0; } + /* now mark the buffer_heads as dirty and uptodate */ + block_commit_write(page, 0, PAGE_CACHE_SIZE); } if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) -- cgit v1.2.2 From 23712a9c28b9f80a8cf70c8490358d5f562d2465 Mon Sep 17 00:00:00 2001 From: Frederic Bohe Date: Fri, 7 Nov 2008 09:21:01 -0500 Subject: ext4: add checksum calculation when clearing UNINIT flag in ext4_new_inode When initializing an uninitialized block group in ext4_new_inode(), its block group checksum must be re-calculated. This fixes a race when several threads try to allocate a new inode in an UNINIT'd group. There is some question whether we need to be initializing the block bitmap in ext4_new_inode() at all, but for now, if we are going to init the block group, let's eliminate the race. Signed-off-by: Frederic Bohe Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index fe34d74cfb19..2a117e286e54 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -718,6 +718,8 @@ got: gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); free = ext4_free_blocks_after_init(sb, group, gdp); gdp->bg_free_blocks_count = cpu_to_le16(free); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, + gdp); } spin_unlock(sb_bgl_lock(sbi, group)); -- cgit v1.2.2 From b726e923ea4d216027e466aa602d914e4b4a63af Mon Sep 17 00:00:00 2001 From: Doug Nazar Date: Wed, 5 Nov 2008 06:16:28 -0500 Subject: Fix nfsd truncation of readdir results Commit 8d7c4203 "nfsd: fix failure to set eof in readdir in some situations" introduced a bug: on a directory in an exported ext3 filesystem with dir_index unset, a READDIR will only return about 250 entries, even if the directory was larger. Bisected it back to this commit; reverting it fixes the problem. It turns out that in this case ext3 reads a block at a time, then returns from readdir, which means we can end up with buf.full==0 but with more entries in the directory still to be read. Before 8d7c4203 (but after c002a6c797 "Optimise NFS readdir hack slightly"), this would cause us to return the READDIR result immediately, but with the eof bit unset. That could cause a performance regression (because the client would need more roundtrips to the server to read the whole directory), but no loss in correctness, since the cleared eof bit caused the client to send another readdir. After 8d7c4203, the setting of the eof bit made this a correctness problem. So, move nfserr_eof into the loop and remove the buf.full check so that we loop until buf.used==0. The following seems to do the right thing and reduces the network traffic since we don't return a READDIR result until the buffer is full. Tested on an empty directory & large directory; eof is properly sent and there are no more short buffers. Signed-off-by: Doug Nazar Cc: David Woodhouse Cc: Al Viro Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 848a03e83a42..4433c8f00163 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1875,11 +1875,11 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func, return -ENOMEM; offset = *offsetp; - cdp->err = nfserr_eof; /* will be cleared on successful read */ while (1) { unsigned int reclen; + cdp->err = nfserr_eof; /* will be cleared on successful read */ buf.used = 0; buf.full = 0; @@ -1912,9 +1912,6 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func, de = (struct buffered_dirent *)((char *)de + reclen); } offset = vfs_llseek(file, 0, SEEK_CUR); - cdp->err = nfserr_eof; - if (!buf.full) - break; } done: -- cgit v1.2.2 From 9ccbece546cf836f67f6d9bb4bf2f70f7476cb2c Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Thu, 30 Oct 2008 16:53:25 +1100 Subject: [XFS] Fix use-after-free with log and quotas Destroying the quota stuff on unmount can access the log - ie XFS_QM_DONE() ends up in xfs_dqunlock() which calls xfs_trans_unlocked_item() and then xfs_log_move_tail(). By this time the log has already been destroyed. Just move the cleanup of the quota code earlier in xfs_unmountfs() before the call to xfs_log_unmount(). Moving XFS_QM_DONE() up near XFS_QM_DQPURGEALL() seems like a good spot. SGI-PV: 987086 SGI-Modid: xfs-linux-melb:xfs-kern:32148a Signed-off-by: Lachlan McIlroy Signed-off-by: Christoph Hellwig Signed-off-by: Peter Leckie --- fs/xfs/xfs_mount.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index a4503f5e9497..15f5dd22fbb2 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1245,6 +1245,9 @@ xfs_unmountfs( XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); + if (mp->m_quotainfo) + XFS_QM_DONE(mp); + /* * Flush out the log synchronously so that we know for sure * that nothing is pinned. This is important because bflush() @@ -1297,8 +1300,6 @@ xfs_unmountfs( xfs_errortag_clearall(mp, 0); #endif xfs_free_perag(mp); - if (mp->m_quotainfo) - XFS_QM_DONE(mp); } STATIC void -- cgit v1.2.2 From 2cf7f0da3ae225848a2ee10d4e216448a770fd00 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Thu, 30 Oct 2008 16:59:06 +1100 Subject: [XFS] Wait for all I/O on truncate to zero file size It's possible to have outstanding xfs_ioend_t's queued when the file size is zero. This can happen in the direct I/O path when a direct I/O write fails due to ENOSPC. In this case the xfs_ioend_t will still be queued (ie xfs_end_io_direct() does not know that the I/O failed so can't force the xfs_ioend_t to be flushed synchronously). When we truncate a file on unlink we don't know to wait for these xfs_ioend_ts and we can have a use-after-free situation if the inode is reclaimed before the xfs_ioend_t is finally processed. As was suggested by Dave Chinner lets wait for all I/Os to complete when truncating the file size to zero. SGI-PV: 981668 SGI-Modid: xfs-linux-melb:xfs-kern:32216a Signed-off-by: Lachlan McIlroy Signed-off-by: Christoph Hellwig --- fs/xfs/xfs_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index dbd9cef852ec..a391b955df01 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1414,7 +1414,7 @@ xfs_itruncate_start( mp = ip->i_mount; /* wait for the completion of any pending DIOs */ - if (new_size < ip->i_size) + if (new_size == 0 || new_size < ip->i_size) vn_iowait(ip); /* -- cgit v1.2.2 From 6f9f51adb6ac0a49fce49e01c47dcfc2810c6e9d Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 30 Oct 2008 17:38:12 +1100 Subject: [XFS] Account for allocated blocks when expanding directories When we create a directory, we reserve a number of blocks for the maximum possible expansion of of the directory due to various btree splits, freespace allocation, etc. Unfortunately, each allocation is not reflected in the total number of blocks still available to the transaction, so the maximal reservation is used over and over again. This leads to problems where an allocation group has only enough blocks for *some* of the allocations required for the directory modification. After the first N allocations, the remaining blocks in the allocation group drops below the total reservation, and subsequent allocations fail because the allocator will not allow the allocation to proceed if the AG does not have the enough blocks available for the entire allocation total. This results in an ENOSPC occurring after an allocation has already occurred. This results in aborting the directory operation (leaving the directory in an inconsistent state) and cancelling a dirty transaction, which results in a filesystem shutdown. Avoid the problem by reflecting the number of blocks allocated in any directory expansion in the total number of blocks available to the modification in progress. This prevents a directory modification from being aborted part way through with an ENOSPC. SGI-PV: 988144 SGI-Modid: xfs-linux-melb:xfs-kern:32340a Signed-off-by: David Chinner Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_da_btree.c | 5 +++++ fs/xfs/xfs_dir2.c | 6 ++++++ 2 files changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 9e561a9cefca..a11a8390bf6c 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1566,11 +1566,14 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) int nmap, error, w, count, c, got, i, mapi; xfs_trans_t *tp; xfs_mount_t *mp; + xfs_drfsbno_t nblks; dp = args->dp; mp = dp->i_mount; w = args->whichfork; tp = args->trans; + nblks = dp->i_d.di_nblocks; + /* * For new directories adjust the file offset and block count. */ @@ -1647,6 +1650,8 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) } if (mapp != &map) kmem_free(mapp); + /* account for newly allocated blocks in reserved blocks total */ + args->total -= dp->i_d.di_nblocks - nblks; *new_blkno = (xfs_dablk_t)bno; return 0; } diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 80e0dc51361c..1afb12278b8d 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -525,11 +525,13 @@ xfs_dir2_grow_inode( xfs_mount_t *mp; int nmap; /* number of bmap entries */ xfs_trans_t *tp; + xfs_drfsbno_t nblks; xfs_dir2_trace_args_s("grow_inode", args, space); dp = args->dp; tp = args->trans; mp = dp->i_mount; + nblks = dp->i_d.di_nblocks; /* * Set lowest possible block in the space requested. */ @@ -622,7 +624,11 @@ xfs_dir2_grow_inode( */ if (mapp != &map) kmem_free(mapp); + + /* account for newly allocated blocks in reserved blocks total */ + args->total -= dp->i_d.di_nblocks - nblks; *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno); + /* * Update file's size if this is the data space and it grew. */ -- cgit v1.2.2 From 8f330f5149ef41ff943b04d914406cc417f62784 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 10 Nov 2008 16:50:24 +1100 Subject: [XFS] handle memory allocation failures during log initialisation When there is no memory left in the system, xfs_buf_get_noaddr() can fail. If this happens at mount time during xlog_alloc_log() we fail to catch the error and oops. Catch the error from xfs_buf_get_noaddr(), and allow other memory allocations to fail and catch those errors too. Report the error to the console and fail the mount with ENOMEM. Tested by manually injecting errors into xfs_buf_get_noaddr() and xlog_alloc_log(). Version 2: o remove unnecessary casts of the returned pointer from kmem_zalloc() SGI-PV: 987246 Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_log.c | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0b02c6443551..3608a0f0a5f6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -563,6 +563,11 @@ xfs_log_mount( } mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); + if (!mp->m_log) { + cmn_err(CE_WARN, "XFS: Log allocation failed: No memory!"); + error = ENOMEM; + goto out; + } /* * Initialize the AIL now we have a log. @@ -601,6 +606,7 @@ xfs_log_mount( return 0; error: xfs_log_unmount_dealloc(mp); +out: return error; } /* xfs_log_mount */ @@ -1217,7 +1223,9 @@ xlog_alloc_log(xfs_mount_t *mp, int i; int iclogsize; - log = (xlog_t *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP); + log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); + if (!log) + return NULL; log->l_mp = mp; log->l_targ = log_target; @@ -1249,6 +1257,8 @@ xlog_alloc_log(xfs_mount_t *mp, xlog_get_iclog_buffer_size(mp, log); bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); + if (!bp) + goto out_free_log; XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); @@ -1275,13 +1285,17 @@ xlog_alloc_log(xfs_mount_t *mp, iclogsize = log->l_iclog_size; ASSERT(log->l_iclog_size >= 4096); for (i=0; i < log->l_iclog_bufs; i++) { - *iclogp = (xlog_in_core_t *) - kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP); + *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL); + if (!*iclogp) + goto out_free_iclog; + iclog = *iclogp; iclog->ic_prev = prev_iclog; prev_iclog = iclog; bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp); + if (!bp) + goto out_free_iclog; if (!XFS_BUF_CPSEMA(bp)) ASSERT(0); XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); @@ -1323,6 +1337,25 @@ xlog_alloc_log(xfs_mount_t *mp, log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ return log; + +out_free_iclog: + for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { + prev_iclog = iclog->ic_next; + if (iclog->ic_bp) { + sv_destroy(&iclog->ic_force_wait); + sv_destroy(&iclog->ic_write_wait); + xfs_buf_free(iclog->ic_bp); + xlog_trace_iclog_dealloc(iclog); + } + kmem_free(iclog); + } + spinlock_destroy(&log->l_icloglock); + spinlock_destroy(&log->l_grant_lock); + xlog_trace_loggrant_dealloc(log); + xfs_buf_free(log->l_xbuf); +out_free_log: + kmem_free(log); + return NULL; } /* xlog_alloc_log */ -- cgit v1.2.2 From 220ca310a53200b4bfbc7c4c6e365eea284ec44f Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 30 Oct 2008 17:40:09 +1100 Subject: [XFS] XFS: Check for valid transaction headers in recovery When we are about to add a new item to a transaction in recovery, we need to check that it is valid first. Currently we just assert that header magic number matches, but in production systems that is not present and we add a corrupted transaction to the list to be processed. This results in a kernel oops later when processing the corrupted transaction. Instead, if we detect a corrupted transaction, abort recovery and leave the user to clean up the mess that has occurred. SGI-PV: 988145 SGI-Modid: xfs-linux-melb:xfs-kern:32356a Signed-off-by: David Chinner Signed-off-by: Tim Shimmin Signed-off-by: Eric Sandeen Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_log_recover.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 82d46ce69d5f..70e3ba32e6be 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1419,7 +1419,13 @@ xlog_recover_add_to_trans( return 0; item = trans->r_itemq; if (item == NULL) { - ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC); + /* we need to catch log corruptions here */ + if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { + xlog_warn("XFS: xlog_recover_add_to_trans: " + "bad header magic number"); + ASSERT(0); + return XFS_ERROR(EIO); + } if (len == sizeof(xfs_trans_header_t)) xlog_recover_add_item(&trans->r_itemq); memcpy(&trans->r_theader, dp, len); /* d, s, l */ -- cgit v1.2.2 From c3cb6827353102fee62f3b9401a03ee29b297e5b Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Thu, 23 Oct 2008 16:33:03 +0800 Subject: ocfs2: fix license in xattr This patch fixes the license in xattr.c and xattr.h. Signed-off-by: Tiger Yang Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 13 ++++--------- fs/ocfs2/xattr.h | 12 ++---------- 2 files changed, 6 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 802c41492214..2f8952e4e4c1 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3,25 +3,20 @@ * * xattr.c * - * Copyright (C) 2008 Oracle. All rights reserved. + * Copyright (C) 2004, 2008 Oracle. All rights reserved. * * CREDITS: - * Lots of code in this file is taken from ext3. + * Lots of code in this file is copy from linux/fs/ext3/xattr.c. + * Copyright (C) 2001-2003 Andreas Gruenbacher, * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * License version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index c25c7c62a059..e4e45c81a261 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -3,24 +3,16 @@ * * xattr.h * - * Function prototypes - * - * Copyright (C) 2008 Oracle. All rights reserved. + * Copyright (C) 2004, 2008 Oracle. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * License version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_XATTR_H -- cgit v1.2.2 From 0030e001505d2d1503c083c917a747c033eaf8cd Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Thu, 23 Oct 2008 16:33:33 +0800 Subject: ocfs2: fix function declaration and definition in xattr Because we merged the xattr sources into one file, some functions no longer belong in the header file. Signed-off-by: Tiger Yang Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 28 +++++++++++++++++++++++----- fs/ocfs2/xattr.h | 26 ++++---------------------- 2 files changed, 27 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 2f8952e4e4c1..420d8e30b184 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -132,6 +132,24 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode, static int ocfs2_delete_xattr_index_block(struct inode *inode, struct buffer_head *xb_bh); +static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) +{ + return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE; +} + +static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb) +{ + return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); +} + +static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb) +{ + u16 len = sb->s_blocksize - + offsetof(struct ocfs2_xattr_header, xh_entries); + + return len / sizeof(struct ocfs2_xattr_entry); +} + static inline const char *ocfs2_xattr_prefix(int name_index) { struct xattr_handler *handler = NULL; @@ -832,11 +850,11 @@ cleanup: * Copy an extended attribute into the buffer provided. * Buffer is NULL to compute the size of buffer required. */ -int ocfs2_xattr_get(struct inode *inode, - int name_index, - const char *name, - void *buffer, - size_t buffer_size) +static int ocfs2_xattr_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size) { int ret; struct ocfs2_dinode *di = NULL; diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index e4e45c81a261..1d8314c7656d 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -32,29 +32,11 @@ enum ocfs2_xattr_type { extern struct xattr_handler ocfs2_xattr_user_handler; extern struct xattr_handler ocfs2_xattr_trusted_handler; - -extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); -extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t); -extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *, - size_t, int); -extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh); extern struct xattr_handler *ocfs2_xattr_handlers[]; -static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) -{ - return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE; -} - -static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb) -{ - return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); -} - -static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb) -{ - u16 len = sb->s_blocksize - - offsetof(struct ocfs2_xattr_header, xh_entries); +ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); +int ocfs2_xattr_set(struct inode *, int, const char *, const void *, + size_t, int); +int ocfs2_xattr_remove(struct inode *, struct buffer_head *); - return len / sizeof(struct ocfs2_xattr_entry); -} #endif /* OCFS2_XATTR_H */ -- cgit v1.2.2 From ceb1eba3dc2ad94b25764785ff7d2082c6094115 Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Thu, 23 Oct 2008 16:34:13 +0800 Subject: ocfs2: remove duplicate definition in xattr Include/linux/xattr.h already has the definition about xattr prefix, so remove the duplicate definitions in xattr.c. Signed-off-by: Tiger Yang Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 420d8e30b184..a9da45bbb9ed 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -4740,14 +4740,11 @@ out: /* * 'trusted' attributes support */ - -#define XATTR_TRUSTED_PREFIX "trusted." - static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1; + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (list && total_len <= list_size) { @@ -4784,18 +4781,14 @@ struct xattr_handler ocfs2_xattr_trusted_handler = { .set = ocfs2_xattr_trusted_set, }; - /* * 'user' attributes support */ - -#define XATTR_USER_PREFIX "user." - static size_t ocfs2_xattr_user_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1; + const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); -- cgit v1.2.2 From c988fd045f1195e62c0970384903ab9da26a9359 Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Thu, 23 Oct 2008 16:34:44 +0800 Subject: ocfs2: add handler_map array bounds checking Make the handler_map array as large as the possible value range to avoid a fencepost error. [ Utilize alternate method -- Joel ] Signed-off-by: Tiger Yang Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index a9da45bbb9ed..e19980a71a3c 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -78,7 +78,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = { NULL }; -static struct xattr_handler *ocfs2_xattr_handler_map[] = { +static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, }; -- cgit v1.2.2 From f6087fb799e097e7c9d912daa75701de9d62dc53 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Mon, 20 Oct 2008 18:20:43 -0700 Subject: ocfs2: Check xattr block signatures properly. The xattr.c code is currently memcmp()ing naking buffer pointers. Create the OCFS2_IS_VALID_XATTR_BLOCK() macro to match its peers and use that. In addition, failed signature checks were returning -EFAULT, which is completely wrong. Return -EIO. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/ocfs2.h | 3 +++ fs/ocfs2/xattr.c | 38 ++++++++++++++++---------------------- 2 files changed, 19 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index a21a465490c4..fef7ece32376 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -473,6 +473,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) (____gd)->bg_signature); \ } while (0) +#define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \ + (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE)) + static inline unsigned long ino_from_blkno(struct super_block *sb, u64 blkno) { diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index e19980a71a3c..151ba6257fbb 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -555,14 +555,12 @@ static int ocfs2_xattr_block_list(struct inode *inode, mlog_errno(ret); return ret; } - /*Verify the signature of xattr block*/ - if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, - strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { - ret = -EFAULT; - goto cleanup; - } xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { + ret = -EIO; + goto cleanup; + } if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; @@ -779,15 +777,14 @@ static int ocfs2_xattr_block_get(struct inode *inode, mlog_errno(ret); return ret; } - /*Verify the signature of xattr block*/ - if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, - strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { - ret = -EFAULT; + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { + ret = -EIO; goto cleanup; } xs->xattr_bh = blk_bh; - xb = (struct ocfs2_xattr_block *)blk_bh->b_data; if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { xs->header = &xb->xb_attrs.xb_header; @@ -1527,10 +1524,9 @@ static int ocfs2_xattr_free_block(struct inode *inode, goto out; } - /*Verify the signature of xattr block*/ - if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, - strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { - ret = -EFAULT; + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { + ret = -EIO; goto out; } @@ -1540,7 +1536,6 @@ static int ocfs2_xattr_free_block(struct inode *inode, goto out; } - xb = (struct ocfs2_xattr_block *)blk_bh->b_data; blk = le64_to_cpu(xb->xb_blkno); bit = le16_to_cpu(xb->xb_suballoc_bit); bg_blkno = ocfs2_which_suballoc_group(blk, bit); @@ -1784,15 +1779,14 @@ static int ocfs2_xattr_block_find(struct inode *inode, mlog_errno(ret); return ret; } - /*Verify the signature of xattr block*/ - if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, - strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { - ret = -EFAULT; - goto cleanup; + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { + ret = -EIO; + goto cleanup; } xs->xattr_bh = blk_bh; - xb = (struct ocfs2_xattr_block *)blk_bh->b_data; if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { xs->header = &xb->xb_attrs.xb_header; -- cgit v1.2.2 From b37c4d84e9d16fd5b6f31197f02ea0a112fc9e99 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Mon, 20 Oct 2008 18:24:03 -0700 Subject: ocfs2: Don't return -EFAULT from a corrupt xattr entry. If the xattr disk structures are corrupt, return -EIO, not -EFAULT. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 151ba6257fbb..41a6ca004ae3 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1239,7 +1239,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, free = min_offs - ((void *)last - xs->base) - sizeof(__u32); if (free < 0) - return -EFAULT; + return -EIO; if (!xs->not_found) { size_t size = 0; -- cgit v1.2.2 From bd60bd37ade4321ecce4ed4442f68c88febd76d5 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Mon, 20 Oct 2008 18:25:56 -0700 Subject: ocfs2: Check errors from ocfs2_xattr_update_xattr_search() The ocfs2_xattr_update_xattr_search() function can return an error when trying to read blocks off of disk. The caller needs to check this error before using those (possibly invalid) blocks. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 41a6ca004ae3..92df88a41e5d 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2825,7 +2825,11 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, if (data_bh) ocfs2_journal_dirty(handle, data_bh); - ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh); + ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */ memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - -- cgit v1.2.2 From eb6ff2397d1fdfc6a7629c99896338e5b5c508e5 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Mon, 20 Oct 2008 18:32:48 -0700 Subject: ocfs2: Specify appropriate journal access for new xattr buckets. There are a couple places that get an xattr bucket that may be reading an existing one or may be allocating a new one. They should specify the correct journal access mode depending. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 92df88a41e5d..fb450200bc88 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3231,7 +3231,9 @@ static int ocfs2_half_xattr_bucket(struct inode *inode, for (i = 0; i < blk_per_bucket; i++) { ret = ocfs2_journal_access(handle, inode, t_bhs[i], - OCFS2_JOURNAL_ACCESS_CREATE); + new_bucket_head ? + OCFS2_JOURNAL_ACCESS_CREATE : + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -3393,6 +3395,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode, for (i = 0; i < blk_per_bucket; i++) { ret = ocfs2_journal_access(handle, inode, t_bhs[i], + t_is_new ? + OCFS2_JOURNAL_ACCESS_CREATE : OCFS2_JOURNAL_ACCESS_WRITE); if (ret) goto out; -- cgit v1.2.2 From 54f443f4e7265a1333886dbace31cb6eb1991c72 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Mon, 20 Oct 2008 18:43:07 -0700 Subject: ocfs2: Don't repeat ocfs2_xattr_block_find() ocfs2_xattr_block_get() looks up the xattr in a startlingly familiar way; it's identical to the function ocfs2_xattr_block_find(). Let's just use the later in the former. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 39 +++++++++------------------------------ 1 file changed, 9 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index fb450200bc88..74d1faba23bb 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -111,6 +111,10 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, int *block_off, int *new_offset); +static int ocfs2_xattr_block_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs); static int ocfs2_xattr_index_block_find(struct inode *inode, struct buffer_head *root_bh, int name_index, @@ -760,46 +764,20 @@ static int ocfs2_xattr_block_get(struct inode *inode, size_t buffer_size, struct ocfs2_xattr_search *xs) { - struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; - struct buffer_head *blk_bh = NULL; struct ocfs2_xattr_block *xb; struct ocfs2_xattr_value_root *xv; size_t size; int ret = -ENODATA, name_offset, name_len, block_off, i; - if (!di->i_xattr_loc) - return ret; - memset(&xs->bucket, 0, sizeof(xs->bucket)); - ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); - if (ret < 0) { + ret = ocfs2_xattr_block_find(inode, name_index, name, xs); + if (ret) { mlog_errno(ret); - return ret; - } - - xb = (struct ocfs2_xattr_block *)blk_bh->b_data; - if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { - ret = -EIO; goto cleanup; } - xs->xattr_bh = blk_bh; - - if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { - xs->header = &xb->xb_attrs.xb_header; - xs->base = (void *)xs->header; - xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size; - xs->here = xs->header->xh_entries; - - ret = ocfs2_xattr_find_entry(name_index, name, xs); - } else - ret = ocfs2_xattr_index_block_find(inode, blk_bh, - name_index, - name, xs); - - if (ret) - goto cleanup; + xb = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; size = le64_to_cpu(xs->here->xe_value_size); if (buffer) { ret = -ERANGE; @@ -838,7 +816,8 @@ cleanup: brelse(xs->bucket.bhs[i]); memset(&xs->bucket, 0, sizeof(xs->bucket)); - brelse(blk_bh); + brelse(xs->xattr_bh); + xs->xattr_bh = NULL; return ret; } -- cgit v1.2.2 From 63fd77573723841d5d44a79471258f1b261f4482 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 17 Oct 2008 12:44:36 +0800 Subject: ocfs2: Remove unused ocfs2_restore_xattr_block(). Since now ocfs2 supports empty xattr buckets, we will never remove the xattr index tree even if all the xattrs are removed, so this function will never be called. So remove it. Signed-off-by: Tao Ma Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 48 ------------------------------------------------ 1 file changed, 48 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 74d1faba23bb..789fb70462c9 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1791,52 +1791,6 @@ cleanup: return ret; } -/* - * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree - * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header - * re-initialized. - */ -static int ocfs2_restore_xattr_block(struct inode *inode, - struct ocfs2_xattr_search *xs) -{ - int ret; - handle_t *handle; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_xattr_block *xb = - (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; - struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; - u16 xb_flags = le16_to_cpu(xb->xb_flags); - - BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) || - le16_to_cpu(el->l_next_free_rec) != 0); - - handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; - goto out; - } - - ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; - } - - memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - - offsetof(struct ocfs2_xattr_block, xb_attrs)); - - xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED); - - ocfs2_journal_dirty(handle, xs->xattr_bh); - -out_commit: - ocfs2_commit_trans(osb, handle); -out: - return ret; -} - /* * ocfs2_xattr_block_set() * @@ -1947,8 +1901,6 @@ out: } ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs); - if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0) - ret = ocfs2_restore_xattr_block(inode, xs); end: -- cgit v1.2.2 From 8573f79d30077875e2b6e83849b5245bfbb08685 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 24 Oct 2008 22:24:17 +0800 Subject: ocfs2: Fix some typos in xattr annotations. Fix some typos in the xattr annotations. Signed-off-by: Tao Ma Reported-by: Coly Li Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/ocfs2_fs.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index f24ce3d3f956..5f180cf7abbd 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -742,12 +742,12 @@ struct ocfs2_group_desc */ struct ocfs2_xattr_entry { __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */ - __le16 xe_name_offset; /* byte offset from the 1st etnry in the local + __le16 xe_name_offset; /* byte offset from the 1st entry in the local xattr storage(inode, xattr block or xattr bucket). */ __u8 xe_name_len; /* xattr name len, does't include prefix. */ - __u8 xe_type; /* the low 7 bits indicates the name prefix's - * type and the highest 1 bits indicate whether + __u8 xe_type; /* the low 7 bits indicate the name prefix + * type and the highest bit indicates whether * the EA is stored in the local storage. */ __le64 xe_value_size; /* real xattr value length. */ }; @@ -766,9 +766,10 @@ struct ocfs2_xattr_header { xattr. */ __le16 xh_name_value_len; /* total length of name/value length in this bucket. */ - __le16 xh_num_buckets; /* bucket nums in one extent - record, only valid in the - first bucket. */ + __le16 xh_num_buckets; /* Number of xattr buckets + in this extent record, + only valid in the first + bucket. */ __le64 xh_csum; struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ }; @@ -776,8 +777,8 @@ struct ocfs2_xattr_header { /* * On disk structure for xattr value root. * - * It is used when one extended attribute's size is larger, and we will save it - * in an outside cluster. It will stored in a b-tree like file content. + * When an xattr's value is large enough, it is stored in an external + * b-tree like file data. The xattr value root points to this structure. */ struct ocfs2_xattr_value_root { /*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */ -- cgit v1.2.2 From fa38e92cb34e27e60d0faf1035934eb9b44aa1d4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 20 Oct 2008 19:23:51 +0200 Subject: ocfs2: Fix check of return value of ocfs2_start_trans() On failure, ocfs2_start_trans() returns values like ERR_PTR(-ENOMEM). Thus checks for !handle are wrong. Fix them to use IS_ERR(). Signed-off-by: Jan Kara Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/file.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7efe937a415f..3138a385fdbb 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -247,8 +247,8 @@ int ocfs2_update_inode_atime(struct inode *inode, mlog_entry_void(); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (handle == NULL) { - ret = -ENOMEM; + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); mlog_errno(ret); goto out; } @@ -312,8 +312,8 @@ static int ocfs2_simple_size_update(struct inode *inode, handle_t *handle = NULL; handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (handle == NULL) { - ret = -ENOMEM; + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); mlog_errno(ret); goto out; } @@ -1055,8 +1055,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (handle == NULL) { - ret = -ENOMEM; + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); mlog_errno(ret); goto out; } @@ -1259,8 +1259,8 @@ static int __ocfs2_remove_inode_range(struct inode *inode, } handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); - if (handle == NULL) { - ret = -ENOMEM; + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); mlog_errno(ret); goto out; } @@ -1352,8 +1352,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, goto out; handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (handle == NULL) { - ret = -ENOMEM; + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); mlog_errno(ret); goto out; } -- cgit v1.2.2 From 87cfa004321c62aec681713ea48e0b846336d9f4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 20 Oct 2008 19:23:53 +0200 Subject: ocfs2: Fix checking of return value of new_inode() new_inode() does not return ERR_PTR() but NULL in case of failure. Correct checking of the return value. Signed-off-by: Jan Kara Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 485a6aa0ad39..f594f300d4cd 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -378,8 +378,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, } inode = new_inode(dir->i_sb); - if (IS_ERR(inode)) { - status = PTR_ERR(inode); + if (!inode) { + status = -ENOMEM; mlog(ML_ERROR, "new_inode failed!\n"); goto leave; } -- cgit v1.2.2 From b99835c1684918b9975851d71455c5c007d1715b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 20 Oct 2008 19:23:54 +0200 Subject: ocfs2: Let inode be really deleted when ocfs2_mknod_locked() fails We forgot to set i_nlink to 0 when returning due to error from ocfs2_mknod_locked() and thus inode was not properly released via ocfs2_delete_inode() (e.g. claimed space was not released). Fix it. Signed-off-by: Jan Kara Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f594f300d4cd..f4967e634ffd 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -491,8 +491,10 @@ leave: brelse(*new_fe_bh); *new_fe_bh = NULL; } - if (inode) + if (inode) { + clear_nlink(inode); iput(inode); + } } mlog_exit(status); -- cgit v1.2.2 From d32647993c211901fc4819ef3327f62d1859241b Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 24 Oct 2008 07:57:28 +0800 Subject: ocfs2: Fix check of return value of ocfs2_start_trans() in xattr.c. On failure, ocfs2_start_trans() returns values like ERR_PTR(-ENOMEM), so we should check whether handle is NULL. Fix them to use IS_ERR(). Jan has made the patch for other part in ocfs2(thank Jan for it), so this is just the fix for fs/ocfs2/xattr.c. Signed-off-by: Tao Ma Cc: Jan Kara Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 789fb70462c9..a371c01942b1 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -4092,7 +4092,7 @@ static int ocfs2_xattr_value_update_size(struct inode *inode, handle_t *handle = NULL; handle = ocfs2_start_trans(osb, 1); - if (handle == NULL) { + if (IS_ERR(handle)) { ret = -ENOMEM; mlog_errno(ret); goto out; @@ -4259,7 +4259,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, } handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); - if (handle == NULL) { + if (IS_ERR(handle)) { ret = -ENOMEM; mlog_errno(ret); goto out; -- cgit v1.2.2 From ae0dff683076b2798763288c7ac2f09a18c4a998 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 22 Oct 2008 13:24:29 -0700 Subject: ocfs2: Set journal descriptor to NULL after journal shutdown Patch sets journal descriptor to NULL after the journal is shutdown. This ensures that jbd2_journal_release_jbd_inode(), which removes the jbd2 inode from txn lists, can be called safely from ocfs2_clear_inode() even after the journal has been shutdown. Signed-off-by: Sunil Mushran Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/inode.c | 6 ++++++ fs/ocfs2/journal.c | 1 + 2 files changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 4903688f72a9..7aa00d511874 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1106,6 +1106,12 @@ void ocfs2_clear_inode(struct inode *inode) oi->ip_last_trans = 0; oi->ip_dir_start_lookup = 0; oi->ip_blkno = 0ULL; + + /* + * ip_jinode is used to track txns against this inode. We ensure that + * the journal is flushed before journal shutdown. Thus it is safe to + * have inodes get cleaned up after journal shutdown. + */ jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, &oi->ip_jinode); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 81e40677eecb..99fe9d584f3c 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -690,6 +690,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) /* Shutdown the kernel journal system */ jbd2_journal_destroy(journal->j_journal); + journal->j_journal = NULL; OCFS2_I(inode)->ip_open_count--; -- cgit v1.2.2 From 4c1bbf1ba631d7db61ce3462349a3f5d14ae3009 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 6 Oct 2008 16:59:55 +0800 Subject: ocfs2: return 0 in page_mkwrite to let VFS retry. In ocfs2_page_mkwrite, we return -EINVAL when we found the page mapping isn't updated, and it will cause the user space program get SIGBUS and exit. The reason is that during race writeable mmap, we will do unmap_mapping_range in ocfs2_data_downconvert_worker. The good thing is that if we reuturn 0 in page_mkwrite, VFS will retry fault and then call page_mkwrite again, so it is safe to return 0 here. Signed-off-by: Tao Ma Signed-off-by: Mark Fasheh --- fs/ocfs2/mmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 3dc18d67557c..eea1d24713ea 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -113,7 +113,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, * ocfs2_write_begin_nolock(). */ if (!PageUptodate(page) || page->mapping != inode->i_mapping) { - ret = -EINVAL; + /* + * the page has been umapped in ocfs2_data_downconvert_worker. + * So return 0 here and let VFS retry. + */ + ret = 0; goto out; } -- cgit v1.2.2 From 80bcaf3469b8aefd316d4ceb27d9af7cfbb0b913 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 27 Oct 2008 06:06:24 +0800 Subject: ocfs2/xattr: Proper hash collision handle in bucket division In ocfs2/xattr, we must make sure the xattrs which have the same hash value exist in the same bucket so that the search schema can work. But in the old implementation, when we want to extend a bucket, we just move half number of xattrs to the new bucket. This works in most cases, but if we are lucky enough we will move 2 xattrs into 2 different buckets. This means that an xattr from the previous bucket cannot be found anymore. This patch fix this problem by finding the right position during extending the bucket and extend an empty bucket if needed. Signed-off-by: Tao Ma Cc: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 144 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 115 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index a371c01942b1..f3ea7efb48c6 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3110,25 +3110,73 @@ static int ocfs2_read_xattr_bucket(struct inode *inode, } /* - * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk). + * Find the suitable pos when we divide a bucket into 2. + * We have to make sure the xattrs with the same hash value exist + * in the same bucket. + * + * If this ocfs2_xattr_header covers more than one hash value, find a + * place where the hash value changes. Try to find the most even split. + * The most common case is that all entries have different hash values, + * and the first check we make will find a place to split. + */ +static int ocfs2_xattr_find_divide_pos(struct ocfs2_xattr_header *xh) +{ + struct ocfs2_xattr_entry *entries = xh->xh_entries; + int count = le16_to_cpu(xh->xh_count); + int delta, middle = count / 2; + + /* + * We start at the middle. Each step gets farther away in both + * directions. We therefore hit the change in hash value + * nearest to the middle. Note that this loop does not execute for + * count < 2. + */ + for (delta = 0; delta < middle; delta++) { + /* Let's check delta earlier than middle */ + if (cmp_xe(&entries[middle - delta - 1], + &entries[middle - delta])) + return middle - delta; + + /* For even counts, don't walk off the end */ + if ((middle + delta + 1) == count) + continue; + + /* Now try delta past middle */ + if (cmp_xe(&entries[middle + delta], + &entries[middle + delta + 1])) + return middle + delta + 1; + } + + /* Every entry had the same hash */ + return count; +} + +/* + * Move some xattrs in old bucket(blk) to new bucket(new_blk). * first_hash will record the 1st hash of the new bucket. + * + * Normally half of the xattrs will be moved. But we have to make + * sure that the xattrs with the same hash value are stored in the + * same bucket. If all the xattrs in this bucket have the same hash + * value, the new bucket will be initialized as an empty one and the + * first_hash will be initialized as (hash_value+1). */ -static int ocfs2_half_xattr_bucket(struct inode *inode, - handle_t *handle, - u64 blk, - u64 new_blk, - u32 *first_hash, - int new_bucket_head) +static int ocfs2_divide_xattr_bucket(struct inode *inode, + handle_t *handle, + u64 blk, + u64 new_blk, + u32 *first_hash, + int new_bucket_head) { int ret, i; - u16 count, start, len, name_value_len, xe_len, name_offset; + int count, start, len, name_value_len = 0, xe_len, name_offset = 0; u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); struct buffer_head **s_bhs, **t_bhs = NULL; struct ocfs2_xattr_header *xh; struct ocfs2_xattr_entry *xe; int blocksize = inode->i_sb->s_blocksize; - mlog(0, "move half of xattrs from bucket %llu to %llu\n", + mlog(0, "move some of xattrs from bucket %llu to %llu\n", blk, new_blk); s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); @@ -3171,14 +3219,35 @@ static int ocfs2_half_xattr_bucket(struct inode *inode, } } + xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; + count = le16_to_cpu(xh->xh_count); + start = ocfs2_xattr_find_divide_pos(xh); + + if (start == count) { + xe = &xh->xh_entries[start-1]; + + /* + * initialized a new empty bucket here. + * The hash value is set as one larger than + * that of the last entry in the previous bucket. + */ + for (i = 0; i < blk_per_bucket; i++) + memset(t_bhs[i]->b_data, 0, blocksize); + + xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; + xh->xh_free_start = cpu_to_le16(blocksize); + xh->xh_entries[0].xe_name_hash = xe->xe_name_hash; + le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1); + + goto set_num_buckets; + } + /* copy the whole bucket to the new first. */ for (i = 0; i < blk_per_bucket; i++) memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize); /* update the new bucket. */ xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; - count = le16_to_cpu(xh->xh_count); - start = count / 2; /* * Calculate the total name/value len and xh_free_start for @@ -3235,6 +3304,7 @@ static int ocfs2_half_xattr_bucket(struct inode *inode, xh->xh_free_start = xe->xe_name_offset; } +set_num_buckets: /* set xh->xh_num_buckets for the new xh. */ if (new_bucket_head) xh->xh_num_buckets = cpu_to_le16(1); @@ -3252,9 +3322,13 @@ static int ocfs2_half_xattr_bucket(struct inode *inode, *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); /* - * Now only update the 1st block of the old bucket. - * Please note that the entry has been sorted already above. + * Now only update the 1st block of the old bucket. If we + * just added a new empty bucket, there is no need to modify + * it. */ + if (start == count) + goto out; + xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; memset(&xh->xh_entries[start], 0, sizeof(struct ocfs2_xattr_entry) * (count - start)); @@ -3439,15 +3513,15 @@ out: } /* - * Move half of the xattrs in this cluster to the new cluster. + * Move some xattrs in this cluster to the new cluster. * This function should only be called when bucket size == cluster size. * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead. */ -static int ocfs2_half_xattr_cluster(struct inode *inode, - handle_t *handle, - u64 prev_blk, - u64 new_blk, - u32 *first_hash) +static int ocfs2_divide_xattr_cluster(struct inode *inode, + handle_t *handle, + u64 prev_blk, + u64 new_blk, + u32 *first_hash) { u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); int ret, credits = 2 * blk_per_bucket; @@ -3461,8 +3535,8 @@ static int ocfs2_half_xattr_cluster(struct inode *inode, } /* Move half of the xattr in start_blk to the next bucket. */ - return ocfs2_half_xattr_bucket(inode, handle, prev_blk, - new_blk, first_hash, 1); + return ocfs2_divide_xattr_bucket(inode, handle, prev_blk, + new_blk, first_hash, 1); } /* @@ -3524,9 +3598,9 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, last_blk, new_blk, v_start); else { - ret = ocfs2_half_xattr_cluster(inode, handle, - last_blk, new_blk, - v_start); + ret = ocfs2_divide_xattr_cluster(inode, handle, + last_blk, new_blk, + v_start); if ((*header_bh)->b_blocknr == last_blk && extend) *extend = 0; @@ -3743,8 +3817,8 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode, } /* Move half of the xattr in start_blk to the next bucket. */ - ret = ocfs2_half_xattr_bucket(inode, handle, start_blk, - start_blk + blk_per_bucket, NULL, 0); + ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk, + start_blk + blk_per_bucket, NULL, 0); le16_add_cpu(&first_xh->xh_num_buckets, 1); ocfs2_journal_dirty(handle, first_bh); @@ -4435,11 +4509,21 @@ out: return ret; } -/* check whether the xattr bucket is filled up with the same hash value. */ +/* + * check whether the xattr bucket is filled up with the same hash value. + * If we want to insert the xattr with the same hash, return -ENOSPC. + * If we want to insert a xattr with different hash value, go ahead + * and ocfs2_divide_xattr_bucket will handle this. + */ static int ocfs2_check_xattr_bucket_collision(struct inode *inode, - struct ocfs2_xattr_bucket *bucket) + struct ocfs2_xattr_bucket *bucket, + const char *name) { struct ocfs2_xattr_header *xh = bucket->xh; + u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name)); + + if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash)) + return 0; if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash == xh->xh_entries[0].xe_name_hash) { @@ -4562,7 +4646,9 @@ try_again: * one bucket's worth, so check it here whether we need to * add a new bucket for the insert. */ - ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket); + ret = ocfs2_check_xattr_bucket_collision(inode, + &xs->bucket, + xi->name); if (ret) { mlog_errno(ret); goto out; -- cgit v1.2.2 From c435400140d24fbcb3da6b1e006be831f9056cb6 Mon Sep 17 00:00:00 2001 From: Dmitri Monakhov Date: Mon, 27 Oct 2008 13:01:49 -0700 Subject: ocfs2: truncate outstanding block after direct io failure Signed-off-by: Dmitri Monakhov Cc: Jeff Moyer Cc: Mark Fasheh Cc: Joel Becker Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Mark Fasheh --- fs/ocfs2/file.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 3138a385fdbb..e2570a3bc2b2 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1866,6 +1866,13 @@ relock: written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, ppos, count, ocount); if (written < 0) { + /* + * direct write may have instantiated a few + * blocks outside i_size. Trim these off again. + * Don't need i_size_read because we hold i_mutex. + */ + if (*ppos + count > inode->i_size) + vmtruncate(inode, inode->i_size); ret = written; goto out_dio; } -- cgit v1.2.2 From de29c08528bae45e3fa1171d190f1340e37e0f70 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 29 Oct 2008 14:45:30 -0700 Subject: ocfs2: fix printk related build warnings in xattr.c Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index f3ea7efb48c6..70baffeb1812 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2336,7 +2336,8 @@ static int ocfs2_xattr_index_block_find(struct inode *inode, BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash); mlog(0, "find xattr extent rec %u clusters from %llu, the first hash " - "in the rec is %u\n", num_clusters, p_blkno, first_hash); + "in the rec is %u\n", num_clusters, (unsigned long long)p_blkno, + first_hash); ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash, p_blkno, first_hash, num_clusters, xs); @@ -2360,7 +2361,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode, memset(&bucket, 0, sizeof(bucket)); mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n", - clusters, blkno); + clusters, (unsigned long long)blkno); for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) { ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, @@ -2378,7 +2379,8 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode, if (i == 0) num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets); - mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno, + mlog(0, "iterating xattr bucket %llu, first hash %u\n", + (unsigned long long)blkno, le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash)); if (func) { ret = func(inode, &bucket, para); @@ -2714,7 +2716,8 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, */ blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); - mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno); + mlog(0, "allocate 1 cluster from %llu to xattr block\n", + (unsigned long long)blkno); xh_bh = sb_getblk(inode->i_sb, blkno); if (!xh_bh) { @@ -2883,8 +2886,8 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode, mlog(0, "adjust xattr bucket in %llu, count = %u, " "xh_free_start = %u, xh_name_value_len = %u.\n", - blkno, le16_to_cpu(xh->xh_count), xh_free_start, - le16_to_cpu(xh->xh_name_value_len)); + (unsigned long long)blkno, le16_to_cpu(xh->xh_count), + xh_free_start, le16_to_cpu(xh->xh_name_value_len)); /* * sort all the entries by their offset. @@ -3000,7 +3003,7 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode, prev_blkno += (num_clusters - 1) * bpc + bpc / 2; mlog(0, "move half of xattrs in cluster %llu to %llu\n", - prev_blkno, new_blkno); + (unsigned long long)prev_blkno, (unsigned long long)new_blkno); /* * We need to update the 1st half of the new cluster and @@ -3177,7 +3180,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, int blocksize = inode->i_sb->s_blocksize; mlog(0, "move some of xattrs from bucket %llu to %llu\n", - blk, new_blk); + (unsigned long long)blk, (unsigned long long)new_blk); s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); if (!s_bhs) @@ -3376,7 +3379,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode, BUG_ON(s_blkno == t_blkno); mlog(0, "cp bucket %llu to %llu, target is %d\n", - s_blkno, t_blkno, t_is_new); + (unsigned long long)s_blkno, (unsigned long long)t_blkno, + t_is_new); s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, GFP_NOFS); @@ -3448,7 +3452,8 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode, struct ocfs2_xattr_header *xh; u64 to_blk_start = to_blk; - mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk); + mlog(0, "cp xattrs from cluster %llu to %llu\n", + (unsigned long long)src_blk, (unsigned long long)to_blk); /* * We need to update the new cluster and 1 more for the update of @@ -3579,7 +3584,8 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n", - prev_blk, prev_clusters, new_blk); + (unsigned long long)prev_blk, prev_clusters, + (unsigned long long)new_blk); if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) ret = ocfs2_mv_xattr_bucket_cross_cluster(inode, @@ -3649,7 +3655,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, " "previous xattr blkno = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, - prev_cpos, prev_blkno); + prev_cpos, (unsigned long long)prev_blkno); ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); @@ -3736,7 +3742,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, } } mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", - num_bits, block, v_start); + num_bits, (unsigned long long)block, v_start); ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, num_bits, 0, meta_ac); if (ret < 0) { @@ -3781,7 +3787,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode, u16 bucket = le16_to_cpu(first_xh->xh_num_buckets); mlog(0, "extend xattr bucket in %llu, xattr extend rec starting " - "from %llu, len = %u\n", start_blk, + "from %llu, len = %u\n", (unsigned long long)start_blk, (unsigned long long)first_bh->b_blocknr, num_clusters); BUG_ON(bucket >= num_buckets); -- cgit v1.2.2 From 6c1e183e12dbd78a897a859f13220406296fee31 Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Sun, 2 Nov 2008 19:04:21 +0800 Subject: ocfs2: Check search result in ocfs2_xattr_block_get() ocfs2_xattr_block_get() calls ocfs2_xattr_search() to find an external xattr, but doesn't check the search result that is passed back via struct ocfs2_xattr_search. Add a check for search result, and pass back -ENODATA if the xattr search failed. This avoids a later NULL pointer error. Signed-off-by: Tiger Yang Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 70baffeb1812..054e2efb0b7e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -777,6 +777,11 @@ static int ocfs2_xattr_block_get(struct inode *inode, goto cleanup; } + if (xs->not_found) { + ret = -ENODATA; + goto cleanup; + } + xb = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; size = le64_to_cpu(xs->here->xe_value_size); if (buffer) { @@ -860,7 +865,7 @@ static int ocfs2_xattr_get(struct inode *inode, down_read(&oi->ip_xattr_sem); ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer, buffer_size, &xis); - if (ret == -ENODATA) + if (ret == -ENODATA && di->i_xattr_loc) ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, buffer_size, &xbs); up_read(&oi->ip_xattr_sem); -- cgit v1.2.2