aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2009-04-13 17:40:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-04-13 18:04:33 -0400
commit316cb4ef3eb2ad6e35e15cc56d39c6cda58c093a (patch)
tree23084bf762a3dc0ec91c299c5f10206b132df621
parentcaefba1740d8016e6dfe8fda84f85bdcb8f8c85d (diff)
ext2: fix data corruption for racing writes
If two writers allocating blocks to file race with each other (e.g. because writepages races with ordinary write or two writepages race with each other), ext2_getblock() can be called on the same inode in parallel. Before we are going to allocate new blocks, we have to recheck the block chain we have obtained so far without holding truncate_mutex. Otherwise we could overwrite the indirect block pointer set by the other writer leading to data loss. The below test program by Ying is able to reproduce the data loss with ext2 on in BRD in a few minutes if the machine is under memory pressure: long kMemSize = 50 << 20; int kPageSize = 4096; int main(int argc, char **argv) { int status; int count = 0; int i; char *fname = "/mnt/test.mmap"; char *mem; unlink(fname); int fd = open(fname, O_CREAT | O_EXCL | O_RDWR, 0600); status = ftruncate(fd, kMemSize); mem = mmap(0, kMemSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); // Fill the memory with 1s. memset(mem, 1, kMemSize); sleep(2); for (i = 0; i < kMemSize; i++) { int byte_good = mem[i] != 0; if (!byte_good && ((i % kPageSize) == 0)) { //printf("%d ", i / kPageSize); count++; } } munmap(mem, kMemSize); close(fd); unlink(fname); if (count > 0) { printf("Running %d bad page\n", count); return 1; } return 0; } Cc: Ying Han <yinghan@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Jan Kara <jack@suse.cz> Cc: Mingming Cao <cmm@us.ibm.com> Cc: <linux-ext4@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/ext2/inode.c44
1 files changed, 33 insertions, 11 deletions
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b43b95563663..acf678831103 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -590,9 +590,8 @@ static int ext2_get_blocks(struct inode *inode,
590 590
591 if (depth == 0) 591 if (depth == 0)
592 return (err); 592 return (err);
593reread:
594 partial = ext2_get_branch(inode, depth, offsets, chain, &err);
595 593
594 partial = ext2_get_branch(inode, depth, offsets, chain, &err);
596 /* Simplest case - block found, no allocation needed */ 595 /* Simplest case - block found, no allocation needed */
597 if (!partial) { 596 if (!partial) {
598 first_block = le32_to_cpu(chain[depth - 1].key); 597 first_block = le32_to_cpu(chain[depth - 1].key);
@@ -602,15 +601,16 @@ reread:
602 while (count < maxblocks && count <= blocks_to_boundary) { 601 while (count < maxblocks && count <= blocks_to_boundary) {
603 ext2_fsblk_t blk; 602 ext2_fsblk_t blk;
604 603
605 if (!verify_chain(chain, partial)) { 604 if (!verify_chain(chain, chain + depth - 1)) {
606 /* 605 /*
607 * Indirect block might be removed by 606 * Indirect block might be removed by
608 * truncate while we were reading it. 607 * truncate while we were reading it.
609 * Handling of that case: forget what we've 608 * Handling of that case: forget what we've
610 * got now, go to reread. 609 * got now, go to reread.
611 */ 610 */
611 err = -EAGAIN;
612 count = 0; 612 count = 0;
613 goto changed; 613 break;
614 } 614 }
615 blk = le32_to_cpu(*(chain[depth-1].p + count)); 615 blk = le32_to_cpu(*(chain[depth-1].p + count));
616 if (blk == first_block + count) 616 if (blk == first_block + count)
@@ -618,7 +618,8 @@ reread:
618 else 618 else
619 break; 619 break;
620 } 620 }
621 goto got_it; 621 if (err != -EAGAIN)
622 goto got_it;
622 } 623 }
623 624
624 /* Next simple case - plain lookup or failed read of indirect block */ 625 /* Next simple case - plain lookup or failed read of indirect block */
@@ -626,6 +627,33 @@ reread:
626 goto cleanup; 627 goto cleanup;
627 628
628 mutex_lock(&ei->truncate_mutex); 629 mutex_lock(&ei->truncate_mutex);
630 /*
631 * If the indirect block is missing while we are reading
632 * the chain(ext3_get_branch() returns -EAGAIN err), or
633 * if the chain has been changed after we grab the semaphore,
634 * (either because another process truncated this branch, or
635 * another get_block allocated this branch) re-grab the chain to see if
636 * the request block has been allocated or not.
637 *
638 * Since we already block the truncate/other get_block
639 * at this point, we will have the current copy of the chain when we
640 * splice the branch into the tree.
641 */
642 if (err == -EAGAIN || !verify_chain(chain, partial)) {
643 while (partial > chain) {
644 brelse(partial->bh);
645 partial--;
646 }
647 partial = ext2_get_branch(inode, depth, offsets, chain, &err);
648 if (!partial) {
649 count++;
650 mutex_unlock(&ei->truncate_mutex);
651 if (err)
652 goto cleanup;
653 clear_buffer_new(bh_result);
654 goto got_it;
655 }
656 }
629 657
630 /* 658 /*
631 * Okay, we need to do block allocation. Lazily initialize the block 659 * Okay, we need to do block allocation. Lazily initialize the block
@@ -683,12 +711,6 @@ cleanup:
683 partial--; 711 partial--;
684 } 712 }
685 return err; 713 return err;
686changed:
687 while (partial > chain) {
688 brelse(partial->bh);
689 partial--;
690 }
691 goto reread;
692} 714}
693 715
694int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) 716int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)