aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@gmail.com>2014-05-23 00:03:34 -0400
committerChris Mason <clm@fb.com>2014-06-09 20:21:02 -0400
commitc125b8bff1d9f6c8c91ce4eb8bd5616058c7d510 (patch)
treeaea4d937f4e94d5535867c95835e68b46ca4588a /fs/btrfs
parent8321cf2596d283821acc466377c2b85bcd3422b7 (diff)
Btrfs: ensure readers see new data after a clone operation
We were cleaning the clone target file range from the page cache before we did replace the file extent items in the fs tree. This was racy, as right after cleaning the relevant range from the page cache and before replacing the file extent items, a read against that range could be performed by another task and populate again the page cache with stale data (stale after the cloning finishes). This would result in reads after the clone operation successfully finishes to get old data (and potentially for a very long time). Therefore evict the pages after replacing the file extent items, so that subsequent reads will always get the new data. Similarly, we were prone to races while cloning the file extent items because we weren't locking the target range and wait for any existing ordered extents against that range to complete. It was possible that after cloning the extent items, a write operation that was performed before the clone operation and overlaps the same range, would end up undoing all or part of the work the clone operation did (a worker task running inode.c:btrfs_finish_ordered_io). Therefore lock the target range in the io tree, wait for all pending ordered extents against that range to finish and then safely perform the cloning. The issue of reading stale data after the clone operation is easy to reproduce by running the following C program in a loop until it exits with return value 1. #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> #include <pthread.h> #include <fcntl.h> #include <assert.h> #include <asm/types.h> #include <linux/ioctl.h> #include <sys/stat.h> #include <sys/types.h> #include <sys/ioctl.h> #define SRC_FILE "/mnt/sdd/foo" #define DST_FILE "/mnt/sdd/bar" #define FILE_SIZE (16 * 1024) #define PATTERN_SRC 'X' #define PATTERN_DST 'Y' struct btrfs_ioctl_clone_range_args { __s64 src_fd; __u64 src_offset, src_length; __u64 dest_offset; }; #define BTRFS_IOCTL_MAGIC 0x94 #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ struct btrfs_ioctl_clone_range_args) static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; static int clone_done = 0; static int reader_ready = 0; static int stale_data = 0; static void *reader_loop(void *arg) { char buf[4096], want_buf[4096]; memset(want_buf, PATTERN_SRC, 4096); pthread_mutex_lock(&mutex); reader_ready = 1; pthread_mutex_unlock(&mutex); while (1) { int done, fd, ret; fd = open(DST_FILE, O_RDONLY); assert(fd != -1); pthread_mutex_lock(&mutex); done = clone_done; pthread_mutex_unlock(&mutex); ret = read(fd, buf, 4096); assert(ret == 4096); close(fd); if (done) { ret = memcmp(buf, want_buf, 4096); if (ret == 0) { printf("Found new content\n"); } else { printf("Found old content\n"); pthread_mutex_lock(&mutex); stale_data = 1; pthread_mutex_unlock(&mutex); } break; } } return NULL; } int main(int argc, char *argv[]) { pthread_t reader; int ret, i, fd; struct btrfs_ioctl_clone_range_args clone_args; int fd1, fd2; ret = remove(SRC_FILE); if (ret == -1 && errno != ENOENT) { fprintf(stderr, "Error deleting src file: %s\n", strerror(errno)); return 1; } ret = remove(DST_FILE); if (ret == -1 && errno != ENOENT) { fprintf(stderr, "Error deleting dst file: %s\n", strerror(errno)); return 1; } fd = open(SRC_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU); assert(fd != -1); for (i = 0; i < FILE_SIZE; i++) { char c = PATTERN_SRC; ret = write(fd, &c, 1); assert(ret == 1); } close(fd); fd = open(DST_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU); assert(fd != -1); for (i = 0; i < FILE_SIZE; i++) { char c = PATTERN_DST; ret = write(fd, &c, 1); assert(ret == 1); } close(fd); sync(); ret = pthread_create(&reader, NULL, reader_loop, NULL); assert(ret == 0); while (1) { int r; pthread_mutex_lock(&mutex); r = reader_ready; pthread_mutex_unlock(&mutex); if (r) break; } fd1 = open(SRC_FILE, O_RDONLY); if (fd1 < 0) { fprintf(stderr, "Error open src file: %s\n", strerror(errno)); return 1; } fd2 = open(DST_FILE, O_RDWR); if (fd2 < 0) { fprintf(stderr, "Error open dst file: %s\n", strerror(errno)); return 1; } clone_args.src_fd = fd1; clone_args.src_offset = 0; clone_args.src_length = 4096; clone_args.dest_offset = 0; ret = ioctl(fd2, BTRFS_IOC_CLONE_RANGE, &clone_args); assert(ret == 0); close(fd1); close(fd2); pthread_mutex_lock(&mutex); clone_done = 1; pthread_mutex_unlock(&mutex); ret = pthread_join(reader, NULL); assert(ret == 0); pthread_mutex_lock(&mutex); ret = stale_data ? 1 : 0; pthread_mutex_unlock(&mutex); return ret; } Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/ioctl.c36
1 files changed, 31 insertions, 5 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fba7a004e7e5..362720a3fea2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3410,15 +3410,41 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
3410 goto out_unlock; 3410 goto out_unlock;
3411 } 3411 }
3412 3412
3413 /* truncate page cache pages from target inode range */ 3413 /*
3414 truncate_inode_pages_range(&inode->i_data, destoff, 3414 * Lock the target range too. Right after we replace the file extent
3415 PAGE_CACHE_ALIGN(destoff + len) - 1); 3415 * items in the fs tree (which now point to the cloned data), we might
3416 * have a worker replace them with extent items relative to a write
3417 * operation that was issued before this clone operation (i.e. confront
3418 * with inode.c:btrfs_finish_ordered_io).
3419 */
3420 if (same_inode) {
3421 u64 lock_start = min_t(u64, off, destoff);
3422 u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
3416 3423
3417 lock_extent_range(src, off, len); 3424 lock_extent_range(src, lock_start, lock_len);
3425 } else {
3426 lock_extent_range(src, off, len);
3427 lock_extent_range(inode, destoff, len);
3428 }
3418 3429
3419 ret = btrfs_clone(src, inode, off, olen, len, destoff); 3430 ret = btrfs_clone(src, inode, off, olen, len, destoff);
3420 3431
3421 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); 3432 if (same_inode) {
3433 u64 lock_start = min_t(u64, off, destoff);
3434 u64 lock_end = max_t(u64, off, destoff) + len - 1;
3435
3436 unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
3437 } else {
3438 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
3439 unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
3440 destoff + len - 1);
3441 }
3442 /*
3443 * Truncate page cache pages so that future reads will see the cloned
3444 * data immediately and not the previous data.
3445 */
3446 truncate_inode_pages_range(&inode->i_data, destoff,
3447 PAGE_CACHE_ALIGN(destoff + len) - 1);
3422out_unlock: 3448out_unlock:
3423 if (!same_inode) { 3449 if (!same_inode) {
3424 if (inode < src) { 3450 if (inode < src) {