aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2016-01-27 05:20:58 -0500
committerFilipe Manana <fdmanana@suse.com>2016-02-03 14:27:09 -0500
commite0bd70c67bf996b360f706b6c643000f2e384681 (patch)
treef87a12b8fbc15086b5e6f64ee9530dade5290ea1 /fs
parente410e34fad913dd568ec28d2a9949694324c14db (diff)
Btrfs: fix invalid page accesses in extent_same (dedup) ioctl
In the extent_same ioctl we are getting the pages for the source and target ranges and unlocking them immediately after, which is incorrect because later we attempt to map them (with kmap_atomic) and access their contents at btrfs_cmp_data(). When we do such access the pages might have been relocated or removed from memory, which leads to an invalid memory access. This issue is detected on a kernel with CONFIG_DEBUG_PAGEALLOC=y which produces a trace like the following: 186736.677437] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [186736.680382] Modules linked in: btrfs dm_flakey dm_mod ppdev xor raid6_pq sha256_generic hmac drbg ansi_cprng acpi_cpufreq evdev sg aesni_intel aes_x86_64 parport_pc ablk_helper tpm_tis psmouse parport i2c_piix4 tpm cryptd i2c_core lrw processor button serio_raw pcspkr gf128mul glue_helper loop autofs4 ext4 crc16 mbcache jbd2 sd_mod sr_mod cdrom ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring crc32c_intel scsi_mod e1000 virtio floppy [last unloaded: btrfs] [186736.681319] CPU: 13 PID: 10222 Comm: duperemove Tainted: G W 4.4.0-rc6-btrfs-next-18+ #1 [186736.681319] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [186736.681319] task: ffff880132600400 ti: ffff880362284000 task.ti: ffff880362284000 [186736.681319] RIP: 0010:[<ffffffff81264d00>] [<ffffffff81264d00>] memcmp+0xb/0x22 [186736.681319] RSP: 0018:ffff880362287d70 EFLAGS: 00010287 [186736.681319] RAX: 000002c002468acf RBX: 0000000012345678 RCX: 0000000000000000 [186736.681319] RDX: 0000000000001000 RSI: 0005d129c5cf9000 RDI: 0005d129c5cf9000 [186736.681319] RBP: ffff880362287d70 R08: 0000000000000000 R09: 0000000000001000 [186736.681319] R10: ffff880000000000 R11: 0000000000000476 R12: 0000000000001000 [186736.681319] R13: ffff8802f91d4c88 R14: ffff8801f2a77830 R15: ffff880352e83e40 [186736.681319] FS: 00007f27b37fe700(0000) GS:ffff88043dda0000(0000) knlGS:0000000000000000 [186736.681319] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [186736.681319] CR2: 00007f27a406a000 CR3: 0000000217421000 CR4: 00000000001406e0 [186736.681319] Stack: [186736.681319] ffff880362287ea0 ffffffffa048d0bd 000000000009f000 0000000000001000 [186736.681319] 0100000000000000 ffff8801f2a77850 ffff8802f91d49b0 ffff880132600400 [186736.681319] 00000000000004f8 ffff8801c1efbe41 0000000000000000 0000000000000038 [186736.681319] Call Trace: [186736.681319] [<ffffffffa048d0bd>] btrfs_ioctl+0x24cb/0x2731 [btrfs] [186736.681319] [<ffffffff8108a8b0>] ? arch_local_irq_save+0x9/0xc [186736.681319] [<ffffffff8118b3d4>] ? rcu_read_unlock+0x3e/0x5d [186736.681319] [<ffffffff811822f8>] do_vfs_ioctl+0x42b/0x4ea [186736.681319] [<ffffffff8118b4f3>] ? __fget_light+0x62/0x71 [186736.681319] [<ffffffff8118240e>] SyS_ioctl+0x57/0x79 [186736.681319] [<ffffffff814872d7>] entry_SYSCALL_64_fastpath+0x12/0x6f [186736.681319] Code: 0a 3c 6e 74 0d 3c 79 74 04 3c 59 75 0c c6 06 01 eb 03 c6 06 00 31 c0 eb 05 b8 ea ff ff ff 5d c3 55 31 c9 48 89 e5 48 39 d1 74 13 <0f> b6 04 0f 44 0f b6 04 0e 48 ff c1 44 29 c0 74 ea eb 02 31 c0 (gdb) list *(btrfs_ioctl+0x24cb) 0x5e0e1 is in btrfs_ioctl (fs/btrfs/ioctl.c:2972). 2967 dst_addr = kmap_atomic(dst_page); 2968 2969 flush_dcache_page(src_page); 2970 flush_dcache_page(dst_page); 2971 2972 if (memcmp(addr, dst_addr, cmp_len)) 2973 ret = BTRFS_SAME_DATA_DIFFERS; 2974 2975 kunmap_atomic(addr); 2976 kunmap_atomic(dst_addr); So fix this by making sure we keep the pages locked and respect the same locking order as everywhere else: get and lock the pages first and then lock the range in the inode's io tree (like for example at __btrfs_buffered_write() and extent_readpages()). If an ordered extent is found after locking the range in the io tree, unlock the range, unlock the pages, wait for the ordered extent to complete and repeat the entire locking process until no overlapping ordered extents are found. Cc: stable@vger.kernel.org # 4.2+ Signed-off-by: Filipe Manana <fdmanana@suse.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/ioctl.c90
1 files changed, 76 insertions, 14 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 83c9ad3f2621..1d6767c4c092 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2811,7 +2811,6 @@ static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
2811 return NULL; 2811 return NULL;
2812 } 2812 }
2813 } 2813 }
2814 unlock_page(page);
2815 2814
2816 return page; 2815 return page;
2817} 2816}
@@ -2830,10 +2829,17 @@ static int gather_extent_pages(struct inode *inode, struct page **pages,
2830 return 0; 2829 return 0;
2831} 2830}
2832 2831
2833static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) 2832static int lock_extent_range(struct inode *inode, u64 off, u64 len,
2833 bool retry_range_locking)
2834{ 2834{
2835 /* do any pending delalloc/csum calc on src, one way or 2835 /*
2836 another, and lock file content */ 2836 * Do any pending delalloc/csum calculations on inode, one way or
2837 * another, and lock file content.
2838 * The locking order is:
2839 *
2840 * 1) pages
2841 * 2) range in the inode's io tree
2842 */
2837 while (1) { 2843 while (1) {
2838 struct btrfs_ordered_extent *ordered; 2844 struct btrfs_ordered_extent *ordered;
2839 lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); 2845 lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
@@ -2851,8 +2857,11 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
2851 unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); 2857 unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
2852 if (ordered) 2858 if (ordered)
2853 btrfs_put_ordered_extent(ordered); 2859 btrfs_put_ordered_extent(ordered);
2860 if (!retry_range_locking)
2861 return -EAGAIN;
2854 btrfs_wait_ordered_range(inode, off, len); 2862 btrfs_wait_ordered_range(inode, off, len);
2855 } 2863 }
2864 return 0;
2856} 2865}
2857 2866
2858static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) 2867static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
@@ -2877,15 +2886,24 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
2877 unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); 2886 unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
2878} 2887}
2879 2888
2880static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, 2889static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
2881 struct inode *inode2, u64 loff2, u64 len) 2890 struct inode *inode2, u64 loff2, u64 len,
2891 bool retry_range_locking)
2882{ 2892{
2893 int ret;
2894
2883 if (inode1 < inode2) { 2895 if (inode1 < inode2) {
2884 swap(inode1, inode2); 2896 swap(inode1, inode2);
2885 swap(loff1, loff2); 2897 swap(loff1, loff2);
2886 } 2898 }
2887 lock_extent_range(inode1, loff1, len); 2899 ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
2888 lock_extent_range(inode2, loff2, len); 2900 if (ret)
2901 return ret;
2902 ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
2903 if (ret)
2904 unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
2905 loff1 + len - 1);
2906 return ret;
2889} 2907}
2890 2908
2891struct cmp_pages { 2909struct cmp_pages {
@@ -2901,11 +2919,15 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp)
2901 2919
2902 for (i = 0; i < cmp->num_pages; i++) { 2920 for (i = 0; i < cmp->num_pages; i++) {
2903 pg = cmp->src_pages[i]; 2921 pg = cmp->src_pages[i];
2904 if (pg) 2922 if (pg) {
2923 unlock_page(pg);
2905 page_cache_release(pg); 2924 page_cache_release(pg);
2925 }
2906 pg = cmp->dst_pages[i]; 2926 pg = cmp->dst_pages[i];
2907 if (pg) 2927 if (pg) {
2928 unlock_page(pg);
2908 page_cache_release(pg); 2929 page_cache_release(pg);
2930 }
2909 } 2931 }
2910 kfree(cmp->src_pages); 2932 kfree(cmp->src_pages);
2911 kfree(cmp->dst_pages); 2933 kfree(cmp->dst_pages);
@@ -2966,6 +2988,8 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
2966 2988
2967 src_page = cmp->src_pages[i]; 2989 src_page = cmp->src_pages[i];
2968 dst_page = cmp->dst_pages[i]; 2990 dst_page = cmp->dst_pages[i];
2991 ASSERT(PageLocked(src_page));
2992 ASSERT(PageLocked(dst_page));
2969 2993
2970 addr = kmap_atomic(src_page); 2994 addr = kmap_atomic(src_page);
2971 dst_addr = kmap_atomic(dst_page); 2995 dst_addr = kmap_atomic(dst_page);
@@ -3078,14 +3102,46 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
3078 goto out_unlock; 3102 goto out_unlock;
3079 } 3103 }
3080 3104
3105again:
3081 ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp); 3106 ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
3082 if (ret) 3107 if (ret)
3083 goto out_unlock; 3108 goto out_unlock;
3084 3109
3085 if (same_inode) 3110 if (same_inode)
3086 lock_extent_range(src, same_lock_start, same_lock_len); 3111 ret = lock_extent_range(src, same_lock_start, same_lock_len,
3112 false);
3087 else 3113 else
3088 btrfs_double_extent_lock(src, loff, dst, dst_loff, len); 3114 ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
3115 false);
3116 /*
3117 * If one of the inodes has dirty pages in the respective range or
3118 * ordered extents, we need to flush dellaloc and wait for all ordered
3119 * extents in the range. We must unlock the pages and the ranges in the
3120 * io trees to avoid deadlocks when flushing delalloc (requires locking
3121 * pages) and when waiting for ordered extents to complete (they require
3122 * range locking).
3123 */
3124 if (ret == -EAGAIN) {
3125 /*
3126 * Ranges in the io trees already unlocked. Now unlock all
3127 * pages before waiting for all IO to complete.
3128 */
3129 btrfs_cmp_data_free(&cmp);
3130 if (same_inode) {
3131 btrfs_wait_ordered_range(src, same_lock_start,
3132 same_lock_len);
3133 } else {
3134 btrfs_wait_ordered_range(src, loff, len);
3135 btrfs_wait_ordered_range(dst, dst_loff, len);
3136 }
3137 goto again;
3138 }
3139 ASSERT(ret == 0);
3140 if (WARN_ON(ret)) {
3141 /* ranges in the io trees already unlocked */
3142 btrfs_cmp_data_free(&cmp);
3143 return ret;
3144 }
3089 3145
3090 /* pass original length for comparison so we stay within i_size */ 3146 /* pass original length for comparison so we stay within i_size */
3091 ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp); 3147 ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
@@ -3907,9 +3963,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
3907 u64 lock_start = min_t(u64, off, destoff); 3963 u64 lock_start = min_t(u64, off, destoff);
3908 u64 lock_len = max_t(u64, off, destoff) + len - lock_start; 3964 u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
3909 3965
3910 lock_extent_range(src, lock_start, lock_len); 3966 ret = lock_extent_range(src, lock_start, lock_len, true);
3911 } else { 3967 } else {
3912 btrfs_double_extent_lock(src, off, inode, destoff, len); 3968 ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
3969 true);
3970 }
3971 ASSERT(ret == 0);
3972 if (WARN_ON(ret)) {
3973 /* ranges in the io trees already unlocked */
3974 goto out_unlock;
3913 } 3975 }
3914 3976
3915 ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); 3977 ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);