aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2014-12-01 12:04:09 -0500
committerChris Mason <clm@fb.com>2014-12-02 21:35:09 -0500
commit55507ce3612365a5173dfb080a4baf45d1ef8cd1 (patch)
tree4715caa9e49049d45f1b411297da4444c1af4421
parent04216820fe83d5e27322065ba989de27dbfc104d (diff)
Btrfs: fix race between writing free space cache and trimming
Trimming is completely transactionless, and the way it operates consists of hiding free space entries from a block group, perform the trim/discard and then make the free space entries visible again. Therefore while a free space entry is being trimmed, we can have free space cache writing running in parallel (as part of a transaction commit) which will miss the free space entry. This means that an unmount (or crash/reboot) after that transaction commit and mount again before another transaction starts/commits after the discard finishes, we will have some free space that won't be used again unless the free space cache is rebuilt. After the unmount, fsck (btrfsck, btrfs check) reports the issue like the following example: *** fsck.btrfs output *** checking extents checking free space cache There is no free space entry for 521764864-521781248 There is no free space entry for 521764864-1103101952 cache appears valid but isnt 29360128 Checking filesystem on /dev/sdc UUID: b4789e27-4774-4626-98e9-ae8dfbfb0fb5 found 1235681286 bytes used err is -22 (...) Another issue caused by this race is a crash while writing bitmap entries to the cache, because while the cache writeout task accesses the bitmaps, the trim task can be concurrently modifying the bitmap or worse might be freeing the bitmap. The later case results in the following crash: [55650.804460] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC [55650.804835] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc loop parport_pc parport i2c_piix4 psmouse evdev pcspkr microcode processor i2ccore serio_raw thermal_sys button ext4 crc16 jbd2 mbcache sg sd_mod crc_t10dif sr_mod cdrom crct10dif_generic crct10dif_common ata_generic virtio_scsi floppy ata_piix libata virtio_pci virtio_ring virtio scsi_mod e1000 [last unloaded: btrfs] [55650.806169] CPU: 1 PID: 31002 Comm: btrfs-transacti Tainted: G W 3.17.0-rc5-btrfs-next-1+ #1 [55650.806493] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [55650.806867] task: ffff8800b12f6410 ti: ffff880071538000 task.ti: ffff880071538000 [55650.807166] RIP: 0010:[<ffffffffa037cf45>] [<ffffffffa037cf45>] write_bitmap_entries+0x65/0xbb [btrfs] [55650.807514] RSP: 0018:ffff88007153bc30 EFLAGS: 00010246 [55650.807687] RAX: 000000005d1ec000 RBX: ffff8800a665df08 RCX: 0000000000000400 [55650.807885] RDX: ffff88005d1ec000 RSI: 6b6b6b6b6b6b6b6b RDI: ffff88005d1ec000 [55650.808017] RBP: ffff88007153bc58 R08: 00000000ddd51536 R09: 00000000000001e0 [55650.808017] R10: 0000000000000000 R11: 0000000000000037 R12: 6b6b6b6b6b6b6b6b [55650.808017] R13: ffff88007153bca8 R14: 6b6b6b6b6b6b6b6b R15: ffff88007153bc98 [55650.808017] FS: 0000000000000000(0000) GS:ffff88023ec80000(0000) knlGS:0000000000000000 [55650.808017] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [55650.808017] CR2: 0000000002273b88 CR3: 00000000b18f6000 CR4: 00000000000006e0 [55650.808017] Stack: [55650.808017] ffff88020e834e00 ffff880172d68db0 0000000000000000 ffff88019257c800 [55650.808017] ffff8801d42ea720 ffff88007153bd10 ffffffffa037d2fa ffff880224e99180 [55650.808017] ffff8801469a6188 ffff880224e99140 ffff880172d68c50 00000003000000b7 [55650.808017] Call Trace: [55650.808017] [<ffffffffa037d2fa>] __btrfs_write_out_cache+0x1ea/0x37f [btrfs] [55650.808017] [<ffffffffa037d959>] btrfs_write_out_cache+0xa1/0xd8 [btrfs] [55650.808017] [<ffffffffa033936b>] btrfs_write_dirty_block_groups+0x4b5/0x505 [btrfs] [55650.808017] [<ffffffffa03aa98e>] commit_cowonly_roots+0x15e/0x1f7 [btrfs] [55650.808017] [<ffffffff813eb9c7>] ? _raw_spin_lock+0xe/0x10 [55650.808017] [<ffffffffa0346e46>] btrfs_commit_transaction+0x411/0x882 [btrfs] [55650.808017] [<ffffffffa03432a4>] transaction_kthread+0xf2/0x1a4 [btrfs] [55650.808017] [<ffffffffa03431b2>] ? btrfs_cleanup_transaction+0x3d8/0x3d8 [btrfs] [55650.808017] [<ffffffff8105966b>] kthread+0xb7/0xbf [55650.808017] [<ffffffff810595b4>] ? __kthread_parkme+0x67/0x67 [55650.808017] [<ffffffff813ebeac>] ret_from_fork+0x7c/0xb0 [55650.808017] [<ffffffff810595b4>] ? __kthread_parkme+0x67/0x67 [55650.808017] Code: 4c 89 ef 8d 70 ff e8 d4 fc ff ff 41 8b 45 34 41 39 45 30 7d 5c 31 f6 4c 89 ef e8 80 f6 ff ff 49 8b 7d 00 4c 89 f6 b9 00 04 00 00 <f3> a5 4c 89 ef 41 8b 45 30 8d 70 ff e8 a3 fc ff ff 41 8b 45 34 [55650.808017] RIP [<ffffffffa037cf45>] write_bitmap_entries+0x65/0xbb [btrfs] [55650.808017] RSP <ffff88007153bc30> [55650.815725] ---[ end trace 1c032e96b149ff86 ]--- Fix this by serializing both tasks in such a way that cache writeout doesn't wait for the trim/discard of free space entries to finish and doesn't miss any free space entry. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r--fs/btrfs/free-space-cache.c73
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode-map.c2
3 files changed, 71 insertions, 6 deletions
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0ddc114e2aed..2ee73c276acb 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -32,6 +32,12 @@
32#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 32#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
33#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 33#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
34 34
35struct btrfs_trim_range {
36 u64 start;
37 u64 bytes;
38 struct list_head list;
39};
40
35static int link_free_space(struct btrfs_free_space_ctl *ctl, 41static int link_free_space(struct btrfs_free_space_ctl *ctl,
36 struct btrfs_free_space *info); 42 struct btrfs_free_space *info);
37static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 43static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
@@ -882,6 +888,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
882 int ret; 888 int ret;
883 struct btrfs_free_cluster *cluster = NULL; 889 struct btrfs_free_cluster *cluster = NULL;
884 struct rb_node *node = rb_first(&ctl->free_space_offset); 890 struct rb_node *node = rb_first(&ctl->free_space_offset);
891 struct btrfs_trim_range *trim_entry;
885 892
886 /* Get the cluster for this block_group if it exists */ 893 /* Get the cluster for this block_group if it exists */
887 if (block_group && !list_empty(&block_group->cluster_list)) { 894 if (block_group && !list_empty(&block_group->cluster_list)) {
@@ -917,6 +924,21 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
917 cluster = NULL; 924 cluster = NULL;
918 } 925 }
919 } 926 }
927
928 /*
929 * Make sure we don't miss any range that was removed from our rbtree
930 * because trimming is running. Otherwise after a umount+mount (or crash
931 * after committing the transaction) we would leak free space and get
932 * an inconsistent free space cache report from fsck.
933 */
934 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
935 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
936 trim_entry->bytes, NULL);
937 if (ret)
938 goto fail;
939 *entries += 1;
940 }
941
920 return 0; 942 return 0;
921fail: 943fail:
922 return -ENOSPC; 944 return -ENOSPC;
@@ -1136,12 +1158,15 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1136 1158
1137 io_ctl_set_generation(&io_ctl, trans->transid); 1159 io_ctl_set_generation(&io_ctl, trans->transid);
1138 1160
1161 mutex_lock(&ctl->cache_writeout_mutex);
1139 /* Write out the extent entries in the free space cache */ 1162 /* Write out the extent entries in the free space cache */
1140 ret = write_cache_extent_entries(&io_ctl, ctl, 1163 ret = write_cache_extent_entries(&io_ctl, ctl,
1141 block_group, &entries, &bitmaps, 1164 block_group, &entries, &bitmaps,
1142 &bitmap_list); 1165 &bitmap_list);
1143 if (ret) 1166 if (ret) {
1167 mutex_unlock(&ctl->cache_writeout_mutex);
1144 goto out_nospc; 1168 goto out_nospc;
1169 }
1145 1170
1146 /* 1171 /*
1147 * Some spaces that are freed in the current transaction are pinned, 1172 * Some spaces that are freed in the current transaction are pinned,
@@ -1149,11 +1174,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1149 * committed, we shouldn't lose them. 1174 * committed, we shouldn't lose them.
1150 */ 1175 */
1151 ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); 1176 ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
1152 if (ret) 1177 if (ret) {
1178 mutex_unlock(&ctl->cache_writeout_mutex);
1153 goto out_nospc; 1179 goto out_nospc;
1180 }
1154 1181
1155 /* At last, we write out all the bitmaps. */ 1182 /*
1183 * At last, we write out all the bitmaps and keep cache_writeout_mutex
1184 * locked while doing it because a concurrent trim can be manipulating
1185 * or freeing the bitmap.
1186 */
1156 ret = write_bitmap_entries(&io_ctl, &bitmap_list); 1187 ret = write_bitmap_entries(&io_ctl, &bitmap_list);
1188 mutex_unlock(&ctl->cache_writeout_mutex);
1157 if (ret) 1189 if (ret)
1158 goto out_nospc; 1190 goto out_nospc;
1159 1191
@@ -2296,6 +2328,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
2296 ctl->start = block_group->key.objectid; 2328 ctl->start = block_group->key.objectid;
2297 ctl->private = block_group; 2329 ctl->private = block_group;
2298 ctl->op = &free_space_op; 2330 ctl->op = &free_space_op;
2331 INIT_LIST_HEAD(&ctl->trimming_ranges);
2332 mutex_init(&ctl->cache_writeout_mutex);
2299 2333
2300 /* 2334 /*
2301 * we only want to have 32k of ram per block group for keeping 2335 * we only want to have 32k of ram per block group for keeping
@@ -2912,10 +2946,12 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2912 2946
2913static int do_trimming(struct btrfs_block_group_cache *block_group, 2947static int do_trimming(struct btrfs_block_group_cache *block_group,
2914 u64 *total_trimmed, u64 start, u64 bytes, 2948 u64 *total_trimmed, u64 start, u64 bytes,
2915 u64 reserved_start, u64 reserved_bytes) 2949 u64 reserved_start, u64 reserved_bytes,
2950 struct btrfs_trim_range *trim_entry)
2916{ 2951{
2917 struct btrfs_space_info *space_info = block_group->space_info; 2952 struct btrfs_space_info *space_info = block_group->space_info;
2918 struct btrfs_fs_info *fs_info = block_group->fs_info; 2953 struct btrfs_fs_info *fs_info = block_group->fs_info;
2954 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2919 int ret; 2955 int ret;
2920 int update = 0; 2956 int update = 0;
2921 u64 trimmed = 0; 2957 u64 trimmed = 0;
@@ -2935,7 +2971,10 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
2935 if (!ret) 2971 if (!ret)
2936 *total_trimmed += trimmed; 2972 *total_trimmed += trimmed;
2937 2973
2974 mutex_lock(&ctl->cache_writeout_mutex);
2938 btrfs_add_free_space(block_group, reserved_start, reserved_bytes); 2975 btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
2976 list_del(&trim_entry->list);
2977 mutex_unlock(&ctl->cache_writeout_mutex);
2939 2978
2940 if (update) { 2979 if (update) {
2941 spin_lock(&space_info->lock); 2980 spin_lock(&space_info->lock);
@@ -2963,16 +3002,21 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2963 u64 bytes; 3002 u64 bytes;
2964 3003
2965 while (start < end) { 3004 while (start < end) {
3005 struct btrfs_trim_range trim_entry;
3006
3007 mutex_lock(&ctl->cache_writeout_mutex);
2966 spin_lock(&ctl->tree_lock); 3008 spin_lock(&ctl->tree_lock);
2967 3009
2968 if (ctl->free_space < minlen) { 3010 if (ctl->free_space < minlen) {
2969 spin_unlock(&ctl->tree_lock); 3011 spin_unlock(&ctl->tree_lock);
3012 mutex_unlock(&ctl->cache_writeout_mutex);
2970 break; 3013 break;
2971 } 3014 }
2972 3015
2973 entry = tree_search_offset(ctl, start, 0, 1); 3016 entry = tree_search_offset(ctl, start, 0, 1);
2974 if (!entry) { 3017 if (!entry) {
2975 spin_unlock(&ctl->tree_lock); 3018 spin_unlock(&ctl->tree_lock);
3019 mutex_unlock(&ctl->cache_writeout_mutex);
2976 break; 3020 break;
2977 } 3021 }
2978 3022
@@ -2981,6 +3025,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2981 node = rb_next(&entry->offset_index); 3025 node = rb_next(&entry->offset_index);
2982 if (!node) { 3026 if (!node) {
2983 spin_unlock(&ctl->tree_lock); 3027 spin_unlock(&ctl->tree_lock);
3028 mutex_unlock(&ctl->cache_writeout_mutex);
2984 goto out; 3029 goto out;
2985 } 3030 }
2986 entry = rb_entry(node, struct btrfs_free_space, 3031 entry = rb_entry(node, struct btrfs_free_space,
@@ -2989,6 +3034,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2989 3034
2990 if (entry->offset >= end) { 3035 if (entry->offset >= end) {
2991 spin_unlock(&ctl->tree_lock); 3036 spin_unlock(&ctl->tree_lock);
3037 mutex_unlock(&ctl->cache_writeout_mutex);
2992 break; 3038 break;
2993 } 3039 }
2994 3040
@@ -2998,6 +3044,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2998 bytes = min(extent_start + extent_bytes, end) - start; 3044 bytes = min(extent_start + extent_bytes, end) - start;
2999 if (bytes < minlen) { 3045 if (bytes < minlen) {
3000 spin_unlock(&ctl->tree_lock); 3046 spin_unlock(&ctl->tree_lock);
3047 mutex_unlock(&ctl->cache_writeout_mutex);
3001 goto next; 3048 goto next;
3002 } 3049 }
3003 3050
@@ -3005,9 +3052,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
3005 kmem_cache_free(btrfs_free_space_cachep, entry); 3052 kmem_cache_free(btrfs_free_space_cachep, entry);
3006 3053
3007 spin_unlock(&ctl->tree_lock); 3054 spin_unlock(&ctl->tree_lock);
3055 trim_entry.start = extent_start;
3056 trim_entry.bytes = extent_bytes;
3057 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3058 mutex_unlock(&ctl->cache_writeout_mutex);
3008 3059
3009 ret = do_trimming(block_group, total_trimmed, start, bytes, 3060 ret = do_trimming(block_group, total_trimmed, start, bytes,
3010 extent_start, extent_bytes); 3061 extent_start, extent_bytes, &trim_entry);
3011 if (ret) 3062 if (ret)
3012 break; 3063 break;
3013next: 3064next:
@@ -3036,17 +3087,21 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3036 3087
3037 while (offset < end) { 3088 while (offset < end) {
3038 bool next_bitmap = false; 3089 bool next_bitmap = false;
3090 struct btrfs_trim_range trim_entry;
3039 3091
3092 mutex_lock(&ctl->cache_writeout_mutex);
3040 spin_lock(&ctl->tree_lock); 3093 spin_lock(&ctl->tree_lock);
3041 3094
3042 if (ctl->free_space < minlen) { 3095 if (ctl->free_space < minlen) {
3043 spin_unlock(&ctl->tree_lock); 3096 spin_unlock(&ctl->tree_lock);
3097 mutex_unlock(&ctl->cache_writeout_mutex);
3044 break; 3098 break;
3045 } 3099 }
3046 3100
3047 entry = tree_search_offset(ctl, offset, 1, 0); 3101 entry = tree_search_offset(ctl, offset, 1, 0);
3048 if (!entry) { 3102 if (!entry) {
3049 spin_unlock(&ctl->tree_lock); 3103 spin_unlock(&ctl->tree_lock);
3104 mutex_unlock(&ctl->cache_writeout_mutex);
3050 next_bitmap = true; 3105 next_bitmap = true;
3051 goto next; 3106 goto next;
3052 } 3107 }
@@ -3055,6 +3110,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3055 ret2 = search_bitmap(ctl, entry, &start, &bytes); 3110 ret2 = search_bitmap(ctl, entry, &start, &bytes);
3056 if (ret2 || start >= end) { 3111 if (ret2 || start >= end) {
3057 spin_unlock(&ctl->tree_lock); 3112 spin_unlock(&ctl->tree_lock);
3113 mutex_unlock(&ctl->cache_writeout_mutex);
3058 next_bitmap = true; 3114 next_bitmap = true;
3059 goto next; 3115 goto next;
3060 } 3116 }
@@ -3062,6 +3118,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3062 bytes = min(bytes, end - start); 3118 bytes = min(bytes, end - start);
3063 if (bytes < minlen) { 3119 if (bytes < minlen) {
3064 spin_unlock(&ctl->tree_lock); 3120 spin_unlock(&ctl->tree_lock);
3121 mutex_unlock(&ctl->cache_writeout_mutex);
3065 goto next; 3122 goto next;
3066 } 3123 }
3067 3124
@@ -3070,9 +3127,13 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3070 free_bitmap(ctl, entry); 3127 free_bitmap(ctl, entry);
3071 3128
3072 spin_unlock(&ctl->tree_lock); 3129 spin_unlock(&ctl->tree_lock);
3130 trim_entry.start = start;
3131 trim_entry.bytes = bytes;
3132 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3133 mutex_unlock(&ctl->cache_writeout_mutex);
3073 3134
3074 ret = do_trimming(block_group, total_trimmed, start, bytes, 3135 ret = do_trimming(block_group, total_trimmed, start, bytes,
3075 start, bytes); 3136 start, bytes, &trim_entry);
3076 if (ret) 3137 if (ret)
3077 break; 3138 break;
3078next: 3139next:
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 0cf4977ef70d..88b2238a0aed 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -38,6 +38,8 @@ struct btrfs_free_space_ctl {
38 u64 start; 38 u64 start;
39 struct btrfs_free_space_op *op; 39 struct btrfs_free_space_op *op;
40 void *private; 40 void *private;
41 struct mutex cache_writeout_mutex;
42 struct list_head trimming_ranges;
41}; 43};
42 44
43struct btrfs_free_space_op { 45struct btrfs_free_space_op {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 4ebd5ebb1ea1..74faea3a516e 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -364,6 +364,8 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root)
364 ctl->start = 0; 364 ctl->start = 0;
365 ctl->private = NULL; 365 ctl->private = NULL;
366 ctl->op = &free_ino_op; 366 ctl->op = &free_ino_op;
367 INIT_LIST_HEAD(&ctl->trimming_ranges);
368 mutex_init(&ctl->cache_writeout_mutex);
367 369
368 /* 370 /*
369 * Initially we allow to use 16K of ram to cache chunks of 371 * Initially we allow to use 16K of ram to cache chunks of