aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorJan Schmidt <list.btrfs@jan-o-sch.net>2013-04-24 12:57:33 -0400
committerJosef Bacik <jbacik@fusionio.com>2013-05-06 15:55:17 -0400
commitfc36ed7e0b13955ba66fc56dc5067e67ac105150 (patch)
treedea2b4ebd056c6e87e5dea53c8df20e0509b88db /fs/btrfs
parent6d49ba1b47b9c6822d08f90af6f1a2d8ca1cf533 (diff)
Btrfs: separate sequence numbers for delayed ref tracking and tree mod log
Sequence numbers for delayed refs have been introduced in the first version of the qgroup patch set. To solve the problem of find_all_roots on a busy file system, the tree mod log was introduced. The sequence numbers for that were simply shared between those two users. However, at one point in qgroup's quota accounting, there's a statement accessing the previous sequence number, that's still just doing (seq - 1) just as it would have to in the very first version. To satisfy that requirement, this patch makes the sequence number counter 64 bit and splits it into a major part (used for qgroup sequence number counting) and a minor part (incremented for each tree modification in the log). This enables us to go exactly one major step backwards, as required for qgroups, while still incrementing the sequence counter for tree mod log insertions to keep track of their order. Keeping them in a single variable means there's no need to change all the code dealing with comparisons of two sequence numbers. The sequence number is reset to 0 on commit (not new in this patch), which ensures we won't overflow the two 32 bit counters. Without this fix, the qgroup tracking can occasionally go wrong and WARN_ONs from the tree mod log code may happen. Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net> Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/ctree.c47
-rw-r--r--fs/btrfs/ctree.h7
-rw-r--r--fs/btrfs/delayed-ref.c6
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent-tree.c5
-rw-r--r--fs/btrfs/qgroup.c13
-rw-r--r--fs/btrfs/transaction.c2
7 files changed, 63 insertions, 19 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2bc34408872d..a17d9991c333 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -361,6 +361,44 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
361} 361}
362 362
363/* 363/*
364 * Increment the upper half of tree_mod_seq, set lower half zero.
365 *
366 * Must be called with fs_info->tree_mod_seq_lock held.
367 */
368static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
369{
370 u64 seq = atomic64_read(&fs_info->tree_mod_seq);
371 seq &= 0xffffffff00000000ull;
372 seq += 1ull << 32;
373 atomic64_set(&fs_info->tree_mod_seq, seq);
374 return seq;
375}
376
377/*
378 * Increment the lower half of tree_mod_seq.
379 *
380 * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
381 * are generated should not technically require a spin lock here. (Rationale:
382 * incrementing the minor while incrementing the major seq number is between its
383 * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
384 * just returns a unique sequence number as usual.) We have decided to leave
385 * that requirement in here and rethink it once we notice it really imposes a
386 * problem on some workload.
387 */
388static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
389{
390 return atomic64_inc_return(&fs_info->tree_mod_seq);
391}
392
393/*
394 * return the last minor in the previous major tree_mod_seq number
395 */
396u64 btrfs_tree_mod_seq_prev(u64 seq)
397{
398 return (seq & 0xffffffff00000000ull) - 1ull;
399}
400
401/*
364 * This adds a new blocker to the tree mod log's blocker list if the @elem 402 * This adds a new blocker to the tree mod log's blocker list if the @elem
365 * passed does not already have a sequence number set. So when a caller expects 403 * passed does not already have a sequence number set. So when a caller expects
366 * to record tree modifications, it should ensure to set elem->seq to zero 404 * to record tree modifications, it should ensure to set elem->seq to zero
@@ -376,10 +414,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
376 tree_mod_log_write_lock(fs_info); 414 tree_mod_log_write_lock(fs_info);
377 spin_lock(&fs_info->tree_mod_seq_lock); 415 spin_lock(&fs_info->tree_mod_seq_lock);
378 if (!elem->seq) { 416 if (!elem->seq) {
379 elem->seq = btrfs_inc_tree_mod_seq(fs_info); 417 elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
380 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 418 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
381 } 419 }
382 seq = btrfs_inc_tree_mod_seq(fs_info); 420 seq = btrfs_inc_tree_mod_seq_minor(fs_info);
383 spin_unlock(&fs_info->tree_mod_seq_lock); 421 spin_unlock(&fs_info->tree_mod_seq_lock);
384 tree_mod_log_write_unlock(fs_info); 422 tree_mod_log_write_unlock(fs_info);
385 423
@@ -524,7 +562,10 @@ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
524 if (!tm) 562 if (!tm)
525 return -ENOMEM; 563 return -ENOMEM;
526 564
527 tm->seq = btrfs_inc_tree_mod_seq(fs_info); 565 spin_lock(&fs_info->tree_mod_seq_lock);
566 tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
567 spin_unlock(&fs_info->tree_mod_seq_lock);
568
528 return tm->seq; 569 return tm->seq;
529} 570}
530 571
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 37c4da3403d0..2c48f52aba40 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1422,7 +1422,7 @@ struct btrfs_fs_info {
1422 1422
1423 /* this protects tree_mod_seq_list */ 1423 /* this protects tree_mod_seq_list */
1424 spinlock_t tree_mod_seq_lock; 1424 spinlock_t tree_mod_seq_lock;
1425 atomic_t tree_mod_seq; 1425 atomic64_t tree_mod_seq;
1426 struct list_head tree_mod_seq_list; 1426 struct list_head tree_mod_seq_list;
1427 struct seq_list tree_mod_seq_elem; 1427 struct seq_list tree_mod_seq_elem;
1428 1428
@@ -3332,10 +3332,7 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
3332 struct seq_list *elem); 3332 struct seq_list *elem);
3333void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 3333void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
3334 struct seq_list *elem); 3334 struct seq_list *elem);
3335static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) 3335u64 btrfs_tree_mod_seq_prev(u64 seq);
3336{
3337 return atomic_inc_return(&fs_info->tree_mod_seq);
3338}
3339int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); 3336int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
3340 3337
3341/* root-item.c */ 3338/* root-item.c */
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 116abec7a29c..c219463fb1fd 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -361,8 +361,10 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
361 elem = list_first_entry(&fs_info->tree_mod_seq_list, 361 elem = list_first_entry(&fs_info->tree_mod_seq_list,
362 struct seq_list, list); 362 struct seq_list, list);
363 if (seq >= elem->seq) { 363 if (seq >= elem->seq) {
364 pr_debug("holding back delayed_ref %llu, lowest is " 364 pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n",
365 "%llu (%p)\n", seq, elem->seq, delayed_refs); 365 (u32)(seq >> 32), (u32)seq,
366 (u32)(elem->seq >> 32), (u32)elem->seq,
367 delayed_refs);
366 ret = 1; 368 ret = 1;
367 } 369 }
368 } 370 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e4488b57a7ae..92c44ed78de1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2157,7 +2157,7 @@ int open_ctree(struct super_block *sb,
2157 atomic_set(&fs_info->async_submit_draining, 0); 2157 atomic_set(&fs_info->async_submit_draining, 0);
2158 atomic_set(&fs_info->nr_async_bios, 0); 2158 atomic_set(&fs_info->nr_async_bios, 0);
2159 atomic_set(&fs_info->defrag_running, 0); 2159 atomic_set(&fs_info->defrag_running, 0);
2160 atomic_set(&fs_info->tree_mod_seq, 0); 2160 atomic64_set(&fs_info->tree_mod_seq, 0);
2161 fs_info->sb = sb; 2161 fs_info->sb = sb;
2162 fs_info->max_inline = 8192 * 1024; 2162 fs_info->max_inline = 8192 * 1024;
2163 fs_info->metadata_ratio = 0; 2163 fs_info->metadata_ratio = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f8a5652b0c43..1cae6631b3d8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2541,9 +2541,10 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2541 !trans->delayed_ref_elem.seq) { 2541 !trans->delayed_ref_elem.seq) {
2542 /* list without seq or seq without list */ 2542 /* list without seq or seq without list */
2543 btrfs_err(fs_info, 2543 btrfs_err(fs_info,
2544 "qgroup accounting update error, list is%s empty, seq is %llu", 2544 "qgroup accounting update error, list is%s empty, seq is %#x.%x",
2545 list_empty(&trans->qgroup_ref_list) ? "" : " not", 2545 list_empty(&trans->qgroup_ref_list) ? "" : " not",
2546 trans->delayed_ref_elem.seq); 2546 (u32)(trans->delayed_ref_elem.seq >> 32),
2547 (u32)trans->delayed_ref_elem.seq);
2547 BUG(); 2548 BUG();
2548 } 2549 }
2549 2550
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index f175471da882..e5c56238b6c6 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1242,9 +1242,11 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1242 case BTRFS_ADD_DELAYED_REF: 1242 case BTRFS_ADD_DELAYED_REF:
1243 case BTRFS_ADD_DELAYED_EXTENT: 1243 case BTRFS_ADD_DELAYED_EXTENT:
1244 sgn = 1; 1244 sgn = 1;
1245 seq = btrfs_tree_mod_seq_prev(node->seq);
1245 break; 1246 break;
1246 case BTRFS_DROP_DELAYED_REF: 1247 case BTRFS_DROP_DELAYED_REF:
1247 sgn = -1; 1248 sgn = -1;
1249 seq = node->seq;
1248 break; 1250 break;
1249 case BTRFS_UPDATE_DELAYED_HEAD: 1251 case BTRFS_UPDATE_DELAYED_HEAD:
1250 return 0; 1252 return 0;
@@ -1254,14 +1256,14 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1254 1256
1255 /* 1257 /*
1256 * the delayed ref sequence number we pass depends on the direction of 1258 * the delayed ref sequence number we pass depends on the direction of
1257 * the operation. for add operations, we pass (node->seq - 1) to skip 1259 * the operation. for add operations, we pass
1260 * tree_mod_log_prev_seq(node->seq) to skip
1258 * the delayed ref's current sequence number, because we need the state 1261 * the delayed ref's current sequence number, because we need the state
1259 * of the tree before the add operation. for delete operations, we pass 1262 * of the tree before the add operation. for delete operations, we pass
1260 * (node->seq) to include the delayed ref's current sequence number, 1263 * (node->seq) to include the delayed ref's current sequence number,
1261 * because we need the state of the tree after the delete operation. 1264 * because we need the state of the tree after the delete operation.
1262 */ 1265 */
1263 ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, 1266 ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
1264 sgn > 0 ? node->seq - 1 : node->seq, &roots);
1265 if (ret < 0) 1267 if (ret < 0)
1266 return ret; 1268 return ret;
1267 1269
@@ -1772,8 +1774,9 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
1772{ 1774{
1773 if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) 1775 if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
1774 return; 1776 return;
1775 printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n", 1777 pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n",
1776 trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", 1778 trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
1777 trans->delayed_ref_elem.seq); 1779 (u32)(trans->delayed_ref_elem.seq >> 32),
1780 (u32)trans->delayed_ref_elem.seq);
1778 BUG(); 1781 BUG();
1779} 1782}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 258fcebc7ccf..18d6fb7be265 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -162,7 +162,7 @@ loop:
162 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) 162 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
163 WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when " 163 WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
164 "creating a fresh transaction\n"); 164 "creating a fresh transaction\n");
165 atomic_set(&fs_info->tree_mod_seq, 0); 165 atomic64_set(&fs_info->tree_mod_seq, 0);
166 166
167 spin_lock_init(&cur_trans->commit_lock); 167 spin_lock_init(&cur_trans->commit_lock);
168 spin_lock_init(&cur_trans->delayed_refs.lock); 168 spin_lock_init(&cur_trans->delayed_refs.lock);