/*
* Copyright (C) 2007,2008 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "locking.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *ins_key,
struct btrfs_path *path, int data_size, int extend);
static int push_node_left(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *dst,
struct extent_buffer *src, int empty);
static int balance_node_right(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
int level, int slot);
static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb);
static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
struct btrfs_path *btrfs_alloc_path(void)
{
struct btrfs_path *path;
path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
return path;
}
/*
* set all locked nodes in the path to blocking locks. This should
* be done before scheduling
*/
noinline void btrfs_set_path_blocking(struct btrfs_path *p)
{
int i;
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
if (!p->nodes[i] || !p->locks[i])
continue;
btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
if (p->locks[i] == BTRFS_READ_LOCK)
p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
else if (p->locks[i] == BTRFS_WRITE_LOCK)
p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
}
}
/*
* reset all the locked nodes in the patch to spinning locks.
*
* held is used to keep lockdep happy, when lockdep is enabled
* we set held to a blocking lock before we go around and
* retake all the spinlocks in the path. You can safely use NULL
* for held
*/
noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
struct extent_buffer *held, int held_rw)
{
int i;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* lockdep really cares that we take all of these spinlocks
* in the right order. If any of the locks in the path are not
* currently blocking, it is going to complain. So, make really
* really sure by forcing the path to blocking before we clear
* the path blocking.
*/
if (held) {
btrfs_set_lock_blocking_rw(held, held_rw);
if (held_rw == BTRFS_WRITE_LOCK)
held_rw = BTRFS_WRITE_LOCK_BLOCKING;
else if (held_rw == BTRFS_READ_LOCK)
held_rw = BTRFS_READ_LOCK_BLOCKING;
}
btrfs_set_path_blocking(p);
#endif
for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
if (p->nodes[i] && p->locks[i]) {
btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
p->locks[i] = BTRFS_WRITE_LOCK;
else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
p->locks[i] = BTRFS_READ_LOCK;
}
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (held)
btrfs_clear_lock_blocking_rw(held, held_rw);
#endif
}
/* this also releases the path */
void btrfs_free_path(struct btrfs_path *p)
{
if (!p)
return;
btrfs_release_path(p);
kmem_cache_free(btrfs_path_cachep, p);
}
/*
* path release drops references on the extent buffers in the path
* and it drops any locks held by this path
*
* It is safe to call this on paths that no locks or extent buffers held.
*/
noinline void btrfs_release_path(struct btrfs_path *p)
{
int i;
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
p->slots[i] = 0;
if (!p->nodes[i])
continue;
if (p->locks[i]) {
btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
p->locks[i] = 0;
}
free_extent_buffer(p->nodes[i]);
p->nodes[i] = NULL;
}
}
/*
* safely gets a reference on the root node of a tree. A lock
* is not taken, so a concurrent writer may put a different node
* at the root of the tree. See btrfs_lock_root_node for the
* looping required.
*
* The extent buffer returned by this has a reference taken, so
* it won't disappear. It may stop being the root of the tree
* at any time because there are no locks held.
*/
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
while (1) {
rcu_read_lock();
eb = rcu_dereference(root->node);
/*
* RCU really hurts here, we could free up the root node because
* it was cow'ed but we may not get the new root node yet so do
* the inc_not_zero dance and if it doesn't work then
* synchronize_rcu and try again.
*/
if (atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
break;
}
rcu_read_unlock();
synchronize_rcu();
}
return eb;
}
/* loop around taking references on and locking the root node of the
* tree until you end up with a lock on the root. A locked buffer
* is returned, with a reference held.
*/
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
btrfs_tree_lock(eb);
if (eb == root->node)
break;
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
}
return eb;
}
/* loop around taking references on and locking the root node of the
* tree until you end up with a lock on the root. A locked buffer
* is returned, with a reference held.
*/
static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
btrfs_tree_read_lock(eb);
if (eb == root->node)
break;
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
}
return eb;
}
/* cowonly root (everything not a reference counted cow subvolume), just get
* put onto a simple dirty list. transaction.c walks this to make sure they
* get properly updated on disk.
*/
static void add_root_to_dirty_list(struct btrfs_root *root)
{
spin_lock(&root->fs_info->trans_lock);
if (root->track_dirty && list_empty(&root->dirty_list)) {
list_add(&root->dirty_list,
&root->fs_info->dirty_cowonly_roots);
}
spin_unlock(&root->fs_info->trans_lock);
}
/*
* used by snapshot creation to make a copy of a root for a tree with
* a given objectid. The buffer with the new root node is returned in
* cow_ret, and this func returns zero on success or a negative error code.
*/
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer **cow_ret, u64 new_root_objectid)
{
struct extent_buffer *cow;
int ret = 0;
int level;
struct btrfs_disk_key disk_key;
WARN_ON(root->ref_cows && trans->transid !=
root->fs_info->running_transaction->transid);
WARN_ON(root->ref_cows && trans->transid != root->last_trans);
level = btrfs_header_level(buf);
if (level == 0)
btrfs_item_key(buf, &disk_key, 0);
else
btrfs_node_key(buf, &disk_key, 0);
cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
new_root_objectid, &disk_key, level,
buf->start, 0);
if (IS_ERR(cow))
return PTR_ERR(cow);
copy_extent_buffer(cow, buf, 0, 0, cow->len);
btrfs_set_header_bytenr(cow, cow->start);
btrfs_set_header_generation(cow, trans->transid);
btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
BTRFS_HEADER_FLAG_RELOC);
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
else
btrfs_set_header_owner(cow, new_root_objectid);
write_extent_buffer(cow, root->fs_info->fsid,
(unsigned long)btrfs_header_fsid(cow),
BTRFS_FSID_SIZE);
WARN_ON(btrfs_header_generation(buf) > trans->transid);
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0, 1);
if (ret)
return ret;
btrfs_mark_buffer_dirty(cow);
*cow_ret = cow;
return 0;
}
enum mod_log_op {
MOD_LOG_KEY_REPLACE,
MOD_LOG_KEY_ADD,
MOD_LOG_KEY_REMOVE,
MOD_LOG_KEY_REMOVE_WHILE_FREEING,
MOD_LOG_KEY_REMOVE_WHILE_MOVING,
MOD_LOG_MOVE_KEYS,
MOD_LOG_ROOT_REPLACE,
};
struct tree_mod_move {
int dst_slot;
int nr_items;
};
struct tree_mod_root {
u64 logical;
u8 level;
};
struct tree_mod_elem {
struct rb_node node;
u64 index; /* shifted logical */
u64 seq;
enum mod_log_op op;
/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
int slot;
/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
u64 generation;
/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
struct btrfs_disk_key key;
u64 blockptr;
/* this is used for op == MOD_LOG_MOVE_KEYS */
struct tree_mod_move move;
/* this is used for op == MOD_LOG_ROOT_REPLACE */
struct tree_mod_root old_root;
};
static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
{
read_lock(&fs_info->tree_mod_log_lock);
}
static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
{
read_unlock(&fs_info->tree_mod_log_lock);
}
static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
{
write_lock(&fs_info->tree_mod_log_lock);
}
static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
{
write_unlock(&fs_info->tree_mod_log_lock);
}
/*
* Increment the upper half of tree_mod_seq, set lower half zero.
*
* Must be called with fs_info->tree_mod_seq_lock held.
*/
static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
{
u64 seq = atomic64_read(&fs_info->tree_mod_seq);
seq &= 0xffffffff00000000ull;
seq += 1ull << 32;
atomic64_set(&fs_info->tree_mod_seq, seq);
return seq;
}
/*
* Increment the lower half of tree_mod_seq.
*
* Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
* are generated should not technically require a spin lock here. (Rationale:
* incrementing the minor while incrementing the major seq number is between its
* atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
* just returns a unique sequence number as usual.) We have decided to leave
* that requirement in here and rethink it once we notice it really imposes a
* problem on some workload.
*/
static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
{
return atomic64_inc_return(&fs_info->tree_mod_seq);
}
/*
* return the last minor in the previous major tree_mod_seq number
*/
u64 btrfs_tree_mod_seq_prev(u64 seq)
{
return (seq & 0xffffffff00000000ull) - 1ull;
}
/*
* This adds a new blocker to the tree mod log's blocker list if the @elem
* passed does not already have a sequence number set. So when a caller expects
* to record tree modifications, it should ensure to set elem->seq to zero
* before calling btrfs_get_tree_mod_seq.
* Returns a fresh, unused tree log modification sequence number, even if no new
* blocker was added.
*/
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem)
{
u64 seq;
tree_mod_log_write_lock(fs_info);
spin_lock(&fs_info->tree_mod_seq_lock);
if (!elem->seq) {
elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
}
seq = btrfs_inc_tree_mod_seq_minor(fs_info);
spin_unlock(&fs_info->tree_mod_seq_lock);
tree_mod_log_write_unlock(fs_info);
return seq;
}
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem)
{
struct rb_root *tm_root;
struct rb_node *node;
struct rb_node *next;
struct seq_list *cur_elem;
struct tree_mod_elem *tm;
u64 min_seq = (u64)-1;
u64 seq_putting = elem->seq;
if (!seq_putting)
return;
spin_lock(&fs_info->tree_mod_seq_lock);
list_del(&elem->list);
elem->seq = 0;
list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
if (cur_elem->seq < min_seq) {
if (seq_putting > cur_elem->seq) {
/*
* blocker with lower sequence number exists, we
* cannot remove anything from the log
*/
spin_unlock(&fs_info->tree_mod_seq_lock);
return;
}
min_seq = cur_elem->seq;
}
}
spin_unlock(&fs_info->tree_mod_seq_lock);
/*
* anything that's lower than the lowest existing (read: blocked)
* sequence number can be removed from the tree.
*/
tree_mod_log_write_lock(fs_info);
tm_root = &fs_info->tree_mod_log;
for (node = rb_first(tm_root); node; node = next) {
next = rb_next(node);
tm = container_of(node, struct tree_mod_elem, node);
if (tm->seq > min_seq)
continue;
rb_erase(node, tm_root);
kfree(tm);
}
tree_mod_log_write_unlock(fs_info);
}
/*
* key order of the log:
* index -> sequence
*
* the index is the shifted logical of the *new* root node for root replace
* operations, or the shifted logical of the affected block for all other
* operations.
*/
static noinline int
__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
{
struct rb_root *tm_root;
struct rb_node **new;
struct rb_node *parent = NULL;
struct tree_mod_elem *cur;
BUG_ON(!tm || !tm->seq);
tm_root = &fs_info->tree_mod_log;
new = &tm_root->rb_node;
while (*new) {
cur = container_of(*new, struct tree_mod_elem, node);
parent = *new;
if (cur->index < tm->index)
new = &((*new)->rb_left);
else if (cur->index > tm->index)
new = &((*new)->rb_right);
else if (cur->seq < tm->seq)
new = &((*new)->rb_left);
else if (cur->seq > tm->seq)
new = &((*new)->rb_right);
else {
kfree(tm);
return -EEXIST;
}
}
rb_link_node(&tm->node, parent, new);
rb_insert_color(&tm->node, tm_root);
return 0;
}
/*
* Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
* returns zero with the tree_mod_log_lock acquired. The caller must hold
* this until all tree mod log insertions are recorded in the rb tree and then
* call tree_mod_log_write_unlock() to release.
*/
static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb) {
smp_mb();
if (list_empty(&(fs_info)->tree_mod_seq_list))
return 1;
if (eb && btrfs_header_level(eb) == 0)
return 1;
tree_mod_log_write_lock(fs_info);
if (list_empty(&fs_info->tree_mod_seq_list)) {
/*
* someone emptied the list while we were waiting for the lock.
* we must not add to the list when no blocker exists.
*/
tree_mod_log_write_unlock(fs_info);
return 1;
}
return 0;
}
/*
* This allocates memory and gets a tree modification sequence number.
*
* Returns <0 on error.
* Returns >0 (the added sequence number) on success.
*/
static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
struct tree_mod_elem **tm_ret)
{
struct tree_mod_elem *tm;
/*
* once we switch from spin locks to something different, we should
* honor the flags parameter here.
*/
tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
if (!tm)
return -ENOMEM;
spin_lock(&fs_info->tree_mod_seq_lock);
tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
spin_unlock(&fs_info->tree_mod_seq_lock);
return tm->seq;
}
static inline int
__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int slot,
enum mod_log_op op, gfp_t flags)
{
int ret;
struct tree_mod_elem *tm;
ret = tree_mod_alloc(fs_info, flags, &tm);
if (ret < 0)
return ret;
tm->index = eb->start >> PAGE_CACHE_SHIFT;
if (op != MOD_LOG_KEY_ADD) {
btrfs_node_key(eb, &tm->key, slot);
tm->blockptr = btrfs_node_blockptr(eb, slot);
}
tm->op = op;
tm->slot = slot;
tm->generation = btrfs_node_ptr_generation(eb, slot);
return __tree_mod_log_insert(fs_info, tm);
}
static noinline int
tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int slot,
enum mod_log_op op, gfp_t flags)
{
int ret;
if (tree_mod_dont_log(fs_info, eb))
return 0;
ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
tree_mod_log_write_unlock(fs_info);
return ret;
}
static noinline int
tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
int slot, enum mod_log_op op)
{
return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
}
static noinline int
tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int slot,
enum mod_log_op op)
{
return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
}
static noinline int
tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int dst_slot, int src_slot,
int nr_items, gfp_t flags)
{
struct tree_mod_elem *tm;
int ret;
int i;
if (tree_mod_dont_log(fs_info, eb))
return 0;
/*
* When we override something during the move, we log these removals.
* This can only happen when we move towards the beginning of the
* buffer, i.e. dst_slot < src_slot.
*/
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
MOD_LOG_KEY_REMOVE_WHILE_MOVING);
BUG_ON(ret < 0);
}
ret = tree_mod_alloc(fs_info, flags, &tm);
if (ret < 0)
goto out;
tm->index = eb->start >> PAGE_CACHE_SHIFT;
tm->slot = src_slot;
tm->move.dst_slot = dst_slot;
tm->move.nr_items = nr_items;
tm->op = MOD_LOG_MOVE_KEYS;
ret = __tree_mod_log_insert(fs_info, tm);
out:
tree_mod_log_write_unlock(fs_info);
return ret;
}
static inline void
__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
{
int i;
u32 nritems;
int ret;
if (btrfs_header_level(eb) == 0)
return;
nritems = btrfs_header_nritems(eb);
for (i = nritems - 1; i >= 0; i--) {
ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
MOD_LOG_KEY_REMOVE_WHILE_FREEING);
BUG_ON(ret < 0);
}
}
static noinline int
tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
struct extent_buffer *old_root,
struct extent_buffer *new_root, gfp_t flags,
int log_removal)
{
struct tree_mod_elem *tm;
int ret;
if (tree_mod_dont_log(fs_info, NULL))
return 0;
if (log_removal)
__tree_mod_log_free_eb(fs_info, old_root);
ret = tree_mod_alloc(fs_info, flags, &tm);
if (ret < 0)
goto out;
tm->index = new_root->start >> PAGE_CACHE_SHIFT;
tm->old_root.logical = old_root->start;
tm->old_root.level = btrfs_header_level(old_root);
tm->generation = btrfs_header_generation(old_root);
tm->op = MOD_LOG_ROOT_REPLACE;
ret = __tree_mod_log_insert(fs_info, tm);
out:
tree_mod_log_write_unlock(fs_info);
return ret;
}
static struct tree_mod_elem *
__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
int smallest)
{
|