aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-09-29 15:18:18 -0400
committerChris Mason <chris.mason@oracle.com>2008-09-29 15:18:18 -0400
commitd352ac68148b69937d39ca5d48bcc4478e118dbf (patch)
tree7951dd7311999d9e77766acdc7f8e93de97874d8 /fs/btrfs
parent9a5e1ea1e1e539e244a54afffc330fc368376ab9 (diff)
Btrfs: add and improve comments
This improves the comments at the top of many functions. It didn't dive into the guts of functions because I was trying to avoid merging problems with the new allocator and back reference work. extent-tree.c and volumes.c were both skipped, and there is definitely more work todo in cleaning and commenting the code. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/TODO20
-rw-r--r--fs/btrfs/async-thread.c10
-rw-r--r--fs/btrfs/async-thread.h7
-rw-r--r--fs/btrfs/bit-radix.c130
-rw-r--r--fs/btrfs/bit-radix.h33
-rw-r--r--fs/btrfs/btrfs_inode.h54
-rw-r--r--fs/btrfs/crc32c.h18
-rw-r--r--fs/btrfs/ctree.c127
-rw-r--r--fs/btrfs/ctree.h1
-rw-r--r--fs/btrfs/dir-item.c41
-rw-r--r--fs/btrfs/disk-io.c33
-rw-r--r--fs/btrfs/extent_io.c34
-rw-r--r--fs/btrfs/extent_map.c10
-rw-r--r--fs/btrfs/file.c44
-rw-r--r--fs/btrfs/inode.c189
-rw-r--r--fs/btrfs/locking.c13
-rw-r--r--fs/btrfs/ordered-data.c19
-rw-r--r--fs/btrfs/ref-cache.c26
-rw-r--r--fs/btrfs/ref-cache.h3
-rw-r--r--fs/btrfs/root-tree.c21
-rw-r--r--fs/btrfs/struct-funcs.c21
-rw-r--r--fs/btrfs/super.c3
-rw-r--r--fs/btrfs/transaction.c67
-rw-r--r--fs/btrfs/tree-defrag.c4
25 files changed, 653 insertions, 277 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d5c28557fba9..48b7909ca8d1 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -4,7 +4,7 @@ ifneq ($(KERNELRELEASE),)
4obj-m := btrfs.o 4obj-m := btrfs.o
5btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ 5btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 file-item.o inode-item.o inode-map.o disk-io.o \ 6 file-item.o inode-item.o inode-map.o disk-io.o \
7 transaction.o bit-radix.o inode.o file.o tree-defrag.o \ 7 transaction.o inode.o file.o tree-defrag.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o 10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o
diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
deleted file mode 100644
index d9b6d38c603a..000000000000
--- a/fs/btrfs/TODO
+++ /dev/null
@@ -1,20 +0,0 @@
1* cleanup, add more error checking, get rid of BUG_ONs
2* Fix ENOSPC handling
3* Make allocator smarter
4* add a block group to struct inode
5* Do actual block accounting
6* Check compat and incompat flags on the inode
7* Get rid of struct ctree_path, limiting tree levels held at one time
8* Add generation number to key pointer in nodes
9* Add generation number to inode
10* forbid cross subvolume renames and hardlinks
11* Release
12* Do real tree locking
13* Add extent mirroring (backup copies of blocks)
14* Add fancy interface to get access to incremental backups
15* Add fancy striped extents to make big reads faster
16* Use relocation to try and fix write errors
17* Make allocator much smarter
18* xattrs (directory streams for regular files)
19* Scrub & defrag
20
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 4e780b279de6..04fb9702d14c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -231,17 +231,25 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
231 231
232 /* 232 /*
233 * if we pick a busy task, move the task to the end of the list. 233 * if we pick a busy task, move the task to the end of the list.
234 * hopefully this will keep things somewhat evenly balanced 234 * hopefully this will keep things somewhat evenly balanced.
235 * Do the move in batches based on the sequence number. This groups
236 * requests submitted at roughly the same time onto the same worker.
235 */ 237 */
236 next = workers->worker_list.next; 238 next = workers->worker_list.next;
237 worker = list_entry(next, struct btrfs_worker_thread, worker_list); 239 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
238 atomic_inc(&worker->num_pending); 240 atomic_inc(&worker->num_pending);
239 worker->sequence++; 241 worker->sequence++;
242
240 if (worker->sequence % workers->idle_thresh == 0) 243 if (worker->sequence % workers->idle_thresh == 0)
241 list_move_tail(next, &workers->worker_list); 244 list_move_tail(next, &workers->worker_list);
242 return worker; 245 return worker;
243} 246}
244 247
248/*
249 * selects a worker thread to take the next job. This will either find
250 * an idle worker, start a new worker up to the max count, or just return
251 * one of the existing busy workers.
252 */
245static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) 253static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
246{ 254{
247 struct btrfs_worker_thread *worker; 255 struct btrfs_worker_thread *worker;
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 43e44d115dd1..4ec9a2ee0f9d 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -63,14 +63,17 @@ struct btrfs_workers {
63 /* once a worker has this many requests or fewer, it is idle */ 63 /* once a worker has this many requests or fewer, it is idle */
64 int idle_thresh; 64 int idle_thresh;
65 65
66 /* list with all the work threads */ 66 /* list with all the work threads. The workers on the idle thread
67 * may be actively servicing jobs, but they haven't yet hit the
68 * idle thresh limit above.
69 */
67 struct list_head worker_list; 70 struct list_head worker_list;
68 struct list_head idle_list; 71 struct list_head idle_list;
69 72
70 /* lock for finding the next worker thread to queue on */ 73 /* lock for finding the next worker thread to queue on */
71 spinlock_t lock; 74 spinlock_t lock;
72 75
73 /* extra name for this worker */ 76 /* extra name for this worker, used for current->name */
74 char *name; 77 char *name;
75}; 78};
76 79
diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
deleted file mode 100644
index e8bf876db393..000000000000
--- a/fs/btrfs/bit-radix.c
+++ /dev/null
@@ -1,130 +0,0 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "bit-radix.h"
20
21#define BIT_ARRAY_BYTES 256
22#define BIT_RADIX_BITS_PER_ARRAY ((BIT_ARRAY_BYTES - sizeof(unsigned long)) * 8)
23
24extern struct kmem_cache *btrfs_bit_radix_cachep;
25int set_radix_bit(struct radix_tree_root *radix, unsigned long bit)
26{
27 unsigned long *bits;
28 unsigned long slot;
29 int bit_slot;
30 int ret;
31
32 slot = bit / BIT_RADIX_BITS_PER_ARRAY;
33 bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
34
35 bits = radix_tree_lookup(radix, slot);
36 if (!bits) {
37 bits = kmem_cache_alloc(btrfs_bit_radix_cachep, GFP_NOFS);
38 if (!bits)
39 return -ENOMEM;
40 memset(bits + 1, 0, BIT_ARRAY_BYTES - sizeof(unsigned long));
41 bits[0] = slot;
42 ret = radix_tree_insert(radix, slot, bits);
43 if (ret)
44 return ret;
45 }
46 ret = test_and_set_bit(bit_slot, bits + 1);
47 if (ret < 0)
48 ret = 1;
49 return ret;
50}
51
52int test_radix_bit(struct radix_tree_root *radix, unsigned long bit)
53{
54 unsigned long *bits;
55 unsigned long slot;
56 int bit_slot;
57
58 slot = bit / BIT_RADIX_BITS_PER_ARRAY;
59 bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
60
61 bits = radix_tree_lookup(radix, slot);
62 if (!bits)
63 return 0;
64 return test_bit(bit_slot, bits + 1);
65}
66
67int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit)
68{
69 unsigned long *bits;
70 unsigned long slot;
71 int bit_slot;
72 int i;
73 int empty = 1;
74
75 slot = bit / BIT_RADIX_BITS_PER_ARRAY;
76 bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
77
78 bits = radix_tree_lookup(radix, slot);
79 if (!bits)
80 return 0;
81 clear_bit(bit_slot, bits + 1);
82 for (i = 1; i < BIT_ARRAY_BYTES / sizeof(unsigned long); i++) {
83 if (bits[i]) {
84 empty = 0;
85 break;
86 }
87 }
88 if (empty) {
89 bits = radix_tree_delete(radix, slot);
90 BUG_ON(!bits);
91 kmem_cache_free(btrfs_bit_radix_cachep, bits);
92 }
93 return 0;
94}
95
96int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
97 unsigned long start, int nr)
98{
99 unsigned long *bits;
100 unsigned long *gang[4];
101 int found;
102 int ret;
103 int i;
104 int total_found = 0;
105 unsigned long slot;
106
107 slot = start / BIT_RADIX_BITS_PER_ARRAY;
108 ret = radix_tree_gang_lookup(radix, (void **)gang, slot,
109 ARRAY_SIZE(gang));
110 found = start % BIT_RADIX_BITS_PER_ARRAY;
111 for (i = 0; i < ret && nr > 0; i++) {
112 bits = gang[i];
113 while(nr > 0) {
114 found = find_next_bit(bits + 1,
115 BIT_RADIX_BITS_PER_ARRAY,
116 found);
117 if (found < BIT_RADIX_BITS_PER_ARRAY) {
118 *retbits = bits[0] *
119 BIT_RADIX_BITS_PER_ARRAY + found;
120 retbits++;
121 nr--;
122 total_found++;
123 found++;
124 } else
125 break;
126 }
127 found = 0;
128 }
129 return total_found;
130}
diff --git a/fs/btrfs/bit-radix.h b/fs/btrfs/bit-radix.h
deleted file mode 100644
index c100f54d5c32..000000000000
--- a/fs/btrfs/bit-radix.h
+++ /dev/null
@@ -1,33 +0,0 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BIT_RADIX__
20#define __BIT_RADIX__
21#include <linux/radix-tree.h>
22
23int set_radix_bit(struct radix_tree_root *radix, unsigned long bit);
24int test_radix_bit(struct radix_tree_root *radix, unsigned long bit);
25int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit);
26int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
27 unsigned long start, int nr);
28
29static inline void init_bit_radix(struct radix_tree_root *radix)
30{
31 INIT_RADIX_TREE(radix, GFP_NOFS);
32}
33#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0577fda2168a..0b2e623cf421 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -25,27 +25,58 @@
25 25
26/* in memory btrfs inode */ 26/* in memory btrfs inode */
27struct btrfs_inode { 27struct btrfs_inode {
28 /* which subvolume this inode belongs to */
28 struct btrfs_root *root; 29 struct btrfs_root *root;
30
31 /* the block group preferred for allocations. This pointer is buggy
32 * and needs to be replaced with a bytenr instead
33 */
29 struct btrfs_block_group_cache *block_group; 34 struct btrfs_block_group_cache *block_group;
35
36 /* key used to find this inode on disk. This is used by the code
37 * to read in roots of subvolumes
38 */
30 struct btrfs_key location; 39 struct btrfs_key location;
40
41 /* the extent_tree has caches of all the extent mappings to disk */
31 struct extent_map_tree extent_tree; 42 struct extent_map_tree extent_tree;
43
44 /* the io_tree does range state (DIRTY, LOCKED etc) */
32 struct extent_io_tree io_tree; 45 struct extent_io_tree io_tree;
46
47 /* special utility tree used to record which mirrors have already been
48 * tried when checksums fail for a given block
49 */
33 struct extent_io_tree io_failure_tree; 50 struct extent_io_tree io_failure_tree;
51
52 /* held while inserting checksums to avoid races */
34 struct mutex csum_mutex; 53 struct mutex csum_mutex;
54
55 /* held while inesrting or deleting extents from files */
35 struct mutex extent_mutex; 56 struct mutex extent_mutex;
57
58 /* held while logging the inode in tree-log.c */
36 struct mutex log_mutex; 59 struct mutex log_mutex;
37 struct inode vfs_inode; 60
61 /* used to order data wrt metadata */
38 struct btrfs_ordered_inode_tree ordered_tree; 62 struct btrfs_ordered_inode_tree ordered_tree;
39 63
64 /* standard acl pointers */
40 struct posix_acl *i_acl; 65 struct posix_acl *i_acl;
41 struct posix_acl *i_default_acl; 66 struct posix_acl *i_default_acl;
42 67
43 /* for keeping track of orphaned inodes */ 68 /* for keeping track of orphaned inodes */
44 struct list_head i_orphan; 69 struct list_head i_orphan;
45 70
71 /* list of all the delalloc inodes in the FS. There are times we need
72 * to write all the delalloc pages to disk, and this list is used
73 * to walk them all.
74 */
46 struct list_head delalloc_inodes; 75 struct list_head delalloc_inodes;
47 76
48 /* full 64 bit generation number */ 77 /* full 64 bit generation number, struct vfs_inode doesn't have a big
78 * enough field for this.
79 */
49 u64 generation; 80 u64 generation;
50 81
51 /* 82 /*
@@ -57,10 +88,25 @@ struct btrfs_inode {
57 */ 88 */
58 u64 logged_trans; 89 u64 logged_trans;
59 90
60 /* trans that last made a change that should be fully fsync'd */ 91 /*
92 * trans that last made a change that should be fully fsync'd. This
93 * gets reset to zero each time the inode is logged
94 */
61 u64 log_dirty_trans; 95 u64 log_dirty_trans;
96
97 /* total number of bytes pending delalloc, used by stat to calc the
98 * real block usage of the file
99 */
62 u64 delalloc_bytes; 100 u64 delalloc_bytes;
101
102 /*
103 * the size of the file stored in the metadata on disk. data=ordered
104 * means the in-memory i_size might be larger than the size on disk
105 * because not all the blocks are written yet.
106 */
63 u64 disk_i_size; 107 u64 disk_i_size;
108
109 /* flags field from the on disk inode */
64 u32 flags; 110 u32 flags;
65 111
66 /* 112 /*
@@ -68,6 +114,8 @@ struct btrfs_inode {
68 * number for new files that are created 114 * number for new files that are created
69 */ 115 */
70 u64 index_cnt; 116 u64 index_cnt;
117
118 struct inode vfs_inode;
71}; 119};
72 120
73static inline struct btrfs_inode *BTRFS_I(struct inode *inode) 121static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
index 4f0fefed132a..1eaf11d334fd 100644
--- a/fs/btrfs/crc32c.h
+++ b/fs/btrfs/crc32c.h
@@ -1,3 +1,21 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
1#ifndef __BTRFS_CRC32C__ 19#ifndef __BTRFS_CRC32C__
2#define __BTRFS_CRC32C__ 20#define __BTRFS_CRC32C__
3#include <asm/byteorder.h> 21#include <asm/byteorder.h>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 50e81f43e6d4..ff3261ff2e19 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007,2008 Oracle. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 5 * modify it under the terms of the GNU General Public
@@ -54,12 +54,19 @@ struct btrfs_path *btrfs_alloc_path(void)
54 return path; 54 return path;
55} 55}
56 56
57/* this also releases the path */
57void btrfs_free_path(struct btrfs_path *p) 58void btrfs_free_path(struct btrfs_path *p)
58{ 59{
59 btrfs_release_path(NULL, p); 60 btrfs_release_path(NULL, p);
60 kmem_cache_free(btrfs_path_cachep, p); 61 kmem_cache_free(btrfs_path_cachep, p);
61} 62}
62 63
64/*
65 * path release drops references on the extent buffers in the path
66 * and it drops any locks held by this path
67 *
68 * It is safe to call this on paths that no locks or extent buffers held.
69 */
63void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) 70void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
64{ 71{
65 int i; 72 int i;
@@ -77,6 +84,16 @@ void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
77 } 84 }
78} 85}
79 86
87/*
88 * safely gets a reference on the root node of a tree. A lock
89 * is not taken, so a concurrent writer may put a different node
90 * at the root of the tree. See btrfs_lock_root_node for the
91 * looping required.
92 *
93 * The extent buffer returned by this has a reference taken, so
94 * it won't disappear. It may stop being the root of the tree
95 * at any time because there are no locks held.
96 */
80struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 97struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
81{ 98{
82 struct extent_buffer *eb; 99 struct extent_buffer *eb;
@@ -87,6 +104,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
87 return eb; 104 return eb;
88} 105}
89 106
107/* loop around taking references on and locking the root node of the
108 * tree until you end up with a lock on the root. A locked buffer
109 * is returned, with a reference held.
110 */
90struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) 111struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
91{ 112{
92 struct extent_buffer *eb; 113 struct extent_buffer *eb;
@@ -108,6 +129,10 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
108 return eb; 129 return eb;
109} 130}
110 131
132/* cowonly root (everything not a reference counted cow subvolume), just get
133 * put onto a simple dirty list. transaction.c walks this to make sure they
134 * get properly updated on disk.
135 */
111static void add_root_to_dirty_list(struct btrfs_root *root) 136static void add_root_to_dirty_list(struct btrfs_root *root)
112{ 137{
113 if (root->track_dirty && list_empty(&root->dirty_list)) { 138 if (root->track_dirty && list_empty(&root->dirty_list)) {
@@ -116,6 +141,11 @@ static void add_root_to_dirty_list(struct btrfs_root *root)
116 } 141 }
117} 142}
118 143
144/*
145 * used by snapshot creation to make a copy of a root for a tree with
146 * a given objectid. The buffer with the new root node is returned in
147 * cow_ret, and this func returns zero on success or a negative error code.
148 */
119int btrfs_copy_root(struct btrfs_trans_handle *trans, 149int btrfs_copy_root(struct btrfs_trans_handle *trans,
120 struct btrfs_root *root, 150 struct btrfs_root *root,
121 struct extent_buffer *buf, 151 struct extent_buffer *buf,
@@ -167,6 +197,22 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
167 return 0; 197 return 0;
168} 198}
169 199
200/*
201 * does the dirty work in cow of a single block. The parent block
202 * (if supplied) is updated to point to the new cow copy. The new
203 * buffer is marked dirty and returned locked. If you modify the block
204 * it needs to be marked dirty again.
205 *
206 * search_start -- an allocation hint for the new block
207 *
208 * empty_size -- a hint that you plan on doing more cow. This is the size in bytes
209 * the allocator should try to find free next to the block it returns. This is
210 * just a hint and may be ignored by the allocator.
211 *
212 * prealloc_dest -- if you have already reserved a destination for the cow,
213 * this uses that block instead of allocating a new one. btrfs_alloc_reserved_extent
214 * is used to finish the allocation.
215 */
170int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans, 216int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
171 struct btrfs_root *root, 217 struct btrfs_root *root,
172 struct extent_buffer *buf, 218 struct extent_buffer *buf,
@@ -311,6 +357,11 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
311 return 0; 357 return 0;
312} 358}
313 359
360/*
361 * cows a single block, see __btrfs_cow_block for the real work.
362 * This version of it has extra checks so that a block isn't cow'd more than
363 * once per transaction, as long as it hasn't been written yet
364 */
314int noinline btrfs_cow_block(struct btrfs_trans_handle *trans, 365int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
315 struct btrfs_root *root, struct extent_buffer *buf, 366 struct btrfs_root *root, struct extent_buffer *buf,
316 struct extent_buffer *parent, int parent_slot, 367 struct extent_buffer *parent, int parent_slot,
@@ -347,6 +398,10 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
347 return ret; 398 return ret;
348} 399}
349 400
401/*
402 * helper function for defrag to decide if two blocks pointed to by a
403 * node are actually close by
404 */
350static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 405static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
351{ 406{
352 if (blocknr < other && other - (blocknr + blocksize) < 32768) 407 if (blocknr < other && other - (blocknr + blocksize) < 32768)
@@ -381,6 +436,11 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
381} 436}
382 437
383 438
439/*
440 * this is used by the defrag code to go through all the
441 * leaves pointed to by a node and reallocate them so that
442 * disk order is close to key order
443 */
384int btrfs_realloc_node(struct btrfs_trans_handle *trans, 444int btrfs_realloc_node(struct btrfs_trans_handle *trans,
385 struct btrfs_root *root, struct extent_buffer *parent, 445 struct btrfs_root *root, struct extent_buffer *parent,
386 int start_slot, int cache_only, u64 *last_ret, 446 int start_slot, int cache_only, u64 *last_ret,
@@ -521,6 +581,10 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
521 return btrfs_item_offset_nr(leaf, nr - 1); 581 return btrfs_item_offset_nr(leaf, nr - 1);
522} 582}
523 583
584/*
585 * extra debugging checks to make sure all the items in a key are
586 * well formed and in the proper order
587 */
524static int check_node(struct btrfs_root *root, struct btrfs_path *path, 588static int check_node(struct btrfs_root *root, struct btrfs_path *path,
525 int level) 589 int level)
526{ 590{
@@ -561,6 +625,10 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path,
561 return 0; 625 return 0;
562} 626}
563 627
628/*
629 * extra checking to make sure all the items in a leaf are
630 * well formed and in the proper order
631 */
564static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, 632static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
565 int level) 633 int level)
566{ 634{
@@ -782,6 +850,10 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
782 return -1; 850 return -1;
783} 851}
784 852
853/* given a node and slot number, this reads the blocks it points to. The
854 * extent buffer is returned with a reference taken (but unlocked).
855 * NULL is returned on error.
856 */
785static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, 857static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
786 struct extent_buffer *parent, int slot) 858 struct extent_buffer *parent, int slot)
787{ 859{
@@ -798,6 +870,11 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
798 btrfs_node_ptr_generation(parent, slot)); 870 btrfs_node_ptr_generation(parent, slot));
799} 871}
800 872
873/*
874 * node level balancing, used to make sure nodes are in proper order for
875 * item deletion. We balance from the top down, so we have to make sure
876 * that a deletion won't leave an node completely empty later on.
877 */
801static noinline int balance_level(struct btrfs_trans_handle *trans, 878static noinline int balance_level(struct btrfs_trans_handle *trans,
802 struct btrfs_root *root, 879 struct btrfs_root *root,
803 struct btrfs_path *path, int level) 880 struct btrfs_path *path, int level)
@@ -1024,7 +1101,10 @@ enospc:
1024 return ret; 1101 return ret;
1025} 1102}
1026 1103
1027/* returns zero if the push worked, non-zero otherwise */ 1104/* Node balancing for insertion. Here we only split or push nodes around
1105 * when they are completely full. This is also done top down, so we
1106 * have to be pessimistic.
1107 */
1028static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans, 1108static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
1029 struct btrfs_root *root, 1109 struct btrfs_root *root,
1030 struct btrfs_path *path, int level) 1110 struct btrfs_path *path, int level)
@@ -1150,7 +1230,8 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
1150} 1230}
1151 1231
1152/* 1232/*
1153 * readahead one full node of leaves 1233 * readahead one full node of leaves, finding things that are close
1234 * to the block in 'slot', and triggering ra on them.
1154 */ 1235 */
1155static noinline void reada_for_search(struct btrfs_root *root, 1236static noinline void reada_for_search(struct btrfs_root *root,
1156 struct btrfs_path *path, 1237 struct btrfs_path *path,
@@ -1226,6 +1307,19 @@ static noinline void reada_for_search(struct btrfs_root *root,
1226 } 1307 }
1227} 1308}
1228 1309
1310/*
1311 * when we walk down the tree, it is usually safe to unlock the higher layers in
1312 * the tree. The exceptions are when our path goes through slot 0, because operations
1313 * on the tree might require changing key pointers higher up in the tree.
1314 *
1315 * callers might also have set path->keep_locks, which tells this code to
1316 * keep the lock if the path points to the last slot in the block. This is
1317 * part of walking through the tree, and selecting the next slot in the higher
1318 * block.
1319 *
1320 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.
1321 * so if lowest_unlock is 1, level 0 won't be unlocked
1322 */
1229static noinline void unlock_up(struct btrfs_path *path, int level, 1323static noinline void unlock_up(struct btrfs_path *path, int level,
1230 int lowest_unlock) 1324 int lowest_unlock)
1231{ 1325{
@@ -2705,6 +2799,12 @@ again:
2705 return ret; 2799 return ret;
2706} 2800}
2707 2801
2802/*
2803 * make the item pointed to by the path smaller. new_size indicates
2804 * how small to make it, and from_end tells us if we just chop bytes
2805 * off the end of the item or if we shift the item to chop bytes off
2806 * the front.
2807 */
2708int btrfs_truncate_item(struct btrfs_trans_handle *trans, 2808int btrfs_truncate_item(struct btrfs_trans_handle *trans,
2709 struct btrfs_root *root, 2809 struct btrfs_root *root,
2710 struct btrfs_path *path, 2810 struct btrfs_path *path,
@@ -2818,6 +2918,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
2818 return ret; 2918 return ret;
2819} 2919}
2820 2920
2921/*
2922 * make the item pointed to by the path bigger, data_size is the new size.
2923 */
2821int btrfs_extend_item(struct btrfs_trans_handle *trans, 2924int btrfs_extend_item(struct btrfs_trans_handle *trans,
2822 struct btrfs_root *root, struct btrfs_path *path, 2925 struct btrfs_root *root, struct btrfs_path *path,
2823 u32 data_size) 2926 u32 data_size)
@@ -2897,7 +3000,7 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
2897} 3000}
2898 3001
2899/* 3002/*
2900 * Given a key and some data, insert an item into the tree. 3003 * Given a key and some data, insert items into the tree.
2901 * This does all the path init required, making room in the tree if needed. 3004 * This does all the path init required, making room in the tree if needed.
2902 */ 3005 */
2903int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 3006int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
@@ -3046,9 +3149,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3046/* 3149/*
3047 * delete the pointer from a given node. 3150 * delete the pointer from a given node.
3048 * 3151 *
3049 * If the delete empties a node, the node is removed from the tree, 3152 * the tree should have been previously balanced so the deletion does not
3050 * continuing all the way the root if required. The root is converted into 3153 * empty a node.
3051 * a leaf if all the nodes are emptied.
3052 */ 3154 */
3053static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3155static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3054 struct btrfs_path *path, int level, int slot) 3156 struct btrfs_path *path, int level, int slot)
@@ -3233,6 +3335,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3233 * search the tree again to find a leaf with lesser keys 3335 * search the tree again to find a leaf with lesser keys
3234 * returns 0 if it found something or 1 if there are no lesser leaves. 3336 * returns 0 if it found something or 1 if there are no lesser leaves.
3235 * returns < 0 on io errors. 3337 * returns < 0 on io errors.
3338 *
3339 * This may release the path, and so you may lose any locks held at the
3340 * time you call it.
3236 */ 3341 */
3237int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 3342int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3238{ 3343{
@@ -3265,9 +3370,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3265/* 3370/*
3266 * A helper function to walk down the tree starting at min_key, and looking 3371 * A helper function to walk down the tree starting at min_key, and looking
3267 * for nodes or leaves that are either in cache or have a minimum 3372 * for nodes or leaves that are either in cache or have a minimum
3268 * transaction id. This is used by the btree defrag code, but could 3373 * transaction id. This is used by the btree defrag code, and tree logging
3269 * also be used to search for blocks that have changed since a given
3270 * transaction id.
3271 * 3374 *
3272 * This does not cow, but it does stuff the starting key it finds back 3375 * This does not cow, but it does stuff the starting key it finds back
3273 * into min_key, so you can call btrfs_search_slot with cow=1 on the 3376 * into min_key, so you can call btrfs_search_slot with cow=1 on the
@@ -3279,6 +3382,10 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3279 * This honors path->lowest_level to prevent descent past a given level 3382 * This honors path->lowest_level to prevent descent past a given level
3280 * of the tree. 3383 * of the tree.
3281 * 3384 *
3385 * min_trans indicates the oldest transaction that you are interested
3386 * in walking through. Any nodes or leaves older than min_trans are
3387 * skipped over (without reading them).
3388 *
3282 * returns zero if something useful was found, < 0 on error and 1 if there 3389 * returns zero if something useful was found, < 0 on error and 1 if there
3283 * was nothing in the tree that matched the search criteria. 3390 * was nothing in the tree that matched the search criteria.
3284 */ 3391 */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0079b60b18f3..ded1643c0273 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,7 +27,6 @@
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <asm/kmap_types.h> 29#include <asm/kmap_types.h>
30#include "bit-radix.h"
31#include "extent_io.h" 30#include "extent_io.h"
32#include "extent_map.h" 31#include "extent_map.h"
33#include "async-thread.h" 32#include "async-thread.h"
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index e4f30090d640..5040b71f1900 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,6 +21,14 @@
21#include "hash.h" 21#include "hash.h"
22#include "transaction.h" 22#include "transaction.h"
23 23
24/*
25 * insert a name into a directory, doing overflow properly if there is a hash
26 * collision. data_size indicates how big the item inserted should be. On
27 * success a struct btrfs_dir_item pointer is returned, otherwise it is
28 * an ERR_PTR.
29 *
30 * The name is not copied into the dir item, you have to do that yourself.
31 */
24static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle 32static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
25 *trans, 33 *trans,
26 struct btrfs_root *root, 34 struct btrfs_root *root,
@@ -55,6 +63,10 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
55 return (struct btrfs_dir_item *)ptr; 63 return (struct btrfs_dir_item *)ptr;
56} 64}
57 65
66/*
67 * xattrs work a lot like directories, this inserts an xattr item
68 * into the tree
69 */
58int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, 70int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
59 struct btrfs_root *root, const char *name, 71 struct btrfs_root *root, const char *name,
60 u16 name_len, const void *data, u16 data_len, 72 u16 name_len, const void *data, u16 data_len,
@@ -109,6 +121,13 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
109 return ret; 121 return ret;
110} 122}
111 123
124/*
125 * insert a directory item in the tree, doing all the magic for
126 * both indexes. 'dir' indicates which objectid to insert it into,
127 * 'location' is the key to stuff into the directory item, 'type' is the
128 * type of the inode we're pointing to, and 'index' is the sequence number
129 * to use for the second index (if one is created).
130 */
112int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root 131int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
113 *root, const char *name, int name_len, u64 dir, 132 *root, const char *name, int name_len, u64 dir,
114 struct btrfs_key *location, u8 type, u64 index) 133 struct btrfs_key *location, u8 type, u64 index)
@@ -184,6 +203,11 @@ out:
184 return 0; 203 return 0;
185} 204}
186 205
206/*
207 * lookup a directory item based on name. 'dir' is the objectid
208 * we're searching in, and 'mod' tells us if you plan on deleting the
209 * item (use mod < 0) or changing the options (use mod > 0)
210 */
187struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, 211struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
188 struct btrfs_root *root, 212 struct btrfs_root *root,
189 struct btrfs_path *path, u64 dir, 213 struct btrfs_path *path, u64 dir,
@@ -222,6 +246,14 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
222 return btrfs_match_dir_item_name(root, path, name, name_len); 246 return btrfs_match_dir_item_name(root, path, name, name_len);
223} 247}
224 248
249/*
250 * lookup a directory item based on index. 'dir' is the objectid
251 * we're searching in, and 'mod' tells us if you plan on deleting the
252 * item (use mod < 0) or changing the options (use mod > 0)
253 *
254 * The name is used to make sure the index really points to the name you were
255 * looking for.
256 */
225struct btrfs_dir_item * 257struct btrfs_dir_item *
226btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, 258btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
227 struct btrfs_root *root, 259 struct btrfs_root *root,
@@ -282,6 +314,11 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
282 return btrfs_match_dir_item_name(root, path, name, name_len); 314 return btrfs_match_dir_item_name(root, path, name, name_len);
283} 315}
284 316
317/*
318 * helper function to look at the directory item pointed to by 'path'
319 * this walks through all the entries in a dir item and finds one
320 * for a specific name.
321 */
285struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, 322struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
286 struct btrfs_path *path, 323 struct btrfs_path *path,
287 const char *name, int name_len) 324 const char *name, int name_len)
@@ -313,6 +350,10 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
313 return NULL; 350 return NULL;
314} 351}
315 352
353/*
354 * given a pointer into a directory item, delete it. This
355 * handles items that have more than one entry in them.
356 */
316int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, 357int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
317 struct btrfs_root *root, 358 struct btrfs_root *root,
318 struct btrfs_path *path, 359 struct btrfs_path *path,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 45b4f7285275..5ee10d3136f5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -55,6 +55,11 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
55static struct extent_io_ops btree_extent_io_ops; 55static struct extent_io_ops btree_extent_io_ops;
56static void end_workqueue_fn(struct btrfs_work *work); 56static void end_workqueue_fn(struct btrfs_work *work);
57 57
58/*
59 * end_io_wq structs are used to do processing in task context when an IO is
60 * complete. This is used during reads to verify checksums, and it is used
61 * by writes to insert metadata for new file extents after IO is complete.
62 */
58struct end_io_wq { 63struct end_io_wq {
59 struct bio *bio; 64 struct bio *bio;
60 bio_end_io_t *end_io; 65 bio_end_io_t *end_io;
@@ -66,6 +71,11 @@ struct end_io_wq {
66 struct btrfs_work work; 71 struct btrfs_work work;
67}; 72};
68 73
74/*
75 * async submit bios are used to offload expensive checksumming
76 * onto the worker threads. They checksum file and metadata bios
77 * just before they are sent down the IO stack.
78 */
69struct async_submit_bio { 79struct async_submit_bio {
70 struct inode *inode; 80 struct inode *inode;
71 struct bio *bio; 81 struct bio *bio;
@@ -76,6 +86,10 @@ struct async_submit_bio {
76 struct btrfs_work work; 86 struct btrfs_work work;
77}; 87};
78 88
89/*
90 * extents on the btree inode are pretty simple, there's one extent
91 * that covers the entire device
92 */
79struct extent_map *btree_get_extent(struct inode *inode, struct page *page, 93struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
80 size_t page_offset, u64 start, u64 len, 94 size_t page_offset, u64 start, u64 len,
81 int create) 95 int create)
@@ -151,6 +165,10 @@ void btrfs_csum_final(u32 crc, char *result)
151 *(__le32 *)result = ~cpu_to_le32(crc); 165 *(__le32 *)result = ~cpu_to_le32(crc);
152} 166}
153 167
168/*
169 * compute the csum for a btree block, and either verify it or write it
170 * into the csum field of the block.
171 */
154static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, 172static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
155 int verify) 173 int verify)
156{ 174{
@@ -204,6 +222,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
204 return 0; 222 return 0;
205} 223}
206 224
225/*
226 * we can't consider a given block up to date unless the transid of the
227 * block matches the transid in the parent node's pointer. This is how we
228 * detect blocks that either didn't get written at all or got written
229 * in the wrong place.
230 */
207static int verify_parent_transid(struct extent_io_tree *io_tree, 231static int verify_parent_transid(struct extent_io_tree *io_tree,
208 struct extent_buffer *eb, u64 parent_transid) 232 struct extent_buffer *eb, u64 parent_transid)
209{ 233{
@@ -228,9 +252,12 @@ out:
228 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, 252 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
229 GFP_NOFS); 253 GFP_NOFS);
230 return ret; 254 return ret;
231
232} 255}
233 256
257/*
258 * helper to read a given tree block, doing retries as required when
259 * the checksums don't match and we have alternate mirrors to try.
260 */
234static int btree_read_extent_buffer_pages(struct btrfs_root *root, 261static int btree_read_extent_buffer_pages(struct btrfs_root *root,
235 struct extent_buffer *eb, 262 struct extent_buffer *eb,
236 u64 start, u64 parent_transid) 263 u64 start, u64 parent_transid)
@@ -260,6 +287,10 @@ printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror
260 return -EIO; 287 return -EIO;
261} 288}
262 289
290/*
291 * checksum a dirty tree block before IO. This has extra checks to make
292 * sure we only fill in the checksum field in the first page of a multi-page block
293 */
263int csum_dirty_buffer(struct btrfs_root *root, struct page *page) 294int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
264{ 295{
265 struct extent_io_tree *tree; 296 struct extent_io_tree *tree;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8bd1b402f3fd..563b2d12f4f2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -914,6 +914,10 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
914} 914}
915EXPORT_SYMBOL(wait_on_extent_writeback); 915EXPORT_SYMBOL(wait_on_extent_writeback);
916 916
917/*
918 * either insert or lock state struct between start and end use mask to tell
919 * us if waiting is desired.
920 */
917int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 921int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
918{ 922{
919 int err; 923 int err;
@@ -982,6 +986,13 @@ int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
982} 986}
983EXPORT_SYMBOL(set_range_writeback); 987EXPORT_SYMBOL(set_range_writeback);
984 988
989/*
990 * find the first offset in the io tree with 'bits' set. zero is
991 * returned if we find something, and *start_ret and *end_ret are
992 * set to reflect the state struct that was found.
993 *
994 * If nothing was found, 1 is returned, < 0 on error
995 */
985int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 996int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
986 u64 *start_ret, u64 *end_ret, int bits) 997 u64 *start_ret, u64 *end_ret, int bits)
987{ 998{
@@ -1017,6 +1028,10 @@ out:
1017} 1028}
1018EXPORT_SYMBOL(find_first_extent_bit); 1029EXPORT_SYMBOL(find_first_extent_bit);
1019 1030
1031/* find the first state struct with 'bits' set after 'start', and
1032 * return it. tree->lock must be held. NULL will returned if
1033 * nothing was found after 'start'
1034 */
1020struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 1035struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1021 u64 start, int bits) 1036 u64 start, int bits)
1022{ 1037{
@@ -1046,8 +1061,14 @@ out:
1046} 1061}
1047EXPORT_SYMBOL(find_first_extent_bit_state); 1062EXPORT_SYMBOL(find_first_extent_bit_state);
1048 1063
1049u64 find_lock_delalloc_range(struct extent_io_tree *tree, 1064/*
1050 u64 *start, u64 *end, u64 max_bytes) 1065 * find a contiguous range of bytes in the file marked as delalloc, not
1066 * more than 'max_bytes'. start and end are used to return the range,
1067 *
1068 * 1 is returned if we find something, 0 if nothing was in the tree
1069 */
1070static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
1071 u64 *start, u64 *end, u64 max_bytes)
1051{ 1072{
1052 struct rb_node *node; 1073 struct rb_node *node;
1053 struct extent_state *state; 1074 struct extent_state *state;
@@ -1130,6 +1151,11 @@ out:
1130 return found; 1151 return found;
1131} 1152}
1132 1153
1154/*
1155 * count the number of bytes in the tree that have a given bit(s)
1156 * set. This can be fairly slow, except for EXTENT_DIRTY which is
1157 * cached. The total number found is returned.
1158 */
1133u64 count_range_bits(struct extent_io_tree *tree, 1159u64 count_range_bits(struct extent_io_tree *tree,
1134 u64 *start, u64 search_end, u64 max_bytes, 1160 u64 *start, u64 search_end, u64 max_bytes,
1135 unsigned long bits) 1161 unsigned long bits)
@@ -1245,6 +1271,10 @@ int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1245} 1271}
1246EXPORT_SYMBOL(unlock_range); 1272EXPORT_SYMBOL(unlock_range);
1247 1273
1274/*
1275 * set the private field for a given byte offset in the tree. If there isn't
1276 * an extent_state there already, this does nothing.
1277 */
1248int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1278int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1249{ 1279{
1250 struct rb_node *node; 1280 struct rb_node *node;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 78ced11d18c7..74b2a29880d3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -114,6 +114,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
114 return NULL; 114 return NULL;
115} 115}
116 116
117/*
118 * search through the tree for an extent_map with a given offset. If
119 * it can't be found, try to find some neighboring extents
120 */
117static struct rb_node *__tree_search(struct rb_root *root, u64 offset, 121static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
118 struct rb_node **prev_ret, 122 struct rb_node **prev_ret,
119 struct rb_node **next_ret) 123 struct rb_node **next_ret)
@@ -160,6 +164,10 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
160 return NULL; 164 return NULL;
161} 165}
162 166
167/*
168 * look for an offset in the tree, and if it can't be found, return
169 * the first offset we can find smaller than 'offset'.
170 */
163static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) 171static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
164{ 172{
165 struct rb_node *prev; 173 struct rb_node *prev;
@@ -170,6 +178,7 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
170 return ret; 178 return ret;
171} 179}
172 180
181/* check to see if two extent_map structs are adjacent and safe to merge */
173static int mergable_maps(struct extent_map *prev, struct extent_map *next) 182static int mergable_maps(struct extent_map *prev, struct extent_map *next)
174{ 183{
175 if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) 184 if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
@@ -250,6 +259,7 @@ out:
250} 259}
251EXPORT_SYMBOL(add_extent_mapping); 260EXPORT_SYMBOL(add_extent_mapping);
252 261
262/* simple helper to do math around the end of an extent, handling wrap */
253static u64 range_end(u64 start, u64 len) 263static u64 range_end(u64 start, u64 len)
254{ 264{
255 if (start + len < start) 265 if (start + len < start)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1b7e51a9db0f..3088a1184483 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,9 @@
41#include "compat.h" 41#include "compat.h"
42 42
43 43
44/* simple helper to fault in pages and copy. This should go away
45 * and be replaced with calls into generic code.
46 */
44static int noinline btrfs_copy_from_user(loff_t pos, int num_pages, 47static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
45 int write_bytes, 48 int write_bytes,
46 struct page **prepared_pages, 49 struct page **prepared_pages,
@@ -72,12 +75,19 @@ static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
72 return page_fault ? -EFAULT : 0; 75 return page_fault ? -EFAULT : 0;
73} 76}
74 77
78/*
79 * unlocks pages after btrfs_file_write is done with them
80 */
75static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages) 81static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
76{ 82{
77 size_t i; 83 size_t i;
78 for (i = 0; i < num_pages; i++) { 84 for (i = 0; i < num_pages; i++) {
79 if (!pages[i]) 85 if (!pages[i])
80 break; 86 break;
87 /* page checked is some magic around finding pages that
88 * have been modified without going through btrfs_set_page_dirty
89 * clear it here
90 */
81 ClearPageChecked(pages[i]); 91 ClearPageChecked(pages[i]);
82 unlock_page(pages[i]); 92 unlock_page(pages[i]);
83 mark_page_accessed(pages[i]); 93 mark_page_accessed(pages[i]);
@@ -85,6 +95,10 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
85 } 95 }
86} 96}
87 97
98/* this does all the hard work for inserting an inline extent into
99 * the btree. Any existing inline extent is extended as required to make room,
100 * otherwise things are inserted as required into the btree
101 */
88static int noinline insert_inline_extent(struct btrfs_trans_handle *trans, 102static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
89 struct btrfs_root *root, struct inode *inode, 103 struct btrfs_root *root, struct inode *inode,
90 u64 offset, size_t size, 104 u64 offset, size_t size,
@@ -228,6 +242,14 @@ fail:
228 return err; 242 return err;
229} 243}
230 244
245/*
246 * after copy_from_user, pages need to be dirtied and we need to make
247 * sure holes are created between the current EOF and the start of
248 * any next extents (if required).
249 *
250 * this also makes the decision about creating an inline extent vs
251 * doing real data extents, marking pages dirty and delalloc as required.
252 */
231static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, 253static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
232 struct btrfs_root *root, 254 struct btrfs_root *root,
233 struct file *file, 255 struct file *file,
@@ -362,6 +384,10 @@ out_unlock:
362 return err; 384 return err;
363} 385}
364 386
387/*
388 * this drops all the extents in the cache that intersect the range
389 * [start, end]. Existing extents are split as required.
390 */
365int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 391int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
366 int skip_pinned) 392 int skip_pinned)
367{ 393{
@@ -536,6 +562,9 @@ out:
536 * If an extent intersects the range but is not entirely inside the range 562 * If an extent intersects the range but is not entirely inside the range
537 * it is either truncated or split. Anything entirely inside the range 563 * it is either truncated or split. Anything entirely inside the range
538 * is deleted from the tree. 564 * is deleted from the tree.
565 *
566 * inline_limit is used to tell this code which offsets in the file to keep
567 * if they contain inline extents.
539 */ 568 */
540int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans, 569int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
541 struct btrfs_root *root, struct inode *inode, 570 struct btrfs_root *root, struct inode *inode,
@@ -796,7 +825,9 @@ out:
796} 825}
797 826
798/* 827/*
799 * this gets pages into the page cache and locks them down 828 * this gets pages into the page cache and locks them down, it also properly
829 * waits for data=ordered extents to finish before allowing the pages to be
830 * modified.
800 */ 831 */
801static int noinline prepare_pages(struct btrfs_root *root, struct file *file, 832static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
802 struct page **pages, size_t num_pages, 833 struct page **pages, size_t num_pages,
@@ -1034,6 +1065,17 @@ int btrfs_release_file(struct inode * inode, struct file * filp)
1034 return 0; 1065 return 0;
1035} 1066}
1036 1067
1068/*
1069 * fsync call for both files and directories. This logs the inode into
1070 * the tree log instead of forcing full commits whenever possible.
1071 *
1072 * It needs to call filemap_fdatawait so that all ordered extent updates are
1073 * in the metadata btree are up to date for copying to the log.
1074 *
1075 * It drops the inode mutex before doing the tree log commit. This is an
1076 * important optimization for directories because holding the mutex prevents
1077 * new operations on the dir while we write to disk.
1078 */
1037int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) 1079int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1038{ 1080{
1039 struct inode *inode = dentry->d_inode; 1081 struct inode *inode = dentry->d_inode;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 404704d26822..f3abecc2d14c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -83,6 +83,10 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
83 83
84static void btrfs_truncate(struct inode *inode); 84static void btrfs_truncate(struct inode *inode);
85 85
86/*
87 * a very lame attempt at stopping writes when the FS is 85% full. There
88 * are countless ways this is incorrect, but it is better than nothing.
89 */
86int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, 90int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
87 int for_del) 91 int for_del)
88{ 92{
@@ -108,6 +112,12 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
108 return ret; 112 return ret;
109} 113}
110 114
115/*
116 * when extent_io.c finds a delayed allocation range in the file,
117 * the call backs end up in this code. The basic idea is to
118 * allocate extents on disk for the range, and create ordered data structs
119 * in ram to track those extents.
120 */
111static int cow_file_range(struct inode *inode, u64 start, u64 end) 121static int cow_file_range(struct inode *inode, u64 start, u64 end)
112{ 122{
113 struct btrfs_root *root = BTRFS_I(inode)->root; 123 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -185,6 +195,13 @@ out:
185 return ret; 195 return ret;
186} 196}
187 197
198/*
199 * when nowcow writeback call back. This checks for snapshots or COW copies
200 * of the extents that exist in the file, and COWs the file as required.
201 *
202 * If no cow copies or snapshots exist, we write directly to the existing
203 * blocks on disk
204 */
188static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end) 205static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
189{ 206{
190 u64 extent_start; 207 u64 extent_start;
@@ -291,6 +308,9 @@ out:
291 return err; 308 return err;
292} 309}
293 310
311/*
312 * extent_io.c call back to do delayed allocation processing
313 */
294static int run_delalloc_range(struct inode *inode, u64 start, u64 end) 314static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
295{ 315{
296 struct btrfs_root *root = BTRFS_I(inode)->root; 316 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -305,6 +325,11 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
305 return ret; 325 return ret;
306} 326}
307 327
328/*
329 * extent_io.c set_bit_hook, used to track delayed allocation
330 * bytes in this file, and to maintain the list of inodes that
331 * have pending delalloc work to be done.
332 */
308int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 333int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
309 unsigned long old, unsigned long bits) 334 unsigned long old, unsigned long bits)
310{ 335{
@@ -323,6 +348,9 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
323 return 0; 348 return 0;
324} 349}
325 350
351/*
352 * extent_io.c clear_bit_hook, see set_bit_hook for why
353 */
326int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, 354int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
327 unsigned long old, unsigned long bits) 355 unsigned long old, unsigned long bits)
328{ 356{
@@ -349,6 +377,10 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
349 return 0; 377 return 0;
350} 378}
351 379
380/*
381 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
382 * we don't create bios that span stripes or chunks
383 */
352int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 384int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
353 size_t size, struct bio *bio) 385 size_t size, struct bio *bio)
354{ 386{
@@ -371,6 +403,14 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
371 return 0; 403 return 0;
372} 404}
373 405
406/*
407 * in order to insert checksums into the metadata in large chunks,
408 * we wait until bio submission time. All the pages in the bio are
409 * checksummed and sums are attached onto the ordered extent record.
410 *
411 * At IO completion time the cums attached on the ordered extent record
412 * are inserted into the btree
413 */
374int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 414int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
375 int mirror_num) 415 int mirror_num)
376{ 416{
@@ -383,6 +423,10 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
383 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 423 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
384} 424}
385 425
426/*
427 * extent_io.c submission hook. This does the right thing for csum calculation on write,
428 * or reading the csums from the tree before a read
429 */
386int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 430int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
387 int mirror_num) 431 int mirror_num)
388{ 432{
@@ -408,6 +452,10 @@ mapit:
408 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 452 return btrfs_map_bio(root, rw, bio, mirror_num, 0);
409} 453}
410 454
455/*
456 * given a list of ordered sums record them in the inode. This happens
457 * at IO completion time based on sums calculated at bio submission time.
458 */
411static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 459static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
412 struct inode *inode, u64 file_offset, 460 struct inode *inode, u64 file_offset,
413 struct list_head *list) 461 struct list_head *list)
@@ -430,12 +478,12 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
430 GFP_NOFS); 478 GFP_NOFS);
431} 479}
432 480
481/* see btrfs_writepage_start_hook for details on why this is required */
433struct btrfs_writepage_fixup { 482struct btrfs_writepage_fixup {
434 struct page *page; 483 struct page *page;
435 struct btrfs_work work; 484 struct btrfs_work work;
436}; 485};
437 486
438/* see btrfs_writepage_start_hook for details on why this is required */
439void btrfs_writepage_fixup_worker(struct btrfs_work *work) 487void btrfs_writepage_fixup_worker(struct btrfs_work *work)
440{ 488{
441 struct btrfs_writepage_fixup *fixup; 489 struct btrfs_writepage_fixup *fixup;
@@ -522,6 +570,10 @@ int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
522 return -EAGAIN; 570 return -EAGAIN;
523} 571}
524 572
573/* as ordered data IO finishes, this gets called so we can finish
574 * an ordered extent if the range of bytes in the file it covers are
575 * fully written.
576 */
525static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 577static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
526{ 578{
527 struct btrfs_root *root = BTRFS_I(inode)->root; 579 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -631,6 +683,14 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
631 return btrfs_finish_ordered_io(page->mapping->host, start, end); 683 return btrfs_finish_ordered_io(page->mapping->host, start, end);
632} 684}
633 685
686/*
687 * When IO fails, either with EIO or csum verification fails, we
688 * try other mirrors that might have a good copy of the data. This
689 * io_failure_record is used to record state as we go through all the
690 * mirrors. If another mirror has good data, the page is set up to date
691 * and things continue. If a good mirror can't be found, the original
692 * bio end_io callback is called to indicate things have failed.
693 */
634struct io_failure_record { 694struct io_failure_record {
635 struct page *page; 695 struct page *page;
636 u64 start; 696 u64 start;
@@ -725,6 +785,10 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
725 return 0; 785 return 0;
726} 786}
727 787
788/*
789 * each time an IO finishes, we do a fast check in the IO failure tree
790 * to see if we need to process or clean up an io_failure_record
791 */
728int btrfs_clean_io_failures(struct inode *inode, u64 start) 792int btrfs_clean_io_failures(struct inode *inode, u64 start)
729{ 793{
730 u64 private; 794 u64 private;
@@ -753,6 +817,11 @@ int btrfs_clean_io_failures(struct inode *inode, u64 start)
753 return 0; 817 return 0;
754} 818}
755 819
820/*
821 * when reads are done, we need to check csums to verify the data is correct
822 * if there's a match, we allow the bio to finish. If not, we go through
823 * the io_failure_record routines to find good copies
824 */
756int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 825int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
757 struct extent_state *state) 826 struct extent_state *state)
758{ 827{
@@ -990,6 +1059,9 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
990 btrfs_free_path(path); 1059 btrfs_free_path(path);
991} 1060}
992 1061
1062/*
1063 * read an inode from the btree into the in-memory inode
1064 */
993void btrfs_read_locked_inode(struct inode *inode) 1065void btrfs_read_locked_inode(struct inode *inode)
994{ 1066{
995 struct btrfs_path *path; 1067 struct btrfs_path *path;
@@ -1083,6 +1155,9 @@ make_bad:
1083 make_bad_inode(inode); 1155 make_bad_inode(inode);
1084} 1156}
1085 1157
1158/*
1159 * given a leaf and an inode, copy the inode fields into the leaf
1160 */
1086static void fill_inode_item(struct btrfs_trans_handle *trans, 1161static void fill_inode_item(struct btrfs_trans_handle *trans,
1087 struct extent_buffer *leaf, 1162 struct extent_buffer *leaf,
1088 struct btrfs_inode_item *item, 1163 struct btrfs_inode_item *item,
@@ -1118,6 +1193,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
1118 BTRFS_I(inode)->block_group->key.objectid); 1193 BTRFS_I(inode)->block_group->key.objectid);
1119} 1194}
1120 1195
1196/*
1197 * copy everything in the in-memory inode into the btree.
1198 */
1121int noinline btrfs_update_inode(struct btrfs_trans_handle *trans, 1199int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
1122 struct btrfs_root *root, 1200 struct btrfs_root *root,
1123 struct inode *inode) 1201 struct inode *inode)
@@ -1151,6 +1229,11 @@ failed:
1151} 1229}
1152 1230
1153 1231
1232/*
1233 * unlink helper that gets used here in inode.c and in the tree logging
1234 * recovery code. It remove a link in a directory with a given name, and
1235 * also drops the back refs in the inode to the directory
1236 */
1154int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 1237int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
1155 struct btrfs_root *root, 1238 struct btrfs_root *root,
1156 struct inode *dir, struct inode *inode, 1239 struct inode *dir, struct inode *inode,
@@ -1309,7 +1392,7 @@ fail:
1309/* 1392/*
1310 * this can truncate away extent items, csum items and directory items. 1393 * this can truncate away extent items, csum items and directory items.
1311 * It starts at a high offset and removes keys until it can't find 1394 * It starts at a high offset and removes keys until it can't find
1312 * any higher than i_size. 1395 * any higher than new_size
1313 * 1396 *
1314 * csum items that cross the new i_size are truncated to the new size 1397 * csum items that cross the new i_size are truncated to the new size
1315 * as well. 1398 * as well.
@@ -2123,6 +2206,11 @@ void btrfs_dirty_inode(struct inode *inode)
2123 btrfs_end_transaction(trans, root); 2206 btrfs_end_transaction(trans, root);
2124} 2207}
2125 2208
2209/*
2210 * find the highest existing sequence number in a directory
2211 * and then set the in-memory index_cnt variable to reflect
2212 * free sequence numbers
2213 */
2126static int btrfs_set_inode_index_count(struct inode *inode) 2214static int btrfs_set_inode_index_count(struct inode *inode)
2127{ 2215{
2128 struct btrfs_root *root = BTRFS_I(inode)->root; 2216 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2175,6 +2263,10 @@ out:
2175 return ret; 2263 return ret;
2176} 2264}
2177 2265
2266/*
2267 * helper to find a free sequence number in a given directory. This current
2268 * code is very simple, later versions will do smarter things in the btree
2269 */
2178static int btrfs_set_inode_index(struct inode *dir, struct inode *inode, 2270static int btrfs_set_inode_index(struct inode *dir, struct inode *inode,
2179 u64 *index) 2271 u64 *index)
2180{ 2272{
@@ -2305,6 +2397,12 @@ static inline u8 btrfs_inode_type(struct inode *inode)
2305 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 2397 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
2306} 2398}
2307 2399
2400/*
2401 * utility function to add 'inode' into 'parent_inode' with
2402 * a give name and a given sequence number.
2403 * if 'add_backref' is true, also insert a backref from the
2404 * inode to the parent directory.
2405 */
2308int btrfs_add_link(struct btrfs_trans_handle *trans, 2406int btrfs_add_link(struct btrfs_trans_handle *trans,
2309 struct inode *parent_inode, struct inode *inode, 2407 struct inode *parent_inode, struct inode *inode,
2310 const char *name, int name_len, int add_backref, u64 index) 2408 const char *name, int name_len, int add_backref, u64 index)
@@ -2611,6 +2709,10 @@ out_unlock:
2611 return err; 2709 return err;
2612} 2710}
2613 2711
2712/* helper for btfs_get_extent. Given an existing extent in the tree,
2713 * and an extent that you want to insert, deal with overlap and insert
2714 * the new extent into the tree.
2715 */
2614static int merge_extent_mapping(struct extent_map_tree *em_tree, 2716static int merge_extent_mapping(struct extent_map_tree *em_tree,
2615 struct extent_map *existing, 2717 struct extent_map *existing,
2616 struct extent_map *em, 2718 struct extent_map *em,
@@ -2627,6 +2729,14 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
2627 return add_extent_mapping(em_tree, em); 2729 return add_extent_mapping(em_tree, em);
2628} 2730}
2629 2731
2732/*
2733 * a bit scary, this does extent mapping from logical file offset to the disk.
2734 * the ugly parts come from merging extents from the disk with the
2735 * in-ram representation. This gets more complex because of the data=ordered code,
2736 * where the in-ram extents might be locked pending data=ordered completion.
2737 *
2738 * This also copies inline extents directly into the page.
2739 */
2630struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 2740struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
2631 size_t pg_offset, u64 start, u64 len, 2741 size_t pg_offset, u64 start, u64 len,
2632 int create) 2742 int create)
@@ -2869,76 +2979,11 @@ out:
2869 return em; 2979 return em;
2870} 2980}
2871 2981
2872#if 0 /* waiting for O_DIRECT reads */
2873static int btrfs_get_block(struct inode *inode, sector_t iblock,
2874 struct buffer_head *bh_result, int create)
2875{
2876 struct extent_map *em;
2877 u64 start = (u64)iblock << inode->i_blkbits;
2878 struct btrfs_multi_bio *multi = NULL;
2879 struct btrfs_root *root = BTRFS_I(inode)->root;
2880 u64 len;
2881 u64 logical;
2882 u64 map_length;
2883 int ret = 0;
2884
2885 em = btrfs_get_extent(inode, NULL, 0, start, bh_result->b_size, 0);
2886
2887 if (!em || IS_ERR(em))
2888 goto out;
2889
2890 if (em->start > start || em->start + em->len <= start) {
2891 goto out;
2892 }
2893
2894 if (em->block_start == EXTENT_MAP_INLINE) {
2895 ret = -EINVAL;
2896 goto out;
2897 }
2898
2899 len = em->start + em->len - start;
2900 len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size)));
2901
2902 if (em->block_start == EXTENT_MAP_HOLE ||
2903 em->block_start == EXTENT_MAP_DELALLOC) {
2904 bh_result->b_size = len;
2905 goto out;
2906 }
2907
2908 logical = start - em->start;
2909 logical = em->block_start + logical;
2910
2911 map_length = len;
2912 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
2913 logical, &map_length, &multi, 0);
2914 BUG_ON(ret);
2915 bh_result->b_blocknr = multi->stripes[0].physical >> inode->i_blkbits;
2916 bh_result->b_size = min(map_length, len);
2917
2918 bh_result->b_bdev = multi->stripes[0].dev->bdev;
2919 set_buffer_mapped(bh_result);
2920 kfree(multi);
2921out:
2922 free_extent_map(em);
2923 return ret;
2924}
2925#endif
2926
2927static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 2982static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
2928 const struct iovec *iov, loff_t offset, 2983 const struct iovec *iov, loff_t offset,
2929 unsigned long nr_segs) 2984 unsigned long nr_segs)
2930{ 2985{
2931 return -EINVAL; 2986 return -EINVAL;
2932#if 0
2933 struct file *file = iocb->ki_filp;
2934 struct inode *inode = file->f_mapping->host;
2935
2936 if (rw == WRITE)
2937 return -EINVAL;
2938
2939 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
2940 offset, nr_segs, btrfs_get_block, NULL);
2941#endif
2942} 2987}
2943 2988
2944static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) 2989static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
@@ -3202,6 +3247,9 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
3202 } 3247 }
3203} 3248}
3204 3249
3250/*
3251 * create a new subvolume directory/inode (helper for the ioctl).
3252 */
3205int btrfs_create_subvol_root(struct btrfs_root *new_root, 3253int btrfs_create_subvol_root(struct btrfs_root *new_root,
3206 struct btrfs_trans_handle *trans, u64 new_dirid, 3254 struct btrfs_trans_handle *trans, u64 new_dirid,
3207 struct btrfs_block_group_cache *block_group) 3255 struct btrfs_block_group_cache *block_group)
@@ -3223,6 +3271,9 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,
3223 return btrfs_update_inode(trans, new_root, inode); 3271 return btrfs_update_inode(trans, new_root, inode);
3224} 3272}
3225 3273
3274/* helper function for file defrag and space balancing. This
3275 * forces readahead on a given range of bytes in an inode
3276 */
3226unsigned long btrfs_force_ra(struct address_space *mapping, 3277unsigned long btrfs_force_ra(struct address_space *mapping,
3227 struct file_ra_state *ra, struct file *file, 3278 struct file_ra_state *ra, struct file *file,
3228 pgoff_t offset, pgoff_t last_index) 3279 pgoff_t offset, pgoff_t last_index)
@@ -3424,6 +3475,10 @@ out_unlock:
3424 return ret; 3475 return ret;
3425} 3476}
3426 3477
3478/*
3479 * some fairly slow code that needs optimization. This walks the list
3480 * of all the inodes with pending delalloc and forces them to disk.
3481 */
3427int btrfs_start_delalloc_inodes(struct btrfs_root *root) 3482int btrfs_start_delalloc_inodes(struct btrfs_root *root)
3428{ 3483{
3429 struct list_head *head = &root->fs_info->delalloc_inodes; 3484 struct list_head *head = &root->fs_info->delalloc_inodes;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 0cc314c10d66..e30aa6e2958f 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,6 +25,15 @@
25#include "extent_io.h" 25#include "extent_io.h"
26#include "locking.h" 26#include "locking.h"
27 27
28/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks
30 * and the spin is not tuned very extensively. The spinning does make a big
31 * difference in almost every workload, but spinning for the right amount of
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree searches,
35 * and we should give up if they are in more expensive code.
36 */
28int btrfs_tree_lock(struct extent_buffer *eb) 37int btrfs_tree_lock(struct extent_buffer *eb)
29{ 38{
30 int i; 39 int i;
@@ -57,6 +66,10 @@ int btrfs_tree_locked(struct extent_buffer *eb)
57 return mutex_is_locked(&eb->mutex); 66 return mutex_is_locked(&eb->mutex);
58} 67}
59 68
69/*
70 * btrfs_search_slot uses this to decide if it should drop its locks
71 * before doing something expensive like allocating free blocks for cow.
72 */
60int btrfs_path_lock_waiting(struct btrfs_path *path, int level) 73int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
61{ 74{
62 int i; 75 int i;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 951eacff2420..dcc1730dd837 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -26,7 +26,6 @@
26#include "btrfs_inode.h" 26#include "btrfs_inode.h"
27#include "extent_io.h" 27#include "extent_io.h"
28 28
29
30static u64 entry_end(struct btrfs_ordered_extent *entry) 29static u64 entry_end(struct btrfs_ordered_extent *entry)
31{ 30{
32 if (entry->file_offset + entry->len < entry->file_offset) 31 if (entry->file_offset + entry->len < entry->file_offset)
@@ -34,6 +33,9 @@ static u64 entry_end(struct btrfs_ordered_extent *entry)
34 return entry->file_offset + entry->len; 33 return entry->file_offset + entry->len;
35} 34}
36 35
36/* returns NULL if the insertion worked, or it returns the node it did find
37 * in the tree
38 */
37static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, 39static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
38 struct rb_node *node) 40 struct rb_node *node)
39{ 41{
@@ -58,6 +60,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
58 return NULL; 60 return NULL;
59} 61}
60 62
63/*
64 * look for a given offset in the tree, and if it can't be found return the
65 * first lesser offset
66 */
61static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, 67static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
62 struct rb_node **prev_ret) 68 struct rb_node **prev_ret)
63{ 69{
@@ -108,6 +114,9 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
108 return NULL; 114 return NULL;
109} 115}
110 116
117/*
118 * helper to check if a given offset is inside a given entry
119 */
111static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) 120static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
112{ 121{
113 if (file_offset < entry->file_offset || 122 if (file_offset < entry->file_offset ||
@@ -116,6 +125,10 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
116 return 1; 125 return 1;
117} 126}
118 127
128/*
129 * look find the first ordered struct that has this offset, otherwise
130 * the first one less than this offset
131 */
119static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, 132static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
120 u64 file_offset) 133 u64 file_offset)
121{ 134{
@@ -305,6 +318,10 @@ int btrfs_remove_ordered_extent(struct inode *inode,
305 return 0; 318 return 0;
306} 319}
307 320
321/*
322 * wait for all the ordered extents in a root. This is done when balancing
323 * space between drives.
324 */
308int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) 325int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
309{ 326{
310 struct list_head splice; 327 struct list_head splice;
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 30fcb7aea5b5..a50ebb67055d 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -21,6 +21,16 @@
21#include "ref-cache.h" 21#include "ref-cache.h"
22#include "transaction.h" 22#include "transaction.h"
23 23
24/*
25 * leaf refs are used to cache the information about which extents
26 * a given leaf has references on. This allows us to process that leaf
27 * in btrfs_drop_snapshot without needing to read it back from disk.
28 */
29
30/*
31 * kmalloc a leaf reference struct and update the counters for the
32 * total ref cache size
33 */
24struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, 34struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
25 int nr_extents) 35 int nr_extents)
26{ 36{
@@ -40,6 +50,10 @@ struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
40 return ref; 50 return ref;
41} 51}
42 52
53/*
54 * free a leaf reference struct and update the counters for the
55 * total ref cache size
56 */
43void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) 57void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
44{ 58{
45 if (!ref) 59 if (!ref)
@@ -135,6 +149,10 @@ int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
135 return 0; 149 return 0;
136} 150}
137 151
152/*
153 * find the leaf ref for a given extent. This returns the ref struct with
154 * a usage reference incremented
155 */
138struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, 156struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
139 u64 bytenr) 157 u64 bytenr)
140{ 158{
@@ -160,6 +178,10 @@ again:
160 return NULL; 178 return NULL;
161} 179}
162 180
181/*
182 * add a fully filled in leaf ref struct
183 * remove all the refs older than a given root generation
184 */
163int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, 185int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
164 int shared) 186 int shared)
165{ 187{
@@ -184,6 +206,10 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
184 return ret; 206 return ret;
185} 207}
186 208
209/*
210 * remove a single leaf ref from the tree. This drops the ref held by the tree
211 * only
212 */
187int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) 213int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
188{ 214{
189 struct btrfs_leaf_ref_tree *tree; 215 struct btrfs_leaf_ref_tree *tree;
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 617564787f52..16f3183d7c59 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -19,8 +19,11 @@
19#define __REFCACHE__ 19#define __REFCACHE__
20 20
21struct btrfs_extent_info { 21struct btrfs_extent_info {
22 /* bytenr and num_bytes find the extent in the extent allocation tree */
22 u64 bytenr; 23 u64 bytenr;
23 u64 num_bytes; 24 u64 num_bytes;
25
26 /* objectid and offset find the back reference for the file */
24 u64 objectid; 27 u64 objectid;
25 u64 offset; 28 u64 offset;
26}; 29};
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0091c01abb06..eb7f7655e9d5 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -22,8 +22,10 @@
22#include "print-tree.h" 22#include "print-tree.h"
23 23
24/* 24/*
25 * returns 0 on finding something, 1 if no more roots are there 25 * search forward for a root, starting with objectid 'search_start'
26 * and < 0 on error 26 * if a root key is found, the objectid we find is filled into 'found_objectid'
27 * and 0 is returned. < 0 is returned on error, 1 if there is nothing
28 * left in the tree.
27 */ 29 */
28int btrfs_search_root(struct btrfs_root *root, u64 search_start, 30int btrfs_search_root(struct btrfs_root *root, u64 search_start,
29 u64 *found_objectid) 31 u64 *found_objectid)
@@ -66,6 +68,11 @@ out:
66 return ret; 68 return ret;
67} 69}
68 70
71/*
72 * lookup the root with the highest offset for a given objectid. The key we do
73 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
74 * on error.
75 */
69int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, 76int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
70 struct btrfs_root_item *item, struct btrfs_key *key) 77 struct btrfs_root_item *item, struct btrfs_key *key)
71{ 78{
@@ -104,6 +111,9 @@ out:
104 return ret; 111 return ret;
105} 112}
106 113
114/*
115 * copy the data in 'item' into the btree
116 */
107int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root 117int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
108 *root, struct btrfs_key *key, struct btrfs_root_item 118 *root, struct btrfs_key *key, struct btrfs_root_item
109 *item) 119 *item)
@@ -147,6 +157,12 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
147 return ret; 157 return ret;
148} 158}
149 159
160/*
161 * at mount time we want to find all the old transaction snapshots that were in
162 * the process of being deleted if we crashed. This is any root item with an offset
163 * lower than the latest root. They need to be queued for deletion to finish
164 * what was happening when we crashed.
165 */
150int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, 166int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
151 struct btrfs_root *latest) 167 struct btrfs_root *latest)
152{ 168{
@@ -227,6 +243,7 @@ err:
227 return ret; 243 return ret;
228} 244}
229 245
246/* drop the root item for 'key' from 'root' */
230int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, 247int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
231 struct btrfs_key *key) 248 struct btrfs_key *key)
232{ 249{
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index ad03a32d1116..cdedbe144d45 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -17,6 +17,27 @@
17 */ 17 */
18 18
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20
21/* this is some deeply nasty code. ctree.h has a different
22 * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
23 *
24 * The end result is that anyone who #includes ctree.h gets a
25 * declaration for the btrfs_set_foo functions and btrfs_foo functions
26 *
27 * This file declares the macros and then #includes ctree.h, which results
28 * in cpp creating the function here based on the template below.
29 *
30 * These setget functions do all the extent_buffer related mapping
31 * required to efficiently read and write specific fields in the extent
32 * buffers. Every pointer to metadata items in btrfs is really just
33 * an unsigned long offset into the extent buffer which has been
34 * cast to a specific type. This gives us all the gcc type checking.
35 *
36 * The extent buffer api is used to do all the kmapping and page
37 * spanning work required to get extent buffers in highmem and have
38 * a metadata blocksize different from the page size.
39 */
40
20#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ 41#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
21u##bits btrfs_##name(struct extent_buffer *eb, \ 42u##bits btrfs_##name(struct extent_buffer *eb, \
22 type *s) \ 43 type *s) \
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8399d6d05d63..2e6039825b7b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -519,6 +519,9 @@ static struct file_system_type btrfs_fs_type = {
519 .fs_flags = FS_REQUIRES_DEV, 519 .fs_flags = FS_REQUIRES_DEV,
520}; 520};
521 521
522/*
523 * used by btrfsctl to scan devices when no FS is mounted
524 */
522static long btrfs_control_ioctl(struct file *file, unsigned int cmd, 525static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
523 unsigned long arg) 526 unsigned long arg)
524{ 527{
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 444abe0796ae..11266d68a6c9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -46,6 +46,9 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
46 } 46 }
47} 47}
48 48
49/*
50 * either allocate a new transaction or hop into the existing one
51 */
49static noinline int join_transaction(struct btrfs_root *root) 52static noinline int join_transaction(struct btrfs_root *root)
50{ 53{
51 struct btrfs_transaction *cur_trans; 54 struct btrfs_transaction *cur_trans;
@@ -85,6 +88,12 @@ static noinline int join_transaction(struct btrfs_root *root)
85 return 0; 88 return 0;
86} 89}
87 90
91/*
92 * this does all the record keeping required to make sure that a
93 * reference counted root is properly recorded in a given transaction.
94 * This is required to make sure the old root from before we joined the transaction
95 * is deleted when the transaction commits
96 */
88noinline int btrfs_record_root_in_trans(struct btrfs_root *root) 97noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
89{ 98{
90 struct btrfs_dirty_root *dirty; 99 struct btrfs_dirty_root *dirty;
@@ -127,6 +136,10 @@ noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
127 return 0; 136 return 0;
128} 137}
129 138
139/* wait for commit against the current transaction to become unblocked
140 * when this is done, it is safe to start a new transaction, but the current
141 * transaction might not be fully on disk.
142 */
130static void wait_current_trans(struct btrfs_root *root) 143static void wait_current_trans(struct btrfs_root *root)
131{ 144{
132 struct btrfs_transaction *cur_trans; 145 struct btrfs_transaction *cur_trans;
@@ -198,7 +211,7 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
198 return start_transaction(r, num_blocks, 2); 211 return start_transaction(r, num_blocks, 2);
199} 212}
200 213
201 214/* wait for a transaction commit to be fully complete */
202static noinline int wait_for_commit(struct btrfs_root *root, 215static noinline int wait_for_commit(struct btrfs_root *root,
203 struct btrfs_transaction *commit) 216 struct btrfs_transaction *commit)
204{ 217{
@@ -218,6 +231,10 @@ static noinline int wait_for_commit(struct btrfs_root *root,
218 return 0; 231 return 0;
219} 232}
220 233
234/*
235 * rate limit against the drop_snapshot code. This helps to slow down new operations
236 * if the drop_snapshot code isn't able to keep up.
237 */
221static void throttle_on_drops(struct btrfs_root *root) 238static void throttle_on_drops(struct btrfs_root *root)
222{ 239{
223 struct btrfs_fs_info *info = root->fs_info; 240 struct btrfs_fs_info *info = root->fs_info;
@@ -302,7 +319,11 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
302 return __btrfs_end_transaction(trans, root, 1); 319 return __btrfs_end_transaction(trans, root, 1);
303} 320}
304 321
305 322/*
323 * when btree blocks are allocated, they have some corresponding bits set for
324 * them in one of two extent_io trees. This is used to make sure all of
325 * those extents are on disk for transaction or log commit
326 */
306int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 327int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
307 struct extent_io_tree *dirty_pages) 328 struct extent_io_tree *dirty_pages)
308{ 329{
@@ -393,6 +414,16 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
393 &trans->transaction->dirty_pages); 414 &trans->transaction->dirty_pages);
394} 415}
395 416
417/*
418 * this is used to update the root pointer in the tree of tree roots.
419 *
420 * But, in the case of the extent allocation tree, updating the root
421 * pointer may allocate blocks which may change the root of the extent
422 * allocation tree.
423 *
424 * So, this loops and repeats and makes sure the cowonly root didn't
425 * change while the root pointer was being updated in the metadata.
426 */
396static int update_cowonly_root(struct btrfs_trans_handle *trans, 427static int update_cowonly_root(struct btrfs_trans_handle *trans,
397 struct btrfs_root *root) 428 struct btrfs_root *root)
398{ 429{
@@ -418,6 +449,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
418 return 0; 449 return 0;
419} 450}
420 451
452/*
453 * update all the cowonly tree roots on disk
454 */
421int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 455int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
422 struct btrfs_root *root) 456 struct btrfs_root *root)
423{ 457{
@@ -433,6 +467,11 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
433 return 0; 467 return 0;
434} 468}
435 469
470/*
471 * dead roots are old snapshots that need to be deleted. This allocates
472 * a dirty root struct and adds it into the list of dead roots that need to
473 * be deleted
474 */
436int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) 475int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
437{ 476{
438 struct btrfs_dirty_root *dirty; 477 struct btrfs_dirty_root *dirty;
@@ -449,6 +488,12 @@ int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
449 return 0; 488 return 0;
450} 489}
451 490
491/*
492 * at transaction commit time we need to schedule the old roots for
493 * deletion via btrfs_drop_snapshot. This runs through all the
494 * reference counted roots that were modified in the current
495 * transaction and puts them into the drop list
496 */
452static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, 497static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
453 struct radix_tree_root *radix, 498 struct radix_tree_root *radix,
454 struct list_head *list) 499 struct list_head *list)
@@ -541,6 +586,10 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
541 return err; 586 return err;
542} 587}
543 588
589/*
590 * defrag a given btree. If cacheonly == 1, this won't read from the disk,
591 * otherwise every leaf in the btree is read and defragged.
592 */
544int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 593int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
545{ 594{
546 struct btrfs_fs_info *info = root->fs_info; 595 struct btrfs_fs_info *info = root->fs_info;
@@ -570,6 +619,10 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
570 return 0; 619 return 0;
571} 620}
572 621
622/*
623 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
624 * all of them
625 */
573static noinline int drop_dirty_roots(struct btrfs_root *tree_root, 626static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
574 struct list_head *list) 627 struct list_head *list)
575{ 628{
@@ -664,6 +717,10 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
664 return ret; 717 return ret;
665} 718}
666 719
720/*
721 * new snapshots need to be created at a very specific time in the
722 * transaction commit. This does the actual creation
723 */
667static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, 724static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
668 struct btrfs_fs_info *fs_info, 725 struct btrfs_fs_info *fs_info,
669 struct btrfs_pending_snapshot *pending) 726 struct btrfs_pending_snapshot *pending)
@@ -734,6 +791,9 @@ fail:
734 return ret; 791 return ret;
735} 792}
736 793
794/*
795 * create all the snapshots we've scheduled for creation
796 */
737static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, 797static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
738 struct btrfs_fs_info *fs_info) 798 struct btrfs_fs_info *fs_info)
739{ 799{
@@ -944,6 +1004,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
944 return ret; 1004 return ret;
945} 1005}
946 1006
1007/*
1008 * interface function to delete all the snapshots we have scheduled for deletion
1009 */
947int btrfs_clean_old_snapshots(struct btrfs_root *root) 1010int btrfs_clean_old_snapshots(struct btrfs_root *root)
948{ 1011{
949 struct list_head dirty_roots; 1012 struct list_head dirty_roots;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b3bb5bbad76e..6f57d0889b1e 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,6 +23,10 @@
23#include "transaction.h" 23#include "transaction.h"
24#include "locking.h" 24#include "locking.h"
25 25
26/* defrag all the leaves in a given btree. If cache_only == 1, don't read things
27 * from disk, otherwise read all the leaves and try to get key order to
28 * better reflect disk order
29 */
26int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 30int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
27 struct btrfs_root *root, int cache_only) 31 struct btrfs_root *root, int cache_only)
28{ 32{