aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/aio.c11
-rw-r--r--fs/autofs4/waitq.c2
-rw-r--r--fs/block_dev.c3
-rw-r--r--fs/btrfs/Kconfig19
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/backref.c1131
-rw-r--r--fs/btrfs/backref.h5
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/check-integrity.c3068
-rw-r--r--fs/btrfs/check-integrity.h36
-rw-r--r--fs/btrfs/ctree.c42
-rw-r--r--fs/btrfs/ctree.h239
-rw-r--r--fs/btrfs/delayed-inode.c45
-rw-r--r--fs/btrfs/delayed-ref.c153
-rw-r--r--fs/btrfs/delayed-ref.h104
-rw-r--r--fs/btrfs/disk-io.c124
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/export.c2
-rw-r--r--fs/btrfs/extent-tree.c465
-rw-r--r--fs/btrfs/extent_io.c6
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file.c11
-rw-r--r--fs/btrfs/free-space-cache.c417
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c66
-rw-r--r--fs/btrfs/ioctl.c268
-rw-r--r--fs/btrfs/ioctl.h54
-rw-r--r--fs/btrfs/locking.c53
-rw-r--r--fs/btrfs/relocation.c20
-rw-r--r--fs/btrfs/scrub.c12
-rw-r--r--fs/btrfs/super.c190
-rw-r--r--fs/btrfs/transaction.c20
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/ulist.c220
-rw-r--r--fs/btrfs/ulist.h68
-rw-r--r--fs/btrfs/volumes.c993
-rw-r--r--fs/btrfs/volumes.h54
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/ceph/dir.c76
-rw-r--r--fs/ceph/export.c6
-rw-r--r--fs/ceph/inode.c3
-rw-r--r--fs/ceph/mds_client.c4
-rw-r--r--fs/ceph/super.c16
-rw-r--r--fs/ceph/super.h1
-rw-r--r--fs/ceph/xattr.c22
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--fs/dcache.c11
-rw-r--r--fs/direct-io.c57
-rw-r--r--fs/eventpoll.c234
-rw-r--r--fs/fuse/dev.c57
-rw-r--r--fs/fuse/dir.c58
-rw-r--r--fs/fuse/file.c58
-rw-r--r--fs/fuse/fuse_i.h10
-rw-r--r--fs/gfs2/glock.c2
-rw-r--r--fs/gfs2/glock.h7
-rw-r--r--fs/gfs2/incore.h60
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/gfs2/lock_dlm.c993
-rw-r--r--fs/gfs2/main.c10
-rw-r--r--fs/gfs2/ops_fstype.c31
-rw-r--r--fs/gfs2/recovery.c11
-rw-r--r--fs/gfs2/rgrp.c2
-rw-r--r--fs/gfs2/sys.c33
-rw-r--r--fs/gfs2/sys.h2
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--fs/inode.c3
-rw-r--r--fs/ioprio.c24
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/mpage.c4
-rw-r--r--fs/namei.c28
-rw-r--r--fs/nfs/blocklayout/blocklayout.c202
-rw-r--r--fs/nfs/blocklayout/blocklayout.h12
-rw-r--r--fs/nfs/blocklayout/extents.c176
-rw-r--r--fs/nfs/callback.h2
-rw-r--r--fs/nfs/callback_xdr.c4
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/nfs4filelayoutdev.c2
-rw-r--r--fs/nfs/nfs4proc.c2
-rw-r--r--fs/nfs/write.c4
-rw-r--r--fs/nfsd/Kconfig10
-rw-r--r--fs/nfsd/Makefile1
-rw-r--r--fs/nfsd/export.c12
-rw-r--r--fs/nfsd/fault_inject.c91
-rw-r--r--fs/nfsd/fault_inject.h28
-rw-r--r--fs/nfsd/nfs4idmap.c11
-rw-r--r--fs/nfsd/nfs4proc.c7
-rw-r--r--fs/nfsd/nfs4recover.c22
-rw-r--r--fs/nfsd/nfs4state.c328
-rw-r--r--fs/nfsd/nfs4xdr.c3
-rw-r--r--fs/nfsd/nfsctl.c10
-rw-r--r--fs/nfsd/nfsd.h20
-rw-r--r--fs/nfsd/state.h3
-rw-r--r--fs/nfsd/vfs.c17
-rw-r--r--fs/notify/mark.c8
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/proc/array.c9
-rw-r--r--fs/proc/base.c152
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/qnx4/inode.c62
-rw-r--r--fs/squashfs/cache.c30
-rw-r--r--fs/squashfs/inode.c4
-rw-r--r--fs/squashfs/squashfs_fs_sb.h1
-rw-r--r--fs/squashfs/super.c2
-rw-r--r--fs/super.c2
-rw-r--r--fs/ubifs/debug.c90
-rw-r--r--fs/ubifs/debug.h75
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/replay.c8
-rw-r--r--fs/ubifs/tnc.c55
-rw-r--r--fs/ubifs/tnc_misc.c10
-rw-r--r--fs/xfs/xfs_aops.c29
-rw-r--r--fs/xfs/xfs_attr.c4
-rw-r--r--fs/xfs/xfs_attr_leaf.c9
-rw-r--r--fs/xfs/xfs_bmap.c116
-rw-r--r--fs/xfs/xfs_dfrag.c43
-rw-r--r--fs/xfs/xfs_file.c184
-rw-r--r--fs/xfs/xfs_fs_subr.c2
-rw-r--r--fs/xfs/xfs_iget.c24
-rw-r--r--fs/xfs/xfs_inode.c193
-rw-r--r--fs/xfs/xfs_inode.h114
-rw-r--r--fs/xfs/xfs_inode_item.c8
-rw-r--r--fs/xfs/xfs_iomap.c46
-rw-r--r--fs/xfs/xfs_iops.c46
-rw-r--r--fs/xfs/xfs_qm_syscalls.c8
-rw-r--r--fs/xfs/xfs_super.c8
-rw-r--r--fs/xfs/xfs_sync.c9
-rw-r--r--fs/xfs/xfs_trace.h29
-rw-r--r--fs/xfs/xfs_vnodeops.c44
131 files changed, 9750 insertions, 2416 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 78c514cfd212..969beb0e2231 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -476,14 +476,21 @@ static void kiocb_batch_init(struct kiocb_batch *batch, long total)
476 batch->count = total; 476 batch->count = total;
477} 477}
478 478
479static void kiocb_batch_free(struct kiocb_batch *batch) 479static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
480{ 480{
481 struct kiocb *req, *n; 481 struct kiocb *req, *n;
482 482
483 if (list_empty(&batch->head))
484 return;
485
486 spin_lock_irq(&ctx->ctx_lock);
483 list_for_each_entry_safe(req, n, &batch->head, ki_batch) { 487 list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
484 list_del(&req->ki_batch); 488 list_del(&req->ki_batch);
489 list_del(&req->ki_list);
485 kmem_cache_free(kiocb_cachep, req); 490 kmem_cache_free(kiocb_cachep, req);
491 ctx->reqs_active--;
486 } 492 }
493 spin_unlock_irq(&ctx->ctx_lock);
487} 494}
488 495
489/* 496/*
@@ -1742,7 +1749,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1742 } 1749 }
1743 blk_finish_plug(&plug); 1750 blk_finish_plug(&plug);
1744 1751
1745 kiocb_batch_free(&batch); 1752 kiocb_batch_free(ctx, &batch);
1746 put_ioctx(ctx); 1753 put_ioctx(ctx);
1747 return i ? i : ret; 1754 return i ? i : ret;
1748} 1755}
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 9ef5b2914407..da8876d38a7b 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -76,7 +76,7 @@ static int autofs4_write(struct autofs_sb_info *sbi,
76 data += wr; 76 data += wr;
77 bytes -= wr; 77 bytes -= wr;
78 } 78 }
79 mutex_lock(&sbi->pipe_mutex); 79 mutex_unlock(&sbi->pipe_mutex);
80 80
81 set_fs(fs); 81 set_fs(fs);
82 82
diff --git a/fs/block_dev.c b/fs/block_dev.c
index afe74dda632b..0e575d1304b4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1139,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1139 mutex_lock_nested(&bdev->bd_mutex, for_part); 1139 mutex_lock_nested(&bdev->bd_mutex, for_part);
1140 if (!bdev->bd_openers) { 1140 if (!bdev->bd_openers) {
1141 bdev->bd_disk = disk; 1141 bdev->bd_disk = disk;
1142 bdev->bd_queue = disk->queue;
1142 bdev->bd_contains = bdev; 1143 bdev->bd_contains = bdev;
1143 if (!partno) { 1144 if (!partno) {
1144 struct backing_dev_info *bdi; 1145 struct backing_dev_info *bdi;
@@ -1159,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1159 disk_put_part(bdev->bd_part); 1160 disk_put_part(bdev->bd_part);
1160 bdev->bd_part = NULL; 1161 bdev->bd_part = NULL;
1161 bdev->bd_disk = NULL; 1162 bdev->bd_disk = NULL;
1163 bdev->bd_queue = NULL;
1162 mutex_unlock(&bdev->bd_mutex); 1164 mutex_unlock(&bdev->bd_mutex);
1163 disk_unblock_events(disk); 1165 disk_unblock_events(disk);
1164 put_disk(disk); 1166 put_disk(disk);
@@ -1232,6 +1234,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1232 disk_put_part(bdev->bd_part); 1234 disk_put_part(bdev->bd_part);
1233 bdev->bd_disk = NULL; 1235 bdev->bd_disk = NULL;
1234 bdev->bd_part = NULL; 1236 bdev->bd_part = NULL;
1237 bdev->bd_queue = NULL;
1235 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); 1238 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1236 if (bdev != bdev->bd_contains) 1239 if (bdev != bdev->bd_contains)
1237 __blkdev_put(bdev->bd_contains, mode, 1); 1240 __blkdev_put(bdev->bd_contains, mode, 1);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ecb9fd3be143..d33f01c08b60 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL
31 Linux website <http://acl.bestbits.at/>. 31 Linux website <http://acl.bestbits.at/>.
32 32
33 If you don't know what Access Control Lists are, say N 33 If you don't know what Access Control Lists are, say N
34
35config BTRFS_FS_CHECK_INTEGRITY
36 bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
37 depends on BTRFS_FS
38 help
39 Adds code that examines all block write requests (including
40 writes of the super block). The goal is to verify that the
41 state of the filesystem on disk is always consistent, i.e.,
42 after a power-loss or kernel panic event the filesystem is
43 in a consistent state.
44
45 If the integrity check tool is included and activated in
46 the mount options, plenty of kernel memory is used, and
47 plenty of additional CPU cycles are spent. Enabling this
48 functionality is not intended for normal use.
49
50 In most cases, unless you are a btrfs developer who needs
51 to verify the integrity of (super)-block write requests
52 during the run of a regression test, say N
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd29c5e5..0c4fa2befae7 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o 11 reada.o backref.o ulist.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 22c64fff1bd5..b9a843226de8 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -19,18 +19,789 @@
19#include "ctree.h" 19#include "ctree.h"
20#include "disk-io.h" 20#include "disk-io.h"
21#include "backref.h" 21#include "backref.h"
22#include "ulist.h"
23#include "transaction.h"
24#include "delayed-ref.h"
22 25
23struct __data_ref { 26/*
27 * this structure records all encountered refs on the way up to the root
28 */
29struct __prelim_ref {
24 struct list_head list; 30 struct list_head list;
25 u64 inum; 31 u64 root_id;
26 u64 root; 32 struct btrfs_key key;
27 u64 extent_data_item_offset; 33 int level;
34 int count;
35 u64 parent;
36 u64 wanted_disk_byte;
28}; 37};
29 38
30struct __shared_ref { 39static int __add_prelim_ref(struct list_head *head, u64 root_id,
31 struct list_head list; 40 struct btrfs_key *key, int level, u64 parent,
41 u64 wanted_disk_byte, int count)
42{
43 struct __prelim_ref *ref;
44
45 /* in case we're adding delayed refs, we're holding the refs spinlock */
46 ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
47 if (!ref)
48 return -ENOMEM;
49
50 ref->root_id = root_id;
51 if (key)
52 ref->key = *key;
53 else
54 memset(&ref->key, 0, sizeof(ref->key));
55
56 ref->level = level;
57 ref->count = count;
58 ref->parent = parent;
59 ref->wanted_disk_byte = wanted_disk_byte;
60 list_add_tail(&ref->list, head);
61
62 return 0;
63}
64
65static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
66 struct ulist *parents,
67 struct extent_buffer *eb, int level,
68 u64 wanted_objectid, u64 wanted_disk_byte)
69{
70 int ret;
71 int slot;
72 struct btrfs_file_extent_item *fi;
73 struct btrfs_key key;
32 u64 disk_byte; 74 u64 disk_byte;
33}; 75
76add_parent:
77 ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
78 if (ret < 0)
79 return ret;
80
81 if (level != 0)
82 return 0;
83
84 /*
85 * if the current leaf is full with EXTENT_DATA items, we must
86 * check the next one if that holds a reference as well.
87 * ref->count cannot be used to skip this check.
88 * repeat this until we don't find any additional EXTENT_DATA items.
89 */
90 while (1) {
91 ret = btrfs_next_leaf(root, path);
92 if (ret < 0)
93 return ret;
94 if (ret)
95 return 0;
96
97 eb = path->nodes[0];
98 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
99 btrfs_item_key_to_cpu(eb, &key, slot);
100 if (key.objectid != wanted_objectid ||
101 key.type != BTRFS_EXTENT_DATA_KEY)
102 return 0;
103 fi = btrfs_item_ptr(eb, slot,
104 struct btrfs_file_extent_item);
105 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
106 if (disk_byte == wanted_disk_byte)
107 goto add_parent;
108 }
109 }
110
111 return 0;
112}
113
114/*
115 * resolve an indirect backref in the form (root_id, key, level)
116 * to a logical address
117 */
118static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
119 struct __prelim_ref *ref,
120 struct ulist *parents)
121{
122 struct btrfs_path *path;
123 struct btrfs_root *root;
124 struct btrfs_key root_key;
125 struct btrfs_key key = {0};
126 struct extent_buffer *eb;
127 int ret = 0;
128 int root_level;
129 int level = ref->level;
130
131 path = btrfs_alloc_path();
132 if (!path)
133 return -ENOMEM;
134
135 root_key.objectid = ref->root_id;
136 root_key.type = BTRFS_ROOT_ITEM_KEY;
137 root_key.offset = (u64)-1;
138 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
139 if (IS_ERR(root)) {
140 ret = PTR_ERR(root);
141 goto out;
142 }
143
144 rcu_read_lock();
145 root_level = btrfs_header_level(root->node);
146 rcu_read_unlock();
147
148 if (root_level + 1 == level)
149 goto out;
150
151 path->lowest_level = level;
152 ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
153 pr_debug("search slot in root %llu (level %d, ref count %d) returned "
154 "%d for key (%llu %u %llu)\n",
155 (unsigned long long)ref->root_id, level, ref->count, ret,
156 (unsigned long long)ref->key.objectid, ref->key.type,
157 (unsigned long long)ref->key.offset);
158 if (ret < 0)
159 goto out;
160
161 eb = path->nodes[level];
162 if (!eb) {
163 WARN_ON(1);
164 ret = 1;
165 goto out;
166 }
167
168 if (level == 0) {
169 if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
170 ret = btrfs_next_leaf(root, path);
171 if (ret)
172 goto out;
173 eb = path->nodes[0];
174 }
175
176 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
177 }
178
179 /* the last two parameters will only be used for level == 0 */
180 ret = add_all_parents(root, path, parents, eb, level, key.objectid,
181 ref->wanted_disk_byte);
182out:
183 btrfs_free_path(path);
184 return ret;
185}
186
187/*
188 * resolve all indirect backrefs from the list
189 */
190static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
191 struct list_head *head)
192{
193 int err;
194 int ret = 0;
195 struct __prelim_ref *ref;
196 struct __prelim_ref *ref_safe;
197 struct __prelim_ref *new_ref;
198 struct ulist *parents;
199 struct ulist_node *node;
200
201 parents = ulist_alloc(GFP_NOFS);
202 if (!parents)
203 return -ENOMEM;
204
205 /*
206 * _safe allows us to insert directly after the current item without
207 * iterating over the newly inserted items.
208 * we're also allowed to re-assign ref during iteration.
209 */
210 list_for_each_entry_safe(ref, ref_safe, head, list) {
211 if (ref->parent) /* already direct */
212 continue;
213 if (ref->count == 0)
214 continue;
215 err = __resolve_indirect_ref(fs_info, ref, parents);
216 if (err) {
217 if (ret == 0)
218 ret = err;
219 continue;
220 }
221
222 /* we put the first parent into the ref at hand */
223 node = ulist_next(parents, NULL);
224 ref->parent = node ? node->val : 0;
225
226 /* additional parents require new refs being added here */
227 while ((node = ulist_next(parents, node))) {
228 new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
229 if (!new_ref) {
230 ret = -ENOMEM;
231 break;
232 }
233 memcpy(new_ref, ref, sizeof(*ref));
234 new_ref->parent = node->val;
235 list_add(&new_ref->list, &ref->list);
236 }
237 ulist_reinit(parents);
238 }
239
240 ulist_free(parents);
241 return ret;
242}
243
244/*
245 * merge two lists of backrefs and adjust counts accordingly
246 *
247 * mode = 1: merge identical keys, if key is set
248 * mode = 2: merge identical parents
249 */
250static int __merge_refs(struct list_head *head, int mode)
251{
252 struct list_head *pos1;
253
254 list_for_each(pos1, head) {
255 struct list_head *n2;
256 struct list_head *pos2;
257 struct __prelim_ref *ref1;
258
259 ref1 = list_entry(pos1, struct __prelim_ref, list);
260
261 if (mode == 1 && ref1->key.type == 0)
262 continue;
263 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
264 pos2 = n2, n2 = pos2->next) {
265 struct __prelim_ref *ref2;
266
267 ref2 = list_entry(pos2, struct __prelim_ref, list);
268
269 if (mode == 1) {
270 if (memcmp(&ref1->key, &ref2->key,
271 sizeof(ref1->key)) ||
272 ref1->level != ref2->level ||
273 ref1->root_id != ref2->root_id)
274 continue;
275 ref1->count += ref2->count;
276 } else {
277 if (ref1->parent != ref2->parent)
278 continue;
279 ref1->count += ref2->count;
280 }
281 list_del(&ref2->list);
282 kfree(ref2);
283 }
284
285 }
286 return 0;
287}
288
289/*
290 * add all currently queued delayed refs from this head whose seq nr is
291 * smaller or equal that seq to the list
292 */
293static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
294 struct btrfs_key *info_key,
295 struct list_head *prefs)
296{
297 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
298 struct rb_node *n = &head->node.rb_node;
299 int sgn;
300 int ret;
301
302 if (extent_op && extent_op->update_key)
303 btrfs_disk_key_to_cpu(info_key, &extent_op->key);
304
305 while ((n = rb_prev(n))) {
306 struct btrfs_delayed_ref_node *node;
307 node = rb_entry(n, struct btrfs_delayed_ref_node,
308 rb_node);
309 if (node->bytenr != head->node.bytenr)
310 break;
311 WARN_ON(node->is_head);
312
313 if (node->seq > seq)
314 continue;
315
316 switch (node->action) {
317 case BTRFS_ADD_DELAYED_EXTENT:
318 case BTRFS_UPDATE_DELAYED_HEAD:
319 WARN_ON(1);
320 continue;
321 case BTRFS_ADD_DELAYED_REF:
322 sgn = 1;
323 break;
324 case BTRFS_DROP_DELAYED_REF:
325 sgn = -1;
326 break;
327 default:
328 BUG_ON(1);
329 }
330 switch (node->type) {
331 case BTRFS_TREE_BLOCK_REF_KEY: {
332 struct btrfs_delayed_tree_ref *ref;
333
334 ref = btrfs_delayed_node_to_tree_ref(node);
335 ret = __add_prelim_ref(prefs, ref->root, info_key,
336 ref->level + 1, 0, node->bytenr,
337 node->ref_mod * sgn);
338 break;
339 }
340 case BTRFS_SHARED_BLOCK_REF_KEY: {
341 struct btrfs_delayed_tree_ref *ref;
342
343 ref = btrfs_delayed_node_to_tree_ref(node);
344 ret = __add_prelim_ref(prefs, ref->root, info_key,
345 ref->level + 1, ref->parent,
346 node->bytenr,
347 node->ref_mod * sgn);
348 break;
349 }
350 case BTRFS_EXTENT_DATA_REF_KEY: {
351 struct btrfs_delayed_data_ref *ref;
352 struct btrfs_key key;
353
354 ref = btrfs_delayed_node_to_data_ref(node);
355
356 key.objectid = ref->objectid;
357 key.type = BTRFS_EXTENT_DATA_KEY;
358 key.offset = ref->offset;
359 ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
360 node->bytenr,
361 node->ref_mod * sgn);
362 break;
363 }
364 case BTRFS_SHARED_DATA_REF_KEY: {
365 struct btrfs_delayed_data_ref *ref;
366 struct btrfs_key key;
367
368 ref = btrfs_delayed_node_to_data_ref(node);
369
370 key.objectid = ref->objectid;
371 key.type = BTRFS_EXTENT_DATA_KEY;
372 key.offset = ref->offset;
373 ret = __add_prelim_ref(prefs, ref->root, &key, 0,
374 ref->parent, node->bytenr,
375 node->ref_mod * sgn);
376 break;
377 }
378 default:
379 WARN_ON(1);
380 }
381 BUG_ON(ret);
382 }
383
384 return 0;
385}
386
387/*
388 * add all inline backrefs for bytenr to the list
389 */
390static int __add_inline_refs(struct btrfs_fs_info *fs_info,
391 struct btrfs_path *path, u64 bytenr,
392 struct btrfs_key *info_key, int *info_level,
393 struct list_head *prefs)
394{
395 int ret;
396 int slot;
397 struct extent_buffer *leaf;
398 struct btrfs_key key;
399 unsigned long ptr;
400 unsigned long end;
401 struct btrfs_extent_item *ei;
402 u64 flags;
403 u64 item_size;
404
405 /*
406 * enumerate all inline refs
407 */
408 leaf = path->nodes[0];
409 slot = path->slots[0] - 1;
410
411 item_size = btrfs_item_size_nr(leaf, slot);
412 BUG_ON(item_size < sizeof(*ei));
413
414 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
415 flags = btrfs_extent_flags(leaf, ei);
416
417 ptr = (unsigned long)(ei + 1);
418 end = (unsigned long)ei + item_size;
419
420 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
421 struct btrfs_tree_block_info *info;
422 struct btrfs_disk_key disk_key;
423
424 info = (struct btrfs_tree_block_info *)ptr;
425 *info_level = btrfs_tree_block_level(leaf, info);
426 btrfs_tree_block_key(leaf, info, &disk_key);
427 btrfs_disk_key_to_cpu(info_key, &disk_key);
428 ptr += sizeof(struct btrfs_tree_block_info);
429 BUG_ON(ptr > end);
430 } else {
431 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
432 }
433
434 while (ptr < end) {
435 struct btrfs_extent_inline_ref *iref;
436 u64 offset;
437 int type;
438
439 iref = (struct btrfs_extent_inline_ref *)ptr;
440 type = btrfs_extent_inline_ref_type(leaf, iref);
441 offset = btrfs_extent_inline_ref_offset(leaf, iref);
442
443 switch (type) {
444 case BTRFS_SHARED_BLOCK_REF_KEY:
445 ret = __add_prelim_ref(prefs, 0, info_key,
446 *info_level + 1, offset,
447 bytenr, 1);
448 break;
449 case BTRFS_SHARED_DATA_REF_KEY: {
450 struct btrfs_shared_data_ref *sdref;
451 int count;
452
453 sdref = (struct btrfs_shared_data_ref *)(iref + 1);
454 count = btrfs_shared_data_ref_count(leaf, sdref);
455 ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
456 bytenr, count);
457 break;
458 }
459 case BTRFS_TREE_BLOCK_REF_KEY:
460 ret = __add_prelim_ref(prefs, offset, info_key,
461 *info_level + 1, 0, bytenr, 1);
462 break;
463 case BTRFS_EXTENT_DATA_REF_KEY: {
464 struct btrfs_extent_data_ref *dref;
465 int count;
466 u64 root;
467
468 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
469 count = btrfs_extent_data_ref_count(leaf, dref);
470 key.objectid = btrfs_extent_data_ref_objectid(leaf,
471 dref);
472 key.type = BTRFS_EXTENT_DATA_KEY;
473 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
474 root = btrfs_extent_data_ref_root(leaf, dref);
475 ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
476 count);
477 break;
478 }
479 default:
480 WARN_ON(1);
481 }
482 BUG_ON(ret);
483 ptr += btrfs_extent_inline_ref_size(type);
484 }
485
486 return 0;
487}
488
489/*
490 * add all non-inline backrefs for bytenr to the list
491 */
492static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
493 struct btrfs_path *path, u64 bytenr,
494 struct btrfs_key *info_key, int info_level,
495 struct list_head *prefs)
496{
497 struct btrfs_root *extent_root = fs_info->extent_root;
498 int ret;
499 int slot;
500 struct extent_buffer *leaf;
501 struct btrfs_key key;
502
503 while (1) {
504 ret = btrfs_next_item(extent_root, path);
505 if (ret < 0)
506 break;
507 if (ret) {
508 ret = 0;
509 break;
510 }
511
512 slot = path->slots[0];
513 leaf = path->nodes[0];
514 btrfs_item_key_to_cpu(leaf, &key, slot);
515
516 if (key.objectid != bytenr)
517 break;
518 if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
519 continue;
520 if (key.type > BTRFS_SHARED_DATA_REF_KEY)
521 break;
522
523 switch (key.type) {
524 case BTRFS_SHARED_BLOCK_REF_KEY:
525 ret = __add_prelim_ref(prefs, 0, info_key,
526 info_level + 1, key.offset,
527 bytenr, 1);
528 break;
529 case BTRFS_SHARED_DATA_REF_KEY: {
530 struct btrfs_shared_data_ref *sdref;
531 int count;
532
533 sdref = btrfs_item_ptr(leaf, slot,
534 struct btrfs_shared_data_ref);
535 count = btrfs_shared_data_ref_count(leaf, sdref);
536 ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
537 bytenr, count);
538 break;
539 }
540 case BTRFS_TREE_BLOCK_REF_KEY:
541 ret = __add_prelim_ref(prefs, key.offset, info_key,
542 info_level + 1, 0, bytenr, 1);
543 break;
544 case BTRFS_EXTENT_DATA_REF_KEY: {
545 struct btrfs_extent_data_ref *dref;
546 int count;
547 u64 root;
548
549 dref = btrfs_item_ptr(leaf, slot,
550 struct btrfs_extent_data_ref);
551 count = btrfs_extent_data_ref_count(leaf, dref);
552 key.objectid = btrfs_extent_data_ref_objectid(leaf,
553 dref);
554 key.type = BTRFS_EXTENT_DATA_KEY;
555 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
556 root = btrfs_extent_data_ref_root(leaf, dref);
557 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
558 bytenr, count);
559 break;
560 }
561 default:
562 WARN_ON(1);
563 }
564 BUG_ON(ret);
565 }
566
567 return ret;
568}
569
570/*
571 * this adds all existing backrefs (inline backrefs, backrefs and delayed
572 * refs) for the given bytenr to the refs list, merges duplicates and resolves
573 * indirect refs to their parent bytenr.
574 * When roots are found, they're added to the roots list
575 *
576 * FIXME some caching might speed things up
577 */
578static int find_parent_nodes(struct btrfs_trans_handle *trans,
579 struct btrfs_fs_info *fs_info, u64 bytenr,
580 u64 seq, struct ulist *refs, struct ulist *roots)
581{
582 struct btrfs_key key;
583 struct btrfs_path *path;
584 struct btrfs_key info_key = { 0 };
585 struct btrfs_delayed_ref_root *delayed_refs = NULL;
586 struct btrfs_delayed_ref_head *head = NULL;
587 int info_level = 0;
588 int ret;
589 struct list_head prefs_delayed;
590 struct list_head prefs;
591 struct __prelim_ref *ref;
592
593 INIT_LIST_HEAD(&prefs);
594 INIT_LIST_HEAD(&prefs_delayed);
595
596 key.objectid = bytenr;
597 key.type = BTRFS_EXTENT_ITEM_KEY;
598 key.offset = (u64)-1;
599
600 path = btrfs_alloc_path();
601 if (!path)
602 return -ENOMEM;
603
604 /*
605 * grab both a lock on the path and a lock on the delayed ref head.
606 * We need both to get a consistent picture of how the refs look
607 * at a specified point in time
608 */
609again:
610 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
611 if (ret < 0)
612 goto out;
613 BUG_ON(ret == 0);
614
615 /*
616 * look if there are updates for this ref queued and lock the head
617 */
618 delayed_refs = &trans->transaction->delayed_refs;
619 spin_lock(&delayed_refs->lock);
620 head = btrfs_find_delayed_ref_head(trans, bytenr);
621 if (head) {
622 if (!mutex_trylock(&head->mutex)) {
623 atomic_inc(&head->node.refs);
624 spin_unlock(&delayed_refs->lock);
625
626 btrfs_release_path(path);
627
628 /*
629 * Mutex was contended, block until it's
630 * released and try again
631 */
632 mutex_lock(&head->mutex);
633 mutex_unlock(&head->mutex);
634 btrfs_put_delayed_ref(&head->node);
635 goto again;
636 }
637 ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
638 if (ret)
639 goto out;
640 }
641 spin_unlock(&delayed_refs->lock);
642
643 if (path->slots[0]) {
644 struct extent_buffer *leaf;
645 int slot;
646
647 leaf = path->nodes[0];
648 slot = path->slots[0] - 1;
649 btrfs_item_key_to_cpu(leaf, &key, slot);
650 if (key.objectid == bytenr &&
651 key.type == BTRFS_EXTENT_ITEM_KEY) {
652 ret = __add_inline_refs(fs_info, path, bytenr,
653 &info_key, &info_level, &prefs);
654 if (ret)
655 goto out;
656 ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
657 info_level, &prefs);
658 if (ret)
659 goto out;
660 }
661 }
662 btrfs_release_path(path);
663
664 /*
665 * when adding the delayed refs above, the info_key might not have
666 * been known yet. Go over the list and replace the missing keys
667 */
668 list_for_each_entry(ref, &prefs_delayed, list) {
669 if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
670 memcpy(&ref->key, &info_key, sizeof(ref->key));
671 }
672 list_splice_init(&prefs_delayed, &prefs);
673
674 ret = __merge_refs(&prefs, 1);
675 if (ret)
676 goto out;
677
678 ret = __resolve_indirect_refs(fs_info, &prefs);
679 if (ret)
680 goto out;
681
682 ret = __merge_refs(&prefs, 2);
683 if (ret)
684 goto out;
685
686 while (!list_empty(&prefs)) {
687 ref = list_first_entry(&prefs, struct __prelim_ref, list);
688 list_del(&ref->list);
689 if (ref->count < 0)
690 WARN_ON(1);
691 if (ref->count && ref->root_id && ref->parent == 0) {
692 /* no parent == root of tree */
693 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
694 BUG_ON(ret < 0);
695 }
696 if (ref->count && ref->parent) {
697 ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
698 BUG_ON(ret < 0);
699 }
700 kfree(ref);
701 }
702
703out:
704 if (head)
705 mutex_unlock(&head->mutex);
706 btrfs_free_path(path);
707 while (!list_empty(&prefs)) {
708 ref = list_first_entry(&prefs, struct __prelim_ref, list);
709 list_del(&ref->list);
710 kfree(ref);
711 }
712 while (!list_empty(&prefs_delayed)) {
713 ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
714 list);
715 list_del(&ref->list);
716 kfree(ref);
717 }
718
719 return ret;
720}
721
722/*
723 * Finds all leafs with a reference to the specified combination of bytenr and
724 * offset. key_list_head will point to a list of corresponding keys (caller must
725 * free each list element). The leafs will be stored in the leafs ulist, which
726 * must be freed with ulist_free.
727 *
728 * returns 0 on success, <0 on error
729 */
730static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
731 struct btrfs_fs_info *fs_info, u64 bytenr,
732 u64 num_bytes, u64 seq, struct ulist **leafs)
733{
734 struct ulist *tmp;
735 int ret;
736
737 tmp = ulist_alloc(GFP_NOFS);
738 if (!tmp)
739 return -ENOMEM;
740 *leafs = ulist_alloc(GFP_NOFS);
741 if (!*leafs) {
742 ulist_free(tmp);
743 return -ENOMEM;
744 }
745
746 ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
747 ulist_free(tmp);
748
749 if (ret < 0 && ret != -ENOENT) {
750 ulist_free(*leafs);
751 return ret;
752 }
753
754 return 0;
755}
756
757/*
758 * walk all backrefs for a given extent to find all roots that reference this
759 * extent. Walking a backref means finding all extents that reference this
760 * extent and in turn walk the backrefs of those, too. Naturally this is a
761 * recursive process, but here it is implemented in an iterative fashion: We
762 * find all referencing extents for the extent in question and put them on a
763 * list. In turn, we find all referencing extents for those, further appending
764 * to the list. The way we iterate the list allows adding more elements after
765 * the current while iterating. The process stops when we reach the end of the
766 * list. Found roots are added to the roots list.
767 *
768 * returns 0 on success, < 0 on error.
769 */
770int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
771 struct btrfs_fs_info *fs_info, u64 bytenr,
772 u64 num_bytes, u64 seq, struct ulist **roots)
773{
774 struct ulist *tmp;
775 struct ulist_node *node = NULL;
776 int ret;
777
778 tmp = ulist_alloc(GFP_NOFS);
779 if (!tmp)
780 return -ENOMEM;
781 *roots = ulist_alloc(GFP_NOFS);
782 if (!*roots) {
783 ulist_free(tmp);
784 return -ENOMEM;
785 }
786
787 while (1) {
788 ret = find_parent_nodes(trans, fs_info, bytenr, seq,
789 tmp, *roots);
790 if (ret < 0 && ret != -ENOENT) {
791 ulist_free(tmp);
792 ulist_free(*roots);
793 return ret;
794 }
795 node = ulist_next(tmp, node);
796 if (!node)
797 break;
798 bytenr = node->val;
799 }
800
801 ulist_free(tmp);
802 return 0;
803}
804
34 805
35static int __inode_info(u64 inum, u64 ioff, u8 key_type, 806static int __inode_info(u64 inum, u64 ioff, u8 key_type,
36 struct btrfs_root *fs_root, struct btrfs_path *path, 807 struct btrfs_root *fs_root, struct btrfs_path *path,
@@ -181,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
181 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); 952 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
182 if (found_key->type != BTRFS_EXTENT_ITEM_KEY || 953 if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
183 found_key->objectid > logical || 954 found_key->objectid > logical ||
184 found_key->objectid + found_key->offset <= logical) 955 found_key->objectid + found_key->offset <= logical) {
956 pr_debug("logical %llu is not within any extent\n",
957 (unsigned long long)logical);
185 return -ENOENT; 958 return -ENOENT;
959 }
186 960
187 eb = path->nodes[0]; 961 eb = path->nodes[0];
188 item_size = btrfs_item_size_nr(eb, path->slots[0]); 962 item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -191,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
191 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 965 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
192 flags = btrfs_extent_flags(eb, ei); 966 flags = btrfs_extent_flags(eb, ei);
193 967
968 pr_debug("logical %llu is at position %llu within the extent (%llu "
969 "EXTENT_ITEM %llu) flags %#llx size %u\n",
970 (unsigned long long)logical,
971 (unsigned long long)(logical - found_key->objectid),
972 (unsigned long long)found_key->objectid,
973 (unsigned long long)found_key->offset,
974 (unsigned long long)flags, item_size);
194 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 975 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
195 return BTRFS_EXTENT_FLAG_TREE_BLOCK; 976 return BTRFS_EXTENT_FLAG_TREE_BLOCK;
196 if (flags & BTRFS_EXTENT_FLAG_DATA) 977 if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -287,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
287 return 0; 1068 return 0;
288} 1069}
289 1070
290static int __data_list_add(struct list_head *head, u64 inum, 1071static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
291 u64 extent_data_item_offset, u64 root) 1072 struct btrfs_path *path, u64 logical,
292{ 1073 u64 orig_extent_item_objectid,
293 struct __data_ref *ref; 1074 u64 extent_item_pos, u64 root,
294 1075 iterate_extent_inodes_t *iterate, void *ctx)
295 ref = kmalloc(sizeof(*ref), GFP_NOFS);
296 if (!ref)
297 return -ENOMEM;
298
299 ref->inum = inum;
300 ref->extent_data_item_offset = extent_data_item_offset;
301 ref->root = root;
302 list_add_tail(&ref->list, head);
303
304 return 0;
305}
306
307static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
308 struct btrfs_extent_data_ref *dref)
309{
310 return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
311 btrfs_extent_data_ref_offset(eb, dref),
312 btrfs_extent_data_ref_root(eb, dref));
313}
314
315static int __shared_list_add(struct list_head *head, u64 disk_byte)
316{
317 struct __shared_ref *ref;
318
319 ref = kmalloc(sizeof(*ref), GFP_NOFS);
320 if (!ref)
321 return -ENOMEM;
322
323 ref->disk_byte = disk_byte;
324 list_add_tail(&ref->list, head);
325
326 return 0;
327}
328
329static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
330 u64 logical, u64 inum,
331 u64 extent_data_item_offset,
332 u64 extent_offset,
333 struct btrfs_path *path,
334 struct list_head *data_refs,
335 iterate_extent_inodes_t *iterate,
336 void *ctx)
337{
338 u64 ref_root;
339 u32 item_size;
340 struct btrfs_key key;
341 struct extent_buffer *eb;
342 struct btrfs_extent_item *ei;
343 struct btrfs_extent_inline_ref *eiref;
344 struct __data_ref *ref;
345 int ret;
346 int type;
347 int last;
348 unsigned long ptr = 0;
349
350 WARN_ON(!list_empty(data_refs));
351 ret = extent_from_logical(fs_info, logical, path, &key);
352 if (ret & BTRFS_EXTENT_FLAG_DATA)
353 ret = -EIO;
354 if (ret < 0)
355 goto out;
356
357 eb = path->nodes[0];
358 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
359 item_size = btrfs_item_size_nr(eb, path->slots[0]);
360
361 ret = 0;
362 ref_root = 0;
363 /*
364 * as done in iterate_extent_inodes, we first build a list of refs to
365 * iterate, then free the path and then iterate them to avoid deadlocks.
366 */
367 do {
368 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
369 &eiref, &type);
370 if (last < 0) {
371 ret = last;
372 goto out;
373 }
374 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
375 type == BTRFS_SHARED_BLOCK_REF_KEY) {
376 ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
377 ret = __data_list_add(data_refs, inum,
378 extent_data_item_offset,
379 ref_root);
380 }
381 } while (!ret && !last);
382
383 btrfs_release_path(path);
384
385 if (ref_root == 0) {
386 printk(KERN_ERR "btrfs: failed to find tree block ref "
387 "for shared data backref %llu\n", logical);
388 WARN_ON(1);
389 ret = -EIO;
390 }
391
392out:
393 while (!list_empty(data_refs)) {
394 ref = list_first_entry(data_refs, struct __data_ref, list);
395 list_del(&ref->list);
396 if (!ret)
397 ret = iterate(ref->inum, extent_offset +
398 ref->extent_data_item_offset,
399 ref->root, ctx);
400 kfree(ref);
401 }
402
403 return ret;
404}
405
406static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
407 u64 logical, u64 orig_extent_item_objectid,
408 u64 extent_offset, struct btrfs_path *path,
409 struct list_head *data_refs,
410 iterate_extent_inodes_t *iterate,
411 void *ctx)
412{ 1076{
413 u64 disk_byte; 1077 u64 disk_byte;
414 struct btrfs_key key; 1078 struct btrfs_key key;
@@ -416,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
416 struct extent_buffer *eb; 1080 struct extent_buffer *eb;
417 int slot; 1081 int slot;
418 int nritems; 1082 int nritems;
419 int ret; 1083 int ret = 0;
420 int found = 0; 1084 int extent_type;
1085 u64 data_offset;
1086 u64 data_len;
421 1087
422 eb = read_tree_block(fs_info->tree_root, logical, 1088 eb = read_tree_block(fs_info->tree_root, logical,
423 fs_info->tree_root->leafsize, 0); 1089 fs_info->tree_root->leafsize, 0);
@@ -435,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
435 if (key.type != BTRFS_EXTENT_DATA_KEY) 1101 if (key.type != BTRFS_EXTENT_DATA_KEY)
436 continue; 1102 continue;
437 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 1103 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
438 if (!fi) { 1104 extent_type = btrfs_file_extent_type(eb, fi);
439 free_extent_buffer(eb); 1105 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
440 return -EIO; 1106 continue;
441 } 1107 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
442 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); 1108 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
443 if (disk_byte != orig_extent_item_objectid) { 1109 if (disk_byte != orig_extent_item_objectid)
444 if (found) 1110 continue;
445 break;
446 else
447 continue;
448 }
449 ++found;
450 ret = __iter_shared_inline_ref_inodes(fs_info, logical,
451 key.objectid,
452 key.offset,
453 extent_offset, path,
454 data_refs,
455 iterate, ctx);
456 if (ret)
457 break;
458 }
459 1111
460 if (!found) { 1112 data_offset = btrfs_file_extent_offset(eb, fi);
461 printk(KERN_ERR "btrfs: failed to follow shared data backref " 1113 data_len = btrfs_file_extent_num_bytes(eb, fi);
462 "to parent %llu\n", logical); 1114
463 WARN_ON(1); 1115 if (extent_item_pos < data_offset ||
464 ret = -EIO; 1116 extent_item_pos >= data_offset + data_len)
1117 continue;
1118
1119 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
1120 "root %llu\n", orig_extent_item_objectid,
1121 key.objectid, key.offset, root);
1122 ret = iterate(key.objectid,
1123 key.offset + (extent_item_pos - data_offset),
1124 root, ctx);
1125 if (ret) {
1126 pr_debug("stopping iteration because ret=%d\n", ret);
1127 break;
1128 }
465 } 1129 }
466 1130
467 free_extent_buffer(eb); 1131 free_extent_buffer(eb);
1132
468 return ret; 1133 return ret;
469} 1134}
470 1135
471/* 1136/*
472 * calls iterate() for every inode that references the extent identified by 1137 * calls iterate() for every inode that references the extent identified by
473 * the given parameters. will use the path given as a parameter and return it 1138 * the given parameters.
474 * released.
475 * when the iterator function returns a non-zero value, iteration stops. 1139 * when the iterator function returns a non-zero value, iteration stops.
1140 * path is guaranteed to be in released state when iterate() is called.
476 */ 1141 */
477int iterate_extent_inodes(struct btrfs_fs_info *fs_info, 1142int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
478 struct btrfs_path *path, 1143 struct btrfs_path *path,
479 u64 extent_item_objectid, 1144 u64 extent_item_objectid, u64 extent_item_pos,
480 u64 extent_offset,
481 iterate_extent_inodes_t *iterate, void *ctx) 1145 iterate_extent_inodes_t *iterate, void *ctx)
482{ 1146{
483 unsigned long ptr = 0;
484 int last;
485 int ret; 1147 int ret;
486 int type;
487 u64 logical;
488 u32 item_size;
489 struct btrfs_extent_inline_ref *eiref;
490 struct btrfs_extent_data_ref *dref;
491 struct extent_buffer *eb;
492 struct btrfs_extent_item *ei;
493 struct btrfs_key key;
494 struct list_head data_refs = LIST_HEAD_INIT(data_refs); 1148 struct list_head data_refs = LIST_HEAD_INIT(data_refs);
495 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); 1149 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
496 struct __data_ref *ref_d; 1150 struct btrfs_trans_handle *trans;
497 struct __shared_ref *ref_s; 1151 struct ulist *refs;
498 1152 struct ulist *roots;
499 eb = path->nodes[0]; 1153 struct ulist_node *ref_node = NULL;
500 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 1154 struct ulist_node *root_node = NULL;
501 item_size = btrfs_item_size_nr(eb, path->slots[0]); 1155 struct seq_list seq_elem;
502 1156 struct btrfs_delayed_ref_root *delayed_refs;
503 /* first we iterate the inline refs, ... */ 1157
504 do { 1158 trans = btrfs_join_transaction(fs_info->extent_root);
505 last = __get_extent_inline_ref(&ptr, eb, ei, item_size, 1159 if (IS_ERR(trans))
506 &eiref, &type); 1160 return PTR_ERR(trans);
507 if (last == -ENOENT) { 1161
508 ret = 0; 1162 pr_debug("resolving all inodes for extent %llu\n",
509 break; 1163 extent_item_objectid);
510 } 1164
511 if (last < 0) { 1165 delayed_refs = &trans->transaction->delayed_refs;
512 ret = last; 1166 spin_lock(&delayed_refs->lock);
513 break; 1167 btrfs_get_delayed_seq(delayed_refs, &seq_elem);
514 } 1168 spin_unlock(&delayed_refs->lock);
1169
1170 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
1171 extent_item_pos, seq_elem.seq,
1172 &refs);
515 1173
516 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1174 if (ret)
517 dref = (struct btrfs_extent_data_ref *)(&eiref->offset); 1175 goto out;
518 ret = __data_list_add_eb(&data_refs, eb, dref);
519 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
520 logical = btrfs_extent_inline_ref_offset(eb, eiref);
521 ret = __shared_list_add(&shared_refs, logical);
522 }
523 } while (!ret && !last);
524 1176
525 /* ... then we proceed to in-tree references and ... */ 1177 while (!ret && (ref_node = ulist_next(refs, ref_node))) {
526 while (!ret) { 1178 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
527 ++path->slots[0]; 1179 seq_elem.seq, &roots);
528 if (path->slots[0] > btrfs_header_nritems(eb)) { 1180 if (ret)
529 ret = btrfs_next_leaf(fs_info->extent_root, path);
530 if (ret) {
531 if (ret == 1)
532 ret = 0; /* we're done */
533 break;
534 }
535 eb = path->nodes[0];
536 }
537 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
538 if (key.objectid != extent_item_objectid)
539 break; 1181 break;
540 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1182 while (!ret && (root_node = ulist_next(roots, root_node))) {
541 dref = btrfs_item_ptr(eb, path->slots[0], 1183 pr_debug("root %llu references leaf %llu\n",
542 struct btrfs_extent_data_ref); 1184 root_node->val, ref_node->val);
543 ret = __data_list_add_eb(&data_refs, eb, dref); 1185 ret = iterate_leaf_refs(fs_info, path, ref_node->val,
544 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1186 extent_item_objectid,
545 ret = __shared_list_add(&shared_refs, key.offset); 1187 extent_item_pos, root_node->val,
1188 iterate, ctx);
546 } 1189 }
547 } 1190 }
548 1191
549 btrfs_release_path(path); 1192 ulist_free(refs);
550 1193 ulist_free(roots);
551 /* 1194out:
552 * ... only at the very end we can process the refs we found. this is 1195 btrfs_put_delayed_seq(delayed_refs, &seq_elem);
553 * because the iterator function we call is allowed to make tree lookups 1196 btrfs_end_transaction(trans, fs_info->extent_root);
554 * and we have to avoid deadlocks. additionally, we need more tree
555 * lookups ourselves for shared data refs.
556 */
557 while (!list_empty(&data_refs)) {
558 ref_d = list_first_entry(&data_refs, struct __data_ref, list);
559 list_del(&ref_d->list);
560 if (!ret)
561 ret = iterate(ref_d->inum, extent_offset +
562 ref_d->extent_data_item_offset,
563 ref_d->root, ctx);
564 kfree(ref_d);
565 }
566
567 while (!list_empty(&shared_refs)) {
568 ref_s = list_first_entry(&shared_refs, struct __shared_ref,
569 list);
570 list_del(&ref_s->list);
571 if (!ret)
572 ret = __iter_shared_inline_ref(fs_info,
573 ref_s->disk_byte,
574 extent_item_objectid,
575 extent_offset, path,
576 &data_refs,
577 iterate, ctx);
578 kfree(ref_s);
579 }
580
581 return ret; 1197 return ret;
582} 1198}
583 1199
@@ -586,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
586 iterate_extent_inodes_t *iterate, void *ctx) 1202 iterate_extent_inodes_t *iterate, void *ctx)
587{ 1203{
588 int ret; 1204 int ret;
589 u64 offset; 1205 u64 extent_item_pos;
590 struct btrfs_key found_key; 1206 struct btrfs_key found_key;
591 1207
592 ret = extent_from_logical(fs_info, logical, path, 1208 ret = extent_from_logical(fs_info, logical, path,
593 &found_key); 1209 &found_key);
1210 btrfs_release_path(path);
594 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1211 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
595 ret = -EINVAL; 1212 ret = -EINVAL;
596 if (ret < 0) 1213 if (ret < 0)
597 return ret; 1214 return ret;
598 1215
599 offset = logical - found_key.objectid; 1216 extent_item_pos = logical - found_key.objectid;
600 ret = iterate_extent_inodes(fs_info, path, found_key.objectid, 1217 ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
601 offset, iterate, ctx); 1218 extent_item_pos, iterate, ctx);
602 1219
603 return ret; 1220 return ret;
604} 1221}
@@ -643,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
643 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { 1260 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
644 name_len = btrfs_inode_ref_name_len(eb, iref); 1261 name_len = btrfs_inode_ref_name_len(eb, iref);
645 /* path must be released before calling iterate()! */ 1262 /* path must be released before calling iterate()! */
1263 pr_debug("following ref at offset %u for inode %llu in "
1264 "tree %llu\n", cur,
1265 (unsigned long long)found_key.objectid,
1266 (unsigned long long)fs_root->objectid);
646 ret = iterate(parent, iref, eb, ctx); 1267 ret = iterate(parent, iref, eb, ctx);
647 if (ret) { 1268 if (ret) {
648 free_extent_buffer(eb); 1269 free_extent_buffer(eb);
@@ -683,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
683 return PTR_ERR(fspath); 1304 return PTR_ERR(fspath);
684 1305
685 if (fspath > fspath_min) { 1306 if (fspath > fspath_min) {
1307 pr_debug("path resolved: %s\n", fspath);
686 ipath->fspath->val[i] = (u64)(unsigned long)fspath; 1308 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
687 ++ipath->fspath->elem_cnt; 1309 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min; 1310 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else { 1311 } else {
1312 pr_debug("missed path, not enough space. missing bytes: %lu, "
1313 "constructed so far: %s\n",
1314 (unsigned long)(fspath_min - fspath), fspath_min);
690 ++ipath->fspath->elem_missed; 1315 ++ipath->fspath->elem_missed;
691 ipath->fspath->bytes_missing += fspath_min - fspath; 1316 ipath->fspath->bytes_missing += fspath_min - fspath;
692 ipath->fspath->bytes_left = 0; 1317 ipath->fspath->bytes_left = 0;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 92618837cb8f..d00dfa9ca934 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -20,6 +20,7 @@
20#define __BTRFS_BACKREF__ 20#define __BTRFS_BACKREF__
21 21
22#include "ioctl.h" 22#include "ioctl.h"
23#include "ulist.h"
23 24
24struct inode_fs_paths { 25struct inode_fs_paths {
25 struct btrfs_path *btrfs_path; 26 struct btrfs_path *btrfs_path;
@@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
54 55
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); 56int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56 57
58int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
59 struct btrfs_fs_info *fs_info, u64 bytenr,
60 u64 num_bytes, u64 seq, struct ulist **roots);
61
57struct btrfs_data_container *init_data_container(u32 total_bytes); 62struct btrfs_data_container *init_data_container(u32 total_bytes);
58struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, 63struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
59 struct btrfs_path *path); 64 struct btrfs_path *path);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 634608d2a6d0..9b9b15fd5204 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,9 @@ struct btrfs_inode {
51 /* held while logging the inode in tree-log.c */ 51 /* held while logging the inode in tree-log.c */
52 struct mutex log_mutex; 52 struct mutex log_mutex;
53 53
54 /* held while doing delalloc reservations */
55 struct mutex delalloc_mutex;
56
54 /* used to order data wrt metadata */ 57 /* used to order data wrt metadata */
55 struct btrfs_ordered_inode_tree ordered_tree; 58 struct btrfs_ordered_inode_tree ordered_tree;
56 59
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644
index 000000000000..ad0b3ba735b7
--- /dev/null
+++ b/fs/btrfs/check-integrity.c
@@ -0,0 +1,3068 @@
1/*
2 * Copyright (C) STRATO AG 2011. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19/*
20 * This module can be used to catch cases when the btrfs kernel
21 * code executes write requests to the disk that bring the file
22 * system in an inconsistent state. In such a state, a power-loss
23 * or kernel panic event would cause that the data on disk is
24 * lost or at least damaged.
25 *
26 * Code is added that examines all block write requests during
27 * runtime (including writes of the super block). Three rules
28 * are verified and an error is printed on violation of the
29 * rules:
30 * 1. It is not allowed to write a disk block which is
31 * currently referenced by the super block (either directly
32 * or indirectly).
33 * 2. When a super block is written, it is verified that all
34 * referenced (directly or indirectly) blocks fulfill the
35 * following requirements:
36 * 2a. All referenced blocks have either been present when
37 * the file system was mounted, (i.e., they have been
38 * referenced by the super block) or they have been
39 * written since then and the write completion callback
40 * was called and a FLUSH request to the device where
41 * these blocks are located was received and completed.
42 * 2b. All referenced blocks need to have a generation
43 * number which is equal to the parent's number.
44 *
45 * One issue that was found using this module was that the log
46 * tree on disk became temporarily corrupted because disk blocks
47 * that had been in use for the log tree had been freed and
48 * reused too early, while being referenced by the written super
49 * block.
50 *
51 * The search term in the kernel log that can be used to filter
52 * on the existence of detected integrity issues is
53 * "btrfs: attempt".
54 *
55 * The integrity check is enabled via mount options. These
56 * mount options are only supported if the integrity check
57 * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
58 *
59 * Example #1, apply integrity checks to all metadata:
60 * mount /dev/sdb1 /mnt -o check_int
61 *
62 * Example #2, apply integrity checks to all metadata and
63 * to data extents:
64 * mount /dev/sdb1 /mnt -o check_int_data
65 *
66 * Example #3, apply integrity checks to all metadata and dump
67 * the tree that the super block references to kernel messages
68 * each time after a super block was written:
69 * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
70 *
71 * If the integrity check tool is included and activated in
72 * the mount options, plenty of kernel memory is used, and
73 * plenty of additional CPU cycles are spent. Enabling this
74 * functionality is not intended for normal use. In most
75 * cases, unless you are a btrfs developer who needs to verify
76 * the integrity of (super)-block write requests, do not
77 * enable the config option BTRFS_FS_CHECK_INTEGRITY to
78 * include and compile the integrity check tool.
79 */
80
81#include <linux/sched.h>
82#include <linux/slab.h>
83#include <linux/buffer_head.h>
84#include <linux/mutex.h>
85#include <linux/crc32c.h>
86#include <linux/genhd.h>
87#include <linux/blkdev.h>
88#include "ctree.h"
89#include "disk-io.h"
90#include "transaction.h"
91#include "extent_io.h"
92#include "disk-io.h"
93#include "volumes.h"
94#include "print-tree.h"
95#include "locking.h"
96#include "check-integrity.h"
97
98#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
99#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
100#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
101#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
102#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
103#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
104#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
105#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
106 * excluding " [...]" */
107#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
108
109#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
110
111/*
112 * The definition of the bitmask fields for the print_mask.
113 * They are specified with the mount option check_integrity_print_mask.
114 */
115#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001
116#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002
117#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004
118#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008
119#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010
120#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020
121#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040
122#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080
123#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100
124#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200
125#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
126#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
127#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
128
129struct btrfsic_dev_state;
130struct btrfsic_state;
131
132struct btrfsic_block {
133 u32 magic_num; /* only used for debug purposes */
134 unsigned int is_metadata:1; /* if it is meta-data, not data-data */
135 unsigned int is_superblock:1; /* if it is one of the superblocks */
136 unsigned int is_iodone:1; /* if is done by lower subsystem */
137 unsigned int iodone_w_error:1; /* error was indicated to endio */
138 unsigned int never_written:1; /* block was added because it was
139 * referenced, not because it was
140 * written */
141 unsigned int mirror_num:2; /* large enough to hold
142 * BTRFS_SUPER_MIRROR_MAX */
143 struct btrfsic_dev_state *dev_state;
144 u64 dev_bytenr; /* key, physical byte num on disk */
145 u64 logical_bytenr; /* logical byte num on disk */
146 u64 generation;
147 struct btrfs_disk_key disk_key; /* extra info to print in case of
148 * issues, will not always be correct */
149 struct list_head collision_resolving_node; /* list node */
150 struct list_head all_blocks_node; /* list node */
151
152 /* the following two lists contain block_link items */
153 struct list_head ref_to_list; /* list */
154 struct list_head ref_from_list; /* list */
155 struct btrfsic_block *next_in_same_bio;
156 void *orig_bio_bh_private;
157 union {
158 bio_end_io_t *bio;
159 bh_end_io_t *bh;
160 } orig_bio_bh_end_io;
161 int submit_bio_bh_rw;
162 u64 flush_gen; /* only valid if !never_written */
163};
164
165/*
166 * Elements of this type are allocated dynamically and required because
167 * each block object can refer to and can be ref from multiple blocks.
168 * The key to lookup them in the hashtable is the dev_bytenr of
169 * the block ref to plus the one from the block refered from.
170 * The fact that they are searchable via a hashtable and that a
171 * ref_cnt is maintained is not required for the btrfs integrity
172 * check algorithm itself, it is only used to make the output more
173 * beautiful in case that an error is detected (an error is defined
174 * as a write operation to a block while that block is still referenced).
175 */
176struct btrfsic_block_link {
177 u32 magic_num; /* only used for debug purposes */
178 u32 ref_cnt;
179 struct list_head node_ref_to; /* list node */
180 struct list_head node_ref_from; /* list node */
181 struct list_head collision_resolving_node; /* list node */
182 struct btrfsic_block *block_ref_to;
183 struct btrfsic_block *block_ref_from;
184 u64 parent_generation;
185};
186
187struct btrfsic_dev_state {
188 u32 magic_num; /* only used for debug purposes */
189 struct block_device *bdev;
190 struct btrfsic_state *state;
191 struct list_head collision_resolving_node; /* list node */
192 struct btrfsic_block dummy_block_for_bio_bh_flush;
193 u64 last_flush_gen;
194 char name[BDEVNAME_SIZE];
195};
196
197struct btrfsic_block_hashtable {
198 struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
199};
200
201struct btrfsic_block_link_hashtable {
202 struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
203};
204
205struct btrfsic_dev_state_hashtable {
206 struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
207};
208
209struct btrfsic_block_data_ctx {
210 u64 start; /* virtual bytenr */
211 u64 dev_bytenr; /* physical bytenr on device */
212 u32 len;
213 struct btrfsic_dev_state *dev;
214 char *data;
215 struct buffer_head *bh; /* do not use if set to NULL */
216};
217
218/* This structure is used to implement recursion without occupying
219 * any stack space, refer to btrfsic_process_metablock() */
220struct btrfsic_stack_frame {
221 u32 magic;
222 u32 nr;
223 int error;
224 int i;
225 int limit_nesting;
226 int num_copies;
227 int mirror_num;
228 struct btrfsic_block *block;
229 struct btrfsic_block_data_ctx *block_ctx;
230 struct btrfsic_block *next_block;
231 struct btrfsic_block_data_ctx next_block_ctx;
232 struct btrfs_header *hdr;
233 struct btrfsic_stack_frame *prev;
234};
235
236/* Some state per mounted filesystem */
237struct btrfsic_state {
238 u32 print_mask;
239 int include_extent_data;
240 int csum_size;
241 struct list_head all_blocks_list;
242 struct btrfsic_block_hashtable block_hashtable;
243 struct btrfsic_block_link_hashtable block_link_hashtable;
244 struct btrfs_root *root;
245 u64 max_superblock_generation;
246 struct btrfsic_block *latest_superblock;
247};
248
249static void btrfsic_block_init(struct btrfsic_block *b);
250static struct btrfsic_block *btrfsic_block_alloc(void);
251static void btrfsic_block_free(struct btrfsic_block *b);
252static void btrfsic_block_link_init(struct btrfsic_block_link *n);
253static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
254static void btrfsic_block_link_free(struct btrfsic_block_link *n);
255static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
256static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
257static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
258static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
259static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
260 struct btrfsic_block_hashtable *h);
261static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
262static struct btrfsic_block *btrfsic_block_hashtable_lookup(
263 struct block_device *bdev,
264 u64 dev_bytenr,
265 struct btrfsic_block_hashtable *h);
266static void btrfsic_block_link_hashtable_init(
267 struct btrfsic_block_link_hashtable *h);
268static void btrfsic_block_link_hashtable_add(
269 struct btrfsic_block_link *l,
270 struct btrfsic_block_link_hashtable *h);
271static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
272static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
273 struct block_device *bdev_ref_to,
274 u64 dev_bytenr_ref_to,
275 struct block_device *bdev_ref_from,
276 u64 dev_bytenr_ref_from,
277 struct btrfsic_block_link_hashtable *h);
278static void btrfsic_dev_state_hashtable_init(
279 struct btrfsic_dev_state_hashtable *h);
280static void btrfsic_dev_state_hashtable_add(
281 struct btrfsic_dev_state *ds,
282 struct btrfsic_dev_state_hashtable *h);
283static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
284static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
285 struct block_device *bdev,
286 struct btrfsic_dev_state_hashtable *h);
287static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
288static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
289static int btrfsic_process_superblock(struct btrfsic_state *state,
290 struct btrfs_fs_devices *fs_devices);
291static int btrfsic_process_metablock(struct btrfsic_state *state,
292 struct btrfsic_block *block,
293 struct btrfsic_block_data_ctx *block_ctx,
294 struct btrfs_header *hdr,
295 int limit_nesting, int force_iodone_flag);
296static int btrfsic_create_link_to_next_block(
297 struct btrfsic_state *state,
298 struct btrfsic_block *block,
299 struct btrfsic_block_data_ctx
300 *block_ctx, u64 next_bytenr,
301 int limit_nesting,
302 struct btrfsic_block_data_ctx *next_block_ctx,
303 struct btrfsic_block **next_blockp,
304 int force_iodone_flag,
305 int *num_copiesp, int *mirror_nump,
306 struct btrfs_disk_key *disk_key,
307 u64 parent_generation);
308static int btrfsic_handle_extent_data(struct btrfsic_state *state,
309 struct btrfsic_block *block,
310 struct btrfsic_block_data_ctx *block_ctx,
311 u32 item_offset, int force_iodone_flag);
312static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
313 struct btrfsic_block_data_ctx *block_ctx_out,
314 int mirror_num);
315static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
316 u32 len, struct block_device *bdev,
317 struct btrfsic_block_data_ctx *block_ctx_out);
318static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
319static int btrfsic_read_block(struct btrfsic_state *state,
320 struct btrfsic_block_data_ctx *block_ctx);
321static void btrfsic_dump_database(struct btrfsic_state *state);
322static int btrfsic_test_for_metadata(struct btrfsic_state *state,
323 const u8 *data, unsigned int size);
324static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
325 u64 dev_bytenr, u8 *mapped_data,
326 unsigned int len, struct bio *bio,
327 int *bio_is_patched,
328 struct buffer_head *bh,
329 int submit_bio_bh_rw);
330static int btrfsic_process_written_superblock(
331 struct btrfsic_state *state,
332 struct btrfsic_block *const block,
333 struct btrfs_super_block *const super_hdr);
334static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
335static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
336static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
337 const struct btrfsic_block *block,
338 int recursion_level);
339static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
340 struct btrfsic_block *const block,
341 int recursion_level);
342static void btrfsic_print_add_link(const struct btrfsic_state *state,
343 const struct btrfsic_block_link *l);
344static void btrfsic_print_rem_link(const struct btrfsic_state *state,
345 const struct btrfsic_block_link *l);
346static char btrfsic_get_block_type(const struct btrfsic_state *state,
347 const struct btrfsic_block *block);
348static void btrfsic_dump_tree(const struct btrfsic_state *state);
349static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
350 const struct btrfsic_block *block,
351 int indent_level);
352static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
353 struct btrfsic_state *state,
354 struct btrfsic_block_data_ctx *next_block_ctx,
355 struct btrfsic_block *next_block,
356 struct btrfsic_block *from_block,
357 u64 parent_generation);
358static struct btrfsic_block *btrfsic_block_lookup_or_add(
359 struct btrfsic_state *state,
360 struct btrfsic_block_data_ctx *block_ctx,
361 const char *additional_string,
362 int is_metadata,
363 int is_iodone,
364 int never_written,
365 int mirror_num,
366 int *was_created);
367static int btrfsic_process_superblock_dev_mirror(
368 struct btrfsic_state *state,
369 struct btrfsic_dev_state *dev_state,
370 struct btrfs_device *device,
371 int superblock_mirror_num,
372 struct btrfsic_dev_state **selected_dev_state,
373 struct btrfs_super_block *selected_super);
374static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
375 struct block_device *bdev);
376static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
377 u64 bytenr,
378 struct btrfsic_dev_state *dev_state,
379 u64 dev_bytenr, char *data);
380
381static struct mutex btrfsic_mutex;
382static int btrfsic_is_initialized;
383static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
384
385
386static void btrfsic_block_init(struct btrfsic_block *b)
387{
388 b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
389 b->dev_state = NULL;
390 b->dev_bytenr = 0;
391 b->logical_bytenr = 0;
392 b->generation = BTRFSIC_GENERATION_UNKNOWN;
393 b->disk_key.objectid = 0;
394 b->disk_key.type = 0;
395 b->disk_key.offset = 0;
396 b->is_metadata = 0;
397 b->is_superblock = 0;
398 b->is_iodone = 0;
399 b->iodone_w_error = 0;
400 b->never_written = 0;
401 b->mirror_num = 0;
402 b->next_in_same_bio = NULL;
403 b->orig_bio_bh_private = NULL;
404 b->orig_bio_bh_end_io.bio = NULL;
405 INIT_LIST_HEAD(&b->collision_resolving_node);
406 INIT_LIST_HEAD(&b->all_blocks_node);
407 INIT_LIST_HEAD(&b->ref_to_list);
408 INIT_LIST_HEAD(&b->ref_from_list);
409 b->submit_bio_bh_rw = 0;
410 b->flush_gen = 0;
411}
412
413static struct btrfsic_block *btrfsic_block_alloc(void)
414{
415 struct btrfsic_block *b;
416
417 b = kzalloc(sizeof(*b), GFP_NOFS);
418 if (NULL != b)
419 btrfsic_block_init(b);
420
421 return b;
422}
423
424static void btrfsic_block_free(struct btrfsic_block *b)
425{
426 BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
427 kfree(b);
428}
429
430static void btrfsic_block_link_init(struct btrfsic_block_link *l)
431{
432 l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
433 l->ref_cnt = 1;
434 INIT_LIST_HEAD(&l->node_ref_to);
435 INIT_LIST_HEAD(&l->node_ref_from);
436 INIT_LIST_HEAD(&l->collision_resolving_node);
437 l->block_ref_to = NULL;
438 l->block_ref_from = NULL;
439}
440
441static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
442{
443 struct btrfsic_block_link *l;
444
445 l = kzalloc(sizeof(*l), GFP_NOFS);
446 if (NULL != l)
447 btrfsic_block_link_init(l);
448
449 return l;
450}
451
452static void btrfsic_block_link_free(struct btrfsic_block_link *l)
453{
454 BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
455 kfree(l);
456}
457
458static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
459{
460 ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
461 ds->bdev = NULL;
462 ds->state = NULL;
463 ds->name[0] = '\0';
464 INIT_LIST_HEAD(&ds->collision_resolving_node);
465 ds->last_flush_gen = 0;
466 btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
467 ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
468 ds->dummy_block_for_bio_bh_flush.dev_state = ds;
469}
470
471static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
472{
473 struct btrfsic_dev_state *ds;
474
475 ds = kzalloc(sizeof(*ds), GFP_NOFS);
476 if (NULL != ds)
477 btrfsic_dev_state_init(ds);
478
479 return ds;
480}
481
482static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
483{
484 BUG_ON(!(NULL == ds ||
485 BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
486 kfree(ds);
487}
488
489static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
490{
491 int i;
492
493 for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
494 INIT_LIST_HEAD(h->table + i);
495}
496
497static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
498 struct btrfsic_block_hashtable *h)
499{
500 const unsigned int hashval =
501 (((unsigned int)(b->dev_bytenr >> 16)) ^
502 ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
503 (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
504
505 list_add(&b->collision_resolving_node, h->table + hashval);
506}
507
508static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
509{
510 list_del(&b->collision_resolving_node);
511}
512
513static struct btrfsic_block *btrfsic_block_hashtable_lookup(
514 struct block_device *bdev,
515 u64 dev_bytenr,
516 struct btrfsic_block_hashtable *h)
517{
518 const unsigned int hashval =
519 (((unsigned int)(dev_bytenr >> 16)) ^
520 ((unsigned int)((uintptr_t)bdev))) &
521 (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
522 struct list_head *elem;
523
524 list_for_each(elem, h->table + hashval) {
525 struct btrfsic_block *const b =
526 list_entry(elem, struct btrfsic_block,
527 collision_resolving_node);
528
529 if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
530 return b;
531 }
532
533 return NULL;
534}
535
536static void btrfsic_block_link_hashtable_init(
537 struct btrfsic_block_link_hashtable *h)
538{
539 int i;
540
541 for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
542 INIT_LIST_HEAD(h->table + i);
543}
544
545static void btrfsic_block_link_hashtable_add(
546 struct btrfsic_block_link *l,
547 struct btrfsic_block_link_hashtable *h)
548{
549 const unsigned int hashval =
550 (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
551 ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
552 ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
553 ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
554 & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
555
556 BUG_ON(NULL == l->block_ref_to);
557 BUG_ON(NULL == l->block_ref_from);
558 list_add(&l->collision_resolving_node, h->table + hashval);
559}
560
561static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
562{
563 list_del(&l->collision_resolving_node);
564}
565
566static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
567 struct block_device *bdev_ref_to,
568 u64 dev_bytenr_ref_to,
569 struct block_device *bdev_ref_from,
570 u64 dev_bytenr_ref_from,
571 struct btrfsic_block_link_hashtable *h)
572{
573 const unsigned int hashval =
574 (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
575 ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
576 ((unsigned int)((uintptr_t)bdev_ref_to)) ^
577 ((unsigned int)((uintptr_t)bdev_ref_from))) &
578 (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
579 struct list_head *elem;
580
581 list_for_each(elem, h->table + hashval) {
582 struct btrfsic_block_link *const l =
583 list_entry(elem, struct btrfsic_block_link,
584 collision_resolving_node);
585
586 BUG_ON(NULL == l->block_ref_to);
587 BUG_ON(NULL == l->block_ref_from);
588 if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
589 l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
590 l->block_ref_from->dev_state->bdev == bdev_ref_from &&
591 l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
592 return l;
593 }
594
595 return NULL;
596}
597
598static void btrfsic_dev_state_hashtable_init(
599 struct btrfsic_dev_state_hashtable *h)
600{
601 int i;
602
603 for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
604 INIT_LIST_HEAD(h->table + i);
605}
606
607static void btrfsic_dev_state_hashtable_add(
608 struct btrfsic_dev_state *ds,
609 struct btrfsic_dev_state_hashtable *h)
610{
611 const unsigned int hashval =
612 (((unsigned int)((uintptr_t)ds->bdev)) &
613 (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
614
615 list_add(&ds->collision_resolving_node, h->table + hashval);
616}
617
618static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
619{
620 list_del(&ds->collision_resolving_node);
621}
622
623static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
624 struct block_device *bdev,
625 struct btrfsic_dev_state_hashtable *h)
626{
627 const unsigned int hashval =
628 (((unsigned int)((uintptr_t)bdev)) &
629 (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
630 struct list_head *elem;
631
632 list_for_each(elem, h->table + hashval) {
633 struct btrfsic_dev_state *const ds =
634 list_entry(elem, struct btrfsic_dev_state,
635 collision_resolving_node);
636
637 if (ds->bdev == bdev)
638 return ds;
639 }
640
641 return NULL;
642}
643
644static int btrfsic_process_superblock(struct btrfsic_state *state,
645 struct btrfs_fs_devices *fs_devices)
646{
647 int ret;
648 struct btrfs_super_block *selected_super;
649 struct list_head *dev_head = &fs_devices->devices;
650 struct btrfs_device *device;
651 struct btrfsic_dev_state *selected_dev_state = NULL;
652 int pass;
653
654 BUG_ON(NULL == state);
655 selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
656 if (NULL == selected_super) {
657 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
658 return -1;
659 }
660
661 list_for_each_entry(device, dev_head, dev_list) {
662 int i;
663 struct btrfsic_dev_state *dev_state;
664
665 if (!device->bdev || !device->name)
666 continue;
667
668 dev_state = btrfsic_dev_state_lookup(device->bdev);
669 BUG_ON(NULL == dev_state);
670 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
671 ret = btrfsic_process_superblock_dev_mirror(
672 state, dev_state, device, i,
673 &selected_dev_state, selected_super);
674 if (0 != ret && 0 == i) {
675 kfree(selected_super);
676 return ret;
677 }
678 }
679 }
680
681 if (NULL == state->latest_superblock) {
682 printk(KERN_INFO "btrfsic: no superblock found!\n");
683 kfree(selected_super);
684 return -1;
685 }
686
687 state->csum_size = btrfs_super_csum_size(selected_super);
688
689 for (pass = 0; pass < 3; pass++) {
690 int num_copies;
691 int mirror_num;
692 u64 next_bytenr;
693
694 switch (pass) {
695 case 0:
696 next_bytenr = btrfs_super_root(selected_super);
697 if (state->print_mask &
698 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
699 printk(KERN_INFO "root@%llu\n",
700 (unsigned long long)next_bytenr);
701 break;
702 case 1:
703 next_bytenr = btrfs_super_chunk_root(selected_super);
704 if (state->print_mask &
705 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
706 printk(KERN_INFO "chunk@%llu\n",
707 (unsigned long long)next_bytenr);
708 break;
709 case 2:
710 next_bytenr = btrfs_super_log_root(selected_super);
711 if (0 == next_bytenr)
712 continue;
713 if (state->print_mask &
714 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
715 printk(KERN_INFO "log@%llu\n",
716 (unsigned long long)next_bytenr);
717 break;
718 }
719
720 num_copies =
721 btrfs_num_copies(&state->root->fs_info->mapping_tree,
722 next_bytenr, PAGE_SIZE);
723 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
724 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
725 (unsigned long long)next_bytenr, num_copies);
726
727 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
728 struct btrfsic_block *next_block;
729 struct btrfsic_block_data_ctx tmp_next_block_ctx;
730 struct btrfsic_block_link *l;
731 struct btrfs_header *hdr;
732
733 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
734 &tmp_next_block_ctx,
735 mirror_num);
736 if (ret) {
737 printk(KERN_INFO "btrfsic:"
738 " btrfsic_map_block(root @%llu,"
739 " mirror %d) failed!\n",
740 (unsigned long long)next_bytenr,
741 mirror_num);
742 kfree(selected_super);
743 return -1;
744 }
745
746 next_block = btrfsic_block_hashtable_lookup(
747 tmp_next_block_ctx.dev->bdev,
748 tmp_next_block_ctx.dev_bytenr,
749 &state->block_hashtable);
750 BUG_ON(NULL == next_block);
751
752 l = btrfsic_block_link_hashtable_lookup(
753 tmp_next_block_ctx.dev->bdev,
754 tmp_next_block_ctx.dev_bytenr,
755 state->latest_superblock->dev_state->
756 bdev,
757 state->latest_superblock->dev_bytenr,
758 &state->block_link_hashtable);
759 BUG_ON(NULL == l);
760
761 ret = btrfsic_read_block(state, &tmp_next_block_ctx);
762 if (ret < (int)BTRFSIC_BLOCK_SIZE) {
763 printk(KERN_INFO
764 "btrfsic: read @logical %llu failed!\n",
765 (unsigned long long)
766 tmp_next_block_ctx.start);
767 btrfsic_release_block_ctx(&tmp_next_block_ctx);
768 kfree(selected_super);
769 return -1;
770 }
771
772 hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
773 ret = btrfsic_process_metablock(state,
774 next_block,
775 &tmp_next_block_ctx,
776 hdr,
777 BTRFS_MAX_LEVEL + 3, 1);
778 btrfsic_release_block_ctx(&tmp_next_block_ctx);
779 }
780 }
781
782 kfree(selected_super);
783 return ret;
784}
785
786static int btrfsic_process_superblock_dev_mirror(
787 struct btrfsic_state *state,
788 struct btrfsic_dev_state *dev_state,
789 struct btrfs_device *device,
790 int superblock_mirror_num,
791 struct btrfsic_dev_state **selected_dev_state,
792 struct btrfs_super_block *selected_super)
793{
794 struct btrfs_super_block *super_tmp;
795 u64 dev_bytenr;
796 struct buffer_head *bh;
797 struct btrfsic_block *superblock_tmp;
798 int pass;
799 struct block_device *const superblock_bdev = device->bdev;
800
801 /* super block bytenr is always the unmapped device bytenr */
802 dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
803 bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
804 if (NULL == bh)
805 return -1;
806 super_tmp = (struct btrfs_super_block *)
807 (bh->b_data + (dev_bytenr & 4095));
808
809 if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
810 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
811 sizeof(super_tmp->magic)) ||
812 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
813 brelse(bh);
814 return 0;
815 }
816
817 superblock_tmp =
818 btrfsic_block_hashtable_lookup(superblock_bdev,
819 dev_bytenr,
820 &state->block_hashtable);
821 if (NULL == superblock_tmp) {
822 superblock_tmp = btrfsic_block_alloc();
823 if (NULL == superblock_tmp) {
824 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
825 brelse(bh);
826 return -1;
827 }
828 /* for superblock, only the dev_bytenr makes sense */
829 superblock_tmp->dev_bytenr = dev_bytenr;
830 superblock_tmp->dev_state = dev_state;
831 superblock_tmp->logical_bytenr = dev_bytenr;
832 superblock_tmp->generation = btrfs_super_generation(super_tmp);
833 superblock_tmp->is_metadata = 1;
834 superblock_tmp->is_superblock = 1;
835 superblock_tmp->is_iodone = 1;
836 superblock_tmp->never_written = 0;
837 superblock_tmp->mirror_num = 1 + superblock_mirror_num;
838 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
839 printk(KERN_INFO "New initial S-block (bdev %p, %s)"
840 " @%llu (%s/%llu/%d)\n",
841 superblock_bdev, device->name,
842 (unsigned long long)dev_bytenr,
843 dev_state->name,
844 (unsigned long long)dev_bytenr,
845 superblock_mirror_num);
846 list_add(&superblock_tmp->all_blocks_node,
847 &state->all_blocks_list);
848 btrfsic_block_hashtable_add(superblock_tmp,
849 &state->block_hashtable);
850 }
851
852 /* select the one with the highest generation field */
853 if (btrfs_super_generation(super_tmp) >
854 state->max_superblock_generation ||
855 0 == state->max_superblock_generation) {
856 memcpy(selected_super, super_tmp, sizeof(*selected_super));
857 *selected_dev_state = dev_state;
858 state->max_superblock_generation =
859 btrfs_super_generation(super_tmp);
860 state->latest_superblock = superblock_tmp;
861 }
862
863 for (pass = 0; pass < 3; pass++) {
864 u64 next_bytenr;
865 int num_copies;
866 int mirror_num;
867 const char *additional_string = NULL;
868 struct btrfs_disk_key tmp_disk_key;
869
870 tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
871 tmp_disk_key.offset = 0;
872 switch (pass) {
873 case 0:
874 tmp_disk_key.objectid =
875 cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
876 additional_string = "initial root ";
877 next_bytenr = btrfs_super_root(super_tmp);
878 break;
879 case 1:
880 tmp_disk_key.objectid =
881 cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
882 additional_string = "initial chunk ";
883 next_bytenr = btrfs_super_chunk_root(super_tmp);
884 break;
885 case 2:
886 tmp_disk_key.objectid =
887 cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
888 additional_string = "initial log ";
889 next_bytenr = btrfs_super_log_root(super_tmp);
890 if (0 == next_bytenr)
891 continue;
892 break;
893 }
894
895 num_copies =
896 btrfs_num_copies(&state->root->fs_info->mapping_tree,
897 next_bytenr, PAGE_SIZE);
898 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
899 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
900 (unsigned long long)next_bytenr, num_copies);
901 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
902 struct btrfsic_block *next_block;
903 struct btrfsic_block_data_ctx tmp_next_block_ctx;
904 struct btrfsic_block_link *l;
905
906 if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
907 &tmp_next_block_ctx,
908 mirror_num)) {
909 printk(KERN_INFO "btrfsic: btrfsic_map_block("
910 "bytenr @%llu, mirror %d) failed!\n",
911 (unsigned long long)next_bytenr,
912 mirror_num);
913 brelse(bh);
914 return -1;
915 }
916
917 next_block = btrfsic_block_lookup_or_add(
918 state, &tmp_next_block_ctx,
919 additional_string, 1, 1, 0,
920 mirror_num, NULL);
921 if (NULL == next_block) {
922 btrfsic_release_block_ctx(&tmp_next_block_ctx);
923 brelse(bh);
924 return -1;
925 }
926
927 next_block->disk_key = tmp_disk_key;
928 next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
929 l = btrfsic_block_link_lookup_or_add(
930 state, &tmp_next_block_ctx,
931 next_block, superblock_tmp,
932 BTRFSIC_GENERATION_UNKNOWN);
933 btrfsic_release_block_ctx(&tmp_next_block_ctx);
934 if (NULL == l) {
935 brelse(bh);
936 return -1;
937 }
938 }
939 }
940 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
941 btrfsic_dump_tree_sub(state, superblock_tmp, 0);
942
943 brelse(bh);
944 return 0;
945}
946
947static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
948{
949 struct btrfsic_stack_frame *sf;
950
951 sf = kzalloc(sizeof(*sf), GFP_NOFS);
952 if (NULL == sf)
953 printk(KERN_INFO "btrfsic: alloc memory failed!\n");
954 else
955 sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
956 return sf;
957}
958
959static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
960{
961 BUG_ON(!(NULL == sf ||
962 BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
963 kfree(sf);
964}
965
966static int btrfsic_process_metablock(
967 struct btrfsic_state *state,
968 struct btrfsic_block *const first_block,
969 struct btrfsic_block_data_ctx *const first_block_ctx,
970 struct btrfs_header *const first_hdr,
971 int first_limit_nesting, int force_iodone_flag)
972{
973 struct btrfsic_stack_frame initial_stack_frame = { 0 };
974 struct btrfsic_stack_frame *sf;
975 struct btrfsic_stack_frame *next_stack;
976
977 sf = &initial_stack_frame;
978 sf->error = 0;
979 sf->i = -1;
980 sf->limit_nesting = first_limit_nesting;
981 sf->block = first_block;
982 sf->block_ctx = first_block_ctx;
983 sf->next_block = NULL;
984 sf->hdr = first_hdr;
985 sf->prev = NULL;
986
987continue_with_new_stack_frame:
988 sf->block->generation = le64_to_cpu(sf->hdr->generation);
989 if (0 == sf->hdr->level) {
990 struct btrfs_leaf *const leafhdr =
991 (struct btrfs_leaf *)sf->hdr;
992
993 if (-1 == sf->i) {
994 sf->nr = le32_to_cpu(leafhdr->header.nritems);
995
996 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
997 printk(KERN_INFO
998 "leaf %llu items %d generation %llu"
999 " owner %llu\n",
1000 (unsigned long long)
1001 sf->block_ctx->start,
1002 sf->nr,
1003 (unsigned long long)
1004 le64_to_cpu(leafhdr->header.generation),
1005 (unsigned long long)
1006 le64_to_cpu(leafhdr->header.owner));
1007 }
1008
1009continue_with_current_leaf_stack_frame:
1010 if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
1011 sf->i++;
1012 sf->num_copies = 0;
1013 }
1014
1015 if (sf->i < sf->nr) {
1016 struct btrfs_item *disk_item = leafhdr->items + sf->i;
1017 struct btrfs_disk_key *disk_key = &disk_item->key;
1018 u8 type;
1019 const u32 item_offset = le32_to_cpu(disk_item->offset);
1020
1021 type = disk_key->type;
1022
1023 if (BTRFS_ROOT_ITEM_KEY == type) {
1024 const struct btrfs_root_item *const root_item =
1025 (struct btrfs_root_item *)
1026 (sf->block_ctx->data +
1027 offsetof(struct btrfs_leaf, items) +
1028 item_offset);
1029 const u64 next_bytenr =
1030 le64_to_cpu(root_item->bytenr);
1031
1032 sf->error =
1033 btrfsic_create_link_to_next_block(
1034 state,
1035 sf->block,
1036 sf->block_ctx,
1037 next_bytenr,
1038 sf->limit_nesting,
1039 &sf->next_block_ctx,
1040 &sf->next_block,
1041 force_iodone_flag,
1042 &sf->num_copies,
1043 &sf->mirror_num,
1044 disk_key,
1045 le64_to_cpu(root_item->
1046 generation));
1047 if (sf->error)
1048 goto one_stack_frame_backwards;
1049
1050 if (NULL != sf->next_block) {
1051 struct btrfs_header *const next_hdr =
1052 (struct btrfs_header *)
1053 sf->next_block_ctx.data;
1054
1055 next_stack =
1056 btrfsic_stack_frame_alloc();
1057 if (NULL == next_stack) {
1058 btrfsic_release_block_ctx(
1059 &sf->
1060 next_block_ctx);
1061 goto one_stack_frame_backwards;
1062 }
1063
1064 next_stack->i = -1;
1065 next_stack->block = sf->next_block;
1066 next_stack->block_ctx =
1067 &sf->next_block_ctx;
1068 next_stack->next_block = NULL;
1069 next_stack->hdr = next_hdr;
1070 next_stack->limit_nesting =
1071 sf->limit_nesting - 1;
1072 next_stack->prev = sf;
1073 sf = next_stack;
1074 goto continue_with_new_stack_frame;
1075 }
1076 } else if (BTRFS_EXTENT_DATA_KEY == type &&
1077 state->include_extent_data) {
1078 sf->error = btrfsic_handle_extent_data(
1079 state,
1080 sf->block,
1081 sf->block_ctx,
1082 item_offset,
1083 force_iodone_flag);
1084 if (sf->error)
1085 goto one_stack_frame_backwards;
1086 }
1087
1088 goto continue_with_current_leaf_stack_frame;
1089 }
1090 } else {
1091 struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
1092
1093 if (-1 == sf->i) {
1094 sf->nr = le32_to_cpu(nodehdr->header.nritems);
1095
1096 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1097 printk(KERN_INFO "node %llu level %d items %d"
1098 " generation %llu owner %llu\n",
1099 (unsigned long long)
1100 sf->block_ctx->start,
1101 nodehdr->header.level, sf->nr,
1102 (unsigned long long)
1103 le64_to_cpu(nodehdr->header.generation),
1104 (unsigned long long)
1105 le64_to_cpu(nodehdr->header.owner));
1106 }
1107
1108continue_with_current_node_stack_frame:
1109 if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
1110 sf->i++;
1111 sf->num_copies = 0;
1112 }
1113
1114 if (sf->i < sf->nr) {
1115 struct btrfs_key_ptr *disk_key_ptr =
1116 nodehdr->ptrs + sf->i;
1117 const u64 next_bytenr =
1118 le64_to_cpu(disk_key_ptr->blockptr);
1119
1120 sf->error = btrfsic_create_link_to_next_block(
1121 state,
1122 sf->block,
1123 sf->block_ctx,
1124 next_bytenr,
1125 sf->limit_nesting,
1126 &sf->next_block_ctx,
1127 &sf->next_block,
1128 force_iodone_flag,
1129 &sf->num_copies,
1130 &sf->mirror_num,
1131 &disk_key_ptr->key,
1132 le64_to_cpu(disk_key_ptr->generation));
1133 if (sf->error)
1134 goto one_stack_frame_backwards;
1135
1136 if (NULL != sf->next_block) {
1137 struct btrfs_header *const next_hdr =
1138 (struct btrfs_header *)
1139 sf->next_block_ctx.data;
1140
1141 next_stack = btrfsic_stack_frame_alloc();
1142 if (NULL == next_stack)
1143 goto one_stack_frame_backwards;
1144
1145 next_stack->i = -1;
1146 next_stack->block = sf->next_block;
1147 next_stack->block_ctx = &sf->next_block_ctx;
1148 next_stack->next_block = NULL;
1149 next_stack->hdr = next_hdr;
1150 next_stack->limit_nesting =
1151 sf->limit_nesting - 1;
1152 next_stack->prev = sf;
1153 sf = next_stack;
1154 goto continue_with_new_stack_frame;
1155 }
1156
1157 goto continue_with_current_node_stack_frame;
1158 }
1159 }
1160
1161one_stack_frame_backwards:
1162 if (NULL != sf->prev) {
1163 struct btrfsic_stack_frame *const prev = sf->prev;
1164
1165 /* the one for the initial block is freed in the caller */
1166 btrfsic_release_block_ctx(sf->block_ctx);
1167
1168 if (sf->error) {
1169 prev->error = sf->error;
1170 btrfsic_stack_frame_free(sf);
1171 sf = prev;
1172 goto one_stack_frame_backwards;
1173 }
1174
1175 btrfsic_stack_frame_free(sf);
1176 sf = prev;
1177 goto continue_with_new_stack_frame;
1178 } else {
1179 BUG_ON(&initial_stack_frame != sf);
1180 }
1181
1182 return sf->error;
1183}
1184
1185static int btrfsic_create_link_to_next_block(
1186 struct btrfsic_state *state,
1187 struct btrfsic_block *block,
1188 struct btrfsic_block_data_ctx *block_ctx,
1189 u64 next_bytenr,
1190 int limit_nesting,
1191 struct btrfsic_block_data_ctx *next_block_ctx,
1192 struct btrfsic_block **next_blockp,
1193 int force_iodone_flag,
1194 int *num_copiesp, int *mirror_nump,
1195 struct btrfs_disk_key *disk_key,
1196 u64 parent_generation)
1197{
1198 struct btrfsic_block *next_block = NULL;
1199 int ret;
1200 struct btrfsic_block_link *l;
1201 int did_alloc_block_link;
1202 int block_was_created;
1203
1204 *next_blockp = NULL;
1205 if (0 == *num_copiesp) {
1206 *num_copiesp =
1207 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1208 next_bytenr, PAGE_SIZE);
1209 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1210 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1211 (unsigned long long)next_bytenr, *num_copiesp);
1212 *mirror_nump = 1;
1213 }
1214
1215 if (*mirror_nump > *num_copiesp)
1216 return 0;
1217
1218 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1219 printk(KERN_INFO
1220 "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
1221 *mirror_nump);
1222 ret = btrfsic_map_block(state, next_bytenr,
1223 BTRFSIC_BLOCK_SIZE,
1224 next_block_ctx, *mirror_nump);
1225 if (ret) {
1226 printk(KERN_INFO
1227 "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
1228 (unsigned long long)next_bytenr, *mirror_nump);
1229 btrfsic_release_block_ctx(next_block_ctx);
1230 *next_blockp = NULL;
1231 return -1;
1232 }
1233
1234 next_block = btrfsic_block_lookup_or_add(state,
1235 next_block_ctx, "referenced ",
1236 1, force_iodone_flag,
1237 !force_iodone_flag,
1238 *mirror_nump,
1239 &block_was_created);
1240 if (NULL == next_block) {
1241 btrfsic_release_block_ctx(next_block_ctx);
1242 *next_blockp = NULL;
1243 return -1;
1244 }
1245 if (block_was_created) {
1246 l = NULL;
1247 next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
1248 } else {
1249 if (next_block->logical_bytenr != next_bytenr &&
1250 !(!next_block->is_metadata &&
1251 0 == next_block->logical_bytenr)) {
1252 printk(KERN_INFO
1253 "Referenced block @%llu (%s/%llu/%d)"
1254 " found in hash table, %c,"
1255 " bytenr mismatch (!= stored %llu).\n",
1256 (unsigned long long)next_bytenr,
1257 next_block_ctx->dev->name,
1258 (unsigned long long)next_block_ctx->dev_bytenr,
1259 *mirror_nump,
1260 btrfsic_get_block_type(state, next_block),
1261 (unsigned long long)next_block->logical_bytenr);
1262 } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1263 printk(KERN_INFO
1264 "Referenced block @%llu (%s/%llu/%d)"
1265 " found in hash table, %c.\n",
1266 (unsigned long long)next_bytenr,
1267 next_block_ctx->dev->name,
1268 (unsigned long long)next_block_ctx->dev_bytenr,
1269 *mirror_nump,
1270 btrfsic_get_block_type(state, next_block));
1271 next_block->logical_bytenr = next_bytenr;
1272
1273 next_block->mirror_num = *mirror_nump;
1274 l = btrfsic_block_link_hashtable_lookup(
1275 next_block_ctx->dev->bdev,
1276 next_block_ctx->dev_bytenr,
1277 block_ctx->dev->bdev,
1278 block_ctx->dev_bytenr,
1279 &state->block_link_hashtable);
1280 }
1281
1282 next_block->disk_key = *disk_key;
1283 if (NULL == l) {
1284 l = btrfsic_block_link_alloc();
1285 if (NULL == l) {
1286 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
1287 btrfsic_release_block_ctx(next_block_ctx);
1288 *next_blockp = NULL;
1289 return -1;
1290 }
1291
1292 did_alloc_block_link = 1;
1293 l->block_ref_to = next_block;
1294 l->block_ref_from = block;
1295 l->ref_cnt = 1;
1296 l->parent_generation = parent_generation;
1297
1298 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1299 btrfsic_print_add_link(state, l);
1300
1301 list_add(&l->node_ref_to, &block->ref_to_list);
1302 list_add(&l->node_ref_from, &next_block->ref_from_list);
1303
1304 btrfsic_block_link_hashtable_add(l,
1305 &state->block_link_hashtable);
1306 } else {
1307 did_alloc_block_link = 0;
1308 if (0 == limit_nesting) {
1309 l->ref_cnt++;
1310 l->parent_generation = parent_generation;
1311 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1312 btrfsic_print_add_link(state, l);
1313 }
1314 }
1315
1316 if (limit_nesting > 0 && did_alloc_block_link) {
1317 ret = btrfsic_read_block(state, next_block_ctx);
1318 if (ret < (int)BTRFSIC_BLOCK_SIZE) {
1319 printk(KERN_INFO
1320 "btrfsic: read block @logical %llu failed!\n",
1321 (unsigned long long)next_bytenr);
1322 btrfsic_release_block_ctx(next_block_ctx);
1323 *next_blockp = NULL;
1324 return -1;
1325 }
1326
1327 *next_blockp = next_block;
1328 } else {
1329 *next_blockp = NULL;
1330 }
1331 (*mirror_nump)++;
1332
1333 return 0;
1334}
1335
1336static int btrfsic_handle_extent_data(
1337 struct btrfsic_state *state,
1338 struct btrfsic_block *block,
1339 struct btrfsic_block_data_ctx *block_ctx,
1340 u32 item_offset, int force_iodone_flag)
1341{
1342 int ret;
1343 struct btrfs_file_extent_item *file_extent_item =
1344 (struct btrfs_file_extent_item *)(block_ctx->data +
1345 offsetof(struct btrfs_leaf,
1346 items) + item_offset);
1347 u64 next_bytenr =
1348 le64_to_cpu(file_extent_item->disk_bytenr) +
1349 le64_to_cpu(file_extent_item->offset);
1350 u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
1351 u64 generation = le64_to_cpu(file_extent_item->generation);
1352 struct btrfsic_block_link *l;
1353
1354 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1355 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
1356 " offset = %llu, num_bytes = %llu\n",
1357 file_extent_item->type,
1358 (unsigned long long)
1359 le64_to_cpu(file_extent_item->disk_bytenr),
1360 (unsigned long long)
1361 le64_to_cpu(file_extent_item->offset),
1362 (unsigned long long)
1363 le64_to_cpu(file_extent_item->num_bytes));
1364 if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
1365 ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
1366 return 0;
1367 while (num_bytes > 0) {
1368 u32 chunk_len;
1369 int num_copies;
1370 int mirror_num;
1371
1372 if (num_bytes > BTRFSIC_BLOCK_SIZE)
1373 chunk_len = BTRFSIC_BLOCK_SIZE;
1374 else
1375 chunk_len = num_bytes;
1376
1377 num_copies =
1378 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1379 next_bytenr, PAGE_SIZE);
1380 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1381 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1382 (unsigned long long)next_bytenr, num_copies);
1383 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
1384 struct btrfsic_block_data_ctx next_block_ctx;
1385 struct btrfsic_block *next_block;
1386 int block_was_created;
1387
1388 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1389 printk(KERN_INFO "btrfsic_handle_extent_data("
1390 "mirror_num=%d)\n", mirror_num);
1391 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1392 printk(KERN_INFO
1393 "\tdisk_bytenr = %llu, num_bytes %u\n",
1394 (unsigned long long)next_bytenr,
1395 chunk_len);
1396 ret = btrfsic_map_block(state, next_bytenr,
1397 chunk_len, &next_block_ctx,
1398 mirror_num);
1399 if (ret) {
1400 printk(KERN_INFO
1401 "btrfsic: btrfsic_map_block(@%llu,"
1402 " mirror=%d) failed!\n",
1403 (unsigned long long)next_bytenr,
1404 mirror_num);
1405 return -1;
1406 }
1407
1408 next_block = btrfsic_block_lookup_or_add(
1409 state,
1410 &next_block_ctx,
1411 "referenced ",
1412 0,
1413 force_iodone_flag,
1414 !force_iodone_flag,
1415 mirror_num,
1416 &block_was_created);
1417 if (NULL == next_block) {
1418 printk(KERN_INFO
1419 "btrfsic: error, kmalloc failed!\n");
1420 btrfsic_release_block_ctx(&next_block_ctx);
1421 return -1;
1422 }
1423 if (!block_was_created) {
1424 if (next_block->logical_bytenr != next_bytenr &&
1425 !(!next_block->is_metadata &&
1426 0 == next_block->logical_bytenr)) {
1427 printk(KERN_INFO
1428 "Referenced block"
1429 " @%llu (%s/%llu/%d)"
1430 " found in hash table, D,"
1431 " bytenr mismatch"
1432 " (!= stored %llu).\n",
1433 (unsigned long long)next_bytenr,
1434 next_block_ctx.dev->name,
1435 (unsigned long long)
1436 next_block_ctx.dev_bytenr,
1437 mirror_num,
1438 (unsigned long long)
1439 next_block->logical_bytenr);
1440 }
1441 next_block->logical_bytenr = next_bytenr;
1442 next_block->mirror_num = mirror_num;
1443 }
1444
1445 l = btrfsic_block_link_lookup_or_add(state,
1446 &next_block_ctx,
1447 next_block, block,
1448 generation);
1449 btrfsic_release_block_ctx(&next_block_ctx);
1450 if (NULL == l)
1451 return -1;
1452 }
1453
1454 next_bytenr += chunk_len;
1455 num_bytes -= chunk_len;
1456 }
1457
1458 return 0;
1459}
1460
1461static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1462 struct btrfsic_block_data_ctx *block_ctx_out,
1463 int mirror_num)
1464{
1465 int ret;
1466 u64 length;
1467 struct btrfs_bio *multi = NULL;
1468 struct btrfs_device *device;
1469
1470 length = len;
1471 ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
1472 bytenr, &length, &multi, mirror_num);
1473
1474 device = multi->stripes[0].dev;
1475 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
1476 block_ctx_out->dev_bytenr = multi->stripes[0].physical;
1477 block_ctx_out->start = bytenr;
1478 block_ctx_out->len = len;
1479 block_ctx_out->data = NULL;
1480 block_ctx_out->bh = NULL;
1481
1482 if (0 == ret)
1483 kfree(multi);
1484 if (NULL == block_ctx_out->dev) {
1485 ret = -ENXIO;
1486 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
1487 }
1488
1489 return ret;
1490}
1491
1492static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
1493 u32 len, struct block_device *bdev,
1494 struct btrfsic_block_data_ctx *block_ctx_out)
1495{
1496 block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
1497 block_ctx_out->dev_bytenr = bytenr;
1498 block_ctx_out->start = bytenr;
1499 block_ctx_out->len = len;
1500 block_ctx_out->data = NULL;
1501 block_ctx_out->bh = NULL;
1502 if (NULL != block_ctx_out->dev) {
1503 return 0;
1504 } else {
1505 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
1506 return -ENXIO;
1507 }
1508}
1509
1510static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
1511{
1512 if (NULL != block_ctx->bh) {
1513 brelse(block_ctx->bh);
1514 block_ctx->bh = NULL;
1515 }
1516}
1517
1518static int btrfsic_read_block(struct btrfsic_state *state,
1519 struct btrfsic_block_data_ctx *block_ctx)
1520{
1521 block_ctx->bh = NULL;
1522 if (block_ctx->dev_bytenr & 4095) {
1523 printk(KERN_INFO
1524 "btrfsic: read_block() with unaligned bytenr %llu\n",
1525 (unsigned long long)block_ctx->dev_bytenr);
1526 return -1;
1527 }
1528 if (block_ctx->len > 4096) {
1529 printk(KERN_INFO
1530 "btrfsic: read_block() with too huge size %d\n",
1531 block_ctx->len);
1532 return -1;
1533 }
1534
1535 block_ctx->bh = __bread(block_ctx->dev->bdev,
1536 block_ctx->dev_bytenr >> 12, 4096);
1537 if (NULL == block_ctx->bh)
1538 return -1;
1539 block_ctx->data = block_ctx->bh->b_data;
1540
1541 return block_ctx->len;
1542}
1543
1544static void btrfsic_dump_database(struct btrfsic_state *state)
1545{
1546 struct list_head *elem_all;
1547
1548 BUG_ON(NULL == state);
1549
1550 printk(KERN_INFO "all_blocks_list:\n");
1551 list_for_each(elem_all, &state->all_blocks_list) {
1552 const struct btrfsic_block *const b_all =
1553 list_entry(elem_all, struct btrfsic_block,
1554 all_blocks_node);
1555 struct list_head *elem_ref_to;
1556 struct list_head *elem_ref_from;
1557
1558 printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
1559 btrfsic_get_block_type(state, b_all),
1560 (unsigned long long)b_all->logical_bytenr,
1561 b_all->dev_state->name,
1562 (unsigned long long)b_all->dev_bytenr,
1563 b_all->mirror_num);
1564
1565 list_for_each(elem_ref_to, &b_all->ref_to_list) {
1566 const struct btrfsic_block_link *const l =
1567 list_entry(elem_ref_to,
1568 struct btrfsic_block_link,
1569 node_ref_to);
1570
1571 printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
1572 " refers %u* to"
1573 " %c @%llu (%s/%llu/%d)\n",
1574 btrfsic_get_block_type(state, b_all),
1575 (unsigned long long)b_all->logical_bytenr,
1576 b_all->dev_state->name,
1577 (unsigned long long)b_all->dev_bytenr,
1578 b_all->mirror_num,
1579 l->ref_cnt,
1580 btrfsic_get_block_type(state, l->block_ref_to),
1581 (unsigned long long)
1582 l->block_ref_to->logical_bytenr,
1583 l->block_ref_to->dev_state->name,
1584 (unsigned long long)l->block_ref_to->dev_bytenr,
1585 l->block_ref_to->mirror_num);
1586 }
1587
1588 list_for_each(elem_ref_from, &b_all->ref_from_list) {
1589 const struct btrfsic_block_link *const l =
1590 list_entry(elem_ref_from,
1591 struct btrfsic_block_link,
1592 node_ref_from);
1593
1594 printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
1595 " is ref %u* from"
1596 " %c @%llu (%s/%llu/%d)\n",
1597 btrfsic_get_block_type(state, b_all),
1598 (unsigned long long)b_all->logical_bytenr,
1599 b_all->dev_state->name,
1600 (unsigned long long)b_all->dev_bytenr,
1601 b_all->mirror_num,
1602 l->ref_cnt,
1603 btrfsic_get_block_type(state, l->block_ref_from),
1604 (unsigned long long)
1605 l->block_ref_from->logical_bytenr,
1606 l->block_ref_from->dev_state->name,
1607 (unsigned long long)
1608 l->block_ref_from->dev_bytenr,
1609 l->block_ref_from->mirror_num);
1610 }
1611
1612 printk(KERN_INFO "\n");
1613 }
1614}
1615
1616/*
1617 * Test whether the disk block contains a tree block (leaf or node)
1618 * (note that this test fails for the super block)
1619 */
1620static int btrfsic_test_for_metadata(struct btrfsic_state *state,
1621 const u8 *data, unsigned int size)
1622{
1623 struct btrfs_header *h;
1624 u8 csum[BTRFS_CSUM_SIZE];
1625 u32 crc = ~(u32)0;
1626 int fail = 0;
1627 int crc_fail = 0;
1628
1629 h = (struct btrfs_header *)data;
1630
1631 if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
1632 fail++;
1633
1634 crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
1635 btrfs_csum_final(crc, csum);
1636 if (memcmp(csum, h->csum, state->csum_size))
1637 crc_fail++;
1638
1639 return fail || crc_fail;
1640}
1641
1642static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1643 u64 dev_bytenr,
1644 u8 *mapped_data, unsigned int len,
1645 struct bio *bio,
1646 int *bio_is_patched,
1647 struct buffer_head *bh,
1648 int submit_bio_bh_rw)
1649{
1650 int is_metadata;
1651 struct btrfsic_block *block;
1652 struct btrfsic_block_data_ctx block_ctx;
1653 int ret;
1654 struct btrfsic_state *state = dev_state->state;
1655 struct block_device *bdev = dev_state->bdev;
1656
1657 WARN_ON(len > PAGE_SIZE);
1658 is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
1659 if (NULL != bio_is_patched)
1660 *bio_is_patched = 0;
1661
1662 block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
1663 &state->block_hashtable);
1664 if (NULL != block) {
1665 u64 bytenr;
1666 struct list_head *elem_ref_to;
1667 struct list_head *tmp_ref_to;
1668
1669 if (block->is_superblock) {
1670 bytenr = le64_to_cpu(((struct btrfs_super_block *)
1671 mapped_data)->bytenr);
1672 is_metadata = 1;
1673 if (state->print_mask &
1674 BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
1675 printk(KERN_INFO
1676 "[before new superblock is written]:\n");
1677 btrfsic_dump_tree_sub(state, block, 0);
1678 }
1679 }
1680 if (is_metadata) {
1681 if (!block->is_superblock) {
1682 bytenr = le64_to_cpu(((struct btrfs_header *)
1683 mapped_data)->bytenr);
1684 btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
1685 dev_state,
1686 dev_bytenr,
1687 mapped_data);
1688 }
1689 if (block->logical_bytenr != bytenr) {
1690 printk(KERN_INFO
1691 "Written block @%llu (%s/%llu/%d)"
1692 " found in hash table, %c,"
1693 " bytenr mismatch"
1694 " (!= stored %llu).\n",
1695 (unsigned long long)bytenr,
1696 dev_state->name,
1697 (unsigned long long)dev_bytenr,
1698 block->mirror_num,
1699 btrfsic_get_block_type(state, block),
1700 (unsigned long long)
1701 block->logical_bytenr);
1702 block->logical_bytenr = bytenr;
1703 } else if (state->print_mask &
1704 BTRFSIC_PRINT_MASK_VERBOSE)
1705 printk(KERN_INFO
1706 "Written block @%llu (%s/%llu/%d)"
1707 " found in hash table, %c.\n",
1708 (unsigned long long)bytenr,
1709 dev_state->name,
1710 (unsigned long long)dev_bytenr,
1711 block->mirror_num,
1712 btrfsic_get_block_type(state, block));
1713 } else {
1714 bytenr = block->logical_bytenr;
1715 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1716 printk(KERN_INFO
1717 "Written block @%llu (%s/%llu/%d)"
1718 " found in hash table, %c.\n",
1719 (unsigned long long)bytenr,
1720 dev_state->name,
1721 (unsigned long long)dev_bytenr,
1722 block->mirror_num,
1723 btrfsic_get_block_type(state, block));
1724 }
1725
1726 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1727 printk(KERN_INFO
1728 "ref_to_list: %cE, ref_from_list: %cE\n",
1729 list_empty(&block->ref_to_list) ? ' ' : '!',
1730 list_empty(&block->ref_from_list) ? ' ' : '!');
1731 if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
1732 printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
1733 " @%llu (%s/%llu/%d), old(gen=%llu,"
1734 " objectid=%llu, type=%d, offset=%llu),"
1735 " new(gen=%llu),"
1736 " which is referenced by most recent superblock"
1737 " (superblockgen=%llu)!\n",
1738 btrfsic_get_block_type(state, block),
1739 (unsigned long long)bytenr,
1740 dev_state->name,
1741 (unsigned long long)dev_bytenr,
1742 block->mirror_num,
1743 (unsigned long long)block->generation,
1744 (unsigned long long)
1745 le64_to_cpu(block->disk_key.objectid),
1746 block->disk_key.type,
1747 (unsigned long long)
1748 le64_to_cpu(block->disk_key.offset),
1749 (unsigned long long)
1750 le64_to_cpu(((struct btrfs_header *)
1751 mapped_data)->generation),
1752 (unsigned long long)
1753 state->max_superblock_generation);
1754 btrfsic_dump_tree(state);
1755 }
1756
1757 if (!block->is_iodone && !block->never_written) {
1758 printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
1759 " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
1760 " which is not yet iodone!\n",
1761 btrfsic_get_block_type(state, block),
1762 (unsigned long long)bytenr,
1763 dev_state->name,
1764 (unsigned long long)dev_bytenr,
1765 block->mirror_num,
1766 (unsigned long long)block->generation,
1767 (unsigned long long)
1768 le64_to_cpu(((struct btrfs_header *)
1769 mapped_data)->generation));
1770 /* it would not be safe to go on */
1771 btrfsic_dump_tree(state);
1772 return;
1773 }
1774
1775 /*
1776 * Clear all references of this block. Do not free
1777 * the block itself even if is not referenced anymore
1778 * because it still carries valueable information
1779 * like whether it was ever written and IO completed.
1780 */
1781 list_for_each_safe(elem_ref_to, tmp_ref_to,
1782 &block->ref_to_list) {
1783 struct btrfsic_block_link *const l =
1784 list_entry(elem_ref_to,
1785 struct btrfsic_block_link,
1786 node_ref_to);
1787
1788 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1789 btrfsic_print_rem_link(state, l);
1790 l->ref_cnt--;
1791 if (0 == l->ref_cnt) {
1792 list_del(&l->node_ref_to);
1793 list_del(&l->node_ref_from);
1794 btrfsic_block_link_hashtable_remove(l);
1795 btrfsic_block_link_free(l);
1796 }
1797 }
1798
1799 if (block->is_superblock)
1800 ret = btrfsic_map_superblock(state, bytenr, len,
1801 bdev, &block_ctx);
1802 else
1803 ret = btrfsic_map_block(state, bytenr, len,
1804 &block_ctx, 0);
1805 if (ret) {
1806 printk(KERN_INFO
1807 "btrfsic: btrfsic_map_block(root @%llu)"
1808 " failed!\n", (unsigned long long)bytenr);
1809 return;
1810 }
1811 block_ctx.data = mapped_data;
1812 /* the following is required in case of writes to mirrors,
1813 * use the same that was used for the lookup */
1814 block_ctx.dev = dev_state;
1815 block_ctx.dev_bytenr = dev_bytenr;
1816
1817 if (is_metadata || state->include_extent_data) {
1818 block->never_written = 0;
1819 block->iodone_w_error = 0;
1820 if (NULL != bio) {
1821 block->is_iodone = 0;
1822 BUG_ON(NULL == bio_is_patched);
1823 if (!*bio_is_patched) {
1824 block->orig_bio_bh_private =
1825 bio->bi_private;
1826 block->orig_bio_bh_end_io.bio =
1827 bio->bi_end_io;
1828 block->next_in_same_bio = NULL;
1829 bio->bi_private = block;
1830 bio->bi_end_io = btrfsic_bio_end_io;
1831 *bio_is_patched = 1;
1832 } else {
1833 struct btrfsic_block *chained_block =
1834 (struct btrfsic_block *)
1835 bio->bi_private;
1836
1837 BUG_ON(NULL == chained_block);
1838 block->orig_bio_bh_private =
1839 chained_block->orig_bio_bh_private;
1840 block->orig_bio_bh_end_io.bio =
1841 chained_block->orig_bio_bh_end_io.
1842 bio;
1843 block->next_in_same_bio = chained_block;
1844 bio->bi_private = block;
1845 }
1846 } else if (NULL != bh) {
1847 block->is_iodone = 0;
1848 block->orig_bio_bh_private = bh->b_private;
1849 block->orig_bio_bh_end_io.bh = bh->b_end_io;
1850 block->next_in_same_bio = NULL;
1851 bh->b_private = block;
1852 bh->b_end_io = btrfsic_bh_end_io;
1853 } else {
1854 block->is_iodone = 1;
1855 block->orig_bio_bh_private = NULL;
1856 block->orig_bio_bh_end_io.bio = NULL;
1857 block->next_in_same_bio = NULL;
1858 }
1859 }
1860
1861 block->flush_gen = dev_state->last_flush_gen + 1;
1862 block->submit_bio_bh_rw = submit_bio_bh_rw;
1863 if (is_metadata) {
1864 block->logical_bytenr = bytenr;
1865 block->is_metadata = 1;
1866 if (block->is_superblock) {
1867 ret = btrfsic_process_written_superblock(
1868 state,
1869 block,
1870 (struct btrfs_super_block *)
1871 mapped_data);
1872 if (state->print_mask &
1873 BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
1874 printk(KERN_INFO
1875 "[after new superblock is written]:\n");
1876 btrfsic_dump_tree_sub(state, block, 0);
1877 }
1878 } else {
1879 block->mirror_num = 0; /* unknown */
1880 ret = btrfsic_process_metablock(
1881 state,
1882 block,
1883 &block_ctx,
1884 (struct btrfs_header *)
1885 block_ctx.data,
1886 0, 0);
1887 }
1888 if (ret)
1889 printk(KERN_INFO
1890 "btrfsic: btrfsic_process_metablock"
1891 "(root @%llu) failed!\n",
1892 (unsigned long long)dev_bytenr);
1893 } else {
1894 block->is_metadata = 0;
1895 block->mirror_num = 0; /* unknown */
1896 block->generation = BTRFSIC_GENERATION_UNKNOWN;
1897 if (!state->include_extent_data
1898 && list_empty(&block->ref_from_list)) {
1899 /*
1900 * disk block is overwritten with extent
1901 * data (not meta data) and we are configured
1902 * to not include extent data: take the
1903 * chance and free the block's memory
1904 */
1905 btrfsic_block_hashtable_remove(block);
1906 list_del(&block->all_blocks_node);
1907 btrfsic_block_free(block);
1908 }
1909 }
1910 btrfsic_release_block_ctx(&block_ctx);
1911 } else {
1912 /* block has not been found in hash table */
1913 u64 bytenr;
1914
1915 if (!is_metadata) {
1916 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1917 printk(KERN_INFO "Written block (%s/%llu/?)"
1918 " !found in hash table, D.\n",
1919 dev_state->name,
1920 (unsigned long long)dev_bytenr);
1921 if (!state->include_extent_data)
1922 return; /* ignore that written D block */
1923
1924 /* this is getting ugly for the
1925 * include_extent_data case... */
1926 bytenr = 0; /* unknown */
1927 block_ctx.start = bytenr;
1928 block_ctx.len = len;
1929 block_ctx.bh = NULL;
1930 } else {
1931 bytenr = le64_to_cpu(((struct btrfs_header *)
1932 mapped_data)->bytenr);
1933 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
1934 dev_bytenr,
1935 mapped_data);
1936 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1937 printk(KERN_INFO
1938 "Written block @%llu (%s/%llu/?)"
1939 " !found in hash table, M.\n",
1940 (unsigned long long)bytenr,
1941 dev_state->name,
1942 (unsigned long long)dev_bytenr);
1943
1944 ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
1945 0);
1946 if (ret) {
1947 printk(KERN_INFO
1948 "btrfsic: btrfsic_map_block(root @%llu)"
1949 " failed!\n",
1950 (unsigned long long)dev_bytenr);
1951 return;
1952 }
1953 }
1954 block_ctx.data = mapped_data;
1955 /* the following is required in case of writes to mirrors,
1956 * use the same that was used for the lookup */
1957 block_ctx.dev = dev_state;
1958 block_ctx.dev_bytenr = dev_bytenr;
1959
1960 block = btrfsic_block_alloc();
1961 if (NULL == block) {
1962 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
1963 btrfsic_release_block_ctx(&block_ctx);
1964 return;
1965 }
1966 block->dev_state = dev_state;
1967 block->dev_bytenr = dev_bytenr;
1968 block->logical_bytenr = bytenr;
1969 block->is_metadata = is_metadata;
1970 block->never_written = 0;
1971 block->iodone_w_error = 0;
1972 block->mirror_num = 0; /* unknown */
1973 block->flush_gen = dev_state->last_flush_gen + 1;
1974 block->submit_bio_bh_rw = submit_bio_bh_rw;
1975 if (NULL != bio) {
1976 block->is_iodone = 0;
1977 BUG_ON(NULL == bio_is_patched);
1978 if (!*bio_is_patched) {
1979 block->orig_bio_bh_private = bio->bi_private;
1980 block->orig_bio_bh_end_io.bio = bio->bi_end_io;
1981 block->next_in_same_bio = NULL;
1982 bio->bi_private = block;
1983 bio->bi_end_io = btrfsic_bio_end_io;
1984 *bio_is_patched = 1;
1985 } else {
1986 struct btrfsic_block *chained_block =
1987 (struct btrfsic_block *)
1988 bio->bi_private;
1989
1990 BUG_ON(NULL == chained_block);
1991 block->orig_bio_bh_private =
1992 chained_block->orig_bio_bh_private;
1993 block->orig_bio_bh_end_io.bio =
1994 chained_block->orig_bio_bh_end_io.bio;
1995 block->next_in_same_bio = chained_block;
1996 bio->bi_private = block;
1997 }
1998 } else if (NULL != bh) {
1999 block->is_iodone = 0;
2000 block->orig_bio_bh_private = bh->b_private;
2001 block->orig_bio_bh_end_io.bh = bh->b_end_io;
2002 block->next_in_same_bio = NULL;
2003 bh->b_private = block;
2004 bh->b_end_io = btrfsic_bh_end_io;
2005 } else {
2006 block->is_iodone = 1;
2007 block->orig_bio_bh_private = NULL;
2008 block->orig_bio_bh_end_io.bio = NULL;
2009 block->next_in_same_bio = NULL;
2010 }
2011 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2012 printk(KERN_INFO
2013 "New written %c-block @%llu (%s/%llu/%d)\n",
2014 is_metadata ? 'M' : 'D',
2015 (unsigned long long)block->logical_bytenr,
2016 block->dev_state->name,
2017 (unsigned long long)block->dev_bytenr,
2018 block->mirror_num);
2019 list_add(&block->all_blocks_node, &state->all_blocks_list);
2020 btrfsic_block_hashtable_add(block, &state->block_hashtable);
2021
2022 if (is_metadata) {
2023 ret = btrfsic_process_metablock(state, block,
2024 &block_ctx,
2025 (struct btrfs_header *)
2026 block_ctx.data, 0, 0);
2027 if (ret)
2028 printk(KERN_INFO
2029 "btrfsic: process_metablock(root @%llu)"
2030 " failed!\n",
2031 (unsigned long long)dev_bytenr);
2032 }
2033 btrfsic_release_block_ctx(&block_ctx);
2034 }
2035}
2036
2037static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
2038{
2039 struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
2040 int iodone_w_error;
2041
2042 /* mutex is not held! This is not save if IO is not yet completed
2043 * on umount */
2044 iodone_w_error = 0;
2045 if (bio_error_status)
2046 iodone_w_error = 1;
2047
2048 BUG_ON(NULL == block);
2049 bp->bi_private = block->orig_bio_bh_private;
2050 bp->bi_end_io = block->orig_bio_bh_end_io.bio;
2051
2052 do {
2053 struct btrfsic_block *next_block;
2054 struct btrfsic_dev_state *const dev_state = block->dev_state;
2055
2056 if ((dev_state->state->print_mask &
2057 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2058 printk(KERN_INFO
2059 "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
2060 bio_error_status,
2061 btrfsic_get_block_type(dev_state->state, block),
2062 (unsigned long long)block->logical_bytenr,
2063 dev_state->name,
2064 (unsigned long long)block->dev_bytenr,
2065 block->mirror_num);
2066 next_block = block->next_in_same_bio;
2067 block->iodone_w_error = iodone_w_error;
2068 if (block->submit_bio_bh_rw & REQ_FLUSH) {
2069 dev_state->last_flush_gen++;
2070 if ((dev_state->state->print_mask &
2071 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2072 printk(KERN_INFO
2073 "bio_end_io() new %s flush_gen=%llu\n",
2074 dev_state->name,
2075 (unsigned long long)
2076 dev_state->last_flush_gen);
2077 }
2078 if (block->submit_bio_bh_rw & REQ_FUA)
2079 block->flush_gen = 0; /* FUA completed means block is
2080 * on disk */
2081 block->is_iodone = 1; /* for FLUSH, this releases the block */
2082 block = next_block;
2083 } while (NULL != block);
2084
2085 bp->bi_end_io(bp, bio_error_status);
2086}
2087
2088static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
2089{
2090 struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
2091 int iodone_w_error = !uptodate;
2092 struct btrfsic_dev_state *dev_state;
2093
2094 BUG_ON(NULL == block);
2095 dev_state = block->dev_state;
2096 if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2097 printk(KERN_INFO
2098 "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
2099 iodone_w_error,
2100 btrfsic_get_block_type(dev_state->state, block),
2101 (unsigned long long)block->logical_bytenr,
2102 block->dev_state->name,
2103 (unsigned long long)block->dev_bytenr,
2104 block->mirror_num);
2105
2106 block->iodone_w_error = iodone_w_error;
2107 if (block->submit_bio_bh_rw & REQ_FLUSH) {
2108 dev_state->last_flush_gen++;
2109 if ((dev_state->state->print_mask &
2110 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2111 printk(KERN_INFO
2112 "bh_end_io() new %s flush_gen=%llu\n",
2113 dev_state->name,
2114 (unsigned long long)dev_state->last_flush_gen);
2115 }
2116 if (block->submit_bio_bh_rw & REQ_FUA)
2117 block->flush_gen = 0; /* FUA completed means block is on disk */
2118
2119 bh->b_private = block->orig_bio_bh_private;
2120 bh->b_end_io = block->orig_bio_bh_end_io.bh;
2121 block->is_iodone = 1; /* for FLUSH, this releases the block */
2122 bh->b_end_io(bh, uptodate);
2123}
2124
2125static int btrfsic_process_written_superblock(
2126 struct btrfsic_state *state,
2127 struct btrfsic_block *const superblock,
2128 struct btrfs_super_block *const super_hdr)
2129{
2130 int pass;
2131
2132 superblock->generation = btrfs_super_generation(super_hdr);
2133 if (!(superblock->generation > state->max_superblock_generation ||
2134 0 == state->max_superblock_generation)) {
2135 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
2136 printk(KERN_INFO
2137 "btrfsic: superblock @%llu (%s/%llu/%d)"
2138 " with old gen %llu <= %llu\n",
2139 (unsigned long long)superblock->logical_bytenr,
2140 superblock->dev_state->name,
2141 (unsigned long long)superblock->dev_bytenr,
2142 superblock->mirror_num,
2143 (unsigned long long)
2144 btrfs_super_generation(super_hdr),
2145 (unsigned long long)
2146 state->max_superblock_generation);
2147 } else {
2148 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
2149 printk(KERN_INFO
2150 "btrfsic: got new superblock @%llu (%s/%llu/%d)"
2151 " with new gen %llu > %llu\n",
2152 (unsigned long long)superblock->logical_bytenr,
2153 superblock->dev_state->name,
2154 (unsigned long long)superblock->dev_bytenr,
2155 superblock->mirror_num,
2156 (unsigned long long)
2157 btrfs_super_generation(super_hdr),
2158 (unsigned long long)
2159 state->max_superblock_generation);
2160
2161 state->max_superblock_generation =
2162 btrfs_super_generation(super_hdr);
2163 state->latest_superblock = superblock;
2164 }
2165
2166 for (pass = 0; pass < 3; pass++) {
2167 int ret;
2168 u64 next_bytenr;
2169 struct btrfsic_block *next_block;
2170 struct btrfsic_block_data_ctx tmp_next_block_ctx;
2171 struct btrfsic_block_link *l;
2172 int num_copies;
2173 int mirror_num;
2174 const char *additional_string = NULL;
2175 struct btrfs_disk_key tmp_disk_key;
2176
2177 tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
2178 tmp_disk_key.offset = 0;
2179
2180 switch (pass) {
2181 case 0:
2182 tmp_disk_key.objectid =
2183 cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
2184 additional_string = "root ";
2185 next_bytenr = btrfs_super_root(super_hdr);
2186 if (state->print_mask &
2187 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2188 printk(KERN_INFO "root@%llu\n",
2189 (unsigned long long)next_bytenr);
2190 break;
2191 case 1:
2192 tmp_disk_key.objectid =
2193 cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
2194 additional_string = "chunk ";
2195 next_bytenr = btrfs_super_chunk_root(super_hdr);
2196 if (state->print_mask &
2197 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2198 printk(KERN_INFO "chunk@%llu\n",
2199 (unsigned long long)next_bytenr);
2200 break;
2201 case 2:
2202 tmp_disk_key.objectid =
2203 cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
2204 additional_string = "log ";
2205 next_bytenr = btrfs_super_log_root(super_hdr);
2206 if (0 == next_bytenr)
2207 continue;
2208 if (state->print_mask &
2209 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2210 printk(KERN_INFO "log@%llu\n",
2211 (unsigned long long)next_bytenr);
2212 break;
2213 }
2214
2215 num_copies =
2216 btrfs_num_copies(&state->root->fs_info->mapping_tree,
2217 next_bytenr, PAGE_SIZE);
2218 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
2219 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
2220 (unsigned long long)next_bytenr, num_copies);
2221 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2222 int was_created;
2223
2224 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2225 printk(KERN_INFO
2226 "btrfsic_process_written_superblock("
2227 "mirror_num=%d)\n", mirror_num);
2228 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
2229 &tmp_next_block_ctx,
2230 mirror_num);
2231 if (ret) {
2232 printk(KERN_INFO
2233 "btrfsic: btrfsic_map_block(@%llu,"
2234 " mirror=%d) failed!\n",
2235 (unsigned long long)next_bytenr,
2236 mirror_num);
2237 return -1;
2238 }
2239
2240 next_block = btrfsic_block_lookup_or_add(
2241 state,
2242 &tmp_next_block_ctx,
2243 additional_string,
2244 1, 0, 1,
2245 mirror_num,
2246 &was_created);
2247 if (NULL == next_block) {
2248 printk(KERN_INFO
2249 "btrfsic: error, kmalloc failed!\n");
2250 btrfsic_release_block_ctx(&tmp_next_block_ctx);
2251 return -1;
2252 }
2253
2254 next_block->disk_key = tmp_disk_key;
2255 if (was_created)
2256 next_block->generation =
2257 BTRFSIC_GENERATION_UNKNOWN;
2258 l = btrfsic_block_link_lookup_or_add(
2259 state,
2260 &tmp_next_block_ctx,
2261 next_block,
2262 superblock,
2263 BTRFSIC_GENERATION_UNKNOWN);
2264 btrfsic_release_block_ctx(&tmp_next_block_ctx);
2265 if (NULL == l)
2266 return -1;
2267 }
2268 }
2269
2270 if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
2271 WARN_ON(1);
2272 btrfsic_dump_tree(state);
2273 }
2274
2275 return 0;
2276}
2277
2278static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
2279 struct btrfsic_block *const block,
2280 int recursion_level)
2281{
2282 struct list_head *elem_ref_to;
2283 int ret = 0;
2284
2285 if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
2286 /*
2287 * Note that this situation can happen and does not
2288 * indicate an error in regular cases. It happens
2289 * when disk blocks are freed and later reused.
2290 * The check-integrity module is not aware of any
2291 * block free operations, it just recognizes block
2292 * write operations. Therefore it keeps the linkage
2293 * information for a block until a block is
2294 * rewritten. This can temporarily cause incorrect
2295 * and even circular linkage informations. This
2296 * causes no harm unless such blocks are referenced
2297 * by the most recent super block.
2298 */
2299 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2300 printk(KERN_INFO
2301 "btrfsic: abort cyclic linkage (case 1).\n");
2302
2303 return ret;
2304 }
2305
2306 /*
2307 * This algorithm is recursive because the amount of used stack
2308 * space is very small and the max recursion depth is limited.
2309 */
2310 list_for_each(elem_ref_to, &block->ref_to_list) {
2311 const struct btrfsic_block_link *const l =
2312 list_entry(elem_ref_to, struct btrfsic_block_link,
2313 node_ref_to);
2314
2315 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2316 printk(KERN_INFO
2317 "rl=%d, %c @%llu (%s/%llu/%d)"
2318 " %u* refers to %c @%llu (%s/%llu/%d)\n",
2319 recursion_level,
2320 btrfsic_get_block_type(state, block),
2321 (unsigned long long)block->logical_bytenr,
2322 block->dev_state->name,
2323 (unsigned long long)block->dev_bytenr,
2324 block->mirror_num,
2325 l->ref_cnt,
2326 btrfsic_get_block_type(state, l->block_ref_to),
2327 (unsigned long long)
2328 l->block_ref_to->logical_bytenr,
2329 l->block_ref_to->dev_state->name,
2330 (unsigned long long)l->block_ref_to->dev_bytenr,
2331 l->block_ref_to->mirror_num);
2332 if (l->block_ref_to->never_written) {
2333 printk(KERN_INFO "btrfs: attempt to write superblock"
2334 " which references block %c @%llu (%s/%llu/%d)"
2335 " which is never written!\n",
2336 btrfsic_get_block_type(state, l->block_ref_to),
2337 (unsigned long long)
2338 l->block_ref_to->logical_bytenr,
2339 l->block_ref_to->dev_state->name,
2340 (unsigned long long)l->block_ref_to->dev_bytenr,
2341 l->block_ref_to->mirror_num);
2342 ret = -1;
2343 } else if (!l->block_ref_to->is_iodone) {
2344 printk(KERN_INFO "btrfs: attempt to write superblock"
2345 " which references block %c @%llu (%s/%llu/%d)"
2346 " which is not yet iodone!\n",
2347 btrfsic_get_block_type(state, l->block_ref_to),
2348 (unsigned long long)
2349 l->block_ref_to->logical_bytenr,
2350 l->block_ref_to->dev_state->name,
2351 (unsigned long long)l->block_ref_to->dev_bytenr,
2352 l->block_ref_to->mirror_num);
2353 ret = -1;
2354 } else if (l->parent_generation !=
2355 l->block_ref_to->generation &&
2356 BTRFSIC_GENERATION_UNKNOWN !=
2357 l->parent_generation &&
2358 BTRFSIC_GENERATION_UNKNOWN !=
2359 l->block_ref_to->generation) {
2360 printk(KERN_INFO "btrfs: attempt to write superblock"
2361 " which references block %c @%llu (%s/%llu/%d)"
2362 " with generation %llu !="
2363 " parent generation %llu!\n",
2364 btrfsic_get_block_type(state, l->block_ref_to),
2365 (unsigned long long)
2366 l->block_ref_to->logical_bytenr,
2367 l->block_ref_to->dev_state->name,
2368 (unsigned long long)l->block_ref_to->dev_bytenr,
2369 l->block_ref_to->mirror_num,
2370 (unsigned long long)l->block_ref_to->generation,
2371 (unsigned long long)l->parent_generation);
2372 ret = -1;
2373 } else if (l->block_ref_to->flush_gen >
2374 l->block_ref_to->dev_state->last_flush_gen) {
2375 printk(KERN_INFO "btrfs: attempt to write superblock"
2376 " which references block %c @%llu (%s/%llu/%d)"
2377 " which is not flushed out of disk's write cache"
2378 " (block flush_gen=%llu,"
2379 " dev->flush_gen=%llu)!\n",
2380 btrfsic_get_block_type(state, l->block_ref_to),
2381 (unsigned long long)
2382 l->block_ref_to->logical_bytenr,
2383 l->block_ref_to->dev_state->name,
2384 (unsigned long long)l->block_ref_to->dev_bytenr,
2385 l->block_ref_to->mirror_num,
2386 (unsigned long long)block->flush_gen,
2387 (unsigned long long)
2388 l->block_ref_to->dev_state->last_flush_gen);
2389 ret = -1;
2390 } else if (-1 == btrfsic_check_all_ref_blocks(state,
2391 l->block_ref_to,
2392 recursion_level +
2393 1)) {
2394 ret = -1;
2395 }
2396 }
2397
2398 return ret;
2399}
2400
2401static int btrfsic_is_block_ref_by_superblock(
2402 const struct btrfsic_state *state,
2403 const struct btrfsic_block *block,
2404 int recursion_level)
2405{
2406 struct list_head *elem_ref_from;
2407
2408 if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
2409 /* refer to comment at "abort cyclic linkage (case 1)" */
2410 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2411 printk(KERN_INFO
2412 "btrfsic: abort cyclic linkage (case 2).\n");
2413
2414 return 0;
2415 }
2416
2417 /*
2418 * This algorithm is recursive because the amount of used stack space
2419 * is very small and the max recursion depth is limited.
2420 */
2421 list_for_each(elem_ref_from, &block->ref_from_list) {
2422 const struct btrfsic_block_link *const l =
2423 list_entry(elem_ref_from, struct btrfsic_block_link,
2424 node_ref_from);
2425
2426 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2427 printk(KERN_INFO
2428 "rl=%d, %c @%llu (%s/%llu/%d)"
2429 " is ref %u* from %c @%llu (%s/%llu/%d)\n",
2430 recursion_level,
2431 btrfsic_get_block_type(state, block),
2432 (unsigned long long)block->logical_bytenr,
2433 block->dev_state->name,
2434 (unsigned long long)block->dev_bytenr,
2435 block->mirror_num,
2436 l->ref_cnt,
2437 btrfsic_get_block_type(state, l->block_ref_from),
2438 (unsigned long long)
2439 l->block_ref_from->logical_bytenr,
2440 l->block_ref_from->dev_state->name,
2441 (unsigned long long)
2442 l->block_ref_from->dev_bytenr,
2443 l->block_ref_from->mirror_num);
2444 if (l->block_ref_from->is_superblock &&
2445 state->latest_superblock->dev_bytenr ==
2446 l->block_ref_from->dev_bytenr &&
2447 state->latest_superblock->dev_state->bdev ==
2448 l->block_ref_from->dev_state->bdev)
2449 return 1;
2450 else if (btrfsic_is_block_ref_by_superblock(state,
2451 l->block_ref_from,
2452 recursion_level +
2453 1))
2454 return 1;
2455 }
2456
2457 return 0;
2458}
2459
2460static void btrfsic_print_add_link(const struct btrfsic_state *state,
2461 const struct btrfsic_block_link *l)
2462{
2463 printk(KERN_INFO
2464 "Add %u* link from %c @%llu (%s/%llu/%d)"
2465 " to %c @%llu (%s/%llu/%d).\n",
2466 l->ref_cnt,
2467 btrfsic_get_block_type(state, l->block_ref_from),
2468 (unsigned long long)l->block_ref_from->logical_bytenr,
2469 l->block_ref_from->dev_state->name,
2470 (unsigned long long)l->block_ref_from->dev_bytenr,
2471 l->block_ref_from->mirror_num,
2472 btrfsic_get_block_type(state, l->block_ref_to),
2473 (unsigned long long)l->block_ref_to->logical_bytenr,
2474 l->block_ref_to->dev_state->name,
2475 (unsigned long long)l->block_ref_to->dev_bytenr,
2476 l->block_ref_to->mirror_num);
2477}
2478
2479static void btrfsic_print_rem_link(const struct btrfsic_state *state,
2480 const struct btrfsic_block_link *l)
2481{
2482 printk(KERN_INFO
2483 "Rem %u* link from %c @%llu (%s/%llu/%d)"
2484 " to %c @%llu (%s/%llu/%d).\n",
2485 l->ref_cnt,
2486 btrfsic_get_block_type(state, l->block_ref_from),
2487 (unsigned long long)l->block_ref_from->logical_bytenr,
2488 l->block_ref_from->dev_state->name,
2489 (unsigned long long)l->block_ref_from->dev_bytenr,
2490 l->block_ref_from->mirror_num,
2491 btrfsic_get_block_type(state, l->block_ref_to),
2492 (unsigned long long)l->block_ref_to->logical_bytenr,
2493 l->block_ref_to->dev_state->name,
2494 (unsigned long long)l->block_ref_to->dev_bytenr,
2495 l->block_ref_to->mirror_num);
2496}
2497
2498static char btrfsic_get_block_type(const struct btrfsic_state *state,
2499 const struct btrfsic_block *block)
2500{
2501 if (block->is_superblock &&
2502 state->latest_superblock->dev_bytenr == block->dev_bytenr &&
2503 state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
2504 return 'S';
2505 else if (block->is_superblock)
2506 return 's';
2507 else if (block->is_metadata)
2508 return 'M';
2509 else
2510 return 'D';
2511}
2512
2513static void btrfsic_dump_tree(const struct btrfsic_state *state)
2514{
2515 btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
2516}
2517
2518static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
2519 const struct btrfsic_block *block,
2520 int indent_level)
2521{
2522 struct list_head *elem_ref_to;
2523 int indent_add;
2524 static char buf[80];
2525 int cursor_position;
2526
2527 /*
2528 * Should better fill an on-stack buffer with a complete line and
2529 * dump it at once when it is time to print a newline character.
2530 */
2531
2532 /*
2533 * This algorithm is recursive because the amount of used stack space
2534 * is very small and the max recursion depth is limited.
2535 */
2536 indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
2537 btrfsic_get_block_type(state, block),
2538 (unsigned long long)block->logical_bytenr,
2539 block->dev_state->name,
2540 (unsigned long long)block->dev_bytenr,
2541 block->mirror_num);
2542 if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
2543 printk("[...]\n");
2544 return;
2545 }
2546 printk(buf);
2547 indent_level += indent_add;
2548 if (list_empty(&block->ref_to_list)) {
2549 printk("\n");
2550 return;
2551 }
2552 if (block->mirror_num > 1 &&
2553 !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
2554 printk(" [...]\n");
2555 return;
2556 }
2557
2558 cursor_position = indent_level;
2559 list_for_each(elem_ref_to, &block->ref_to_list) {
2560 const struct btrfsic_block_link *const l =
2561 list_entry(elem_ref_to, struct btrfsic_block_link,
2562 node_ref_to);
2563
2564 while (cursor_position < indent_level) {
2565 printk(" ");
2566 cursor_position++;
2567 }
2568 if (l->ref_cnt > 1)
2569 indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
2570 else
2571 indent_add = sprintf(buf, " --> ");
2572 if (indent_level + indent_add >
2573 BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
2574 printk("[...]\n");
2575 cursor_position = 0;
2576 continue;
2577 }
2578
2579 printk(buf);
2580
2581 btrfsic_dump_tree_sub(state, l->block_ref_to,
2582 indent_level + indent_add);
2583 cursor_position = 0;
2584 }
2585}
2586
2587static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
2588 struct btrfsic_state *state,
2589 struct btrfsic_block_data_ctx *next_block_ctx,
2590 struct btrfsic_block *next_block,
2591 struct btrfsic_block *from_block,
2592 u64 parent_generation)
2593{
2594 struct btrfsic_block_link *l;
2595
2596 l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
2597 next_block_ctx->dev_bytenr,
2598 from_block->dev_state->bdev,
2599 from_block->dev_bytenr,
2600 &state->block_link_hashtable);
2601 if (NULL == l) {
2602 l = btrfsic_block_link_alloc();
2603 if (NULL == l) {
2604 printk(KERN_INFO
2605 "btrfsic: error, kmalloc" " failed!\n");
2606 return NULL;
2607 }
2608
2609 l->block_ref_to = next_block;
2610 l->block_ref_from = from_block;
2611 l->ref_cnt = 1;
2612 l->parent_generation = parent_generation;
2613
2614 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2615 btrfsic_print_add_link(state, l);
2616
2617 list_add(&l->node_ref_to, &from_block->ref_to_list);
2618 list_add(&l->node_ref_from, &next_block->ref_from_list);
2619
2620 btrfsic_block_link_hashtable_add(l,
2621 &state->block_link_hashtable);
2622 } else {
2623 l->ref_cnt++;
2624 l->parent_generation = parent_generation;
2625 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2626 btrfsic_print_add_link(state, l);
2627 }
2628
2629 return l;
2630}
2631
2632static struct btrfsic_block *btrfsic_block_lookup_or_add(
2633 struct btrfsic_state *state,
2634 struct btrfsic_block_data_ctx *block_ctx,
2635 const char *additional_string,
2636 int is_metadata,
2637 int is_iodone,
2638 int never_written,
2639 int mirror_num,
2640 int *was_created)
2641{
2642 struct btrfsic_block *block;
2643
2644 block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
2645 block_ctx->dev_bytenr,
2646 &state->block_hashtable);
2647 if (NULL == block) {
2648 struct btrfsic_dev_state *dev_state;
2649
2650 block = btrfsic_block_alloc();
2651 if (NULL == block) {
2652 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
2653 return NULL;
2654 }
2655 dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
2656 if (NULL == dev_state) {
2657 printk(KERN_INFO
2658 "btrfsic: error, lookup dev_state failed!\n");
2659 btrfsic_block_free(block);
2660 return NULL;
2661 }
2662 block->dev_state = dev_state;
2663 block->dev_bytenr = block_ctx->dev_bytenr;
2664 block->logical_bytenr = block_ctx->start;
2665 block->is_metadata = is_metadata;
2666 block->is_iodone = is_iodone;
2667 block->never_written = never_written;
2668 block->mirror_num = mirror_num;
2669 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2670 printk(KERN_INFO
2671 "New %s%c-block @%llu (%s/%llu/%d)\n",
2672 additional_string,
2673 btrfsic_get_block_type(state, block),
2674 (unsigned long long)block->logical_bytenr,
2675 dev_state->name,
2676 (unsigned long long)block->dev_bytenr,
2677 mirror_num);
2678 list_add(&block->all_blocks_node, &state->all_blocks_list);
2679 btrfsic_block_hashtable_add(block, &state->block_hashtable);
2680 if (NULL != was_created)
2681 *was_created = 1;
2682 } else {
2683 if (NULL != was_created)
2684 *was_created = 0;
2685 }
2686
2687 return block;
2688}
2689
2690static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2691 u64 bytenr,
2692 struct btrfsic_dev_state *dev_state,
2693 u64 dev_bytenr, char *data)
2694{
2695 int num_copies;
2696 int mirror_num;
2697 int ret;
2698 struct btrfsic_block_data_ctx block_ctx;
2699 int match = 0;
2700
2701 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
2702 bytenr, PAGE_SIZE);
2703
2704 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2705 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
2706 &block_ctx, mirror_num);
2707 if (ret) {
2708 printk(KERN_INFO "btrfsic:"
2709 " btrfsic_map_block(logical @%llu,"
2710 " mirror %d) failed!\n",
2711 (unsigned long long)bytenr, mirror_num);
2712 continue;
2713 }
2714
2715 if (dev_state->bdev == block_ctx.dev->bdev &&
2716 dev_bytenr == block_ctx.dev_bytenr) {
2717 match++;
2718 btrfsic_release_block_ctx(&block_ctx);
2719 break;
2720 }
2721 btrfsic_release_block_ctx(&block_ctx);
2722 }
2723
2724 if (!match) {
2725 printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
2726 " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
2727 " phys_bytenr=%llu)!\n",
2728 (unsigned long long)bytenr, dev_state->name,
2729 (unsigned long long)dev_bytenr);
2730 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2731 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
2732 &block_ctx, mirror_num);
2733 if (ret)
2734 continue;
2735
2736 printk(KERN_INFO "Read logical bytenr @%llu maps to"
2737 " (%s/%llu/%d)\n",
2738 (unsigned long long)bytenr,
2739 block_ctx.dev->name,
2740 (unsigned long long)block_ctx.dev_bytenr,
2741 mirror_num);
2742 }
2743 WARN_ON(1);
2744 }
2745}
2746
2747static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
2748 struct block_device *bdev)
2749{
2750 struct btrfsic_dev_state *ds;
2751
2752 ds = btrfsic_dev_state_hashtable_lookup(bdev,
2753 &btrfsic_dev_state_hashtable);
2754 return ds;
2755}
2756
2757int btrfsic_submit_bh(int rw, struct buffer_head *bh)
2758{
2759 struct btrfsic_dev_state *dev_state;
2760
2761 if (!btrfsic_is_initialized)
2762 return submit_bh(rw, bh);
2763
2764 mutex_lock(&btrfsic_mutex);
2765 /* since btrfsic_submit_bh() might also be called before
2766 * btrfsic_mount(), this might return NULL */
2767 dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
2768
2769 /* Only called to write the superblock (incl. FLUSH/FUA) */
2770 if (NULL != dev_state &&
2771 (rw & WRITE) && bh->b_size > 0) {
2772 u64 dev_bytenr;
2773
2774 dev_bytenr = 4096 * bh->b_blocknr;
2775 if (dev_state->state->print_mask &
2776 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2777 printk(KERN_INFO
2778 "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
2779 " size=%lu, data=%p, bdev=%p)\n",
2780 rw, bh->b_blocknr,
2781 (unsigned long long)dev_bytenr, bh->b_size,
2782 bh->b_data, bh->b_bdev);
2783 btrfsic_process_written_block(dev_state, dev_bytenr,
2784 bh->b_data, bh->b_size, NULL,
2785 NULL, bh, rw);
2786 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2787 if (dev_state->state->print_mask &
2788 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2789 printk(KERN_INFO
2790 "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
2791 rw, bh->b_bdev);
2792 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2793 if ((dev_state->state->print_mask &
2794 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2795 BTRFSIC_PRINT_MASK_VERBOSE)))
2796 printk(KERN_INFO
2797 "btrfsic_submit_bh(%s) with FLUSH"
2798 " but dummy block already in use"
2799 " (ignored)!\n",
2800 dev_state->name);
2801 } else {
2802 struct btrfsic_block *const block =
2803 &dev_state->dummy_block_for_bio_bh_flush;
2804
2805 block->is_iodone = 0;
2806 block->never_written = 0;
2807 block->iodone_w_error = 0;
2808 block->flush_gen = dev_state->last_flush_gen + 1;
2809 block->submit_bio_bh_rw = rw;
2810 block->orig_bio_bh_private = bh->b_private;
2811 block->orig_bio_bh_end_io.bh = bh->b_end_io;
2812 block->next_in_same_bio = NULL;
2813 bh->b_private = block;
2814 bh->b_end_io = btrfsic_bh_end_io;
2815 }
2816 }
2817 mutex_unlock(&btrfsic_mutex);
2818 return submit_bh(rw, bh);
2819}
2820
2821void btrfsic_submit_bio(int rw, struct bio *bio)
2822{
2823 struct btrfsic_dev_state *dev_state;
2824
2825 if (!btrfsic_is_initialized) {
2826 submit_bio(rw, bio);
2827 return;
2828 }
2829
2830 mutex_lock(&btrfsic_mutex);
2831 /* since btrfsic_submit_bio() is also called before
2832 * btrfsic_mount(), this might return NULL */
2833 dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
2834 if (NULL != dev_state &&
2835 (rw & WRITE) && NULL != bio->bi_io_vec) {
2836 unsigned int i;
2837 u64 dev_bytenr;
2838 int bio_is_patched;
2839
2840 dev_bytenr = 512 * bio->bi_sector;
2841 bio_is_patched = 0;
2842 if (dev_state->state->print_mask &
2843 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2844 printk(KERN_INFO
2845 "submit_bio(rw=0x%x, bi_vcnt=%u,"
2846 " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
2847 rw, bio->bi_vcnt, bio->bi_sector,
2848 (unsigned long long)dev_bytenr,
2849 bio->bi_bdev);
2850
2851 for (i = 0; i < bio->bi_vcnt; i++) {
2852 u8 *mapped_data;
2853
2854 mapped_data = kmap(bio->bi_io_vec[i].bv_page);
2855 if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2856 BTRFSIC_PRINT_MASK_VERBOSE) ==
2857 (dev_state->state->print_mask &
2858 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2859 BTRFSIC_PRINT_MASK_VERBOSE)))
2860 printk(KERN_INFO
2861 "#%u: page=%p, mapped=%p, len=%u,"
2862 " offset=%u\n",
2863 i, bio->bi_io_vec[i].bv_page,
2864 mapped_data,
2865 bio->bi_io_vec[i].bv_len,
2866 bio->bi_io_vec[i].bv_offset);
2867 btrfsic_process_written_block(dev_state, dev_bytenr,
2868 mapped_data,
2869 bio->bi_io_vec[i].bv_len,
2870 bio, &bio_is_patched,
2871 NULL, rw);
2872 kunmap(bio->bi_io_vec[i].bv_page);
2873 dev_bytenr += bio->bi_io_vec[i].bv_len;
2874 }
2875 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2876 if (dev_state->state->print_mask &
2877 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2878 printk(KERN_INFO
2879 "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
2880 rw, bio->bi_bdev);
2881 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2882 if ((dev_state->state->print_mask &
2883 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2884 BTRFSIC_PRINT_MASK_VERBOSE)))
2885 printk(KERN_INFO
2886 "btrfsic_submit_bio(%s) with FLUSH"
2887 " but dummy block already in use"
2888 " (ignored)!\n",
2889 dev_state->name);
2890 } else {
2891 struct btrfsic_block *const block =
2892 &dev_state->dummy_block_for_bio_bh_flush;
2893
2894 block->is_iodone = 0;
2895 block->never_written = 0;
2896 block->iodone_w_error = 0;
2897 block->flush_gen = dev_state->last_flush_gen + 1;
2898 block->submit_bio_bh_rw = rw;
2899 block->orig_bio_bh_private = bio->bi_private;
2900 block->orig_bio_bh_end_io.bio = bio->bi_end_io;
2901 block->next_in_same_bio = NULL;
2902 bio->bi_private = block;
2903 bio->bi_end_io = btrfsic_bio_end_io;
2904 }
2905 }
2906 mutex_unlock(&btrfsic_mutex);
2907
2908 submit_bio(rw, bio);
2909}
2910
2911int btrfsic_mount(struct btrfs_root *root,
2912 struct btrfs_fs_devices *fs_devices,
2913 int including_extent_data, u32 print_mask)
2914{
2915 int ret;
2916 struct btrfsic_state *state;
2917 struct list_head *dev_head = &fs_devices->devices;
2918 struct btrfs_device *device;
2919
2920 state = kzalloc(sizeof(*state), GFP_NOFS);
2921 if (NULL == state) {
2922 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
2923 return -1;
2924 }
2925
2926 if (!btrfsic_is_initialized) {
2927 mutex_init(&btrfsic_mutex);
2928 btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
2929 btrfsic_is_initialized = 1;
2930 }
2931 mutex_lock(&btrfsic_mutex);
2932 state->root = root;
2933 state->print_mask = print_mask;
2934 state->include_extent_data = including_extent_data;
2935 state->csum_size = 0;
2936 INIT_LIST_HEAD(&state->all_blocks_list);
2937 btrfsic_block_hashtable_init(&state->block_hashtable);
2938 btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
2939 state->max_superblock_generation = 0;
2940 state->latest_superblock = NULL;
2941
2942 list_for_each_entry(device, dev_head, dev_list) {
2943 struct btrfsic_dev_state *ds;
2944 char *p;
2945
2946 if (!device->bdev || !device->name)
2947 continue;
2948
2949 ds = btrfsic_dev_state_alloc();
2950 if (NULL == ds) {
2951 printk(KERN_INFO
2952 "btrfs check-integrity: kmalloc() failed!\n");
2953 mutex_unlock(&btrfsic_mutex);
2954 return -1;
2955 }
2956 ds->bdev = device->bdev;
2957 ds->state = state;
2958 bdevname(ds->bdev, ds->name);
2959 ds->name[BDEVNAME_SIZE - 1] = '\0';
2960 for (p = ds->name; *p != '\0'; p++);
2961 while (p > ds->name && *p != '/')
2962 p--;
2963 if (*p == '/')
2964 p++;
2965 strlcpy(ds->name, p, sizeof(ds->name));
2966 btrfsic_dev_state_hashtable_add(ds,
2967 &btrfsic_dev_state_hashtable);
2968 }
2969
2970 ret = btrfsic_process_superblock(state, fs_devices);
2971 if (0 != ret) {
2972 mutex_unlock(&btrfsic_mutex);
2973 btrfsic_unmount(root, fs_devices);
2974 return ret;
2975 }
2976
2977 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
2978 btrfsic_dump_database(state);
2979 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
2980 btrfsic_dump_tree(state);
2981
2982 mutex_unlock(&btrfsic_mutex);
2983 return 0;
2984}
2985
2986void btrfsic_unmount(struct btrfs_root *root,
2987 struct btrfs_fs_devices *fs_devices)
2988{
2989 struct list_head *elem_all;
2990 struct list_head *tmp_all;
2991 struct btrfsic_state *state;
2992 struct list_head *dev_head = &fs_devices->devices;
2993 struct btrfs_device *device;
2994
2995 if (!btrfsic_is_initialized)
2996 return;
2997
2998 mutex_lock(&btrfsic_mutex);
2999
3000 state = NULL;
3001 list_for_each_entry(device, dev_head, dev_list) {
3002 struct btrfsic_dev_state *ds;
3003
3004 if (!device->bdev || !device->name)
3005 continue;
3006
3007 ds = btrfsic_dev_state_hashtable_lookup(
3008 device->bdev,
3009 &btrfsic_dev_state_hashtable);
3010 if (NULL != ds) {
3011 state = ds->state;
3012 btrfsic_dev_state_hashtable_remove(ds);
3013 btrfsic_dev_state_free(ds);
3014 }
3015 }
3016
3017 if (NULL == state) {
3018 printk(KERN_INFO
3019 "btrfsic: error, cannot find state information"
3020 " on umount!\n");
3021 mutex_unlock(&btrfsic_mutex);
3022 return;
3023 }
3024
3025 /*
3026 * Don't care about keeping the lists' state up to date,
3027 * just free all memory that was allocated dynamically.
3028 * Free the blocks and the block_links.
3029 */
3030 list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
3031 struct btrfsic_block *const b_all =
3032 list_entry(elem_all, struct btrfsic_block,
3033 all_blocks_node);
3034 struct list_head *elem_ref_to;
3035 struct list_head *tmp_ref_to;
3036
3037 list_for_each_safe(elem_ref_to, tmp_ref_to,
3038 &b_all->ref_to_list) {
3039 struct btrfsic_block_link *const l =
3040 list_entry(elem_ref_to,
3041 struct btrfsic_block_link,
3042 node_ref_to);
3043
3044 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
3045 btrfsic_print_rem_link(state, l);
3046
3047 l->ref_cnt--;
3048 if (0 == l->ref_cnt)
3049 btrfsic_block_link_free(l);
3050 }
3051
3052 if (b_all->is_iodone)
3053 btrfsic_block_free(b_all);
3054 else
3055 printk(KERN_INFO "btrfs: attempt to free %c-block"
3056 " @%llu (%s/%llu/%d) on umount which is"
3057 " not yet iodone!\n",
3058 btrfsic_get_block_type(state, b_all),
3059 (unsigned long long)b_all->logical_bytenr,
3060 b_all->dev_state->name,
3061 (unsigned long long)b_all->dev_bytenr,
3062 b_all->mirror_num);
3063 }
3064
3065 mutex_unlock(&btrfsic_mutex);
3066
3067 kfree(state);
3068}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644
index 000000000000..8b59175cc502
--- /dev/null
+++ b/fs/btrfs/check-integrity.h
@@ -0,0 +1,36 @@
1/*
2 * Copyright (C) STRATO AG 2011. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#if !defined(__BTRFS_CHECK_INTEGRITY__)
20#define __BTRFS_CHECK_INTEGRITY__
21
22#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
23int btrfsic_submit_bh(int rw, struct buffer_head *bh);
24void btrfsic_submit_bio(int rw, struct bio *bio);
25#else
26#define btrfsic_submit_bh submit_bh
27#define btrfsic_submit_bio submit_bio
28#endif
29
30int btrfsic_mount(struct btrfs_root *root,
31 struct btrfs_fs_devices *fs_devices,
32 int including_extent_data, u32 print_mask);
33void btrfsic_unmount(struct btrfs_root *root,
34 struct btrfs_fs_devices *fs_devices);
35
36#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dede441bdeee..0639a555e16e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
240 240
241 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 241 cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
242 new_root_objectid, &disk_key, level, 242 new_root_objectid, &disk_key, level,
243 buf->start, 0); 243 buf->start, 0, 1);
244 if (IS_ERR(cow)) 244 if (IS_ERR(cow))
245 return PTR_ERR(cow); 245 return PTR_ERR(cow);
246 246
@@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
261 261
262 WARN_ON(btrfs_header_generation(buf) > trans->transid); 262 WARN_ON(btrfs_header_generation(buf) > trans->transid);
263 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 263 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
264 ret = btrfs_inc_ref(trans, root, cow, 1); 264 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
265 else 265 else
266 ret = btrfs_inc_ref(trans, root, cow, 0); 266 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
267 267
268 if (ret) 268 if (ret)
269 return ret; 269 return ret;
@@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
350 if ((owner == root->root_key.objectid || 350 if ((owner == root->root_key.objectid ||
351 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 351 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
352 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 352 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
353 ret = btrfs_inc_ref(trans, root, buf, 1); 353 ret = btrfs_inc_ref(trans, root, buf, 1, 1);
354 BUG_ON(ret); 354 BUG_ON(ret);
355 355
356 if (root->root_key.objectid == 356 if (root->root_key.objectid ==
357 BTRFS_TREE_RELOC_OBJECTID) { 357 BTRFS_TREE_RELOC_OBJECTID) {
358 ret = btrfs_dec_ref(trans, root, buf, 0); 358 ret = btrfs_dec_ref(trans, root, buf, 0, 1);
359 BUG_ON(ret); 359 BUG_ON(ret);
360 ret = btrfs_inc_ref(trans, root, cow, 1); 360 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
361 BUG_ON(ret); 361 BUG_ON(ret);
362 } 362 }
363 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 363 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
365 365
366 if (root->root_key.objectid == 366 if (root->root_key.objectid ==
367 BTRFS_TREE_RELOC_OBJECTID) 367 BTRFS_TREE_RELOC_OBJECTID)
368 ret = btrfs_inc_ref(trans, root, cow, 1); 368 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
369 else 369 else
370 ret = btrfs_inc_ref(trans, root, cow, 0); 370 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
371 BUG_ON(ret); 371 BUG_ON(ret);
372 } 372 }
373 if (new_flags != 0) { 373 if (new_flags != 0) {
@@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
381 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 381 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
382 if (root->root_key.objectid == 382 if (root->root_key.objectid ==
383 BTRFS_TREE_RELOC_OBJECTID) 383 BTRFS_TREE_RELOC_OBJECTID)
384 ret = btrfs_inc_ref(trans, root, cow, 1); 384 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
385 else 385 else
386 ret = btrfs_inc_ref(trans, root, cow, 0); 386 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
387 BUG_ON(ret); 387 BUG_ON(ret);
388 ret = btrfs_dec_ref(trans, root, buf, 1); 388 ret = btrfs_dec_ref(trans, root, buf, 1, 1);
389 BUG_ON(ret); 389 BUG_ON(ret);
390 } 390 }
391 clean_tree_block(trans, root, buf); 391 clean_tree_block(trans, root, buf);
@@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
446 446
447 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 447 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
448 root->root_key.objectid, &disk_key, 448 root->root_key.objectid, &disk_key,
449 level, search_start, empty_size); 449 level, search_start, empty_size, 1);
450 if (IS_ERR(cow)) 450 if (IS_ERR(cow))
451 return PTR_ERR(cow); 451 return PTR_ERR(cow);
452 452
@@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
484 rcu_assign_pointer(root->node, cow); 484 rcu_assign_pointer(root->node, cow);
485 485
486 btrfs_free_tree_block(trans, root, buf, parent_start, 486 btrfs_free_tree_block(trans, root, buf, parent_start,
487 last_ref); 487 last_ref, 1);
488 free_extent_buffer(buf); 488 free_extent_buffer(buf);
489 add_root_to_dirty_list(root); 489 add_root_to_dirty_list(root);
490 } else { 490 } else {
@@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
500 trans->transid); 500 trans->transid);
501 btrfs_mark_buffer_dirty(parent); 501 btrfs_mark_buffer_dirty(parent);
502 btrfs_free_tree_block(trans, root, buf, parent_start, 502 btrfs_free_tree_block(trans, root, buf, parent_start,
503 last_ref); 503 last_ref, 1);
504 } 504 }
505 if (unlock_orig) 505 if (unlock_orig)
506 btrfs_tree_unlock(buf); 506 btrfs_tree_unlock(buf);
@@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
957 free_extent_buffer(mid); 957 free_extent_buffer(mid);
958 958
959 root_sub_used(root, mid->len); 959 root_sub_used(root, mid->len);
960 btrfs_free_tree_block(trans, root, mid, 0, 1); 960 btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
961 /* once for the root ptr */ 961 /* once for the root ptr */
962 free_extent_buffer(mid); 962 free_extent_buffer(mid);
963 return 0; 963 return 0;
@@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1015 if (wret) 1015 if (wret)
1016 ret = wret; 1016 ret = wret;
1017 root_sub_used(root, right->len); 1017 root_sub_used(root, right->len);
1018 btrfs_free_tree_block(trans, root, right, 0, 1); 1018 btrfs_free_tree_block(trans, root, right, 0, 1, 0);
1019 free_extent_buffer(right); 1019 free_extent_buffer(right);
1020 right = NULL; 1020 right = NULL;
1021 } else { 1021 } else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1055 if (wret) 1055 if (wret)
1056 ret = wret; 1056 ret = wret;
1057 root_sub_used(root, mid->len); 1057 root_sub_used(root, mid->len);
1058 btrfs_free_tree_block(trans, root, mid, 0, 1); 1058 btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
1059 free_extent_buffer(mid); 1059 free_extent_buffer(mid);
1060 mid = NULL; 1060 mid = NULL;
1061 } else { 1061 } else {
@@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2089 2089
2090 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2090 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2091 root->root_key.objectid, &lower_key, 2091 root->root_key.objectid, &lower_key,
2092 level, root->node->start, 0); 2092 level, root->node->start, 0, 0);
2093 if (IS_ERR(c)) 2093 if (IS_ERR(c))
2094 return PTR_ERR(c); 2094 return PTR_ERR(c);
2095 2095
@@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2216 2216
2217 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2217 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2218 root->root_key.objectid, 2218 root->root_key.objectid,
2219 &disk_key, level, c->start, 0); 2219 &disk_key, level, c->start, 0, 0);
2220 if (IS_ERR(split)) 2220 if (IS_ERR(split))
2221 return PTR_ERR(split); 2221 return PTR_ERR(split);
2222 2222
@@ -2970,7 +2970,7 @@ again:
2970 2970
2971 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 2971 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2972 root->root_key.objectid, 2972 root->root_key.objectid,
2973 &disk_key, 0, l->start, 0); 2973 &disk_key, 0, l->start, 0, 0);
2974 if (IS_ERR(right)) 2974 if (IS_ERR(right))
2975 return PTR_ERR(right); 2975 return PTR_ERR(right);
2976 2976
@@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3781 3781
3782 root_sub_used(root, leaf->len); 3782 root_sub_used(root, leaf->len);
3783 3783
3784 btrfs_free_tree_block(trans, root, leaf, 0, 1); 3784 btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
3785 return 0; 3785 return 0;
3786} 3786}
3787/* 3787/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323d..27ebe61d3ccc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
86/* holds checksums of all the data extents */ 86/* holds checksums of all the data extents */
87#define BTRFS_CSUM_TREE_OBJECTID 7ULL 87#define BTRFS_CSUM_TREE_OBJECTID 7ULL
88 88
89/* for storing balance parameters in the root tree */
90#define BTRFS_BALANCE_OBJECTID -4ULL
91
89/* orhpan objectid for tracking unlinked/truncated files */ 92/* orhpan objectid for tracking unlinked/truncated files */
90#define BTRFS_ORPHAN_OBJECTID -5ULL 93#define BTRFS_ORPHAN_OBJECTID -5ULL
91 94
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
692 __le16 name_len; 695 __le16 name_len;
693} __attribute__ ((__packed__)); 696} __attribute__ ((__packed__));
694 697
698struct btrfs_disk_balance_args {
699 /*
700 * profiles to operate on, single is denoted by
701 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
702 */
703 __le64 profiles;
704
705 /* usage filter */
706 __le64 usage;
707
708 /* devid filter */
709 __le64 devid;
710
711 /* devid subset filter [pstart..pend) */
712 __le64 pstart;
713 __le64 pend;
714
715 /* btrfs virtual address space subset filter [vstart..vend) */
716 __le64 vstart;
717 __le64 vend;
718
719 /*
720 * profile to convert to, single is denoted by
721 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
722 */
723 __le64 target;
724
725 /* BTRFS_BALANCE_ARGS_* */
726 __le64 flags;
727
728 __le64 unused[8];
729} __attribute__ ((__packed__));
730
731/*
732 * store balance parameters to disk so that balance can be properly
733 * resumed after crash or unmount
734 */
735struct btrfs_balance_item {
736 /* BTRFS_BALANCE_* */
737 __le64 flags;
738
739 struct btrfs_disk_balance_args data;
740 struct btrfs_disk_balance_args meta;
741 struct btrfs_disk_balance_args sys;
742
743 __le64 unused[4];
744} __attribute__ ((__packed__));
745
695#define BTRFS_FILE_EXTENT_INLINE 0 746#define BTRFS_FILE_EXTENT_INLINE 0
696#define BTRFS_FILE_EXTENT_REG 1 747#define BTRFS_FILE_EXTENT_REG 1
697#define BTRFS_FILE_EXTENT_PREALLOC 2 748#define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -751,14 +802,32 @@ struct btrfs_csum_item {
751} __attribute__ ((__packed__)); 802} __attribute__ ((__packed__));
752 803
753/* different types of block groups (and chunks) */ 804/* different types of block groups (and chunks) */
754#define BTRFS_BLOCK_GROUP_DATA (1 << 0) 805#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
755#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) 806#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
756#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) 807#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
757#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) 808#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
758#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) 809#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
759#define BTRFS_BLOCK_GROUP_DUP (1 << 5) 810#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
760#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) 811#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
761#define BTRFS_NR_RAID_TYPES 5 812#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
813#define BTRFS_NR_RAID_TYPES 5
814
815#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
816 BTRFS_BLOCK_GROUP_SYSTEM | \
817 BTRFS_BLOCK_GROUP_METADATA)
818
819#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
820 BTRFS_BLOCK_GROUP_RAID1 | \
821 BTRFS_BLOCK_GROUP_DUP | \
822 BTRFS_BLOCK_GROUP_RAID10)
823/*
824 * We need a bit for restriper to be able to tell when chunks of type
825 * SINGLE are available. This "extended" profile format is used in
826 * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
827 * (on-disk). The corresponding on-disk bit in chunk.type is reserved
828 * to avoid remappings between two formats in future.
829 */
830#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
762 831
763struct btrfs_block_group_item { 832struct btrfs_block_group_item {
764 __le64 used; 833 __le64 used;
@@ -916,6 +985,7 @@ struct btrfs_block_group_cache {
916struct reloc_control; 985struct reloc_control;
917struct btrfs_device; 986struct btrfs_device;
918struct btrfs_fs_devices; 987struct btrfs_fs_devices;
988struct btrfs_balance_control;
919struct btrfs_delayed_root; 989struct btrfs_delayed_root;
920struct btrfs_fs_info { 990struct btrfs_fs_info {
921 u8 fsid[BTRFS_FSID_SIZE]; 991 u8 fsid[BTRFS_FSID_SIZE];
@@ -971,7 +1041,7 @@ struct btrfs_fs_info {
971 * is required instead of the faster short fsync log commits 1041 * is required instead of the faster short fsync log commits
972 */ 1042 */
973 u64 last_trans_log_full_commit; 1043 u64 last_trans_log_full_commit;
974 unsigned long mount_opt:20; 1044 unsigned long mount_opt:21;
975 unsigned long compress_type:4; 1045 unsigned long compress_type:4;
976 u64 max_inline; 1046 u64 max_inline;
977 u64 alloc_start; 1047 u64 alloc_start;
@@ -1132,12 +1202,23 @@ struct btrfs_fs_info {
1132 spinlock_t ref_cache_lock; 1202 spinlock_t ref_cache_lock;
1133 u64 total_ref_cache_size; 1203 u64 total_ref_cache_size;
1134 1204
1205 /*
1206 * these three are in extended format (availability of single
1207 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
1208 * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
1209 */
1135 u64 avail_data_alloc_bits; 1210 u64 avail_data_alloc_bits;
1136 u64 avail_metadata_alloc_bits; 1211 u64 avail_metadata_alloc_bits;
1137 u64 avail_system_alloc_bits; 1212 u64 avail_system_alloc_bits;
1138 u64 data_alloc_profile; 1213
1139 u64 metadata_alloc_profile; 1214 /* restriper state */
1140 u64 system_alloc_profile; 1215 spinlock_t balance_lock;
1216 struct mutex balance_mutex;
1217 atomic_t balance_running;
1218 atomic_t balance_pause_req;
1219 atomic_t balance_cancel_req;
1220 struct btrfs_balance_control *balance_ctl;
1221 wait_queue_head_t balance_wait_q;
1141 1222
1142 unsigned data_chunk_allocations; 1223 unsigned data_chunk_allocations;
1143 unsigned metadata_ratio; 1224 unsigned metadata_ratio;
@@ -1155,6 +1236,10 @@ struct btrfs_fs_info {
1155 int scrub_workers_refcnt; 1236 int scrub_workers_refcnt;
1156 struct btrfs_workers scrub_workers; 1237 struct btrfs_workers scrub_workers;
1157 1238
1239#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1240 u32 check_integrity_print_mask;
1241#endif
1242
1158 /* filesystem state */ 1243 /* filesystem state */
1159 u64 fs_state; 1244 u64 fs_state;
1160 1245
@@ -1383,6 +1468,8 @@ struct btrfs_ioctl_defrag_range_args {
1383#define BTRFS_DEV_ITEM_KEY 216 1468#define BTRFS_DEV_ITEM_KEY 216
1384#define BTRFS_CHUNK_ITEM_KEY 228 1469#define BTRFS_CHUNK_ITEM_KEY 228
1385 1470
1471#define BTRFS_BALANCE_ITEM_KEY 248
1472
1386/* 1473/*
1387 * string items are for debugging. They just store a short string of 1474 * string items are for debugging. They just store a short string of
1388 * data in the FS 1475 * data in the FS
@@ -1413,6 +1500,9 @@ struct btrfs_ioctl_defrag_range_args {
1413#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1500#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1414#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 1501#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1415#define BTRFS_MOUNT_RECOVERY (1 << 18) 1502#define BTRFS_MOUNT_RECOVERY (1 << 18)
1503#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
1504#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
1505#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
1416 1506
1417#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1507#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1418#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1508#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2077,8 +2167,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
2077BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, 2167BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
2078 num_devices, 64); 2168 num_devices, 64);
2079 2169
2080/* struct btrfs_super_block */ 2170/* struct btrfs_balance_item */
2171BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
2081 2172
2173static inline void btrfs_balance_data(struct extent_buffer *eb,
2174 struct btrfs_balance_item *bi,
2175 struct btrfs_disk_balance_args *ba)
2176{
2177 read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
2178}
2179
2180static inline void btrfs_set_balance_data(struct extent_buffer *eb,
2181 struct btrfs_balance_item *bi,
2182 struct btrfs_disk_balance_args *ba)
2183{
2184 write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
2185}
2186
2187static inline void btrfs_balance_meta(struct extent_buffer *eb,
2188 struct btrfs_balance_item *bi,
2189 struct btrfs_disk_balance_args *ba)
2190{
2191 read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
2192}
2193
2194static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
2195 struct btrfs_balance_item *bi,
2196 struct btrfs_disk_balance_args *ba)
2197{
2198 write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
2199}
2200
2201static inline void btrfs_balance_sys(struct extent_buffer *eb,
2202 struct btrfs_balance_item *bi,
2203 struct btrfs_disk_balance_args *ba)
2204{
2205 read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
2206}
2207
2208static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
2209 struct btrfs_balance_item *bi,
2210 struct btrfs_disk_balance_args *ba)
2211{
2212 write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
2213}
2214
2215static inline void
2216btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
2217 struct btrfs_disk_balance_args *disk)
2218{
2219 memset(cpu, 0, sizeof(*cpu));
2220
2221 cpu->profiles = le64_to_cpu(disk->profiles);
2222 cpu->usage = le64_to_cpu(disk->usage);
2223 cpu->devid = le64_to_cpu(disk->devid);
2224 cpu->pstart = le64_to_cpu(disk->pstart);
2225 cpu->pend = le64_to_cpu(disk->pend);
2226 cpu->vstart = le64_to_cpu(disk->vstart);
2227 cpu->vend = le64_to_cpu(disk->vend);
2228 cpu->target = le64_to_cpu(disk->target);
2229 cpu->flags = le64_to_cpu(disk->flags);
2230}
2231
2232static inline void
2233btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
2234 struct btrfs_balance_args *cpu)
2235{
2236 memset(disk, 0, sizeof(*disk));
2237
2238 disk->profiles = cpu_to_le64(cpu->profiles);
2239 disk->usage = cpu_to_le64(cpu->usage);
2240 disk->devid = cpu_to_le64(cpu->devid);
2241 disk->pstart = cpu_to_le64(cpu->pstart);
2242 disk->pend = cpu_to_le64(cpu->pend);
2243 disk->vstart = cpu_to_le64(cpu->vstart);
2244 disk->vend = cpu_to_le64(cpu->vend);
2245 disk->target = cpu_to_le64(cpu->target);
2246 disk->flags = cpu_to_le64(cpu->flags);
2247}
2248
2249/* struct btrfs_super_block */
2082BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 2250BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
2083BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); 2251BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
2084BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, 2252BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2196,7 +2364,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
2196 return btrfs_item_size(eb, e) - offset; 2364 return btrfs_item_size(eb, e) - offset;
2197} 2365}
2198 2366
2199static inline struct btrfs_root *btrfs_sb(struct super_block *sb) 2367static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2200{ 2368{
2201 return sb->s_fs_info; 2369 return sb->s_fs_info;
2202} 2370}
@@ -2277,11 +2445,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2277 struct btrfs_root *root, u32 blocksize, 2445 struct btrfs_root *root, u32 blocksize,
2278 u64 parent, u64 root_objectid, 2446 u64 parent, u64 root_objectid,
2279 struct btrfs_disk_key *key, int level, 2447 struct btrfs_disk_key *key, int level,
2280 u64 hint, u64 empty_size); 2448 u64 hint, u64 empty_size, int for_cow);
2281void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2449void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
2282 struct btrfs_root *root, 2450 struct btrfs_root *root,
2283 struct extent_buffer *buf, 2451 struct extent_buffer *buf,
2284 u64 parent, int last_ref); 2452 u64 parent, int last_ref, int for_cow);
2285struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2453struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
2286 struct btrfs_root *root, 2454 struct btrfs_root *root,
2287 u64 bytenr, u32 blocksize, 2455 u64 bytenr, u32 blocksize,
@@ -2301,17 +2469,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2301 u64 search_end, struct btrfs_key *ins, 2469 u64 search_end, struct btrfs_key *ins,
2302 u64 data); 2470 u64 data);
2303int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2471int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2304 struct extent_buffer *buf, int full_backref); 2472 struct extent_buffer *buf, int full_backref, int for_cow);
2305int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2473int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2306 struct extent_buffer *buf, int full_backref); 2474 struct extent_buffer *buf, int full_backref, int for_cow);
2307int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2475int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2308 struct btrfs_root *root, 2476 struct btrfs_root *root,
2309 u64 bytenr, u64 num_bytes, u64 flags, 2477 u64 bytenr, u64 num_bytes, u64 flags,
2310 int is_data); 2478 int is_data);
2311int btrfs_free_extent(struct btrfs_trans_handle *trans, 2479int btrfs_free_extent(struct btrfs_trans_handle *trans,
2312 struct btrfs_root *root, 2480 struct btrfs_root *root,
2313 u64 bytenr, u64 num_bytes, u64 parent, 2481 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
2314 u64 root_objectid, u64 owner, u64 offset); 2482 u64 owner, u64 offset, int for_cow);
2315 2483
2316int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2484int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2317int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 2485int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -2323,7 +2491,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2323int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2491int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2324 struct btrfs_root *root, 2492 struct btrfs_root *root,
2325 u64 bytenr, u64 num_bytes, u64 parent, 2493 u64 bytenr, u64 num_bytes, u64 parent,
2326 u64 root_objectid, u64 owner, u64 offset); 2494 u64 root_objectid, u64 owner, u64 offset, int for_cow);
2327 2495
2328int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2496int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2329 struct btrfs_root *root); 2497 struct btrfs_root *root);
@@ -2482,10 +2650,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2482} 2650}
2483 2651
2484int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2652int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2653static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
2654{
2655 ++p->slots[0];
2656 if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
2657 return btrfs_next_leaf(root, p);
2658 return 0;
2659}
2485int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2660int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2486int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2661int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2487void btrfs_drop_snapshot(struct btrfs_root *root, 2662void btrfs_drop_snapshot(struct btrfs_root *root,
2488 struct btrfs_block_rsv *block_rsv, int update_ref); 2663 struct btrfs_block_rsv *block_rsv, int update_ref,
2664 int for_reloc);
2489int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2665int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2490 struct btrfs_root *root, 2666 struct btrfs_root *root,
2491 struct extent_buffer *node, 2667 struct extent_buffer *node,
@@ -2500,6 +2676,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2500} 2676}
2501static inline void free_fs_info(struct btrfs_fs_info *fs_info) 2677static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2502{ 2678{
2679 kfree(fs_info->balance_ctl);
2503 kfree(fs_info->delayed_root); 2680 kfree(fs_info->delayed_root);
2504 kfree(fs_info->extent_root); 2681 kfree(fs_info->extent_root);
2505 kfree(fs_info->tree_root); 2682 kfree(fs_info->tree_root);
@@ -2510,6 +2687,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2510 kfree(fs_info->super_for_commit); 2687 kfree(fs_info->super_for_commit);
2511 kfree(fs_info); 2688 kfree(fs_info);
2512} 2689}
2690/**
2691 * profile_is_valid - tests whether a given profile is valid and reduced
2692 * @flags: profile to validate
2693 * @extended: if true @flags is treated as an extended profile
2694 */
2695static inline int profile_is_valid(u64 flags, int extended)
2696{
2697 u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
2698
2699 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
2700 if (extended)
2701 mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2702
2703 if (flags & mask)
2704 return 0;
2705 /* true if zero or exactly one bit set */
2706 return (flags & (~flags + 1)) == flags;
2707}
2513 2708
2514/* root-item.c */ 2709/* root-item.c */
2515int btrfs_find_root_ref(struct btrfs_root *tree_root, 2710int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 9c1eccc2c503..fe4cd0f1cef1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
595 595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
598 if (!ret) 598 if (!ret) {
599 trace_btrfs_space_reservation(root->fs_info, "delayed_item",
600 item->key.objectid,
601 num_bytes, 1);
599 item->bytes_reserved = num_bytes; 602 item->bytes_reserved = num_bytes;
603 }
600 604
601 return ret; 605 return ret;
602} 606}
@@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
610 return; 614 return;
611 615
612 rsv = &root->fs_info->delayed_block_rsv; 616 rsv = &root->fs_info->delayed_block_rsv;
617 trace_btrfs_space_reservation(root->fs_info, "delayed_item",
618 item->key.objectid, item->bytes_reserved,
619 0);
613 btrfs_block_rsv_release(root, rsv, 620 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved); 621 item->bytes_reserved);
615} 622}
@@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata(
624 struct btrfs_block_rsv *dst_rsv; 631 struct btrfs_block_rsv *dst_rsv;
625 u64 num_bytes; 632 u64 num_bytes;
626 int ret; 633 int ret;
627 int release = false; 634 bool release = false;
628 635
629 src_rsv = trans->block_rsv; 636 src_rsv = trans->block_rsv;
630 dst_rsv = &root->fs_info->delayed_block_rsv; 637 dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata(
651 */ 658 */
652 if (ret == -EAGAIN) 659 if (ret == -EAGAIN)
653 ret = -ENOSPC; 660 ret = -ENOSPC;
654 if (!ret) 661 if (!ret) {
655 node->bytes_reserved = num_bytes; 662 node->bytes_reserved = num_bytes;
663 trace_btrfs_space_reservation(root->fs_info,
664 "delayed_inode",
665 btrfs_ino(inode),
666 num_bytes, 1);
667 }
656 return ret; 668 return ret;
657 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { 669 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
658 spin_lock(&BTRFS_I(inode)->lock); 670 spin_lock(&BTRFS_I(inode)->lock);
@@ -707,11 +719,17 @@ out:
707 * reservation here. I think it may be time for a documentation page on 719 * reservation here. I think it may be time for a documentation page on
708 * how block rsvs. work. 720 * how block rsvs. work.
709 */ 721 */
710 if (!ret) 722 if (!ret) {
723 trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
724 btrfs_ino(inode), num_bytes, 1);
711 node->bytes_reserved = num_bytes; 725 node->bytes_reserved = num_bytes;
726 }
712 727
713 if (release) 728 if (release) {
729 trace_btrfs_space_reservation(root->fs_info, "delalloc",
730 btrfs_ino(inode), num_bytes, 0);
714 btrfs_block_rsv_release(root, src_rsv, num_bytes); 731 btrfs_block_rsv_release(root, src_rsv, num_bytes);
732 }
715 733
716 return ret; 734 return ret;
717} 735}
@@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
725 return; 743 return;
726 744
727 rsv = &root->fs_info->delayed_block_rsv; 745 rsv = &root->fs_info->delayed_block_rsv;
746 trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
747 node->inode_id, node->bytes_reserved, 0);
728 btrfs_block_rsv_release(root, rsv, 748 btrfs_block_rsv_release(root, rsv,
729 node->bytes_reserved); 749 node->bytes_reserved);
730 node->bytes_reserved = 0; 750 node->bytes_reserved = 0;
@@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1372 goto release_node; 1392 goto release_node;
1373 } 1393 }
1374 1394
1375 ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
1376 /*
1377 * we have reserved enough space when we start a new transaction,
1378 * so reserving metadata failure is impossible
1379 */
1380 BUG_ON(ret);
1381
1382 delayed_item->key.objectid = btrfs_ino(dir); 1395 delayed_item->key.objectid = btrfs_ino(dir);
1383 btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); 1396 btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
1384 delayed_item->key.offset = index; 1397 delayed_item->key.offset = index;
@@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1391 dir_item->type = type; 1404 dir_item->type = type;
1392 memcpy((char *)(dir_item + 1), name, name_len); 1405 memcpy((char *)(dir_item + 1), name, name_len);
1393 1406
1407 ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
1408 /*
1409 * we have reserved enough space when we start a new transaction,
1410 * so reserving metadata failure is impossible
1411 */
1412 BUG_ON(ret);
1413
1414
1394 mutex_lock(&delayed_node->mutex); 1415 mutex_lock(&delayed_node->mutex);
1395 ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); 1416 ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
1396 if (unlikely(ret)) { 1417 if (unlikely(ret)) {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 125cf76fcd08..66e4f29505a3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
101 return -1; 101 return -1;
102 if (ref1->type > ref2->type) 102 if (ref1->type > ref2->type)
103 return 1; 103 return 1;
104 /* merging of sequenced refs is not allowed */
105 if (ref1->seq < ref2->seq)
106 return -1;
107 if (ref1->seq > ref2->seq)
108 return 1;
104 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || 109 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
105 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { 110 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
106 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), 111 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
150 155
151/* 156/*
152 * find an head entry based on bytenr. This returns the delayed ref 157 * find an head entry based on bytenr. This returns the delayed ref
153 * head if it was able to find one, or NULL if nothing was in that spot 158 * head if it was able to find one, or NULL if nothing was in that spot.
159 * If return_bigger is given, the next bigger entry is returned if no exact
160 * match is found.
154 */ 161 */
155static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, 162static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
156 u64 bytenr, 163 u64 bytenr,
157 struct btrfs_delayed_ref_node **last) 164 struct btrfs_delayed_ref_node **last,
165 int return_bigger)
158{ 166{
159 struct rb_node *n = root->rb_node; 167 struct rb_node *n;
160 struct btrfs_delayed_ref_node *entry; 168 struct btrfs_delayed_ref_node *entry;
161 int cmp; 169 int cmp = 0;
162 170
171again:
172 n = root->rb_node;
173 entry = NULL;
163 while (n) { 174 while (n) {
164 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 175 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
165 WARN_ON(!entry->in_tree); 176 WARN_ON(!entry->in_tree);
@@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
182 else 193 else
183 return entry; 194 return entry;
184 } 195 }
196 if (entry && return_bigger) {
197 if (cmp > 0) {
198 n = rb_next(&entry->rb_node);
199 if (!n)
200 n = rb_first(root);
201 entry = rb_entry(n, struct btrfs_delayed_ref_node,
202 rb_node);
203 bytenr = entry->bytenr;
204 return_bigger = 0;
205 goto again;
206 }
207 return entry;
208 }
185 return NULL; 209 return NULL;
186} 210}
187 211
@@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
209 return 0; 233 return 0;
210} 234}
211 235
236int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
237 u64 seq)
238{
239 struct seq_list *elem;
240
241 assert_spin_locked(&delayed_refs->lock);
242 if (list_empty(&delayed_refs->seq_head))
243 return 0;
244
245 elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
246 if (seq >= elem->seq) {
247 pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
248 seq, elem->seq, delayed_refs);
249 return 1;
250 }
251 return 0;
252}
253
212int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 254int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
213 struct list_head *cluster, u64 start) 255 struct list_head *cluster, u64 start)
214{ 256{
@@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
223 node = rb_first(&delayed_refs->root); 265 node = rb_first(&delayed_refs->root);
224 } else { 266 } else {
225 ref = NULL; 267 ref = NULL;
226 find_ref_head(&delayed_refs->root, start, &ref); 268 find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
227 if (ref) { 269 if (ref) {
228 struct btrfs_delayed_ref_node *tmp;
229
230 node = rb_prev(&ref->rb_node);
231 while (node) {
232 tmp = rb_entry(node,
233 struct btrfs_delayed_ref_node,
234 rb_node);
235 if (tmp->bytenr < start)
236 break;
237 ref = tmp;
238 node = rb_prev(&ref->rb_node);
239 }
240 node = &ref->rb_node; 270 node = &ref->rb_node;
241 } else 271 } else
242 node = rb_first(&delayed_refs->root); 272 node = rb_first(&delayed_refs->root);
@@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
390 * this does all the dirty work in terms of maintaining the correct 420 * this does all the dirty work in terms of maintaining the correct
391 * overall modification count. 421 * overall modification count.
392 */ 422 */
393static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, 423static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
424 struct btrfs_trans_handle *trans,
394 struct btrfs_delayed_ref_node *ref, 425 struct btrfs_delayed_ref_node *ref,
395 u64 bytenr, u64 num_bytes, 426 u64 bytenr, u64 num_bytes,
396 int action, int is_data) 427 int action, int is_data)
@@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
437 ref->action = 0; 468 ref->action = 0;
438 ref->is_head = 1; 469 ref->is_head = 1;
439 ref->in_tree = 1; 470 ref->in_tree = 1;
471 ref->seq = 0;
440 472
441 head_ref = btrfs_delayed_node_to_head(ref); 473 head_ref = btrfs_delayed_node_to_head(ref);
442 head_ref->must_insert_reserved = must_insert_reserved; 474 head_ref->must_insert_reserved = must_insert_reserved;
@@ -468,14 +500,17 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
468/* 500/*
469 * helper to insert a delayed tree ref into the rbtree. 501 * helper to insert a delayed tree ref into the rbtree.
470 */ 502 */
471static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, 503static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
504 struct btrfs_trans_handle *trans,
472 struct btrfs_delayed_ref_node *ref, 505 struct btrfs_delayed_ref_node *ref,
473 u64 bytenr, u64 num_bytes, u64 parent, 506 u64 bytenr, u64 num_bytes, u64 parent,
474 u64 ref_root, int level, int action) 507 u64 ref_root, int level, int action,
508 int for_cow)
475{ 509{
476 struct btrfs_delayed_ref_node *existing; 510 struct btrfs_delayed_ref_node *existing;
477 struct btrfs_delayed_tree_ref *full_ref; 511 struct btrfs_delayed_tree_ref *full_ref;
478 struct btrfs_delayed_ref_root *delayed_refs; 512 struct btrfs_delayed_ref_root *delayed_refs;
513 u64 seq = 0;
479 514
480 if (action == BTRFS_ADD_DELAYED_EXTENT) 515 if (action == BTRFS_ADD_DELAYED_EXTENT)
481 action = BTRFS_ADD_DELAYED_REF; 516 action = BTRFS_ADD_DELAYED_REF;
@@ -491,14 +526,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
491 ref->is_head = 0; 526 ref->is_head = 0;
492 ref->in_tree = 1; 527 ref->in_tree = 1;
493 528
529 if (need_ref_seq(for_cow, ref_root))
530 seq = inc_delayed_seq(delayed_refs);
531 ref->seq = seq;
532
494 full_ref = btrfs_delayed_node_to_tree_ref(ref); 533 full_ref = btrfs_delayed_node_to_tree_ref(ref);
495 if (parent) { 534 full_ref->parent = parent;
496 full_ref->parent = parent; 535 full_ref->root = ref_root;
536 if (parent)
497 ref->type = BTRFS_SHARED_BLOCK_REF_KEY; 537 ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
498 } else { 538 else
499 full_ref->root = ref_root;
500 ref->type = BTRFS_TREE_BLOCK_REF_KEY; 539 ref->type = BTRFS_TREE_BLOCK_REF_KEY;
501 }
502 full_ref->level = level; 540 full_ref->level = level;
503 541
504 trace_btrfs_delayed_tree_ref(ref, full_ref, action); 542 trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -522,15 +560,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
522/* 560/*
523 * helper to insert a delayed data ref into the rbtree. 561 * helper to insert a delayed data ref into the rbtree.
524 */ 562 */
525static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, 563static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
564 struct btrfs_trans_handle *trans,
526 struct btrfs_delayed_ref_node *ref, 565 struct btrfs_delayed_ref_node *ref,
527 u64 bytenr, u64 num_bytes, u64 parent, 566 u64 bytenr, u64 num_bytes, u64 parent,
528 u64 ref_root, u64 owner, u64 offset, 567 u64 ref_root, u64 owner, u64 offset,
529 int action) 568 int action, int for_cow)
530{ 569{
531 struct btrfs_delayed_ref_node *existing; 570 struct btrfs_delayed_ref_node *existing;
532 struct btrfs_delayed_data_ref *full_ref; 571 struct btrfs_delayed_data_ref *full_ref;
533 struct btrfs_delayed_ref_root *delayed_refs; 572 struct btrfs_delayed_ref_root *delayed_refs;
573 u64 seq = 0;
534 574
535 if (action == BTRFS_ADD_DELAYED_EXTENT) 575 if (action == BTRFS_ADD_DELAYED_EXTENT)
536 action = BTRFS_ADD_DELAYED_REF; 576 action = BTRFS_ADD_DELAYED_REF;
@@ -546,14 +586,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
546 ref->is_head = 0; 586 ref->is_head = 0;
547 ref->in_tree = 1; 587 ref->in_tree = 1;
548 588
589 if (need_ref_seq(for_cow, ref_root))
590 seq = inc_delayed_seq(delayed_refs);
591 ref->seq = seq;
592
549 full_ref = btrfs_delayed_node_to_data_ref(ref); 593 full_ref = btrfs_delayed_node_to_data_ref(ref);
550 if (parent) { 594 full_ref->parent = parent;
551 full_ref->parent = parent; 595 full_ref->root = ref_root;
596 if (parent)
552 ref->type = BTRFS_SHARED_DATA_REF_KEY; 597 ref->type = BTRFS_SHARED_DATA_REF_KEY;
553 } else { 598 else
554 full_ref->root = ref_root;
555 ref->type = BTRFS_EXTENT_DATA_REF_KEY; 599 ref->type = BTRFS_EXTENT_DATA_REF_KEY;
556 } 600
557 full_ref->objectid = owner; 601 full_ref->objectid = owner;
558 full_ref->offset = offset; 602 full_ref->offset = offset;
559 603
@@ -580,10 +624,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
580 * to make sure the delayed ref is eventually processed before this 624 * to make sure the delayed ref is eventually processed before this
581 * transaction commits. 625 * transaction commits.
582 */ 626 */
583int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 627int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
628 struct btrfs_trans_handle *trans,
584 u64 bytenr, u64 num_bytes, u64 parent, 629 u64 bytenr, u64 num_bytes, u64 parent,
585 u64 ref_root, int level, int action, 630 u64 ref_root, int level, int action,
586 struct btrfs_delayed_extent_op *extent_op) 631 struct btrfs_delayed_extent_op *extent_op,
632 int for_cow)
587{ 633{
588 struct btrfs_delayed_tree_ref *ref; 634 struct btrfs_delayed_tree_ref *ref;
589 struct btrfs_delayed_ref_head *head_ref; 635 struct btrfs_delayed_ref_head *head_ref;
@@ -610,13 +656,17 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
610 * insert both the head node and the new ref without dropping 656 * insert both the head node and the new ref without dropping
611 * the spin lock 657 * the spin lock
612 */ 658 */
613 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, 659 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
614 action, 0); 660 num_bytes, action, 0);
615 BUG_ON(ret); 661 BUG_ON(ret);
616 662
617 ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, 663 ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
618 parent, ref_root, level, action); 664 num_bytes, parent, ref_root, level, action,
665 for_cow);
619 BUG_ON(ret); 666 BUG_ON(ret);
667 if (!need_ref_seq(for_cow, ref_root) &&
668 waitqueue_active(&delayed_refs->seq_wait))
669 wake_up(&delayed_refs->seq_wait);
620 spin_unlock(&delayed_refs->lock); 670 spin_unlock(&delayed_refs->lock);
621 return 0; 671 return 0;
622} 672}
@@ -624,11 +674,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
624/* 674/*
625 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. 675 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
626 */ 676 */
627int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 677int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
678 struct btrfs_trans_handle *trans,
628 u64 bytenr, u64 num_bytes, 679 u64 bytenr, u64 num_bytes,
629 u64 parent, u64 ref_root, 680 u64 parent, u64 ref_root,
630 u64 owner, u64 offset, int action, 681 u64 owner, u64 offset, int action,
631 struct btrfs_delayed_extent_op *extent_op) 682 struct btrfs_delayed_extent_op *extent_op,
683 int for_cow)
632{ 684{
633 struct btrfs_delayed_data_ref *ref; 685 struct btrfs_delayed_data_ref *ref;
634 struct btrfs_delayed_ref_head *head_ref; 686 struct btrfs_delayed_ref_head *head_ref;
@@ -655,18 +707,23 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
655 * insert both the head node and the new ref without dropping 707 * insert both the head node and the new ref without dropping
656 * the spin lock 708 * the spin lock
657 */ 709 */
658 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, 710 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
659 action, 1); 711 num_bytes, action, 1);
660 BUG_ON(ret); 712 BUG_ON(ret);
661 713
662 ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, 714 ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
663 parent, ref_root, owner, offset, action); 715 num_bytes, parent, ref_root, owner, offset,
716 action, for_cow);
664 BUG_ON(ret); 717 BUG_ON(ret);
718 if (!need_ref_seq(for_cow, ref_root) &&
719 waitqueue_active(&delayed_refs->seq_wait))
720 wake_up(&delayed_refs->seq_wait);
665 spin_unlock(&delayed_refs->lock); 721 spin_unlock(&delayed_refs->lock);
666 return 0; 722 return 0;
667} 723}
668 724
669int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 725int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
726 struct btrfs_trans_handle *trans,
670 u64 bytenr, u64 num_bytes, 727 u64 bytenr, u64 num_bytes,
671 struct btrfs_delayed_extent_op *extent_op) 728 struct btrfs_delayed_extent_op *extent_op)
672{ 729{
@@ -683,11 +740,13 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
683 delayed_refs = &trans->transaction->delayed_refs; 740 delayed_refs = &trans->transaction->delayed_refs;
684 spin_lock(&delayed_refs->lock); 741 spin_lock(&delayed_refs->lock);
685 742
686 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, 743 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
687 num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 744 num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
688 extent_op->is_data); 745 extent_op->is_data);
689 BUG_ON(ret); 746 BUG_ON(ret);
690 747
748 if (waitqueue_active(&delayed_refs->seq_wait))
749 wake_up(&delayed_refs->seq_wait);
691 spin_unlock(&delayed_refs->lock); 750 spin_unlock(&delayed_refs->lock);
692 return 0; 751 return 0;
693} 752}
@@ -704,7 +763,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
704 struct btrfs_delayed_ref_root *delayed_refs; 763 struct btrfs_delayed_ref_root *delayed_refs;
705 764
706 delayed_refs = &trans->transaction->delayed_refs; 765 delayed_refs = &trans->transaction->delayed_refs;
707 ref = find_ref_head(&delayed_refs->root, bytenr, NULL); 766 ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
708 if (ref) 767 if (ref)
709 return btrfs_delayed_node_to_head(ref); 768 return btrfs_delayed_node_to_head(ref);
710 return NULL; 769 return NULL;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e287e3b0eab0..d8f244d94925 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
33 /* the size of the extent */ 33 /* the size of the extent */
34 u64 num_bytes; 34 u64 num_bytes;
35 35
36 /* seq number to keep track of insertion order */
37 u64 seq;
38
36 /* ref count on this data structure */ 39 /* ref count on this data structure */
37 atomic_t refs; 40 atomic_t refs;
38 41
@@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head {
98 101
99struct btrfs_delayed_tree_ref { 102struct btrfs_delayed_tree_ref {
100 struct btrfs_delayed_ref_node node; 103 struct btrfs_delayed_ref_node node;
101 union { 104 u64 root;
102 u64 root; 105 u64 parent;
103 u64 parent;
104 };
105 int level; 106 int level;
106}; 107};
107 108
108struct btrfs_delayed_data_ref { 109struct btrfs_delayed_data_ref {
109 struct btrfs_delayed_ref_node node; 110 struct btrfs_delayed_ref_node node;
110 union { 111 u64 root;
111 u64 root; 112 u64 parent;
112 u64 parent;
113 };
114 u64 objectid; 113 u64 objectid;
115 u64 offset; 114 u64 offset;
116}; 115};
@@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root {
140 int flushing; 139 int flushing;
141 140
142 u64 run_delayed_start; 141 u64 run_delayed_start;
142
143 /*
144 * seq number of delayed refs. We need to know if a backref was being
145 * added before the currently processed ref or afterwards.
146 */
147 u64 seq;
148
149 /*
150 * seq_list holds a list of all seq numbers that are currently being
151 * added to the list. While walking backrefs (btrfs_find_all_roots,
152 * qgroups), which might take some time, no newer ref must be processed,
153 * as it might influence the outcome of the walk.
154 */
155 struct list_head seq_head;
156
157 /*
158 * when the only refs we have in the list must not be processed, we want
159 * to wait for more refs to show up or for the end of backref walking.
160 */
161 wait_queue_head_t seq_wait;
143}; 162};
144 163
145static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) 164static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
151 } 170 }
152} 171}
153 172
154int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 173int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
174 struct btrfs_trans_handle *trans,
155 u64 bytenr, u64 num_bytes, u64 parent, 175 u64 bytenr, u64 num_bytes, u64 parent,
156 u64 ref_root, int level, int action, 176 u64 ref_root, int level, int action,
157 struct btrfs_delayed_extent_op *extent_op); 177 struct btrfs_delayed_extent_op *extent_op,
158int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 178 int for_cow);
179int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
180 struct btrfs_trans_handle *trans,
159 u64 bytenr, u64 num_bytes, 181 u64 bytenr, u64 num_bytes,
160 u64 parent, u64 ref_root, 182 u64 parent, u64 ref_root,
161 u64 owner, u64 offset, int action, 183 u64 owner, u64 offset, int action,
162 struct btrfs_delayed_extent_op *extent_op); 184 struct btrfs_delayed_extent_op *extent_op,
163int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 185 int for_cow);
186int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
187 struct btrfs_trans_handle *trans,
164 u64 bytenr, u64 num_bytes, 188 u64 bytenr, u64 num_bytes,
165 struct btrfs_delayed_extent_op *extent_op); 189 struct btrfs_delayed_extent_op *extent_op);
166 190
@@ -170,6 +194,60 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
170 struct btrfs_delayed_ref_head *head); 194 struct btrfs_delayed_ref_head *head);
171int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
172 struct list_head *cluster, u64 search_start); 196 struct list_head *cluster, u64 search_start);
197
198struct seq_list {
199 struct list_head list;
200 u64 seq;
201};
202
203static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
204{
205 assert_spin_locked(&delayed_refs->lock);
206 ++delayed_refs->seq;
207 return delayed_refs->seq;
208}
209
210static inline void
211btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
212 struct seq_list *elem)
213{
214 assert_spin_locked(&delayed_refs->lock);
215 elem->seq = delayed_refs->seq;
216 list_add_tail(&elem->list, &delayed_refs->seq_head);
217}
218
219static inline void
220btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
221 struct seq_list *elem)
222{
223 spin_lock(&delayed_refs->lock);
224 list_del(&elem->list);
225 wake_up(&delayed_refs->seq_wait);
226 spin_unlock(&delayed_refs->lock);
227}
228
229int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
230 u64 seq);
231
232/*
233 * delayed refs with a ref_seq > 0 must be held back during backref walking.
234 * this only applies to items in one of the fs-trees. for_cow items never need
235 * to be held back, so they won't get a ref_seq number.
236 */
237static inline int need_ref_seq(int for_cow, u64 rootid)
238{
239 if (for_cow)
240 return 0;
241
242 if (rootid == BTRFS_FS_TREE_OBJECTID)
243 return 1;
244
245 if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
246 return 1;
247
248 return 0;
249}
250
173/* 251/*
174 * a node might live in a head or a regular ref, this lets you 252 * a node might live in a head or a regular ref, this lets you
175 * test for the proper type to use. 253 * test for the proper type to use.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f99a099a7747..7aa9cd36bf1b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,6 +43,7 @@
43#include "tree-log.h" 43#include "tree-log.h"
44#include "free-space-cache.h" 44#include "free-space-cache.h"
45#include "inode-map.h" 45#include "inode-map.h"
46#include "check-integrity.h"
46 47
47static struct extent_io_ops btree_extent_io_ops; 48static struct extent_io_ops btree_extent_io_ops;
48static void end_workqueue_fn(struct btrfs_work *work); 49static void end_workqueue_fn(struct btrfs_work *work);
@@ -872,7 +873,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
872 873
873#ifdef CONFIG_MIGRATION 874#ifdef CONFIG_MIGRATION
874static int btree_migratepage(struct address_space *mapping, 875static int btree_migratepage(struct address_space *mapping,
875 struct page *newpage, struct page *page) 876 struct page *newpage, struct page *page,
877 enum migrate_mode mode)
876{ 878{
877 /* 879 /*
878 * we can't safely write a btree page from here, 880 * we can't safely write a btree page from here,
@@ -887,7 +889,7 @@ static int btree_migratepage(struct address_space *mapping,
887 if (page_has_private(page) && 889 if (page_has_private(page) &&
888 !try_to_release_page(page, GFP_KERNEL)) 890 !try_to_release_page(page, GFP_KERNEL))
889 return -EAGAIN; 891 return -EAGAIN;
890 return migrate_page(mapping, newpage, page); 892 return migrate_page(mapping, newpage, page, mode);
891} 893}
892#endif 894#endif
893 895
@@ -1142,7 +1144,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1142 root->orphan_item_inserted = 0; 1144 root->orphan_item_inserted = 0;
1143 root->orphan_cleanup_state = 0; 1145 root->orphan_cleanup_state = 0;
1144 1146
1145 root->fs_info = fs_info;
1146 root->objectid = objectid; 1147 root->objectid = objectid;
1147 root->last_trans = 0; 1148 root->last_trans = 0;
1148 root->highest_objectid = 0; 1149 root->highest_objectid = 0;
@@ -1216,6 +1217,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1216 return 0; 1217 return 0;
1217} 1218}
1218 1219
1220static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1221{
1222 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
1223 if (root)
1224 root->fs_info = fs_info;
1225 return root;
1226}
1227
1219static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 1228static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1220 struct btrfs_fs_info *fs_info) 1229 struct btrfs_fs_info *fs_info)
1221{ 1230{
@@ -1223,7 +1232,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1223 struct btrfs_root *tree_root = fs_info->tree_root; 1232 struct btrfs_root *tree_root = fs_info->tree_root;
1224 struct extent_buffer *leaf; 1233 struct extent_buffer *leaf;
1225 1234
1226 root = kzalloc(sizeof(*root), GFP_NOFS); 1235 root = btrfs_alloc_root(fs_info);
1227 if (!root) 1236 if (!root)
1228 return ERR_PTR(-ENOMEM); 1237 return ERR_PTR(-ENOMEM);
1229 1238
@@ -1243,7 +1252,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1243 root->ref_cows = 0; 1252 root->ref_cows = 0;
1244 1253
1245 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1254 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1246 BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); 1255 BTRFS_TREE_LOG_OBJECTID, NULL,
1256 0, 0, 0, 0);
1247 if (IS_ERR(leaf)) { 1257 if (IS_ERR(leaf)) {
1248 kfree(root); 1258 kfree(root);
1249 return ERR_CAST(leaf); 1259 return ERR_CAST(leaf);
@@ -1317,7 +1327,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1317 u32 blocksize; 1327 u32 blocksize;
1318 int ret = 0; 1328 int ret = 0;
1319 1329
1320 root = kzalloc(sizeof(*root), GFP_NOFS); 1330 root = btrfs_alloc_root(fs_info);
1321 if (!root) 1331 if (!root)
1322 return ERR_PTR(-ENOMEM); 1332 return ERR_PTR(-ENOMEM);
1323 if (location->offset == (u64)-1) { 1333 if (location->offset == (u64)-1) {
@@ -1873,9 +1883,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1873} 1883}
1874 1884
1875 1885
1876struct btrfs_root *open_ctree(struct super_block *sb, 1886int open_ctree(struct super_block *sb,
1877 struct btrfs_fs_devices *fs_devices, 1887 struct btrfs_fs_devices *fs_devices,
1878 char *options) 1888 char *options)
1879{ 1889{
1880 u32 sectorsize; 1890 u32 sectorsize;
1881 u32 nodesize; 1891 u32 nodesize;
@@ -1887,8 +1897,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1887 struct btrfs_key location; 1897 struct btrfs_key location;
1888 struct buffer_head *bh; 1898 struct buffer_head *bh;
1889 struct btrfs_super_block *disk_super; 1899 struct btrfs_super_block *disk_super;
1890 struct btrfs_root *tree_root = btrfs_sb(sb); 1900 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1891 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1901 struct btrfs_root *tree_root;
1892 struct btrfs_root *extent_root; 1902 struct btrfs_root *extent_root;
1893 struct btrfs_root *csum_root; 1903 struct btrfs_root *csum_root;
1894 struct btrfs_root *chunk_root; 1904 struct btrfs_root *chunk_root;
@@ -1899,16 +1909,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1899 int num_backups_tried = 0; 1909 int num_backups_tried = 0;
1900 int backup_index = 0; 1910 int backup_index = 0;
1901 1911
1902 extent_root = fs_info->extent_root = 1912 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
1903 kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 1913 extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
1904 csum_root = fs_info->csum_root = 1914 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
1905 kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 1915 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
1906 chunk_root = fs_info->chunk_root = 1916 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
1907 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1908 dev_root = fs_info->dev_root =
1909 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1910 1917
1911 if (!extent_root || !csum_root || !chunk_root || !dev_root) { 1918 if (!tree_root || !extent_root || !csum_root ||
1919 !chunk_root || !dev_root) {
1912 err = -ENOMEM; 1920 err = -ENOMEM;
1913 goto fail; 1921 goto fail;
1914 } 1922 }
@@ -1997,6 +2005,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1997 init_waitqueue_head(&fs_info->scrub_pause_wait); 2005 init_waitqueue_head(&fs_info->scrub_pause_wait);
1998 init_rwsem(&fs_info->scrub_super_lock); 2006 init_rwsem(&fs_info->scrub_super_lock);
1999 fs_info->scrub_workers_refcnt = 0; 2007 fs_info->scrub_workers_refcnt = 0;
2008#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2009 fs_info->check_integrity_print_mask = 0;
2010#endif
2011
2012 spin_lock_init(&fs_info->balance_lock);
2013 mutex_init(&fs_info->balance_mutex);
2014 atomic_set(&fs_info->balance_running, 0);
2015 atomic_set(&fs_info->balance_pause_req, 0);
2016 atomic_set(&fs_info->balance_cancel_req, 0);
2017 fs_info->balance_ctl = NULL;
2018 init_waitqueue_head(&fs_info->balance_wait_q);
2000 2019
2001 sb->s_blocksize = 4096; 2020 sb->s_blocksize = 4096;
2002 sb->s_blocksize_bits = blksize_bits(4096); 2021 sb->s_blocksize_bits = blksize_bits(4096);
@@ -2266,9 +2285,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
2266 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), 2285 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
2267 BTRFS_UUID_SIZE); 2286 BTRFS_UUID_SIZE);
2268 2287
2269 mutex_lock(&fs_info->chunk_mutex);
2270 ret = btrfs_read_chunk_tree(chunk_root); 2288 ret = btrfs_read_chunk_tree(chunk_root);
2271 mutex_unlock(&fs_info->chunk_mutex);
2272 if (ret) { 2289 if (ret) {
2273 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2290 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
2274 sb->s_id); 2291 sb->s_id);
@@ -2317,9 +2334,6 @@ retry_root_backup:
2317 2334
2318 fs_info->generation = generation; 2335 fs_info->generation = generation;
2319 fs_info->last_trans_committed = generation; 2336 fs_info->last_trans_committed = generation;
2320 fs_info->data_alloc_profile = (u64)-1;
2321 fs_info->metadata_alloc_profile = (u64)-1;
2322 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
2323 2337
2324 ret = btrfs_init_space_info(fs_info); 2338 ret = btrfs_init_space_info(fs_info);
2325 if (ret) { 2339 if (ret) {
@@ -2352,6 +2366,19 @@ retry_root_backup:
2352 btrfs_set_opt(fs_info->mount_opt, SSD); 2366 btrfs_set_opt(fs_info->mount_opt, SSD);
2353 } 2367 }
2354 2368
2369#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2370 if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
2371 ret = btrfsic_mount(tree_root, fs_devices,
2372 btrfs_test_opt(tree_root,
2373 CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
2374 1 : 0,
2375 fs_info->check_integrity_print_mask);
2376 if (ret)
2377 printk(KERN_WARNING "btrfs: failed to initialize"
2378 " integrity check module %s\n", sb->s_id);
2379 }
2380#endif
2381
2355 /* do not make disk changes in broken FS */ 2382 /* do not make disk changes in broken FS */
2356 if (btrfs_super_log_root(disk_super) != 0 && 2383 if (btrfs_super_log_root(disk_super) != 0 &&
2357 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { 2384 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2367,7 +2394,7 @@ retry_root_backup:
2367 btrfs_level_size(tree_root, 2394 btrfs_level_size(tree_root,
2368 btrfs_super_log_root_level(disk_super)); 2395 btrfs_super_log_root_level(disk_super));
2369 2396
2370 log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 2397 log_tree_root = btrfs_alloc_root(fs_info);
2371 if (!log_tree_root) { 2398 if (!log_tree_root) {
2372 err = -ENOMEM; 2399 err = -ENOMEM;
2373 goto fail_trans_kthread; 2400 goto fail_trans_kthread;
@@ -2422,13 +2449,17 @@ retry_root_backup:
2422 if (!err) 2449 if (!err)
2423 err = btrfs_orphan_cleanup(fs_info->tree_root); 2450 err = btrfs_orphan_cleanup(fs_info->tree_root);
2424 up_read(&fs_info->cleanup_work_sem); 2451 up_read(&fs_info->cleanup_work_sem);
2452
2453 if (!err)
2454 err = btrfs_recover_balance(fs_info->tree_root);
2455
2425 if (err) { 2456 if (err) {
2426 close_ctree(tree_root); 2457 close_ctree(tree_root);
2427 return ERR_PTR(err); 2458 return err;
2428 } 2459 }
2429 } 2460 }
2430 2461
2431 return tree_root; 2462 return 0;
2432 2463
2433fail_trans_kthread: 2464fail_trans_kthread:
2434 kthread_stop(fs_info->transaction_kthread); 2465 kthread_stop(fs_info->transaction_kthread);
@@ -2474,8 +2505,7 @@ fail_srcu:
2474 cleanup_srcu_struct(&fs_info->subvol_srcu); 2505 cleanup_srcu_struct(&fs_info->subvol_srcu);
2475fail: 2506fail:
2476 btrfs_close_devices(fs_info->fs_devices); 2507 btrfs_close_devices(fs_info->fs_devices);
2477 free_fs_info(fs_info); 2508 return err;
2478 return ERR_PTR(err);
2479 2509
2480recovery_tree_root: 2510recovery_tree_root:
2481 if (!btrfs_test_opt(tree_root, RECOVERY)) 2511 if (!btrfs_test_opt(tree_root, RECOVERY))
@@ -2630,7 +2660,7 @@ static int write_dev_supers(struct btrfs_device *device,
2630 * we fua the first super. The others we allow 2660 * we fua the first super. The others we allow
2631 * to go down lazy. 2661 * to go down lazy.
2632 */ 2662 */
2633 ret = submit_bh(WRITE_FUA, bh); 2663 ret = btrfsic_submit_bh(WRITE_FUA, bh);
2634 if (ret) 2664 if (ret)
2635 errors++; 2665 errors++;
2636 } 2666 }
@@ -2707,7 +2737,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
2707 device->flush_bio = bio; 2737 device->flush_bio = bio;
2708 2738
2709 bio_get(bio); 2739 bio_get(bio);
2710 submit_bio(WRITE_FLUSH, bio); 2740 btrfsic_submit_bio(WRITE_FLUSH, bio);
2711 2741
2712 return 0; 2742 return 0;
2713} 2743}
@@ -2971,6 +3001,9 @@ int close_ctree(struct btrfs_root *root)
2971 fs_info->closing = 1; 3001 fs_info->closing = 1;
2972 smp_mb(); 3002 smp_mb();
2973 3003
3004 /* pause restriper - we want to resume on mount */
3005 btrfs_pause_balance(root->fs_info);
3006
2974 btrfs_scrub_cancel(root); 3007 btrfs_scrub_cancel(root);
2975 3008
2976 /* wait for any defraggers to finish */ 3009 /* wait for any defraggers to finish */
@@ -2978,7 +3011,7 @@ int close_ctree(struct btrfs_root *root)
2978 (atomic_read(&fs_info->defrag_running) == 0)); 3011 (atomic_read(&fs_info->defrag_running) == 0));
2979 3012
2980 /* clear out the rbtree of defraggable inodes */ 3013 /* clear out the rbtree of defraggable inodes */
2981 btrfs_run_defrag_inodes(root->fs_info); 3014 btrfs_run_defrag_inodes(fs_info);
2982 3015
2983 /* 3016 /*
2984 * Here come 2 situations when btrfs is broken to flip readonly: 3017 * Here come 2 situations when btrfs is broken to flip readonly:
@@ -3007,8 +3040,8 @@ int close_ctree(struct btrfs_root *root)
3007 3040
3008 btrfs_put_block_group_cache(fs_info); 3041 btrfs_put_block_group_cache(fs_info);
3009 3042
3010 kthread_stop(root->fs_info->transaction_kthread); 3043 kthread_stop(fs_info->transaction_kthread);
3011 kthread_stop(root->fs_info->cleaner_kthread); 3044 kthread_stop(fs_info->cleaner_kthread);
3012 3045
3013 fs_info->closing = 2; 3046 fs_info->closing = 2;
3014 smp_mb(); 3047 smp_mb();
@@ -3026,14 +3059,14 @@ int close_ctree(struct btrfs_root *root)
3026 free_extent_buffer(fs_info->extent_root->commit_root); 3059 free_extent_buffer(fs_info->extent_root->commit_root);
3027 free_extent_buffer(fs_info->tree_root->node); 3060 free_extent_buffer(fs_info->tree_root->node);
3028 free_extent_buffer(fs_info->tree_root->commit_root); 3061 free_extent_buffer(fs_info->tree_root->commit_root);
3029 free_extent_buffer(root->fs_info->chunk_root->node); 3062 free_extent_buffer(fs_info->chunk_root->node);
3030 free_extent_buffer(root->fs_info->chunk_root->commit_root); 3063 free_extent_buffer(fs_info->chunk_root->commit_root);
3031 free_extent_buffer(root->fs_info->dev_root->node); 3064 free_extent_buffer(fs_info->dev_root->node);
3032 free_extent_buffer(root->fs_info->dev_root->commit_root); 3065 free_extent_buffer(fs_info->dev_root->commit_root);
3033 free_extent_buffer(root->fs_info->csum_root->node); 3066 free_extent_buffer(fs_info->csum_root->node);
3034 free_extent_buffer(root->fs_info->csum_root->commit_root); 3067 free_extent_buffer(fs_info->csum_root->commit_root);
3035 3068
3036 btrfs_free_block_groups(root->fs_info); 3069 btrfs_free_block_groups(fs_info);
3037 3070
3038 del_fs_roots(fs_info); 3071 del_fs_roots(fs_info);
3039 3072
@@ -3053,14 +3086,17 @@ int close_ctree(struct btrfs_root *root)
3053 btrfs_stop_workers(&fs_info->caching_workers); 3086 btrfs_stop_workers(&fs_info->caching_workers);
3054 btrfs_stop_workers(&fs_info->readahead_workers); 3087 btrfs_stop_workers(&fs_info->readahead_workers);
3055 3088
3089#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3090 if (btrfs_test_opt(root, CHECK_INTEGRITY))
3091 btrfsic_unmount(root, fs_info->fs_devices);
3092#endif
3093
3056 btrfs_close_devices(fs_info->fs_devices); 3094 btrfs_close_devices(fs_info->fs_devices);
3057 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3095 btrfs_mapping_tree_free(&fs_info->mapping_tree);
3058 3096
3059 bdi_destroy(&fs_info->bdi); 3097 bdi_destroy(&fs_info->bdi);
3060 cleanup_srcu_struct(&fs_info->subvol_srcu); 3098 cleanup_srcu_struct(&fs_info->subvol_srcu);
3061 3099
3062 free_fs_info(fs_info);
3063
3064 return 0; 3100 return 0;
3065} 3101}
3066 3102
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c99d0a8f13fa..e4bc4741319b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
46 u64 bytenr, u32 blocksize); 46 u64 bytenr, u32 blocksize);
47int clean_tree_block(struct btrfs_trans_handle *trans, 47int clean_tree_block(struct btrfs_trans_handle *trans,
48 struct btrfs_root *root, struct extent_buffer *buf); 48 struct btrfs_root *root, struct extent_buffer *buf);
49struct btrfs_root *open_ctree(struct super_block *sb, 49int open_ctree(struct super_block *sb,
50 struct btrfs_fs_devices *fs_devices, 50 struct btrfs_fs_devices *fs_devices,
51 char *options); 51 char *options);
52int close_ctree(struct btrfs_root *root); 52int close_ctree(struct btrfs_root *root);
53int write_ctree_super(struct btrfs_trans_handle *trans, 53int write_ctree_super(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, int max_mirrors); 54 struct btrfs_root *root, int max_mirrors);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1b8dc33778f9..5f77166fd01c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
67 u64 root_objectid, u32 generation, 67 u64 root_objectid, u32 generation,
68 int check_generation) 68 int check_generation)
69{ 69{
70 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; 70 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
71 struct btrfs_root *root; 71 struct btrfs_root *root;
72 struct inode *inode; 72 struct inode *inode;
73 struct btrfs_key key; 73 struct btrfs_key key;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5fbe576d2ba..700879ed64cf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
618 struct list_head *head = &info->space_info; 618 struct list_head *head = &info->space_info;
619 struct btrfs_space_info *found; 619 struct btrfs_space_info *found;
620 620
621 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | 621 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
622 BTRFS_BLOCK_GROUP_METADATA;
623 622
624 rcu_read_lock(); 623 rcu_read_lock();
625 list_for_each_entry_rcu(found, head, list) { 624 list_for_each_entry_rcu(found, head, list) {
@@ -1872,20 +1871,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1872int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1871int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1873 struct btrfs_root *root, 1872 struct btrfs_root *root,
1874 u64 bytenr, u64 num_bytes, u64 parent, 1873 u64 bytenr, u64 num_bytes, u64 parent,
1875 u64 root_objectid, u64 owner, u64 offset) 1874 u64 root_objectid, u64 owner, u64 offset, int for_cow)
1876{ 1875{
1877 int ret; 1876 int ret;
1877 struct btrfs_fs_info *fs_info = root->fs_info;
1878
1878 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1879 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1879 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1880 root_objectid == BTRFS_TREE_LOG_OBJECTID);
1880 1881
1881 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1882 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1882 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 1883 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1884 num_bytes,
1883 parent, root_objectid, (int)owner, 1885 parent, root_objectid, (int)owner,
1884 BTRFS_ADD_DELAYED_REF, NULL); 1886 BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1885 } else { 1887 } else {
1886 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 1888 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1889 num_bytes,
1887 parent, root_objectid, owner, offset, 1890 parent, root_objectid, owner, offset,
1888 BTRFS_ADD_DELAYED_REF, NULL); 1891 BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1889 } 1892 }
1890 return ret; 1893 return ret;
1891} 1894}
@@ -2233,6 +2236,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2233 } 2236 }
2234 2237
2235 /* 2238 /*
2239 * locked_ref is the head node, so we have to go one
2240 * node back for any delayed ref updates
2241 */
2242 ref = select_delayed_ref(locked_ref);
2243
2244 if (ref && ref->seq &&
2245 btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
2246 /*
2247 * there are still refs with lower seq numbers in the
2248 * process of being added. Don't run this ref yet.
2249 */
2250 list_del_init(&locked_ref->cluster);
2251 mutex_unlock(&locked_ref->mutex);
2252 locked_ref = NULL;
2253 delayed_refs->num_heads_ready++;
2254 spin_unlock(&delayed_refs->lock);
2255 cond_resched();
2256 spin_lock(&delayed_refs->lock);
2257 continue;
2258 }
2259
2260 /*
2236 * record the must insert reserved flag before we 2261 * record the must insert reserved flag before we
2237 * drop the spin lock. 2262 * drop the spin lock.
2238 */ 2263 */
@@ -2242,11 +2267,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2242 extent_op = locked_ref->extent_op; 2267 extent_op = locked_ref->extent_op;
2243 locked_ref->extent_op = NULL; 2268 locked_ref->extent_op = NULL;
2244 2269
2245 /*
2246 * locked_ref is the head node, so we have to go one
2247 * node back for any delayed ref updates
2248 */
2249 ref = select_delayed_ref(locked_ref);
2250 if (!ref) { 2270 if (!ref) {
2251 /* All delayed refs have been processed, Go ahead 2271 /* All delayed refs have been processed, Go ahead
2252 * and send the head node to run_one_delayed_ref, 2272 * and send the head node to run_one_delayed_ref,
@@ -2267,9 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2267 BUG_ON(ret); 2287 BUG_ON(ret);
2268 kfree(extent_op); 2288 kfree(extent_op);
2269 2289
2270 cond_resched(); 2290 goto next;
2271 spin_lock(&delayed_refs->lock);
2272 continue;
2273 } 2291 }
2274 2292
2275 list_del_init(&locked_ref->cluster); 2293 list_del_init(&locked_ref->cluster);
@@ -2279,7 +2297,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2279 ref->in_tree = 0; 2297 ref->in_tree = 0;
2280 rb_erase(&ref->rb_node, &delayed_refs->root); 2298 rb_erase(&ref->rb_node, &delayed_refs->root);
2281 delayed_refs->num_entries--; 2299 delayed_refs->num_entries--;
2282 2300 /*
2301 * we modified num_entries, but as we're currently running
2302 * delayed refs, skip
2303 * wake_up(&delayed_refs->seq_wait);
2304 * here.
2305 */
2283 spin_unlock(&delayed_refs->lock); 2306 spin_unlock(&delayed_refs->lock);
2284 2307
2285 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2308 ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2289,13 +2312,34 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2289 btrfs_put_delayed_ref(ref); 2312 btrfs_put_delayed_ref(ref);
2290 kfree(extent_op); 2313 kfree(extent_op);
2291 count++; 2314 count++;
2292 2315next:
2316 do_chunk_alloc(trans, root->fs_info->extent_root,
2317 2 * 1024 * 1024,
2318 btrfs_get_alloc_profile(root, 0),
2319 CHUNK_ALLOC_NO_FORCE);
2293 cond_resched(); 2320 cond_resched();
2294 spin_lock(&delayed_refs->lock); 2321 spin_lock(&delayed_refs->lock);
2295 } 2322 }
2296 return count; 2323 return count;
2297} 2324}
2298 2325
2326
2327static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
2328 unsigned long num_refs)
2329{
2330 struct list_head *first_seq = delayed_refs->seq_head.next;
2331
2332 spin_unlock(&delayed_refs->lock);
2333 pr_debug("waiting for more refs (num %ld, first %p)\n",
2334 num_refs, first_seq);
2335 wait_event(delayed_refs->seq_wait,
2336 num_refs != delayed_refs->num_entries ||
2337 delayed_refs->seq_head.next != first_seq);
2338 pr_debug("done waiting for more refs (num %ld, first %p)\n",
2339 delayed_refs->num_entries, delayed_refs->seq_head.next);
2340 spin_lock(&delayed_refs->lock);
2341}
2342
2299/* 2343/*
2300 * this starts processing the delayed reference count updates and 2344 * this starts processing the delayed reference count updates and
2301 * extent insertions we have queued up so far. count can be 2345 * extent insertions we have queued up so far. count can be
@@ -2311,15 +2355,23 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2311 struct btrfs_delayed_ref_node *ref; 2355 struct btrfs_delayed_ref_node *ref;
2312 struct list_head cluster; 2356 struct list_head cluster;
2313 int ret; 2357 int ret;
2358 u64 delayed_start;
2314 int run_all = count == (unsigned long)-1; 2359 int run_all = count == (unsigned long)-1;
2315 int run_most = 0; 2360 int run_most = 0;
2361 unsigned long num_refs = 0;
2362 int consider_waiting;
2316 2363
2317 if (root == root->fs_info->extent_root) 2364 if (root == root->fs_info->extent_root)
2318 root = root->fs_info->tree_root; 2365 root = root->fs_info->tree_root;
2319 2366
2367 do_chunk_alloc(trans, root->fs_info->extent_root,
2368 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
2369 CHUNK_ALLOC_NO_FORCE);
2370
2320 delayed_refs = &trans->transaction->delayed_refs; 2371 delayed_refs = &trans->transaction->delayed_refs;
2321 INIT_LIST_HEAD(&cluster); 2372 INIT_LIST_HEAD(&cluster);
2322again: 2373again:
2374 consider_waiting = 0;
2323 spin_lock(&delayed_refs->lock); 2375 spin_lock(&delayed_refs->lock);
2324 if (count == 0) { 2376 if (count == 0) {
2325 count = delayed_refs->num_entries * 2; 2377 count = delayed_refs->num_entries * 2;
@@ -2336,11 +2388,35 @@ again:
2336 * of refs to process starting at the first one we are able to 2388 * of refs to process starting at the first one we are able to
2337 * lock 2389 * lock
2338 */ 2390 */
2391 delayed_start = delayed_refs->run_delayed_start;
2339 ret = btrfs_find_ref_cluster(trans, &cluster, 2392 ret = btrfs_find_ref_cluster(trans, &cluster,
2340 delayed_refs->run_delayed_start); 2393 delayed_refs->run_delayed_start);
2341 if (ret) 2394 if (ret)
2342 break; 2395 break;
2343 2396
2397 if (delayed_start >= delayed_refs->run_delayed_start) {
2398 if (consider_waiting == 0) {
2399 /*
2400 * btrfs_find_ref_cluster looped. let's do one
2401 * more cycle. if we don't run any delayed ref
2402 * during that cycle (because we can't because
2403 * all of them are blocked) and if the number of
2404 * refs doesn't change, we avoid busy waiting.
2405 */
2406 consider_waiting = 1;
2407 num_refs = delayed_refs->num_entries;
2408 } else {
2409 wait_for_more_refs(delayed_refs, num_refs);
2410 /*
2411 * after waiting, things have changed. we
2412 * dropped the lock and someone else might have
2413 * run some refs, built new clusters and so on.
2414 * therefore, we restart staleness detection.
2415 */
2416 consider_waiting = 0;
2417 }
2418 }
2419
2344 ret = run_clustered_refs(trans, root, &cluster); 2420 ret = run_clustered_refs(trans, root, &cluster);
2345 BUG_ON(ret < 0); 2421 BUG_ON(ret < 0);
2346 2422
@@ -2348,6 +2424,11 @@ again:
2348 2424
2349 if (count == 0) 2425 if (count == 0)
2350 break; 2426 break;
2427
2428 if (ret || delayed_refs->run_delayed_start == 0) {
2429 /* refs were run, let's reset staleness detection */
2430 consider_waiting = 0;
2431 }
2351 } 2432 }
2352 2433
2353 if (run_all) { 2434 if (run_all) {
@@ -2405,7 +2486,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2405 extent_op->update_key = 0; 2486 extent_op->update_key = 0;
2406 extent_op->is_data = is_data ? 1 : 0; 2487 extent_op->is_data = is_data ? 1 : 0;
2407 2488
2408 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); 2489 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2490 num_bytes, extent_op);
2409 if (ret) 2491 if (ret)
2410 kfree(extent_op); 2492 kfree(extent_op);
2411 return ret; 2493 return ret;
@@ -2590,7 +2672,7 @@ out:
2590static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2672static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2591 struct btrfs_root *root, 2673 struct btrfs_root *root,
2592 struct extent_buffer *buf, 2674 struct extent_buffer *buf,
2593 int full_backref, int inc) 2675 int full_backref, int inc, int for_cow)
2594{ 2676{
2595 u64 bytenr; 2677 u64 bytenr;
2596 u64 num_bytes; 2678 u64 num_bytes;
@@ -2603,7 +2685,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2603 int level; 2685 int level;
2604 int ret = 0; 2686 int ret = 0;
2605 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 2687 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2606 u64, u64, u64, u64, u64, u64); 2688 u64, u64, u64, u64, u64, u64, int);
2607 2689
2608 ref_root = btrfs_header_owner(buf); 2690 ref_root = btrfs_header_owner(buf);
2609 nritems = btrfs_header_nritems(buf); 2691 nritems = btrfs_header_nritems(buf);
@@ -2640,14 +2722,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2640 key.offset -= btrfs_file_extent_offset(buf, fi); 2722 key.offset -= btrfs_file_extent_offset(buf, fi);
2641 ret = process_func(trans, root, bytenr, num_bytes, 2723 ret = process_func(trans, root, bytenr, num_bytes,
2642 parent, ref_root, key.objectid, 2724 parent, ref_root, key.objectid,
2643 key.offset); 2725 key.offset, for_cow);
2644 if (ret) 2726 if (ret)
2645 goto fail; 2727 goto fail;
2646 } else { 2728 } else {
2647 bytenr = btrfs_node_blockptr(buf, i); 2729 bytenr = btrfs_node_blockptr(buf, i);
2648 num_bytes = btrfs_level_size(root, level - 1); 2730 num_bytes = btrfs_level_size(root, level - 1);
2649 ret = process_func(trans, root, bytenr, num_bytes, 2731 ret = process_func(trans, root, bytenr, num_bytes,
2650 parent, ref_root, level - 1, 0); 2732 parent, ref_root, level - 1, 0,
2733 for_cow);
2651 if (ret) 2734 if (ret)
2652 goto fail; 2735 goto fail;
2653 } 2736 }
@@ -2659,15 +2742,15 @@ fail:
2659} 2742}
2660 2743
2661int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2744int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2662 struct extent_buffer *buf, int full_backref) 2745 struct extent_buffer *buf, int full_backref, int for_cow)
2663{ 2746{
2664 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 2747 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2665} 2748}
2666 2749
2667int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2750int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2668 struct extent_buffer *buf, int full_backref) 2751 struct extent_buffer *buf, int full_backref, int for_cow)
2669{ 2752{
2670 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 2753 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2671} 2754}
2672 2755
2673static int write_one_cache_group(struct btrfs_trans_handle *trans, 2756static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2993,9 +3076,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2993 INIT_LIST_HEAD(&found->block_groups[i]); 3076 INIT_LIST_HEAD(&found->block_groups[i]);
2994 init_rwsem(&found->groups_sem); 3077 init_rwsem(&found->groups_sem);
2995 spin_lock_init(&found->lock); 3078 spin_lock_init(&found->lock);
2996 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | 3079 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
2997 BTRFS_BLOCK_GROUP_SYSTEM |
2998 BTRFS_BLOCK_GROUP_METADATA);
2999 found->total_bytes = total_bytes; 3080 found->total_bytes = total_bytes;
3000 found->disk_total = total_bytes * factor; 3081 found->disk_total = total_bytes * factor;
3001 found->bytes_used = bytes_used; 3082 found->bytes_used = bytes_used;
@@ -3016,20 +3097,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3016 3097
3017static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3098static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3018{ 3099{
3019 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | 3100 u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3020 BTRFS_BLOCK_GROUP_RAID1 | 3101
3021 BTRFS_BLOCK_GROUP_RAID10 | 3102 /* chunk -> extended profile */
3022 BTRFS_BLOCK_GROUP_DUP); 3103 if (extra_flags == 0)
3023 if (extra_flags) { 3104 extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3024 if (flags & BTRFS_BLOCK_GROUP_DATA) 3105
3025 fs_info->avail_data_alloc_bits |= extra_flags; 3106 if (flags & BTRFS_BLOCK_GROUP_DATA)
3026 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3107 fs_info->avail_data_alloc_bits |= extra_flags;
3027 fs_info->avail_metadata_alloc_bits |= extra_flags; 3108 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3028 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3109 fs_info->avail_metadata_alloc_bits |= extra_flags;
3029 fs_info->avail_system_alloc_bits |= extra_flags; 3110 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3030 } 3111 fs_info->avail_system_alloc_bits |= extra_flags;
3031} 3112}
3032 3113
3114/*
3115 * @flags: available profiles in extended format (see ctree.h)
3116 *
3117 * Returns reduced profile in chunk format. If profile changing is in
3118 * progress (either running or paused) picks the target profile (if it's
3119 * already available), otherwise falls back to plain reducing.
3120 */
3033u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3121u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3034{ 3122{
3035 /* 3123 /*
@@ -3040,6 +3128,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3040 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3128 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3041 root->fs_info->fs_devices->missing_devices; 3129 root->fs_info->fs_devices->missing_devices;
3042 3130
3131 /* pick restriper's target profile if it's available */
3132 spin_lock(&root->fs_info->balance_lock);
3133 if (root->fs_info->balance_ctl) {
3134 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
3135 u64 tgt = 0;
3136
3137 if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
3138 (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3139 (flags & bctl->data.target)) {
3140 tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3141 } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
3142 (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3143 (flags & bctl->sys.target)) {
3144 tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3145 } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
3146 (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3147 (flags & bctl->meta.target)) {
3148 tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3149 }
3150
3151 if (tgt) {
3152 spin_unlock(&root->fs_info->balance_lock);
3153 flags = tgt;
3154 goto out;
3155 }
3156 }
3157 spin_unlock(&root->fs_info->balance_lock);
3158
3043 if (num_devices == 1) 3159 if (num_devices == 1)
3044 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3160 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3045 if (num_devices < 4) 3161 if (num_devices < 4)
@@ -3059,22 +3175,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3059 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3175 if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3060 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3176 ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3061 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3177 (flags & BTRFS_BLOCK_GROUP_RAID10) |
3062 (flags & BTRFS_BLOCK_GROUP_DUP))) 3178 (flags & BTRFS_BLOCK_GROUP_DUP))) {
3063 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3179 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3180 }
3181
3182out:
3183 /* extended -> chunk profile */
3184 flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3064 return flags; 3185 return flags;
3065} 3186}
3066 3187
3067static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3188static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3068{ 3189{
3069 if (flags & BTRFS_BLOCK_GROUP_DATA) 3190 if (flags & BTRFS_BLOCK_GROUP_DATA)
3070 flags |= root->fs_info->avail_data_alloc_bits & 3191 flags |= root->fs_info->avail_data_alloc_bits;
3071 root->fs_info->data_alloc_profile;
3072 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3192 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3073 flags |= root->fs_info->avail_system_alloc_bits & 3193 flags |= root->fs_info->avail_system_alloc_bits;
3074 root->fs_info->system_alloc_profile;
3075 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3194 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3076 flags |= root->fs_info->avail_metadata_alloc_bits & 3195 flags |= root->fs_info->avail_metadata_alloc_bits;
3077 root->fs_info->metadata_alloc_profile; 3196
3078 return btrfs_reduce_alloc_profile(root, flags); 3197 return btrfs_reduce_alloc_profile(root, flags);
3079} 3198}
3080 3199
@@ -3191,6 +3310,8 @@ commit_trans:
3191 return -ENOSPC; 3310 return -ENOSPC;
3192 } 3311 }
3193 data_sinfo->bytes_may_use += bytes; 3312 data_sinfo->bytes_may_use += bytes;
3313 trace_btrfs_space_reservation(root->fs_info, "space_info",
3314 (u64)data_sinfo, bytes, 1);
3194 spin_unlock(&data_sinfo->lock); 3315 spin_unlock(&data_sinfo->lock);
3195 3316
3196 return 0; 3317 return 0;
@@ -3210,6 +3331,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3210 data_sinfo = BTRFS_I(inode)->space_info; 3331 data_sinfo = BTRFS_I(inode)->space_info;
3211 spin_lock(&data_sinfo->lock); 3332 spin_lock(&data_sinfo->lock);
3212 data_sinfo->bytes_may_use -= bytes; 3333 data_sinfo->bytes_may_use -= bytes;
3334 trace_btrfs_space_reservation(root->fs_info, "space_info",
3335 (u64)data_sinfo, bytes, 0);
3213 spin_unlock(&data_sinfo->lock); 3336 spin_unlock(&data_sinfo->lock);
3214} 3337}
3215 3338
@@ -3257,27 +3380,15 @@ static int should_alloc_chunk(struct btrfs_root *root,
3257 if (num_bytes - num_allocated < thresh) 3380 if (num_bytes - num_allocated < thresh)
3258 return 1; 3381 return 1;
3259 } 3382 }
3260
3261 /*
3262 * we have two similar checks here, one based on percentage
3263 * and once based on a hard number of 256MB. The idea
3264 * is that if we have a good amount of free
3265 * room, don't allocate a chunk. A good mount is
3266 * less than 80% utilized of the chunks we have allocated,
3267 * or more than 256MB free
3268 */
3269 if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3270 return 0;
3271
3272 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3273 return 0;
3274
3275 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 3383 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3276 3384
3277 /* 256MB or 5% of the FS */ 3385 /* 256MB or 2% of the FS */
3278 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3386 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
3387 /* system chunks need a much small threshold */
3388 if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
3389 thresh = 32 * 1024 * 1024;
3279 3390
3280 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) 3391 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
3281 return 0; 3392 return 0;
3282 return 1; 3393 return 1;
3283} 3394}
@@ -3291,7 +3402,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3291 int wait_for_alloc = 0; 3402 int wait_for_alloc = 0;
3292 int ret = 0; 3403 int ret = 0;
3293 3404
3294 flags = btrfs_reduce_alloc_profile(extent_root, flags); 3405 BUG_ON(!profile_is_valid(flags, 0));
3295 3406
3296 space_info = __find_space_info(extent_root->fs_info, flags); 3407 space_info = __find_space_info(extent_root->fs_info, flags);
3297 if (!space_info) { 3408 if (!space_info) {
@@ -3582,6 +3693,10 @@ again:
3582 if (used <= space_info->total_bytes) { 3693 if (used <= space_info->total_bytes) {
3583 if (used + orig_bytes <= space_info->total_bytes) { 3694 if (used + orig_bytes <= space_info->total_bytes) {
3584 space_info->bytes_may_use += orig_bytes; 3695 space_info->bytes_may_use += orig_bytes;
3696 trace_btrfs_space_reservation(root->fs_info,
3697 "space_info",
3698 (u64)space_info,
3699 orig_bytes, 1);
3585 ret = 0; 3700 ret = 0;
3586 } else { 3701 } else {
3587 /* 3702 /*
@@ -3649,6 +3764,10 @@ again:
3649 3764
3650 if (used + num_bytes < space_info->total_bytes + avail) { 3765 if (used + num_bytes < space_info->total_bytes + avail) {
3651 space_info->bytes_may_use += orig_bytes; 3766 space_info->bytes_may_use += orig_bytes;
3767 trace_btrfs_space_reservation(root->fs_info,
3768 "space_info",
3769 (u64)space_info,
3770 orig_bytes, 1);
3652 ret = 0; 3771 ret = 0;
3653 } else { 3772 } else {
3654 wait_ordered = true; 3773 wait_ordered = true;
@@ -3755,7 +3874,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3755 spin_unlock(&block_rsv->lock); 3874 spin_unlock(&block_rsv->lock);
3756} 3875}
3757 3876
3758static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, 3877static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
3878 struct btrfs_block_rsv *block_rsv,
3759 struct btrfs_block_rsv *dest, u64 num_bytes) 3879 struct btrfs_block_rsv *dest, u64 num_bytes)
3760{ 3880{
3761 struct btrfs_space_info *space_info = block_rsv->space_info; 3881 struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3791,6 +3911,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3791 if (num_bytes) { 3911 if (num_bytes) {
3792 spin_lock(&space_info->lock); 3912 spin_lock(&space_info->lock);
3793 space_info->bytes_may_use -= num_bytes; 3913 space_info->bytes_may_use -= num_bytes;
3914 trace_btrfs_space_reservation(fs_info, "space_info",
3915 (u64)space_info,
3916 num_bytes, 0);
3794 space_info->reservation_progress++; 3917 space_info->reservation_progress++;
3795 spin_unlock(&space_info->lock); 3918 spin_unlock(&space_info->lock);
3796 } 3919 }
@@ -3947,7 +4070,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
3947 if (global_rsv->full || global_rsv == block_rsv || 4070 if (global_rsv->full || global_rsv == block_rsv ||
3948 block_rsv->space_info != global_rsv->space_info) 4071 block_rsv->space_info != global_rsv->space_info)
3949 global_rsv = NULL; 4072 global_rsv = NULL;
3950 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); 4073 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4074 num_bytes);
3951} 4075}
3952 4076
3953/* 4077/*
@@ -4006,11 +4130,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4006 num_bytes = sinfo->total_bytes - num_bytes; 4130 num_bytes = sinfo->total_bytes - num_bytes;
4007 block_rsv->reserved += num_bytes; 4131 block_rsv->reserved += num_bytes;
4008 sinfo->bytes_may_use += num_bytes; 4132 sinfo->bytes_may_use += num_bytes;
4133 trace_btrfs_space_reservation(fs_info, "space_info",
4134 (u64)sinfo, num_bytes, 1);
4009 } 4135 }
4010 4136
4011 if (block_rsv->reserved >= block_rsv->size) { 4137 if (block_rsv->reserved >= block_rsv->size) {
4012 num_bytes = block_rsv->reserved - block_rsv->size; 4138 num_bytes = block_rsv->reserved - block_rsv->size;
4013 sinfo->bytes_may_use -= num_bytes; 4139 sinfo->bytes_may_use -= num_bytes;
4140 trace_btrfs_space_reservation(fs_info, "space_info",
4141 (u64)sinfo, num_bytes, 0);
4014 sinfo->reservation_progress++; 4142 sinfo->reservation_progress++;
4015 block_rsv->reserved = block_rsv->size; 4143 block_rsv->reserved = block_rsv->size;
4016 block_rsv->full = 1; 4144 block_rsv->full = 1;
@@ -4045,7 +4173,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4045 4173
4046static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 4174static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4047{ 4175{
4048 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); 4176 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4177 (u64)-1);
4049 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 4178 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4050 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 4179 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4051 WARN_ON(fs_info->trans_block_rsv.size > 0); 4180 WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4062,6 +4191,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4062 if (!trans->bytes_reserved) 4191 if (!trans->bytes_reserved)
4063 return; 4192 return;
4064 4193
4194 trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans,
4195 trans->bytes_reserved, 0);
4065 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 4196 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4066 trans->bytes_reserved = 0; 4197 trans->bytes_reserved = 0;
4067} 4198}
@@ -4079,6 +4210,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4079 * when we are truly done with the orphan item. 4210 * when we are truly done with the orphan item.
4080 */ 4211 */
4081 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4212 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4213 trace_btrfs_space_reservation(root->fs_info, "orphan",
4214 btrfs_ino(inode), num_bytes, 1);
4082 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4215 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4083} 4216}
4084 4217
@@ -4086,6 +4219,8 @@ void btrfs_orphan_release_metadata(struct inode *inode)
4086{ 4219{
4087 struct btrfs_root *root = BTRFS_I(inode)->root; 4220 struct btrfs_root *root = BTRFS_I(inode)->root;
4088 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4221 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4222 trace_btrfs_space_reservation(root->fs_info, "orphan",
4223 btrfs_ino(inode), num_bytes, 0);
4089 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4224 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4090} 4225}
4091 4226
@@ -4213,12 +4348,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4213 /* Need to be holding the i_mutex here if we aren't free space cache */ 4348 /* Need to be holding the i_mutex here if we aren't free space cache */
4214 if (btrfs_is_free_space_inode(root, inode)) 4349 if (btrfs_is_free_space_inode(root, inode))
4215 flush = 0; 4350 flush = 0;
4216 else
4217 WARN_ON(!mutex_is_locked(&inode->i_mutex));
4218 4351
4219 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4352 if (flush && btrfs_transaction_in_commit(root->fs_info))
4220 schedule_timeout(1); 4353 schedule_timeout(1);
4221 4354
4355 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4222 num_bytes = ALIGN(num_bytes, root->sectorsize); 4356 num_bytes = ALIGN(num_bytes, root->sectorsize);
4223 4357
4224 spin_lock(&BTRFS_I(inode)->lock); 4358 spin_lock(&BTRFS_I(inode)->lock);
@@ -4266,8 +4400,14 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4266 if (dropped) 4400 if (dropped)
4267 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4401 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4268 4402
4269 if (to_free) 4403 if (to_free) {
4270 btrfs_block_rsv_release(root, block_rsv, to_free); 4404 btrfs_block_rsv_release(root, block_rsv, to_free);
4405 trace_btrfs_space_reservation(root->fs_info,
4406 "delalloc",
4407 btrfs_ino(inode),
4408 to_free, 0);
4409 }
4410 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4271 return ret; 4411 return ret;
4272 } 4412 }
4273 4413
@@ -4278,7 +4418,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4278 } 4418 }
4279 BTRFS_I(inode)->reserved_extents += nr_extents; 4419 BTRFS_I(inode)->reserved_extents += nr_extents;
4280 spin_unlock(&BTRFS_I(inode)->lock); 4420 spin_unlock(&BTRFS_I(inode)->lock);
4421 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4281 4422
4423 if (to_reserve)
4424 trace_btrfs_space_reservation(root->fs_info,"delalloc",
4425 btrfs_ino(inode), to_reserve, 1);
4282 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4426 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4283 4427
4284 return 0; 4428 return 0;
@@ -4308,6 +4452,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4308 if (dropped > 0) 4452 if (dropped > 0)
4309 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4453 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4310 4454
4455 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4456 btrfs_ino(inode), to_free, 0);
4311 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4457 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4312 to_free); 4458 to_free);
4313} 4459}
@@ -4562,7 +4708,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4562 cache->reserved += num_bytes; 4708 cache->reserved += num_bytes;
4563 space_info->bytes_reserved += num_bytes; 4709 space_info->bytes_reserved += num_bytes;
4564 if (reserve == RESERVE_ALLOC) { 4710 if (reserve == RESERVE_ALLOC) {
4565 BUG_ON(space_info->bytes_may_use < num_bytes); 4711 trace_btrfs_space_reservation(cache->fs_info,
4712 "space_info",
4713 (u64)space_info,
4714 num_bytes, 0);
4566 space_info->bytes_may_use -= num_bytes; 4715 space_info->bytes_may_use -= num_bytes;
4567 } 4716 }
4568 } 4717 }
@@ -4928,6 +5077,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4928 rb_erase(&head->node.rb_node, &delayed_refs->root); 5077 rb_erase(&head->node.rb_node, &delayed_refs->root);
4929 5078
4930 delayed_refs->num_entries--; 5079 delayed_refs->num_entries--;
5080 if (waitqueue_active(&delayed_refs->seq_wait))
5081 wake_up(&delayed_refs->seq_wait);
4931 5082
4932 /* 5083 /*
4933 * we don't take a ref on the node because we're removing it from the 5084 * we don't take a ref on the node because we're removing it from the
@@ -4955,16 +5106,17 @@ out:
4955void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5106void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4956 struct btrfs_root *root, 5107 struct btrfs_root *root,
4957 struct extent_buffer *buf, 5108 struct extent_buffer *buf,
4958 u64 parent, int last_ref) 5109 u64 parent, int last_ref, int for_cow)
4959{ 5110{
4960 struct btrfs_block_group_cache *cache = NULL; 5111 struct btrfs_block_group_cache *cache = NULL;
4961 int ret; 5112 int ret;
4962 5113
4963 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5114 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4964 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, 5115 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
4965 parent, root->root_key.objectid, 5116 buf->start, buf->len,
4966 btrfs_header_level(buf), 5117 parent, root->root_key.objectid,
4967 BTRFS_DROP_DELAYED_REF, NULL); 5118 btrfs_header_level(buf),
5119 BTRFS_DROP_DELAYED_REF, NULL, for_cow);
4968 BUG_ON(ret); 5120 BUG_ON(ret);
4969 } 5121 }
4970 5122
@@ -4999,12 +5151,12 @@ out:
4999 btrfs_put_block_group(cache); 5151 btrfs_put_block_group(cache);
5000} 5152}
5001 5153
5002int btrfs_free_extent(struct btrfs_trans_handle *trans, 5154int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5003 struct btrfs_root *root, 5155 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5004 u64 bytenr, u64 num_bytes, u64 parent, 5156 u64 owner, u64 offset, int for_cow)
5005 u64 root_objectid, u64 owner, u64 offset)
5006{ 5157{
5007 int ret; 5158 int ret;
5159 struct btrfs_fs_info *fs_info = root->fs_info;
5008 5160
5009 /* 5161 /*
5010 * tree log blocks never actually go into the extent allocation 5162 * tree log blocks never actually go into the extent allocation
@@ -5016,14 +5168,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
5016 btrfs_pin_extent(root, bytenr, num_bytes, 1); 5168 btrfs_pin_extent(root, bytenr, num_bytes, 1);
5017 ret = 0; 5169 ret = 0;
5018 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 5170 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5019 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 5171 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5172 num_bytes,
5020 parent, root_objectid, (int)owner, 5173 parent, root_objectid, (int)owner,
5021 BTRFS_DROP_DELAYED_REF, NULL); 5174 BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5022 BUG_ON(ret); 5175 BUG_ON(ret);
5023 } else { 5176 } else {
5024 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 5177 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5025 parent, root_objectid, owner, 5178 num_bytes,
5026 offset, BTRFS_DROP_DELAYED_REF, NULL); 5179 parent, root_objectid, owner,
5180 offset, BTRFS_DROP_DELAYED_REF,
5181 NULL, for_cow);
5027 BUG_ON(ret); 5182 BUG_ON(ret);
5028 } 5183 }
5029 return ret; 5184 return ret;
@@ -5146,6 +5301,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5146 ins->objectid = 0; 5301 ins->objectid = 0;
5147 ins->offset = 0; 5302 ins->offset = 0;
5148 5303
5304 trace_find_free_extent(orig_root, num_bytes, empty_size, data);
5305
5149 space_info = __find_space_info(root->fs_info, data); 5306 space_info = __find_space_info(root->fs_info, data);
5150 if (!space_info) { 5307 if (!space_info) {
5151 printk(KERN_ERR "No space info for %llu\n", data); 5308 printk(KERN_ERR "No space info for %llu\n", data);
@@ -5295,15 +5452,6 @@ alloc:
5295 if (unlikely(block_group->ro)) 5452 if (unlikely(block_group->ro))
5296 goto loop; 5453 goto loop;
5297 5454
5298 spin_lock(&block_group->free_space_ctl->tree_lock);
5299 if (cached &&
5300 block_group->free_space_ctl->free_space <
5301 num_bytes + empty_cluster + empty_size) {
5302 spin_unlock(&block_group->free_space_ctl->tree_lock);
5303 goto loop;
5304 }
5305 spin_unlock(&block_group->free_space_ctl->tree_lock);
5306
5307 /* 5455 /*
5308 * Ok we want to try and use the cluster allocator, so 5456 * Ok we want to try and use the cluster allocator, so
5309 * lets look there 5457 * lets look there
@@ -5331,6 +5479,8 @@ alloc:
5331 if (offset) { 5479 if (offset) {
5332 /* we have a block, we're done */ 5480 /* we have a block, we're done */
5333 spin_unlock(&last_ptr->refill_lock); 5481 spin_unlock(&last_ptr->refill_lock);
5482 trace_btrfs_reserve_extent_cluster(root,
5483 block_group, search_start, num_bytes);
5334 goto checks; 5484 goto checks;
5335 } 5485 }
5336 5486
@@ -5349,8 +5499,15 @@ refill_cluster:
5349 * plenty of times and not have found 5499 * plenty of times and not have found
5350 * anything, so we are likely way too 5500 * anything, so we are likely way too
5351 * fragmented for the clustering stuff to find 5501 * fragmented for the clustering stuff to find
5352 * anything. */ 5502 * anything.
5353 if (loop >= LOOP_NO_EMPTY_SIZE) { 5503 *
5504 * However, if the cluster is taken from the
5505 * current block group, release the cluster
5506 * first, so that we stand a better chance of
5507 * succeeding in the unclustered
5508 * allocation. */
5509 if (loop >= LOOP_NO_EMPTY_SIZE &&
5510 last_ptr->block_group != block_group) {
5354 spin_unlock(&last_ptr->refill_lock); 5511 spin_unlock(&last_ptr->refill_lock);
5355 goto unclustered_alloc; 5512 goto unclustered_alloc;
5356 } 5513 }
@@ -5361,6 +5518,11 @@ refill_cluster:
5361 */ 5518 */
5362 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5519 btrfs_return_cluster_to_free_space(NULL, last_ptr);
5363 5520
5521 if (loop >= LOOP_NO_EMPTY_SIZE) {
5522 spin_unlock(&last_ptr->refill_lock);
5523 goto unclustered_alloc;
5524 }
5525
5364 /* allocate a cluster in this block group */ 5526 /* allocate a cluster in this block group */
5365 ret = btrfs_find_space_cluster(trans, root, 5527 ret = btrfs_find_space_cluster(trans, root,
5366 block_group, last_ptr, 5528 block_group, last_ptr,
@@ -5377,6 +5539,9 @@ refill_cluster:
5377 if (offset) { 5539 if (offset) {
5378 /* we found one, proceed */ 5540 /* we found one, proceed */
5379 spin_unlock(&last_ptr->refill_lock); 5541 spin_unlock(&last_ptr->refill_lock);
5542 trace_btrfs_reserve_extent_cluster(root,
5543 block_group, search_start,
5544 num_bytes);
5380 goto checks; 5545 goto checks;
5381 } 5546 }
5382 } else if (!cached && loop > LOOP_CACHING_NOWAIT 5547 } else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5401,6 +5566,15 @@ refill_cluster:
5401 } 5566 }
5402 5567
5403unclustered_alloc: 5568unclustered_alloc:
5569 spin_lock(&block_group->free_space_ctl->tree_lock);
5570 if (cached &&
5571 block_group->free_space_ctl->free_space <
5572 num_bytes + empty_cluster + empty_size) {
5573 spin_unlock(&block_group->free_space_ctl->tree_lock);
5574 goto loop;
5575 }
5576 spin_unlock(&block_group->free_space_ctl->tree_lock);
5577
5404 offset = btrfs_find_space_for_alloc(block_group, search_start, 5578 offset = btrfs_find_space_for_alloc(block_group, search_start,
5405 num_bytes, empty_size); 5579 num_bytes, empty_size);
5406 /* 5580 /*
@@ -5438,9 +5612,6 @@ checks:
5438 goto loop; 5612 goto loop;
5439 } 5613 }
5440 5614
5441 ins->objectid = search_start;
5442 ins->offset = num_bytes;
5443
5444 if (offset < search_start) 5615 if (offset < search_start)
5445 btrfs_add_free_space(used_block_group, offset, 5616 btrfs_add_free_space(used_block_group, offset,
5446 search_start - offset); 5617 search_start - offset);
@@ -5457,6 +5628,8 @@ checks:
5457 ins->objectid = search_start; 5628 ins->objectid = search_start;
5458 ins->offset = num_bytes; 5629 ins->offset = num_bytes;
5459 5630
5631 trace_btrfs_reserve_extent(orig_root, block_group,
5632 search_start, num_bytes);
5460 if (offset < search_start) 5633 if (offset < search_start)
5461 btrfs_add_free_space(used_block_group, offset, 5634 btrfs_add_free_space(used_block_group, offset,
5462 search_start - offset); 5635 search_start - offset);
@@ -5842,9 +6015,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5842 6015
5843 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 6016 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
5844 6017
5845 ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, 6018 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
5846 0, root_objectid, owner, offset, 6019 ins->offset, 0,
5847 BTRFS_ADD_DELAYED_EXTENT, NULL); 6020 root_objectid, owner, offset,
6021 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
5848 return ret; 6022 return ret;
5849} 6023}
5850 6024
@@ -5997,10 +6171,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5997 return ERR_PTR(-ENOSPC); 6171 return ERR_PTR(-ENOSPC);
5998} 6172}
5999 6173
6000static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) 6174static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6175 struct btrfs_block_rsv *block_rsv, u32 blocksize)
6001{ 6176{
6002 block_rsv_add_bytes(block_rsv, blocksize, 0); 6177 block_rsv_add_bytes(block_rsv, blocksize, 0);
6003 block_rsv_release_bytes(block_rsv, NULL, 0); 6178 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6004} 6179}
6005 6180
6006/* 6181/*
@@ -6014,7 +6189,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6014 struct btrfs_root *root, u32 blocksize, 6189 struct btrfs_root *root, u32 blocksize,
6015 u64 parent, u64 root_objectid, 6190 u64 parent, u64 root_objectid,
6016 struct btrfs_disk_key *key, int level, 6191 struct btrfs_disk_key *key, int level,
6017 u64 hint, u64 empty_size) 6192 u64 hint, u64 empty_size, int for_cow)
6018{ 6193{
6019 struct btrfs_key ins; 6194 struct btrfs_key ins;
6020 struct btrfs_block_rsv *block_rsv; 6195 struct btrfs_block_rsv *block_rsv;
@@ -6030,7 +6205,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6030 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, 6205 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6031 empty_size, hint, (u64)-1, &ins, 0); 6206 empty_size, hint, (u64)-1, &ins, 0);
6032 if (ret) { 6207 if (ret) {
6033 unuse_block_rsv(block_rsv, blocksize); 6208 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6034 return ERR_PTR(ret); 6209 return ERR_PTR(ret);
6035 } 6210 }
6036 6211
@@ -6058,10 +6233,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6058 extent_op->update_flags = 1; 6233 extent_op->update_flags = 1;
6059 extent_op->is_data = 0; 6234 extent_op->is_data = 0;
6060 6235
6061 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, 6236 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6237 ins.objectid,
6062 ins.offset, parent, root_objectid, 6238 ins.offset, parent, root_objectid,
6063 level, BTRFS_ADD_DELAYED_EXTENT, 6239 level, BTRFS_ADD_DELAYED_EXTENT,
6064 extent_op); 6240 extent_op, for_cow);
6065 BUG_ON(ret); 6241 BUG_ON(ret);
6066 } 6242 }
6067 return buf; 6243 return buf;
@@ -6078,6 +6254,7 @@ struct walk_control {
6078 int keep_locks; 6254 int keep_locks;
6079 int reada_slot; 6255 int reada_slot;
6080 int reada_count; 6256 int reada_count;
6257 int for_reloc;
6081}; 6258};
6082 6259
6083#define DROP_REFERENCE 1 6260#define DROP_REFERENCE 1
@@ -6216,9 +6393,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6216 /* wc->stage == UPDATE_BACKREF */ 6393 /* wc->stage == UPDATE_BACKREF */
6217 if (!(wc->flags[level] & flag)) { 6394 if (!(wc->flags[level] & flag)) {
6218 BUG_ON(!path->locks[level]); 6395 BUG_ON(!path->locks[level]);
6219 ret = btrfs_inc_ref(trans, root, eb, 1); 6396 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6220 BUG_ON(ret); 6397 BUG_ON(ret);
6221 ret = btrfs_dec_ref(trans, root, eb, 0); 6398 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6222 BUG_ON(ret); 6399 BUG_ON(ret);
6223 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 6400 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6224 eb->len, flag, 0); 6401 eb->len, flag, 0);
@@ -6362,7 +6539,7 @@ skip:
6362 } 6539 }
6363 6540
6364 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 6541 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6365 root->root_key.objectid, level - 1, 0); 6542 root->root_key.objectid, level - 1, 0, 0);
6366 BUG_ON(ret); 6543 BUG_ON(ret);
6367 } 6544 }
6368 btrfs_tree_unlock(next); 6545 btrfs_tree_unlock(next);
@@ -6436,9 +6613,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6436 if (wc->refs[level] == 1) { 6613 if (wc->refs[level] == 1) {
6437 if (level == 0) { 6614 if (level == 0) {
6438 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 6615 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6439 ret = btrfs_dec_ref(trans, root, eb, 1); 6616 ret = btrfs_dec_ref(trans, root, eb, 1,
6617 wc->for_reloc);
6440 else 6618 else
6441 ret = btrfs_dec_ref(trans, root, eb, 0); 6619 ret = btrfs_dec_ref(trans, root, eb, 0,
6620 wc->for_reloc);
6442 BUG_ON(ret); 6621 BUG_ON(ret);
6443 } 6622 }
6444 /* make block locked assertion in clean_tree_block happy */ 6623 /* make block locked assertion in clean_tree_block happy */
@@ -6465,7 +6644,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6465 btrfs_header_owner(path->nodes[level + 1])); 6644 btrfs_header_owner(path->nodes[level + 1]));
6466 } 6645 }
6467 6646
6468 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 6647 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
6469out: 6648out:
6470 wc->refs[level] = 0; 6649 wc->refs[level] = 0;
6471 wc->flags[level] = 0; 6650 wc->flags[level] = 0;
@@ -6549,7 +6728,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6549 * blocks are properly updated. 6728 * blocks are properly updated.
6550 */ 6729 */
6551void btrfs_drop_snapshot(struct btrfs_root *root, 6730void btrfs_drop_snapshot(struct btrfs_root *root,
6552 struct btrfs_block_rsv *block_rsv, int update_ref) 6731 struct btrfs_block_rsv *block_rsv, int update_ref,
6732 int for_reloc)
6553{ 6733{
6554 struct btrfs_path *path; 6734 struct btrfs_path *path;
6555 struct btrfs_trans_handle *trans; 6735 struct btrfs_trans_handle *trans;
@@ -6637,6 +6817,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
6637 wc->stage = DROP_REFERENCE; 6817 wc->stage = DROP_REFERENCE;
6638 wc->update_ref = update_ref; 6818 wc->update_ref = update_ref;
6639 wc->keep_locks = 0; 6819 wc->keep_locks = 0;
6820 wc->for_reloc = for_reloc;
6640 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 6821 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6641 6822
6642 while (1) { 6823 while (1) {
@@ -6721,6 +6902,7 @@ out:
6721 * drop subtree rooted at tree block 'node'. 6902 * drop subtree rooted at tree block 'node'.
6722 * 6903 *
6723 * NOTE: this function will unlock and release tree block 'node' 6904 * NOTE: this function will unlock and release tree block 'node'
6905 * only used by relocation code
6724 */ 6906 */
6725int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 6907int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6726 struct btrfs_root *root, 6908 struct btrfs_root *root,
@@ -6765,6 +6947,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6765 wc->stage = DROP_REFERENCE; 6947 wc->stage = DROP_REFERENCE;
6766 wc->update_ref = 0; 6948 wc->update_ref = 0;
6767 wc->keep_locks = 1; 6949 wc->keep_locks = 1;
6950 wc->for_reloc = 1;
6768 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 6951 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6769 6952
6770 while (1) { 6953 while (1) {
@@ -6792,6 +6975,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
6792 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 6975 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
6793 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 6976 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
6794 6977
6978 if (root->fs_info->balance_ctl) {
6979 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
6980 u64 tgt = 0;
6981
6982 /* pick restriper's target profile and return */
6983 if (flags & BTRFS_BLOCK_GROUP_DATA &&
6984 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6985 tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
6986 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
6987 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6988 tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
6989 } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
6990 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6991 tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
6992 }
6993
6994 if (tgt) {
6995 /* extended -> chunk profile */
6996 tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
6997 return tgt;
6998 }
6999 }
7000
6795 /* 7001 /*
6796 * we add in the count of missing devices because we want 7002 * we add in the count of missing devices because we want
6797 * to make sure that any RAID levels on a degraded FS 7003 * to make sure that any RAID levels on a degraded FS
@@ -7085,7 +7291,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7085 * space to fit our block group in. 7291 * space to fit our block group in.
7086 */ 7292 */
7087 if (device->total_bytes > device->bytes_used + min_free) { 7293 if (device->total_bytes > device->bytes_used + min_free) {
7088 ret = find_free_dev_extent(NULL, device, min_free, 7294 ret = find_free_dev_extent(device, min_free,
7089 &dev_offset, NULL); 7295 &dev_offset, NULL);
7090 if (!ret) 7296 if (!ret)
7091 dev_nr++; 7297 dev_nr++;
@@ -7447,6 +7653,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7447 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7653 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7448 &cache->space_info); 7654 &cache->space_info);
7449 BUG_ON(ret); 7655 BUG_ON(ret);
7656 update_global_block_rsv(root->fs_info);
7450 7657
7451 spin_lock(&cache->space_info->lock); 7658 spin_lock(&cache->space_info->lock);
7452 cache->space_info->bytes_readonly += cache->bytes_super; 7659 cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7466,6 +7673,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7466 return 0; 7673 return 0;
7467} 7674}
7468 7675
7676static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7677{
7678 u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
7679
7680 /* chunk -> extended profile */
7681 if (extra_flags == 0)
7682 extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
7683
7684 if (flags & BTRFS_BLOCK_GROUP_DATA)
7685 fs_info->avail_data_alloc_bits &= ~extra_flags;
7686 if (flags & BTRFS_BLOCK_GROUP_METADATA)
7687 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7688 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7689 fs_info->avail_system_alloc_bits &= ~extra_flags;
7690}
7691
7469int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 7692int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7470 struct btrfs_root *root, u64 group_start) 7693 struct btrfs_root *root, u64 group_start)
7471{ 7694{
@@ -7476,6 +7699,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7476 struct btrfs_key key; 7699 struct btrfs_key key;
7477 struct inode *inode; 7700 struct inode *inode;
7478 int ret; 7701 int ret;
7702 int index;
7479 int factor; 7703 int factor;
7480 7704
7481 root = root->fs_info->extent_root; 7705 root = root->fs_info->extent_root;
@@ -7491,6 +7715,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7491 free_excluded_extents(root, block_group); 7715 free_excluded_extents(root, block_group);
7492 7716
7493 memcpy(&key, &block_group->key, sizeof(key)); 7717 memcpy(&key, &block_group->key, sizeof(key));
7718 index = get_block_group_index(block_group);
7494 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 7719 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
7495 BTRFS_BLOCK_GROUP_RAID1 | 7720 BTRFS_BLOCK_GROUP_RAID1 |
7496 BTRFS_BLOCK_GROUP_RAID10)) 7721 BTRFS_BLOCK_GROUP_RAID10))
@@ -7565,6 +7790,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7565 * are still on the list after taking the semaphore 7790 * are still on the list after taking the semaphore
7566 */ 7791 */
7567 list_del_init(&block_group->list); 7792 list_del_init(&block_group->list);
7793 if (list_empty(&block_group->space_info->block_groups[index]))
7794 clear_avail_alloc_bits(root->fs_info, block_group->flags);
7568 up_write(&block_group->space_info->groups_sem); 7795 up_write(&block_group->space_info->groups_sem);
7569 7796
7570 if (block_group->cached == BTRFS_CACHE_STARTED) 7797 if (block_group->cached == BTRFS_CACHE_STARTED)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 49f3c9dc09f4..9d09a4f81875 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -18,6 +18,7 @@
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h" 20#include "volumes.h"
21#include "check-integrity.h"
21 22
22static struct kmem_cache *extent_state_cache; 23static struct kmem_cache *extent_state_cache;
23static struct kmem_cache *extent_buffer_cache; 24static struct kmem_cache *extent_buffer_cache;
@@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1895 } 1896 }
1896 bio->bi_bdev = dev->bdev; 1897 bio->bi_bdev = dev->bdev;
1897 bio_add_page(bio, page, length, start-page_offset(page)); 1898 bio_add_page(bio, page, length, start-page_offset(page));
1898 submit_bio(WRITE_SYNC, bio); 1899 btrfsic_submit_bio(WRITE_SYNC, bio);
1899 wait_for_completion(&compl); 1900 wait_for_completion(&compl);
1900 1901
1901 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 1902 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
2393 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 2394 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2394 mirror_num, bio_flags, start); 2395 mirror_num, bio_flags, start);
2395 else 2396 else
2396 submit_bio(rw, bio); 2397 btrfsic_submit_bio(rw, bio);
2397 2398
2398 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2399 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2399 ret = -EOPNOTSUPP; 2400 ret = -EOPNOTSUPP;
@@ -3579,6 +3580,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3579 atomic_set(&eb->blocking_writers, 0); 3580 atomic_set(&eb->blocking_writers, 0);
3580 atomic_set(&eb->spinning_readers, 0); 3581 atomic_set(&eb->spinning_readers, 0);
3581 atomic_set(&eb->spinning_writers, 0); 3582 atomic_set(&eb->spinning_writers, 0);
3583 eb->lock_nested = 0;
3582 init_waitqueue_head(&eb->write_lock_wq); 3584 init_waitqueue_head(&eb->write_lock_wq);
3583 init_waitqueue_head(&eb->read_lock_wq); 3585 init_waitqueue_head(&eb->read_lock_wq);
3584 3586
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7604c3001322..bc6a042cb6fc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -129,6 +129,7 @@ struct extent_buffer {
129 struct list_head leak_list; 129 struct list_head leak_list;
130 struct rcu_head rcu_head; 130 struct rcu_head rcu_head;
131 atomic_t refs; 131 atomic_t refs;
132 pid_t lock_owner;
132 133
133 /* count of read lock holders on the extent buffer */ 134 /* count of read lock holders on the extent buffer */
134 atomic_t write_locks; 135 atomic_t write_locks;
@@ -137,6 +138,7 @@ struct extent_buffer {
137 atomic_t blocking_readers; 138 atomic_t blocking_readers;
138 atomic_t spinning_readers; 139 atomic_t spinning_readers;
139 atomic_t spinning_writers; 140 atomic_t spinning_writers;
141 int lock_nested;
140 142
141 /* protects write locks */ 143 /* protects write locks */
142 rwlock_t lock; 144 rwlock_t lock;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 034d98503229..859ba2dd8890 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -678,7 +678,7 @@ next_slot:
678 disk_bytenr, num_bytes, 0, 678 disk_bytenr, num_bytes, 0,
679 root->root_key.objectid, 679 root->root_key.objectid,
680 new_key.objectid, 680 new_key.objectid,
681 start - extent_offset); 681 start - extent_offset, 0);
682 BUG_ON(ret); 682 BUG_ON(ret);
683 *hint_byte = disk_bytenr; 683 *hint_byte = disk_bytenr;
684 } 684 }
@@ -753,7 +753,7 @@ next_slot:
753 disk_bytenr, num_bytes, 0, 753 disk_bytenr, num_bytes, 0,
754 root->root_key.objectid, 754 root->root_key.objectid,
755 key.objectid, key.offset - 755 key.objectid, key.offset -
756 extent_offset); 756 extent_offset, 0);
757 BUG_ON(ret); 757 BUG_ON(ret);
758 inode_sub_bytes(inode, 758 inode_sub_bytes(inode,
759 extent_end - key.offset); 759 extent_end - key.offset);
@@ -962,7 +962,7 @@ again:
962 962
963 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 963 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
964 root->root_key.objectid, 964 root->root_key.objectid,
965 ino, orig_offset); 965 ino, orig_offset, 0);
966 BUG_ON(ret); 966 BUG_ON(ret);
967 967
968 if (split == start) { 968 if (split == start) {
@@ -989,7 +989,7 @@ again:
989 del_nr++; 989 del_nr++;
990 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 990 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
991 0, root->root_key.objectid, 991 0, root->root_key.objectid,
992 ino, orig_offset); 992 ino, orig_offset, 0);
993 BUG_ON(ret); 993 BUG_ON(ret);
994 } 994 }
995 other_start = 0; 995 other_start = 0;
@@ -1006,7 +1006,7 @@ again:
1006 del_nr++; 1006 del_nr++;
1007 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1007 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1008 0, root->root_key.objectid, 1008 0, root->root_key.objectid,
1009 ino, orig_offset); 1009 ino, orig_offset, 0);
1010 BUG_ON(ret); 1010 BUG_ON(ret);
1011 } 1011 }
1012 if (del_nr == 0) { 1012 if (del_nr == 0) {
@@ -1274,7 +1274,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1274 dirty_pages); 1274 dirty_pages);
1275 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1275 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1276 btrfs_btree_balance_dirty(root, 1); 1276 btrfs_btree_balance_dirty(root, 1);
1277 btrfs_throttle(root);
1278 1277
1279 pos += copied; 1278 pos += copied;
1280 num_written += copied; 1279 num_written += copied;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9a897bf79538..d20ff87ca603 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
319 io_ctl_unmap_page(io_ctl); 319 io_ctl_unmap_page(io_ctl);
320 320
321 for (i = 0; i < io_ctl->num_pages; i++) { 321 for (i = 0; i < io_ctl->num_pages; i++) {
322 ClearPageChecked(io_ctl->pages[i]); 322 if (io_ctl->pages[i]) {
323 unlock_page(io_ctl->pages[i]); 323 ClearPageChecked(io_ctl->pages[i]);
324 page_cache_release(io_ctl->pages[i]); 324 unlock_page(io_ctl->pages[i]);
325 page_cache_release(io_ctl->pages[i]);
326 }
325 } 327 }
326} 328}
327 329
@@ -635,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
635 if (!num_entries) 637 if (!num_entries)
636 return 0; 638 return 0;
637 639
638 io_ctl_init(&io_ctl, inode, root); 640 ret = io_ctl_init(&io_ctl, inode, root);
641 if (ret)
642 return ret;
643
639 ret = readahead_cache(inode); 644 ret = readahead_cache(inode);
640 if (ret) 645 if (ret)
641 goto out; 646 goto out;
@@ -838,7 +843,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
838 struct io_ctl io_ctl; 843 struct io_ctl io_ctl;
839 struct list_head bitmap_list; 844 struct list_head bitmap_list;
840 struct btrfs_key key; 845 struct btrfs_key key;
841 u64 start, end, len; 846 u64 start, extent_start, extent_end, len;
842 int entries = 0; 847 int entries = 0;
843 int bitmaps = 0; 848 int bitmaps = 0;
844 int ret; 849 int ret;
@@ -849,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
849 if (!i_size_read(inode)) 854 if (!i_size_read(inode))
850 return -1; 855 return -1;
851 856
852 io_ctl_init(&io_ctl, inode, root); 857 ret = io_ctl_init(&io_ctl, inode, root);
858 if (ret)
859 return -1;
853 860
854 /* Get the cluster for this block_group if it exists */ 861 /* Get the cluster for this block_group if it exists */
855 if (block_group && !list_empty(&block_group->cluster_list)) 862 if (block_group && !list_empty(&block_group->cluster_list))
@@ -857,25 +864,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
857 struct btrfs_free_cluster, 864 struct btrfs_free_cluster,
858 block_group_list); 865 block_group_list);
859 866
860 /*
861 * We shouldn't have switched the pinned extents yet so this is the
862 * right one
863 */
864 unpin = root->fs_info->pinned_extents;
865
866 /* Lock all pages first so we can lock the extent safely. */ 867 /* Lock all pages first so we can lock the extent safely. */
867 io_ctl_prepare_pages(&io_ctl, inode, 0); 868 io_ctl_prepare_pages(&io_ctl, inode, 0);
868 869
869 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 870 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
870 0, &cached_state, GFP_NOFS); 871 0, &cached_state, GFP_NOFS);
871 872
872 /*
873 * When searching for pinned extents, we need to start at our start
874 * offset.
875 */
876 if (block_group)
877 start = block_group->key.objectid;
878
879 node = rb_first(&ctl->free_space_offset); 873 node = rb_first(&ctl->free_space_offset);
880 if (!node && cluster) { 874 if (!node && cluster) {
881 node = rb_first(&cluster->root); 875 node = rb_first(&cluster->root);
@@ -918,9 +912,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
918 * We want to add any pinned extents to our free space cache 912 * We want to add any pinned extents to our free space cache
919 * so we don't leak the space 913 * so we don't leak the space
920 */ 914 */
915
916 /*
917 * We shouldn't have switched the pinned extents yet so this is the
918 * right one
919 */
920 unpin = root->fs_info->pinned_extents;
921
922 if (block_group)
923 start = block_group->key.objectid;
924
921 while (block_group && (start < block_group->key.objectid + 925 while (block_group && (start < block_group->key.objectid +
922 block_group->key.offset)) { 926 block_group->key.offset)) {
923 ret = find_first_extent_bit(unpin, start, &start, &end, 927 ret = find_first_extent_bit(unpin, start,
928 &extent_start, &extent_end,
924 EXTENT_DIRTY); 929 EXTENT_DIRTY);
925 if (ret) { 930 if (ret) {
926 ret = 0; 931 ret = 0;
@@ -928,20 +933,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
928 } 933 }
929 934
930 /* This pinned extent is out of our range */ 935 /* This pinned extent is out of our range */
931 if (start >= block_group->key.objectid + 936 if (extent_start >= block_group->key.objectid +
932 block_group->key.offset) 937 block_group->key.offset)
933 break; 938 break;
934 939
935 len = block_group->key.objectid + 940 extent_start = max(extent_start, start);
936 block_group->key.offset - start; 941 extent_end = min(block_group->key.objectid +
937 len = min(len, end + 1 - start); 942 block_group->key.offset, extent_end + 1);
943 len = extent_end - extent_start;
938 944
939 entries++; 945 entries++;
940 ret = io_ctl_add_entry(&io_ctl, start, len, NULL); 946 ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
941 if (ret) 947 if (ret)
942 goto out_nospc; 948 goto out_nospc;
943 949
944 start = end + 1; 950 start = extent_end;
945 } 951 }
946 952
947 /* Write out the bitmaps */ 953 /* Write out the bitmaps */
@@ -2283,23 +2289,23 @@ out:
2283static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, 2289static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2284 struct btrfs_free_space *entry, 2290 struct btrfs_free_space *entry,
2285 struct btrfs_free_cluster *cluster, 2291 struct btrfs_free_cluster *cluster,
2286 u64 offset, u64 bytes, u64 min_bytes) 2292 u64 offset, u64 bytes,
2293 u64 cont1_bytes, u64 min_bytes)
2287{ 2294{
2288 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2295 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2289 unsigned long next_zero; 2296 unsigned long next_zero;
2290 unsigned long i; 2297 unsigned long i;
2291 unsigned long search_bits; 2298 unsigned long want_bits;
2292 unsigned long total_bits; 2299 unsigned long min_bits;
2293 unsigned long found_bits; 2300 unsigned long found_bits;
2294 unsigned long start = 0; 2301 unsigned long start = 0;
2295 unsigned long total_found = 0; 2302 unsigned long total_found = 0;
2296 int ret; 2303 int ret;
2297 bool found = false;
2298 2304
2299 i = offset_to_bit(entry->offset, block_group->sectorsize, 2305 i = offset_to_bit(entry->offset, block_group->sectorsize,
2300 max_t(u64, offset, entry->offset)); 2306 max_t(u64, offset, entry->offset));
2301 search_bits = bytes_to_bits(bytes, block_group->sectorsize); 2307 want_bits = bytes_to_bits(bytes, block_group->sectorsize);
2302 total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2308 min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
2303 2309
2304again: 2310again:
2305 found_bits = 0; 2311 found_bits = 0;
@@ -2308,7 +2314,7 @@ again:
2308 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { 2314 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
2309 next_zero = find_next_zero_bit(entry->bitmap, 2315 next_zero = find_next_zero_bit(entry->bitmap,
2310 BITS_PER_BITMAP, i); 2316 BITS_PER_BITMAP, i);
2311 if (next_zero - i >= search_bits) { 2317 if (next_zero - i >= min_bits) {
2312 found_bits = next_zero - i; 2318 found_bits = next_zero - i;
2313 break; 2319 break;
2314 } 2320 }
@@ -2318,10 +2324,9 @@ again:
2318 if (!found_bits) 2324 if (!found_bits)
2319 return -ENOSPC; 2325 return -ENOSPC;
2320 2326
2321 if (!found) { 2327 if (!total_found) {
2322 start = i; 2328 start = i;
2323 cluster->max_size = 0; 2329 cluster->max_size = 0;
2324 found = true;
2325 } 2330 }
2326 2331
2327 total_found += found_bits; 2332 total_found += found_bits;
@@ -2329,13 +2334,8 @@ again:
2329 if (cluster->max_size < found_bits * block_group->sectorsize) 2334 if (cluster->max_size < found_bits * block_group->sectorsize)
2330 cluster->max_size = found_bits * block_group->sectorsize; 2335 cluster->max_size = found_bits * block_group->sectorsize;
2331 2336
2332 if (total_found < total_bits) { 2337 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2333 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); 2338 i = next_zero + 1;
2334 if (i - start > total_bits * 2) {
2335 total_found = 0;
2336 cluster->max_size = 0;
2337 found = false;
2338 }
2339 goto again; 2339 goto again;
2340 } 2340 }
2341 2341
@@ -2346,28 +2346,31 @@ again:
2346 &entry->offset_index, 1); 2346 &entry->offset_index, 1);
2347 BUG_ON(ret); 2347 BUG_ON(ret);
2348 2348
2349 trace_btrfs_setup_cluster(block_group, cluster,
2350 total_found * block_group->sectorsize, 1);
2349 return 0; 2351 return 0;
2350} 2352}
2351 2353
2352/* 2354/*
2353 * This searches the block group for just extents to fill the cluster with. 2355 * This searches the block group for just extents to fill the cluster with.
2356 * Try to find a cluster with at least bytes total bytes, at least one
2357 * extent of cont1_bytes, and other clusters of at least min_bytes.
2354 */ 2358 */
2355static noinline int 2359static noinline int
2356setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, 2360setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2357 struct btrfs_free_cluster *cluster, 2361 struct btrfs_free_cluster *cluster,
2358 struct list_head *bitmaps, u64 offset, u64 bytes, 2362 struct list_head *bitmaps, u64 offset, u64 bytes,
2359 u64 min_bytes) 2363 u64 cont1_bytes, u64 min_bytes)
2360{ 2364{
2361 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2365 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2362 struct btrfs_free_space *first = NULL; 2366 struct btrfs_free_space *first = NULL;
2363 struct btrfs_free_space *entry = NULL; 2367 struct btrfs_free_space *entry = NULL;
2364 struct btrfs_free_space *prev = NULL;
2365 struct btrfs_free_space *last; 2368 struct btrfs_free_space *last;
2366 struct rb_node *node; 2369 struct rb_node *node;
2367 u64 window_start; 2370 u64 window_start;
2368 u64 window_free; 2371 u64 window_free;
2369 u64 max_extent; 2372 u64 max_extent;
2370 u64 max_gap = 128 * 1024; 2373 u64 total_size = 0;
2371 2374
2372 entry = tree_search_offset(ctl, offset, 0, 1); 2375 entry = tree_search_offset(ctl, offset, 0, 1);
2373 if (!entry) 2376 if (!entry)
@@ -2377,8 +2380,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2377 * We don't want bitmaps, so just move along until we find a normal 2380 * We don't want bitmaps, so just move along until we find a normal
2378 * extent entry. 2381 * extent entry.
2379 */ 2382 */
2380 while (entry->bitmap) { 2383 while (entry->bitmap || entry->bytes < min_bytes) {
2381 if (list_empty(&entry->list)) 2384 if (entry->bitmap && list_empty(&entry->list))
2382 list_add_tail(&entry->list, bitmaps); 2385 list_add_tail(&entry->list, bitmaps);
2383 node = rb_next(&entry->offset_index); 2386 node = rb_next(&entry->offset_index);
2384 if (!node) 2387 if (!node)
@@ -2391,12 +2394,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2391 max_extent = entry->bytes; 2394 max_extent = entry->bytes;
2392 first = entry; 2395 first = entry;
2393 last = entry; 2396 last = entry;
2394 prev = entry;
2395 2397
2396 while (window_free <= min_bytes) { 2398 for (node = rb_next(&entry->offset_index); node;
2397 node = rb_next(&entry->offset_index); 2399 node = rb_next(&entry->offset_index)) {
2398 if (!node)
2399 return -ENOSPC;
2400 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2400 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2401 2401
2402 if (entry->bitmap) { 2402 if (entry->bitmap) {
@@ -2405,26 +2405,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2405 continue; 2405 continue;
2406 } 2406 }
2407 2407
2408 /* 2408 if (entry->bytes < min_bytes)
2409 * we haven't filled the empty size and the window is 2409 continue;
2410 * very large. reset and try again 2410
2411 */ 2411 last = entry;
2412 if (entry->offset - (prev->offset + prev->bytes) > max_gap || 2412 window_free += entry->bytes;
2413 entry->offset - window_start > (min_bytes * 2)) { 2413 if (entry->bytes > max_extent)
2414 first = entry;
2415 window_start = entry->offset;
2416 window_free = entry->bytes;
2417 last = entry;
2418 max_extent = entry->bytes; 2414 max_extent = entry->bytes;
2419 } else {
2420 last = entry;
2421 window_free += entry->bytes;
2422 if (entry->bytes > max_extent)
2423 max_extent = entry->bytes;
2424 }
2425 prev = entry;
2426 } 2415 }
2427 2416
2417 if (window_free < bytes || max_extent < cont1_bytes)
2418 return -ENOSPC;
2419
2428 cluster->window_start = first->offset; 2420 cluster->window_start = first->offset;
2429 2421
2430 node = &first->offset_index; 2422 node = &first->offset_index;
@@ -2438,17 +2430,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2438 2430
2439 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2431 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2440 node = rb_next(&entry->offset_index); 2432 node = rb_next(&entry->offset_index);
2441 if (entry->bitmap) 2433 if (entry->bitmap || entry->bytes < min_bytes)
2442 continue; 2434 continue;
2443 2435
2444 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2436 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2445 ret = tree_insert_offset(&cluster->root, entry->offset, 2437 ret = tree_insert_offset(&cluster->root, entry->offset,
2446 &entry->offset_index, 0); 2438 &entry->offset_index, 0);
2439 total_size += entry->bytes;
2447 BUG_ON(ret); 2440 BUG_ON(ret);
2448 } while (node && entry != last); 2441 } while (node && entry != last);
2449 2442
2450 cluster->max_size = max_extent; 2443 cluster->max_size = max_extent;
2451 2444 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
2452 return 0; 2445 return 0;
2453} 2446}
2454 2447
@@ -2460,7 +2453,7 @@ static noinline int
2460setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, 2453setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2461 struct btrfs_free_cluster *cluster, 2454 struct btrfs_free_cluster *cluster,
2462 struct list_head *bitmaps, u64 offset, u64 bytes, 2455 struct list_head *bitmaps, u64 offset, u64 bytes,
2463 u64 min_bytes) 2456 u64 cont1_bytes, u64 min_bytes)
2464{ 2457{
2465 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2458 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2466 struct btrfs_free_space *entry; 2459 struct btrfs_free_space *entry;
@@ -2485,7 +2478,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2485 if (entry->bytes < min_bytes) 2478 if (entry->bytes < min_bytes)
2486 continue; 2479 continue;
2487 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 2480 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2488 bytes, min_bytes); 2481 bytes, cont1_bytes, min_bytes);
2489 if (!ret) 2482 if (!ret)
2490 return 0; 2483 return 0;
2491 } 2484 }
@@ -2499,7 +2492,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2499 2492
2500/* 2493/*
2501 * here we try to find a cluster of blocks in a block group. The goal 2494 * here we try to find a cluster of blocks in a block group. The goal
2502 * is to find at least bytes free and up to empty_size + bytes free. 2495 * is to find at least bytes+empty_size.
2503 * We might not find them all in one contiguous area. 2496 * We might not find them all in one contiguous area.
2504 * 2497 *
2505 * returns zero and sets up cluster if things worked out, otherwise 2498 * returns zero and sets up cluster if things worked out, otherwise
@@ -2515,23 +2508,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2515 struct btrfs_free_space *entry, *tmp; 2508 struct btrfs_free_space *entry, *tmp;
2516 LIST_HEAD(bitmaps); 2509 LIST_HEAD(bitmaps);
2517 u64 min_bytes; 2510 u64 min_bytes;
2511 u64 cont1_bytes;
2518 int ret; 2512 int ret;
2519 2513
2520 /* for metadata, allow allocates with more holes */ 2514 /*
2515 * Choose the minimum extent size we'll require for this
2516 * cluster. For SSD_SPREAD, don't allow any fragmentation.
2517 * For metadata, allow allocates with smaller extents. For
2518 * data, keep it dense.
2519 */
2521 if (btrfs_test_opt(root, SSD_SPREAD)) { 2520 if (btrfs_test_opt(root, SSD_SPREAD)) {
2522 min_bytes = bytes + empty_size; 2521 cont1_bytes = min_bytes = bytes + empty_size;
2523 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 2522 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2524 /* 2523 cont1_bytes = bytes;
2525 * we want to do larger allocations when we are 2524 min_bytes = block_group->sectorsize;
2526 * flushing out the delayed refs, it helps prevent 2525 } else {
2527 * making more work as we go along. 2526 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
2528 */ 2527 min_bytes = block_group->sectorsize;
2529 if (trans->transaction->delayed_refs.flushing) 2528 }
2530 min_bytes = max(bytes, (bytes + empty_size) >> 1);
2531 else
2532 min_bytes = max(bytes, (bytes + empty_size) >> 4);
2533 } else
2534 min_bytes = max(bytes, (bytes + empty_size) >> 2);
2535 2529
2536 spin_lock(&ctl->tree_lock); 2530 spin_lock(&ctl->tree_lock);
2537 2531
@@ -2539,7 +2533,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2539 * If we know we don't have enough space to make a cluster don't even 2533 * If we know we don't have enough space to make a cluster don't even
2540 * bother doing all the work to try and find one. 2534 * bother doing all the work to try and find one.
2541 */ 2535 */
2542 if (ctl->free_space < min_bytes) { 2536 if (ctl->free_space < bytes) {
2543 spin_unlock(&ctl->tree_lock); 2537 spin_unlock(&ctl->tree_lock);
2544 return -ENOSPC; 2538 return -ENOSPC;
2545 } 2539 }
@@ -2552,11 +2546,17 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2552 goto out; 2546 goto out;
2553 } 2547 }
2554 2548
2549 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
2550 min_bytes);
2551
2552 INIT_LIST_HEAD(&bitmaps);
2555 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2553 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2556 bytes, min_bytes); 2554 bytes + empty_size,
2555 cont1_bytes, min_bytes);
2557 if (ret) 2556 if (ret)
2558 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 2557 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
2559 offset, bytes, min_bytes); 2558 offset, bytes + empty_size,
2559 cont1_bytes, min_bytes);
2560 2560
2561 /* Clear our temporary list */ 2561 /* Clear our temporary list */
2562 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 2562 list_for_each_entry_safe(entry, tmp, &bitmaps, list)
@@ -2567,6 +2567,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2567 list_add_tail(&cluster->block_group_list, 2567 list_add_tail(&cluster->block_group_list,
2568 &block_group->cluster_list); 2568 &block_group->cluster_list);
2569 cluster->block_group = block_group; 2569 cluster->block_group = block_group;
2570 } else {
2571 trace_btrfs_failed_cluster_setup(block_group);
2570 } 2572 }
2571out: 2573out:
2572 spin_unlock(&cluster->lock); 2574 spin_unlock(&cluster->lock);
@@ -2588,17 +2590,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2588 cluster->block_group = NULL; 2590 cluster->block_group = NULL;
2589} 2591}
2590 2592
2591int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, 2593static int do_trimming(struct btrfs_block_group_cache *block_group,
2592 u64 *trimmed, u64 start, u64 end, u64 minlen) 2594 u64 *total_trimmed, u64 start, u64 bytes,
2595 u64 reserved_start, u64 reserved_bytes)
2593{ 2596{
2594 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2597 struct btrfs_space_info *space_info = block_group->space_info;
2595 struct btrfs_free_space *entry = NULL;
2596 struct btrfs_fs_info *fs_info = block_group->fs_info; 2598 struct btrfs_fs_info *fs_info = block_group->fs_info;
2597 u64 bytes = 0; 2599 int ret;
2598 u64 actually_trimmed; 2600 int update = 0;
2599 int ret = 0; 2601 u64 trimmed = 0;
2600 2602
2601 *trimmed = 0; 2603 spin_lock(&space_info->lock);
2604 spin_lock(&block_group->lock);
2605 if (!block_group->ro) {
2606 block_group->reserved += reserved_bytes;
2607 space_info->bytes_reserved += reserved_bytes;
2608 update = 1;
2609 }
2610 spin_unlock(&block_group->lock);
2611 spin_unlock(&space_info->lock);
2612
2613 ret = btrfs_error_discard_extent(fs_info->extent_root,
2614 start, bytes, &trimmed);
2615 if (!ret)
2616 *total_trimmed += trimmed;
2617
2618 btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
2619
2620 if (update) {
2621 spin_lock(&space_info->lock);
2622 spin_lock(&block_group->lock);
2623 if (block_group->ro)
2624 space_info->bytes_readonly += reserved_bytes;
2625 block_group->reserved -= reserved_bytes;
2626 space_info->bytes_reserved -= reserved_bytes;
2627 spin_unlock(&space_info->lock);
2628 spin_unlock(&block_group->lock);
2629 }
2630
2631 return ret;
2632}
2633
2634static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2635 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
2636{
2637 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2638 struct btrfs_free_space *entry;
2639 struct rb_node *node;
2640 int ret = 0;
2641 u64 extent_start;
2642 u64 extent_bytes;
2643 u64 bytes;
2602 2644
2603 while (start < end) { 2645 while (start < end) {
2604 spin_lock(&ctl->tree_lock); 2646 spin_lock(&ctl->tree_lock);
@@ -2609,81 +2651,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2609 } 2651 }
2610 2652
2611 entry = tree_search_offset(ctl, start, 0, 1); 2653 entry = tree_search_offset(ctl, start, 0, 1);
2612 if (!entry) 2654 if (!entry) {
2613 entry = tree_search_offset(ctl,
2614 offset_to_bitmap(ctl, start),
2615 1, 1);
2616
2617 if (!entry || entry->offset >= end) {
2618 spin_unlock(&ctl->tree_lock); 2655 spin_unlock(&ctl->tree_lock);
2619 break; 2656 break;
2620 } 2657 }
2621 2658
2622 if (entry->bitmap) { 2659 /* skip bitmaps */
2623 ret = search_bitmap(ctl, entry, &start, &bytes); 2660 while (entry->bitmap) {
2624 if (!ret) { 2661 node = rb_next(&entry->offset_index);
2625 if (start >= end) { 2662 if (!node) {
2626 spin_unlock(&ctl->tree_lock);
2627 break;
2628 }
2629 bytes = min(bytes, end - start);
2630 bitmap_clear_bits(ctl, entry, start, bytes);
2631 if (entry->bytes == 0)
2632 free_bitmap(ctl, entry);
2633 } else {
2634 start = entry->offset + BITS_PER_BITMAP *
2635 block_group->sectorsize;
2636 spin_unlock(&ctl->tree_lock); 2663 spin_unlock(&ctl->tree_lock);
2637 ret = 0; 2664 goto out;
2638 continue;
2639 } 2665 }
2640 } else { 2666 entry = rb_entry(node, struct btrfs_free_space,
2641 start = entry->offset; 2667 offset_index);
2642 bytes = min(entry->bytes, end - start);
2643 unlink_free_space(ctl, entry);
2644 kmem_cache_free(btrfs_free_space_cachep, entry);
2645 } 2668 }
2646 2669
2670 if (entry->offset >= end) {
2671 spin_unlock(&ctl->tree_lock);
2672 break;
2673 }
2674
2675 extent_start = entry->offset;
2676 extent_bytes = entry->bytes;
2677 start = max(start, extent_start);
2678 bytes = min(extent_start + extent_bytes, end) - start;
2679 if (bytes < minlen) {
2680 spin_unlock(&ctl->tree_lock);
2681 goto next;
2682 }
2683
2684 unlink_free_space(ctl, entry);
2685 kmem_cache_free(btrfs_free_space_cachep, entry);
2686
2647 spin_unlock(&ctl->tree_lock); 2687 spin_unlock(&ctl->tree_lock);
2648 2688
2649 if (bytes >= minlen) { 2689 ret = do_trimming(block_group, total_trimmed, start, bytes,
2650 struct btrfs_space_info *space_info; 2690 extent_start, extent_bytes);
2651 int update = 0; 2691 if (ret)
2652 2692 break;
2653 space_info = block_group->space_info; 2693next:
2654 spin_lock(&space_info->lock); 2694 start += bytes;
2655 spin_lock(&block_group->lock);
2656 if (!block_group->ro) {
2657 block_group->reserved += bytes;
2658 space_info->bytes_reserved += bytes;
2659 update = 1;
2660 }
2661 spin_unlock(&block_group->lock);
2662 spin_unlock(&space_info->lock);
2663
2664 ret = btrfs_error_discard_extent(fs_info->extent_root,
2665 start,
2666 bytes,
2667 &actually_trimmed);
2668
2669 btrfs_add_free_space(block_group, start, bytes);
2670 if (update) {
2671 spin_lock(&space_info->lock);
2672 spin_lock(&block_group->lock);
2673 if (block_group->ro)
2674 space_info->bytes_readonly += bytes;
2675 block_group->reserved -= bytes;
2676 space_info->bytes_reserved -= bytes;
2677 spin_unlock(&space_info->lock);
2678 spin_unlock(&block_group->lock);
2679 }
2680 2695
2681 if (ret) 2696 if (fatal_signal_pending(current)) {
2682 break; 2697 ret = -ERESTARTSYS;
2683 *trimmed += actually_trimmed; 2698 break;
2699 }
2700
2701 cond_resched();
2702 }
2703out:
2704 return ret;
2705}
2706
2707static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
2708 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
2709{
2710 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2711 struct btrfs_free_space *entry;
2712 int ret = 0;
2713 int ret2;
2714 u64 bytes;
2715 u64 offset = offset_to_bitmap(ctl, start);
2716
2717 while (offset < end) {
2718 bool next_bitmap = false;
2719
2720 spin_lock(&ctl->tree_lock);
2721
2722 if (ctl->free_space < minlen) {
2723 spin_unlock(&ctl->tree_lock);
2724 break;
2725 }
2726
2727 entry = tree_search_offset(ctl, offset, 1, 0);
2728 if (!entry) {
2729 spin_unlock(&ctl->tree_lock);
2730 next_bitmap = true;
2731 goto next;
2732 }
2733
2734 bytes = minlen;
2735 ret2 = search_bitmap(ctl, entry, &start, &bytes);
2736 if (ret2 || start >= end) {
2737 spin_unlock(&ctl->tree_lock);
2738 next_bitmap = true;
2739 goto next;
2740 }
2741
2742 bytes = min(bytes, end - start);
2743 if (bytes < minlen) {
2744 spin_unlock(&ctl->tree_lock);
2745 goto next;
2746 }
2747
2748 bitmap_clear_bits(ctl, entry, start, bytes);
2749 if (entry->bytes == 0)
2750 free_bitmap(ctl, entry);
2751
2752 spin_unlock(&ctl->tree_lock);
2753
2754 ret = do_trimming(block_group, total_trimmed, start, bytes,
2755 start, bytes);
2756 if (ret)
2757 break;
2758next:
2759 if (next_bitmap) {
2760 offset += BITS_PER_BITMAP * ctl->unit;
2761 } else {
2762 start += bytes;
2763 if (start >= offset + BITS_PER_BITMAP * ctl->unit)
2764 offset += BITS_PER_BITMAP * ctl->unit;
2684 } 2765 }
2685 start += bytes;
2686 bytes = 0;
2687 2766
2688 if (fatal_signal_pending(current)) { 2767 if (fatal_signal_pending(current)) {
2689 ret = -ERESTARTSYS; 2768 ret = -ERESTARTSYS;
@@ -2696,6 +2775,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2696 return ret; 2775 return ret;
2697} 2776}
2698 2777
2778int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2779 u64 *trimmed, u64 start, u64 end, u64 minlen)
2780{
2781 int ret;
2782
2783 *trimmed = 0;
2784
2785 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
2786 if (ret)
2787 return ret;
2788
2789 ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
2790
2791 return ret;
2792}
2793
2699/* 2794/*
2700 * Find the left-most item in the cache tree, and then return the 2795 * Find the left-most item in the cache tree, and then return the
2701 * smallest inode number in the item. 2796 * smallest inode number in the item.
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f8962a957d65..213ffa86ce1b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
438 trans->bytes_reserved); 438 trans->bytes_reserved);
439 if (ret) 439 if (ret)
440 goto out; 440 goto out;
441 trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
442 trans->bytes_reserved, 1);
441again: 443again:
442 inode = lookup_free_ino_inode(root, path); 444 inode = lookup_free_ino_inode(root, path);
443 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 445 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -498,6 +500,8 @@ again:
498out_put: 500out_put:
499 iput(inode); 501 iput(inode);
500out_release: 502out_release:
503 trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
504 trans->bytes_reserved, 0);
501 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 505 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
502out: 506out:
503 trans->block_rsv = rsv; 507 trans->block_rsv = rsv;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81b235a61f8c..0da19a0ea00d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1951,12 +1951,28 @@ enum btrfs_orphan_cleanup_state {
1951void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 1951void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
1952 struct btrfs_root *root) 1952 struct btrfs_root *root)
1953{ 1953{
1954 struct btrfs_block_rsv *block_rsv;
1954 int ret; 1955 int ret;
1955 1956
1956 if (!list_empty(&root->orphan_list) || 1957 if (!list_empty(&root->orphan_list) ||
1957 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 1958 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
1958 return; 1959 return;
1959 1960
1961 spin_lock(&root->orphan_lock);
1962 if (!list_empty(&root->orphan_list)) {
1963 spin_unlock(&root->orphan_lock);
1964 return;
1965 }
1966
1967 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
1968 spin_unlock(&root->orphan_lock);
1969 return;
1970 }
1971
1972 block_rsv = root->orphan_block_rsv;
1973 root->orphan_block_rsv = NULL;
1974 spin_unlock(&root->orphan_lock);
1975
1960 if (root->orphan_item_inserted && 1976 if (root->orphan_item_inserted &&
1961 btrfs_root_refs(&root->root_item) > 0) { 1977 btrfs_root_refs(&root->root_item) > 0) {
1962 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 1978 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
@@ -1965,10 +1981,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
1965 root->orphan_item_inserted = 0; 1981 root->orphan_item_inserted = 0;
1966 } 1982 }
1967 1983
1968 if (root->orphan_block_rsv) { 1984 if (block_rsv) {
1969 WARN_ON(root->orphan_block_rsv->size > 0); 1985 WARN_ON(block_rsv->size > 0);
1970 btrfs_free_block_rsv(root, root->orphan_block_rsv); 1986 btrfs_free_block_rsv(root, block_rsv);
1971 root->orphan_block_rsv = NULL;
1972 } 1987 }
1973} 1988}
1974 1989
@@ -2224,14 +2239,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2224 continue; 2239 continue;
2225 } 2240 }
2226 nr_truncate++; 2241 nr_truncate++;
2227 /*
2228 * Need to hold the imutex for reservation purposes, not
2229 * a huge deal here but I have a WARN_ON in
2230 * btrfs_delalloc_reserve_space to catch offenders.
2231 */
2232 mutex_lock(&inode->i_mutex);
2233 ret = btrfs_truncate(inode); 2242 ret = btrfs_truncate(inode);
2234 mutex_unlock(&inode->i_mutex);
2235 } else { 2243 } else {
2236 nr_unlink++; 2244 nr_unlink++;
2237 } 2245 }
@@ -2845,7 +2853,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2845 BUG_ON(!root->fs_info->enospc_unlink); 2853 BUG_ON(!root->fs_info->enospc_unlink);
2846 root->fs_info->enospc_unlink = 0; 2854 root->fs_info->enospc_unlink = 0;
2847 } 2855 }
2848 btrfs_end_transaction_throttle(trans, root); 2856 btrfs_end_transaction(trans, root);
2849} 2857}
2850 2858
2851static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2859static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3009,7 +3017,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3009 int pending_del_nr = 0; 3017 int pending_del_nr = 0;
3010 int pending_del_slot = 0; 3018 int pending_del_slot = 0;
3011 int extent_type = -1; 3019 int extent_type = -1;
3012 int encoding;
3013 int ret; 3020 int ret;
3014 int err = 0; 3021 int err = 0;
3015 u64 ino = btrfs_ino(inode); 3022 u64 ino = btrfs_ino(inode);
@@ -3059,7 +3066,6 @@ search_again:
3059 leaf = path->nodes[0]; 3066 leaf = path->nodes[0];
3060 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3067 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3061 found_type = btrfs_key_type(&found_key); 3068 found_type = btrfs_key_type(&found_key);
3062 encoding = 0;
3063 3069
3064 if (found_key.objectid != ino) 3070 if (found_key.objectid != ino)
3065 break; 3071 break;
@@ -3072,10 +3078,6 @@ search_again:
3072 fi = btrfs_item_ptr(leaf, path->slots[0], 3078 fi = btrfs_item_ptr(leaf, path->slots[0],
3073 struct btrfs_file_extent_item); 3079 struct btrfs_file_extent_item);
3074 extent_type = btrfs_file_extent_type(leaf, fi); 3080 extent_type = btrfs_file_extent_type(leaf, fi);
3075 encoding = btrfs_file_extent_compression(leaf, fi);
3076 encoding |= btrfs_file_extent_encryption(leaf, fi);
3077 encoding |= btrfs_file_extent_other_encoding(leaf, fi);
3078
3079 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3081 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3080 item_end += 3082 item_end +=
3081 btrfs_file_extent_num_bytes(leaf, fi); 3083 btrfs_file_extent_num_bytes(leaf, fi);
@@ -3103,7 +3105,7 @@ search_again:
3103 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3105 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3104 u64 num_dec; 3106 u64 num_dec;
3105 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 3107 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
3106 if (!del_item && !encoding) { 3108 if (!del_item) {
3107 u64 orig_num_bytes = 3109 u64 orig_num_bytes =
3108 btrfs_file_extent_num_bytes(leaf, fi); 3110 btrfs_file_extent_num_bytes(leaf, fi);
3109 extent_num_bytes = new_size - 3111 extent_num_bytes = new_size -
@@ -3179,7 +3181,7 @@ delete:
3179 ret = btrfs_free_extent(trans, root, extent_start, 3181 ret = btrfs_free_extent(trans, root, extent_start,
3180 extent_num_bytes, 0, 3182 extent_num_bytes, 0,
3181 btrfs_header_owner(leaf), 3183 btrfs_header_owner(leaf),
3182 ino, extent_offset); 3184 ino, extent_offset, 0);
3183 BUG_ON(ret); 3185 BUG_ON(ret);
3184 } 3186 }
3185 3187
@@ -3434,7 +3436,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3434 i_size_write(inode, newsize); 3436 i_size_write(inode, newsize);
3435 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 3437 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3436 ret = btrfs_update_inode(trans, root, inode); 3438 ret = btrfs_update_inode(trans, root, inode);
3437 btrfs_end_transaction_throttle(trans, root); 3439 btrfs_end_transaction(trans, root);
3438 } else { 3440 } else {
3439 3441
3440 /* 3442 /*
@@ -4655,7 +4657,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4655 } 4657 }
4656out_unlock: 4658out_unlock:
4657 nr = trans->blocks_used; 4659 nr = trans->blocks_used;
4658 btrfs_end_transaction_throttle(trans, root); 4660 btrfs_end_transaction(trans, root);
4659 btrfs_btree_balance_dirty(root, nr); 4661 btrfs_btree_balance_dirty(root, nr);
4660 if (drop_inode) { 4662 if (drop_inode) {
4661 inode_dec_link_count(inode); 4663 inode_dec_link_count(inode);
@@ -4723,7 +4725,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4723 } 4725 }
4724out_unlock: 4726out_unlock:
4725 nr = trans->blocks_used; 4727 nr = trans->blocks_used;
4726 btrfs_end_transaction_throttle(trans, root); 4728 btrfs_end_transaction(trans, root);
4727 if (drop_inode) { 4729 if (drop_inode) {
4728 inode_dec_link_count(inode); 4730 inode_dec_link_count(inode);
4729 iput(inode); 4731 iput(inode);
@@ -4782,7 +4784,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4782 } 4784 }
4783 4785
4784 nr = trans->blocks_used; 4786 nr = trans->blocks_used;
4785 btrfs_end_transaction_throttle(trans, root); 4787 btrfs_end_transaction(trans, root);
4786fail: 4788fail:
4787 if (drop_inode) { 4789 if (drop_inode) {
4788 inode_dec_link_count(inode); 4790 inode_dec_link_count(inode);
@@ -4848,7 +4850,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4848 4850
4849out_fail: 4851out_fail:
4850 nr = trans->blocks_used; 4852 nr = trans->blocks_used;
4851 btrfs_end_transaction_throttle(trans, root); 4853 btrfs_end_transaction(trans, root);
4852 if (drop_on_err) 4854 if (drop_on_err)
4853 iput(inode); 4855 iput(inode);
4854 btrfs_btree_balance_dirty(root, nr); 4856 btrfs_btree_balance_dirty(root, nr);
@@ -5121,7 +5123,7 @@ again:
5121 } 5123 }
5122 flush_dcache_page(page); 5124 flush_dcache_page(page);
5123 } else if (create && PageUptodate(page)) { 5125 } else if (create && PageUptodate(page)) {
5124 WARN_ON(1); 5126 BUG();
5125 if (!trans) { 5127 if (!trans) {
5126 kunmap(page); 5128 kunmap(page);
5127 free_extent_map(em); 5129 free_extent_map(em);
@@ -6402,10 +6404,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6402 u64 page_start; 6404 u64 page_start;
6403 u64 page_end; 6405 u64 page_end;
6404 6406
6405 /* Need this to keep space reservations serialized */
6406 mutex_lock(&inode->i_mutex);
6407 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6407 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6408 mutex_unlock(&inode->i_mutex);
6409 if (!ret) 6408 if (!ret)
6410 ret = btrfs_update_time(vma->vm_file); 6409 ret = btrfs_update_time(vma->vm_file);
6411 if (ret) { 6410 if (ret) {
@@ -6494,8 +6493,8 @@ out_unlock:
6494 if (!ret) 6493 if (!ret)
6495 return VM_FAULT_LOCKED; 6494 return VM_FAULT_LOCKED;
6496 unlock_page(page); 6495 unlock_page(page);
6497 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
6498out: 6496out:
6497 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
6499 return ret; 6498 return ret;
6500} 6499}
6501 6500
@@ -6668,7 +6667,7 @@ end_trans:
6668 err = ret; 6667 err = ret;
6669 6668
6670 nr = trans->blocks_used; 6669 nr = trans->blocks_used;
6671 ret = btrfs_end_transaction_throttle(trans, root); 6670 ret = btrfs_end_transaction(trans, root);
6672 btrfs_btree_balance_dirty(root, nr); 6671 btrfs_btree_balance_dirty(root, nr);
6673 } 6672 }
6674 6673
@@ -6749,6 +6748,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6749 extent_io_tree_init(&ei->io_tree, &inode->i_data); 6748 extent_io_tree_init(&ei->io_tree, &inode->i_data);
6750 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 6749 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
6751 mutex_init(&ei->log_mutex); 6750 mutex_init(&ei->log_mutex);
6751 mutex_init(&ei->delalloc_mutex);
6752 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6752 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
6753 INIT_LIST_HEAD(&ei->i_orphan); 6753 INIT_LIST_HEAD(&ei->i_orphan);
6754 INIT_LIST_HEAD(&ei->delalloc_inodes); 6754 INIT_LIST_HEAD(&ei->delalloc_inodes);
@@ -7074,7 +7074,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7074 btrfs_end_log_trans(root); 7074 btrfs_end_log_trans(root);
7075 } 7075 }
7076out_fail: 7076out_fail:
7077 btrfs_end_transaction_throttle(trans, root); 7077 btrfs_end_transaction(trans, root);
7078out_notrans: 7078out_notrans:
7079 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 7079 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
7080 up_read(&root->fs_info->subvol_sem); 7080 up_read(&root->fs_info->subvol_sem);
@@ -7246,7 +7246,7 @@ out_unlock:
7246 if (!err) 7246 if (!err)
7247 d_instantiate(dentry, inode); 7247 d_instantiate(dentry, inode);
7248 nr = trans->blocks_used; 7248 nr = trans->blocks_used;
7249 btrfs_end_transaction_throttle(trans, root); 7249 btrfs_end_transaction(trans, root);
7250 if (drop_inode) { 7250 if (drop_inode) {
7251 inode_dec_link_count(inode); 7251 inode_dec_link_count(inode);
7252 iput(inode); 7252 iput(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5441ff1480fd..ab620014bcc3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
176 struct btrfs_trans_handle *trans; 176 struct btrfs_trans_handle *trans;
177 unsigned int flags, oldflags; 177 unsigned int flags, oldflags;
178 int ret; 178 int ret;
179 u64 ip_oldflags;
180 unsigned int i_oldflags;
179 181
180 if (btrfs_root_readonly(root)) 182 if (btrfs_root_readonly(root))
181 return -EROFS; 183 return -EROFS;
@@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
192 194
193 mutex_lock(&inode->i_mutex); 195 mutex_lock(&inode->i_mutex);
194 196
197 ip_oldflags = ip->flags;
198 i_oldflags = inode->i_flags;
199
195 flags = btrfs_mask_flags(inode->i_mode, flags); 200 flags = btrfs_mask_flags(inode->i_mode, flags);
196 oldflags = btrfs_flags_to_ioctl(ip->flags); 201 oldflags = btrfs_flags_to_ioctl(ip->flags);
197 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { 202 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -249,19 +254,24 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
249 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); 254 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
250 } 255 }
251 256
252 trans = btrfs_join_transaction(root); 257 trans = btrfs_start_transaction(root, 1);
253 BUG_ON(IS_ERR(trans)); 258 if (IS_ERR(trans)) {
259 ret = PTR_ERR(trans);
260 goto out_drop;
261 }
254 262
255 btrfs_update_iflags(inode); 263 btrfs_update_iflags(inode);
256 inode->i_ctime = CURRENT_TIME; 264 inode->i_ctime = CURRENT_TIME;
257 ret = btrfs_update_inode(trans, root, inode); 265 ret = btrfs_update_inode(trans, root, inode);
258 BUG_ON(ret);
259 266
260 btrfs_end_transaction(trans, root); 267 btrfs_end_transaction(trans, root);
268 out_drop:
269 if (ret) {
270 ip->flags = ip_oldflags;
271 inode->i_flags = i_oldflags;
272 }
261 273
262 mnt_drop_write_file(file); 274 mnt_drop_write_file(file);
263
264 ret = 0;
265 out_unlock: 275 out_unlock:
266 mutex_unlock(&inode->i_mutex); 276 mutex_unlock(&inode->i_mutex);
267 return ret; 277 return ret;
@@ -276,14 +286,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
276 286
277static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) 287static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
278{ 288{
279 struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info; 289 struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
280 struct btrfs_fs_info *fs_info = root->fs_info;
281 struct btrfs_device *device; 290 struct btrfs_device *device;
282 struct request_queue *q; 291 struct request_queue *q;
283 struct fstrim_range range; 292 struct fstrim_range range;
284 u64 minlen = ULLONG_MAX; 293 u64 minlen = ULLONG_MAX;
285 u64 num_devices = 0; 294 u64 num_devices = 0;
286 u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); 295 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
287 int ret; 296 int ret;
288 297
289 if (!capable(CAP_SYS_ADMIN)) 298 if (!capable(CAP_SYS_ADMIN))
@@ -312,7 +321,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
312 321
313 range.len = min(range.len, total_bytes - range.start); 322 range.len = min(range.len, total_bytes - range.start);
314 range.minlen = max(range.minlen, minlen); 323 range.minlen = max(range.minlen, minlen);
315 ret = btrfs_trim_fs(root, &range); 324 ret = btrfs_trim_fs(fs_info->tree_root, &range);
316 if (ret < 0) 325 if (ret < 0)
317 return ret; 326 return ret;
318 327
@@ -358,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root,
358 return PTR_ERR(trans); 367 return PTR_ERR(trans);
359 368
360 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 369 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
361 0, objectid, NULL, 0, 0, 0); 370 0, objectid, NULL, 0, 0, 0, 0);
362 if (IS_ERR(leaf)) { 371 if (IS_ERR(leaf)) {
363 ret = PTR_ERR(leaf); 372 ret = PTR_ERR(leaf);
364 goto fail; 373 goto fail;
@@ -858,10 +867,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
858 return 0; 867 return 0;
859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 868 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
860 869
861 mutex_lock(&inode->i_mutex);
862 ret = btrfs_delalloc_reserve_space(inode, 870 ret = btrfs_delalloc_reserve_space(inode,
863 num_pages << PAGE_CACHE_SHIFT); 871 num_pages << PAGE_CACHE_SHIFT);
864 mutex_unlock(&inode->i_mutex);
865 if (ret) 872 if (ret)
866 return ret; 873 return ret;
867again: 874again:
@@ -1203,13 +1210,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1203 if (!capable(CAP_SYS_ADMIN)) 1210 if (!capable(CAP_SYS_ADMIN))
1204 return -EPERM; 1211 return -EPERM;
1205 1212
1213 mutex_lock(&root->fs_info->volume_mutex);
1214 if (root->fs_info->balance_ctl) {
1215 printk(KERN_INFO "btrfs: balance in progress\n");
1216 ret = -EINVAL;
1217 goto out;
1218 }
1219
1206 vol_args = memdup_user(arg, sizeof(*vol_args)); 1220 vol_args = memdup_user(arg, sizeof(*vol_args));
1207 if (IS_ERR(vol_args)) 1221 if (IS_ERR(vol_args)) {
1208 return PTR_ERR(vol_args); 1222 ret = PTR_ERR(vol_args);
1223 goto out;
1224 }
1209 1225
1210 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1226 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
1211 1227
1212 mutex_lock(&root->fs_info->volume_mutex);
1213 sizestr = vol_args->name; 1228 sizestr = vol_args->name;
1214 devstr = strchr(sizestr, ':'); 1229 devstr = strchr(sizestr, ':');
1215 if (devstr) { 1230 if (devstr) {
@@ -1226,7 +1241,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1226 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1241 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1227 (unsigned long long)devid); 1242 (unsigned long long)devid);
1228 ret = -EINVAL; 1243 ret = -EINVAL;
1229 goto out_unlock; 1244 goto out_free;
1230 } 1245 }
1231 if (!strcmp(sizestr, "max")) 1246 if (!strcmp(sizestr, "max"))
1232 new_size = device->bdev->bd_inode->i_size; 1247 new_size = device->bdev->bd_inode->i_size;
@@ -1241,7 +1256,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1241 new_size = memparse(sizestr, NULL); 1256 new_size = memparse(sizestr, NULL);
1242 if (new_size == 0) { 1257 if (new_size == 0) {
1243 ret = -EINVAL; 1258 ret = -EINVAL;
1244 goto out_unlock; 1259 goto out_free;
1245 } 1260 }
1246 } 1261 }
1247 1262
@@ -1250,7 +1265,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1250 if (mod < 0) { 1265 if (mod < 0) {
1251 if (new_size > old_size) { 1266 if (new_size > old_size) {
1252 ret = -EINVAL; 1267 ret = -EINVAL;
1253 goto out_unlock; 1268 goto out_free;
1254 } 1269 }
1255 new_size = old_size - new_size; 1270 new_size = old_size - new_size;
1256 } else if (mod > 0) { 1271 } else if (mod > 0) {
@@ -1259,11 +1274,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1259 1274
1260 if (new_size < 256 * 1024 * 1024) { 1275 if (new_size < 256 * 1024 * 1024) {
1261 ret = -EINVAL; 1276 ret = -EINVAL;
1262 goto out_unlock; 1277 goto out_free;
1263 } 1278 }
1264 if (new_size > device->bdev->bd_inode->i_size) { 1279 if (new_size > device->bdev->bd_inode->i_size) {
1265 ret = -EFBIG; 1280 ret = -EFBIG;
1266 goto out_unlock; 1281 goto out_free;
1267 } 1282 }
1268 1283
1269 do_div(new_size, root->sectorsize); 1284 do_div(new_size, root->sectorsize);
@@ -1276,7 +1291,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1276 trans = btrfs_start_transaction(root, 0); 1291 trans = btrfs_start_transaction(root, 0);
1277 if (IS_ERR(trans)) { 1292 if (IS_ERR(trans)) {
1278 ret = PTR_ERR(trans); 1293 ret = PTR_ERR(trans);
1279 goto out_unlock; 1294 goto out_free;
1280 } 1295 }
1281 ret = btrfs_grow_device(trans, device, new_size); 1296 ret = btrfs_grow_device(trans, device, new_size);
1282 btrfs_commit_transaction(trans, root); 1297 btrfs_commit_transaction(trans, root);
@@ -1284,9 +1299,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1284 ret = btrfs_shrink_device(device, new_size); 1299 ret = btrfs_shrink_device(device, new_size);
1285 } 1300 }
1286 1301
1287out_unlock: 1302out_free:
1288 mutex_unlock(&root->fs_info->volume_mutex);
1289 kfree(vol_args); 1303 kfree(vol_args);
1304out:
1305 mutex_unlock(&root->fs_info->volume_mutex);
1290 return ret; 1306 return ret;
1291} 1307}
1292 1308
@@ -2052,14 +2068,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2052 if (!capable(CAP_SYS_ADMIN)) 2068 if (!capable(CAP_SYS_ADMIN))
2053 return -EPERM; 2069 return -EPERM;
2054 2070
2071 mutex_lock(&root->fs_info->volume_mutex);
2072 if (root->fs_info->balance_ctl) {
2073 printk(KERN_INFO "btrfs: balance in progress\n");
2074 ret = -EINVAL;
2075 goto out;
2076 }
2077
2055 vol_args = memdup_user(arg, sizeof(*vol_args)); 2078 vol_args = memdup_user(arg, sizeof(*vol_args));
2056 if (IS_ERR(vol_args)) 2079 if (IS_ERR(vol_args)) {
2057 return PTR_ERR(vol_args); 2080 ret = PTR_ERR(vol_args);
2081 goto out;
2082 }
2058 2083
2059 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2084 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2060 ret = btrfs_init_new_device(root, vol_args->name); 2085 ret = btrfs_init_new_device(root, vol_args->name);
2061 2086
2062 kfree(vol_args); 2087 kfree(vol_args);
2088out:
2089 mutex_unlock(&root->fs_info->volume_mutex);
2063 return ret; 2090 return ret;
2064} 2091}
2065 2092
@@ -2074,14 +2101,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
2074 if (root->fs_info->sb->s_flags & MS_RDONLY) 2101 if (root->fs_info->sb->s_flags & MS_RDONLY)
2075 return -EROFS; 2102 return -EROFS;
2076 2103
2104 mutex_lock(&root->fs_info->volume_mutex);
2105 if (root->fs_info->balance_ctl) {
2106 printk(KERN_INFO "btrfs: balance in progress\n");
2107 ret = -EINVAL;
2108 goto out;
2109 }
2110
2077 vol_args = memdup_user(arg, sizeof(*vol_args)); 2111 vol_args = memdup_user(arg, sizeof(*vol_args));
2078 if (IS_ERR(vol_args)) 2112 if (IS_ERR(vol_args)) {
2079 return PTR_ERR(vol_args); 2113 ret = PTR_ERR(vol_args);
2114 goto out;
2115 }
2080 2116
2081 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2117 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2082 ret = btrfs_rm_device(root, vol_args->name); 2118 ret = btrfs_rm_device(root, vol_args->name);
2083 2119
2084 kfree(vol_args); 2120 kfree(vol_args);
2121out:
2122 mutex_unlock(&root->fs_info->volume_mutex);
2085 return ret; 2123 return ret;
2086} 2124}
2087 2125
@@ -2427,7 +2465,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2427 disko, diskl, 0, 2465 disko, diskl, 0,
2428 root->root_key.objectid, 2466 root->root_key.objectid,
2429 btrfs_ino(inode), 2467 btrfs_ino(inode),
2430 new_key.offset - datao); 2468 new_key.offset - datao,
2469 0);
2431 BUG_ON(ret); 2470 BUG_ON(ret);
2432 } 2471 }
2433 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 2472 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -2977,7 +3016,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
2977{ 3016{
2978 int ret = 0; 3017 int ret = 0;
2979 int size; 3018 int size;
2980 u64 extent_offset; 3019 u64 extent_item_pos;
2981 struct btrfs_ioctl_logical_ino_args *loi; 3020 struct btrfs_ioctl_logical_ino_args *loi;
2982 struct btrfs_data_container *inodes = NULL; 3021 struct btrfs_data_container *inodes = NULL;
2983 struct btrfs_path *path = NULL; 3022 struct btrfs_path *path = NULL;
@@ -3008,15 +3047,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3008 } 3047 }
3009 3048
3010 ret = extent_from_logical(root->fs_info, loi->logical, path, &key); 3049 ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
3050 btrfs_release_path(path);
3011 3051
3012 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) 3052 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3013 ret = -ENOENT; 3053 ret = -ENOENT;
3014 if (ret < 0) 3054 if (ret < 0)
3015 goto out; 3055 goto out;
3016 3056
3017 extent_offset = loi->logical - key.objectid; 3057 extent_item_pos = loi->logical - key.objectid;
3018 ret = iterate_extent_inodes(root->fs_info, path, key.objectid, 3058 ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
3019 extent_offset, build_ino_list, inodes); 3059 extent_item_pos, build_ino_list,
3060 inodes);
3020 3061
3021 if (ret < 0) 3062 if (ret < 0)
3022 goto out; 3063 goto out;
@@ -3034,6 +3075,163 @@ out:
3034 return ret; 3075 return ret;
3035} 3076}
3036 3077
3078void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
3079 struct btrfs_ioctl_balance_args *bargs)
3080{
3081 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3082
3083 bargs->flags = bctl->flags;
3084
3085 if (atomic_read(&fs_info->balance_running))
3086 bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
3087 if (atomic_read(&fs_info->balance_pause_req))
3088 bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
3089 if (atomic_read(&fs_info->balance_cancel_req))
3090 bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
3091
3092 memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
3093 memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
3094 memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
3095
3096 if (lock) {
3097 spin_lock(&fs_info->balance_lock);
3098 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
3099 spin_unlock(&fs_info->balance_lock);
3100 } else {
3101 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
3102 }
3103}
3104
3105static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
3106{
3107 struct btrfs_fs_info *fs_info = root->fs_info;
3108 struct btrfs_ioctl_balance_args *bargs;
3109 struct btrfs_balance_control *bctl;
3110 int ret;
3111
3112 if (!capable(CAP_SYS_ADMIN))
3113 return -EPERM;
3114
3115 if (fs_info->sb->s_flags & MS_RDONLY)
3116 return -EROFS;
3117
3118 mutex_lock(&fs_info->volume_mutex);
3119 mutex_lock(&fs_info->balance_mutex);
3120
3121 if (arg) {
3122 bargs = memdup_user(arg, sizeof(*bargs));
3123 if (IS_ERR(bargs)) {
3124 ret = PTR_ERR(bargs);
3125 goto out;
3126 }
3127
3128 if (bargs->flags & BTRFS_BALANCE_RESUME) {
3129 if (!fs_info->balance_ctl) {
3130 ret = -ENOTCONN;
3131 goto out_bargs;
3132 }
3133
3134 bctl = fs_info->balance_ctl;
3135 spin_lock(&fs_info->balance_lock);
3136 bctl->flags |= BTRFS_BALANCE_RESUME;
3137 spin_unlock(&fs_info->balance_lock);
3138
3139 goto do_balance;
3140 }
3141 } else {
3142 bargs = NULL;
3143 }
3144
3145 if (fs_info->balance_ctl) {
3146 ret = -EINPROGRESS;
3147 goto out_bargs;
3148 }
3149
3150 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3151 if (!bctl) {
3152 ret = -ENOMEM;
3153 goto out_bargs;
3154 }
3155
3156 bctl->fs_info = fs_info;
3157 if (arg) {
3158 memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
3159 memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
3160 memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
3161
3162 bctl->flags = bargs->flags;
3163 } else {
3164 /* balance everything - no filters */
3165 bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
3166 }
3167
3168do_balance:
3169 ret = btrfs_balance(bctl, bargs);
3170 /*
3171 * bctl is freed in __cancel_balance or in free_fs_info if
3172 * restriper was paused all the way until unmount
3173 */
3174 if (arg) {
3175 if (copy_to_user(arg, bargs, sizeof(*bargs)))
3176 ret = -EFAULT;
3177 }
3178
3179out_bargs:
3180 kfree(bargs);
3181out:
3182 mutex_unlock(&fs_info->balance_mutex);
3183 mutex_unlock(&fs_info->volume_mutex);
3184 return ret;
3185}
3186
3187static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
3188{
3189 if (!capable(CAP_SYS_ADMIN))
3190 return -EPERM;
3191
3192 switch (cmd) {
3193 case BTRFS_BALANCE_CTL_PAUSE:
3194 return btrfs_pause_balance(root->fs_info);
3195 case BTRFS_BALANCE_CTL_CANCEL:
3196 return btrfs_cancel_balance(root->fs_info);
3197 }
3198
3199 return -EINVAL;
3200}
3201
3202static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
3203 void __user *arg)
3204{
3205 struct btrfs_fs_info *fs_info = root->fs_info;
3206 struct btrfs_ioctl_balance_args *bargs;
3207 int ret = 0;
3208
3209 if (!capable(CAP_SYS_ADMIN))
3210 return -EPERM;
3211
3212 mutex_lock(&fs_info->balance_mutex);
3213 if (!fs_info->balance_ctl) {
3214 ret = -ENOTCONN;
3215 goto out;
3216 }
3217
3218 bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
3219 if (!bargs) {
3220 ret = -ENOMEM;
3221 goto out;
3222 }
3223
3224 update_ioctl_balance_args(fs_info, 1, bargs);
3225
3226 if (copy_to_user(arg, bargs, sizeof(*bargs)))
3227 ret = -EFAULT;
3228
3229 kfree(bargs);
3230out:
3231 mutex_unlock(&fs_info->balance_mutex);
3232 return ret;
3233}
3234
3037long btrfs_ioctl(struct file *file, unsigned int 3235long btrfs_ioctl(struct file *file, unsigned int
3038 cmd, unsigned long arg) 3236 cmd, unsigned long arg)
3039{ 3237{
@@ -3078,7 +3276,7 @@ long btrfs_ioctl(struct file *file, unsigned int
3078 case BTRFS_IOC_DEV_INFO: 3276 case BTRFS_IOC_DEV_INFO:
3079 return btrfs_ioctl_dev_info(root, argp); 3277 return btrfs_ioctl_dev_info(root, argp);
3080 case BTRFS_IOC_BALANCE: 3278 case BTRFS_IOC_BALANCE:
3081 return btrfs_balance(root->fs_info->dev_root); 3279 return btrfs_ioctl_balance(root, NULL);
3082 case BTRFS_IOC_CLONE: 3280 case BTRFS_IOC_CLONE:
3083 return btrfs_ioctl_clone(file, arg, 0, 0, 0); 3281 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
3084 case BTRFS_IOC_CLONE_RANGE: 3282 case BTRFS_IOC_CLONE_RANGE:
@@ -3110,6 +3308,12 @@ long btrfs_ioctl(struct file *file, unsigned int
3110 return btrfs_ioctl_scrub_cancel(root, argp); 3308 return btrfs_ioctl_scrub_cancel(root, argp);
3111 case BTRFS_IOC_SCRUB_PROGRESS: 3309 case BTRFS_IOC_SCRUB_PROGRESS:
3112 return btrfs_ioctl_scrub_progress(root, argp); 3310 return btrfs_ioctl_scrub_progress(root, argp);
3311 case BTRFS_IOC_BALANCE_V2:
3312 return btrfs_ioctl_balance(root, argp);
3313 case BTRFS_IOC_BALANCE_CTL:
3314 return btrfs_ioctl_balance_ctl(root, arg);
3315 case BTRFS_IOC_BALANCE_PROGRESS:
3316 return btrfs_ioctl_balance_progress(root, argp);
3113 } 3317 }
3114 3318
3115 return -ENOTTY; 3319 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 252ae9915de8..4f69028a68c4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args {
109 __u64 reserved[124]; /* pad to 1k */ 109 __u64 reserved[124]; /* pad to 1k */
110}; 110};
111 111
112/* balance control ioctl modes */
113#define BTRFS_BALANCE_CTL_PAUSE 1
114#define BTRFS_BALANCE_CTL_CANCEL 2
115
116/*
117 * this is packed, because it should be exactly the same as its disk
118 * byte order counterpart (struct btrfs_disk_balance_args)
119 */
120struct btrfs_balance_args {
121 __u64 profiles;
122 __u64 usage;
123 __u64 devid;
124 __u64 pstart;
125 __u64 pend;
126 __u64 vstart;
127 __u64 vend;
128
129 __u64 target;
130
131 __u64 flags;
132
133 __u64 unused[8];
134} __attribute__ ((__packed__));
135
136/* report balance progress to userspace */
137struct btrfs_balance_progress {
138 __u64 expected; /* estimated # of chunks that will be
139 * relocated to fulfill the request */
140 __u64 considered; /* # of chunks we have considered so far */
141 __u64 completed; /* # of chunks relocated so far */
142};
143
144#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0)
145#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1)
146#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
147
148struct btrfs_ioctl_balance_args {
149 __u64 flags; /* in/out */
150 __u64 state; /* out */
151
152 struct btrfs_balance_args data; /* in/out */
153 struct btrfs_balance_args meta; /* in/out */
154 struct btrfs_balance_args sys; /* in/out */
155
156 struct btrfs_balance_progress stat; /* out */
157
158 __u64 unused[72]; /* pad to 1k */
159};
160
112#define BTRFS_INO_LOOKUP_PATH_MAX 4080 161#define BTRFS_INO_LOOKUP_PATH_MAX 4080
113struct btrfs_ioctl_ino_lookup_args { 162struct btrfs_ioctl_ino_lookup_args {
114 __u64 treeid; 163 __u64 treeid;
@@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args {
272 struct btrfs_ioctl_dev_info_args) 321 struct btrfs_ioctl_dev_info_args)
273#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ 322#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
274 struct btrfs_ioctl_fs_info_args) 323 struct btrfs_ioctl_fs_info_args)
324#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
325 struct btrfs_ioctl_balance_args)
326#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
327#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
328 struct btrfs_ioctl_balance_args)
275#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ 329#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
276 struct btrfs_ioctl_ino_path_args) 330 struct btrfs_ioctl_ino_path_args)
277#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ 331#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d77b67c4b275..5e178d8f7167 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
33 */ 33 */
34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) 34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
35{ 35{
36 if (eb->lock_nested) {
37 read_lock(&eb->lock);
38 if (eb->lock_nested && current->pid == eb->lock_owner) {
39 read_unlock(&eb->lock);
40 return;
41 }
42 read_unlock(&eb->lock);
43 }
36 if (rw == BTRFS_WRITE_LOCK) { 44 if (rw == BTRFS_WRITE_LOCK) {
37 if (atomic_read(&eb->blocking_writers) == 0) { 45 if (atomic_read(&eb->blocking_writers) == 0) {
38 WARN_ON(atomic_read(&eb->spinning_writers) != 1); 46 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
57 */ 65 */
58void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) 66void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
59{ 67{
68 if (eb->lock_nested) {
69 read_lock(&eb->lock);
70 if (&eb->lock_nested && current->pid == eb->lock_owner) {
71 read_unlock(&eb->lock);
72 return;
73 }
74 read_unlock(&eb->lock);
75 }
60 if (rw == BTRFS_WRITE_LOCK_BLOCKING) { 76 if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
61 BUG_ON(atomic_read(&eb->blocking_writers) != 1); 77 BUG_ON(atomic_read(&eb->blocking_writers) != 1);
62 write_lock(&eb->lock); 78 write_lock(&eb->lock);
@@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
81void btrfs_tree_read_lock(struct extent_buffer *eb) 97void btrfs_tree_read_lock(struct extent_buffer *eb)
82{ 98{
83again: 99again:
100 read_lock(&eb->lock);
101 if (atomic_read(&eb->blocking_writers) &&
102 current->pid == eb->lock_owner) {
103 /*
104 * This extent is already write-locked by our thread. We allow
105 * an additional read lock to be added because it's for the same
106 * thread. btrfs_find_all_roots() depends on this as it may be
107 * called on a partly (write-)locked tree.
108 */
109 BUG_ON(eb->lock_nested);
110 eb->lock_nested = 1;
111 read_unlock(&eb->lock);
112 return;
113 }
114 read_unlock(&eb->lock);
84 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); 115 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
85 read_lock(&eb->lock); 116 read_lock(&eb->lock);
86 if (atomic_read(&eb->blocking_writers)) { 117 if (atomic_read(&eb->blocking_writers)) {
87 read_unlock(&eb->lock); 118 read_unlock(&eb->lock);
88 wait_event(eb->write_lock_wq,
89 atomic_read(&eb->blocking_writers) == 0);
90 goto again; 119 goto again;
91 } 120 }
92 atomic_inc(&eb->read_locks); 121 atomic_inc(&eb->read_locks);
@@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
129 } 158 }
130 atomic_inc(&eb->write_locks); 159 atomic_inc(&eb->write_locks);
131 atomic_inc(&eb->spinning_writers); 160 atomic_inc(&eb->spinning_writers);
161 eb->lock_owner = current->pid;
132 return 1; 162 return 1;
133} 163}
134 164
@@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
137 */ 167 */
138void btrfs_tree_read_unlock(struct extent_buffer *eb) 168void btrfs_tree_read_unlock(struct extent_buffer *eb)
139{ 169{
170 if (eb->lock_nested) {
171 read_lock(&eb->lock);
172 if (eb->lock_nested && current->pid == eb->lock_owner) {
173 eb->lock_nested = 0;
174 read_unlock(&eb->lock);
175 return;
176 }
177 read_unlock(&eb->lock);
178 }
140 btrfs_assert_tree_read_locked(eb); 179 btrfs_assert_tree_read_locked(eb);
141 WARN_ON(atomic_read(&eb->spinning_readers) == 0); 180 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
142 atomic_dec(&eb->spinning_readers); 181 atomic_dec(&eb->spinning_readers);
@@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
149 */ 188 */
150void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) 189void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
151{ 190{
191 if (eb->lock_nested) {
192 read_lock(&eb->lock);
193 if (eb->lock_nested && current->pid == eb->lock_owner) {
194 eb->lock_nested = 0;
195 read_unlock(&eb->lock);
196 return;
197 }
198 read_unlock(&eb->lock);
199 }
152 btrfs_assert_tree_read_locked(eb); 200 btrfs_assert_tree_read_locked(eb);
153 WARN_ON(atomic_read(&eb->blocking_readers) == 0); 201 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
154 if (atomic_dec_and_test(&eb->blocking_readers)) 202 if (atomic_dec_and_test(&eb->blocking_readers))
@@ -181,6 +229,7 @@ again:
181 WARN_ON(atomic_read(&eb->spinning_writers)); 229 WARN_ON(atomic_read(&eb->spinning_writers));
182 atomic_inc(&eb->spinning_writers); 230 atomic_inc(&eb->spinning_writers);
183 atomic_inc(&eb->write_locks); 231 atomic_inc(&eb->write_locks);
232 eb->lock_owner = current->pid;
184 return 0; 233 return 0;
185} 234}
186 235
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cfb55434a469..8c1aae2c845d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
1604 ret = btrfs_inc_extent_ref(trans, root, new_bytenr, 1604 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1605 num_bytes, parent, 1605 num_bytes, parent,
1606 btrfs_header_owner(leaf), 1606 btrfs_header_owner(leaf),
1607 key.objectid, key.offset); 1607 key.objectid, key.offset, 1);
1608 BUG_ON(ret); 1608 BUG_ON(ret);
1609 1609
1610 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1610 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1611 parent, btrfs_header_owner(leaf), 1611 parent, btrfs_header_owner(leaf),
1612 key.objectid, key.offset); 1612 key.objectid, key.offset, 1);
1613 BUG_ON(ret); 1613 BUG_ON(ret);
1614 } 1614 }
1615 if (dirty) 1615 if (dirty)
@@ -1778,21 +1778,23 @@ again:
1778 1778
1779 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, 1779 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1780 path->nodes[level]->start, 1780 path->nodes[level]->start,
1781 src->root_key.objectid, level - 1, 0); 1781 src->root_key.objectid, level - 1, 0,
1782 1);
1782 BUG_ON(ret); 1783 BUG_ON(ret);
1783 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 1784 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1784 0, dest->root_key.objectid, level - 1, 1785 0, dest->root_key.objectid, level - 1,
1785 0); 1786 0, 1);
1786 BUG_ON(ret); 1787 BUG_ON(ret);
1787 1788
1788 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, 1789 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1789 path->nodes[level]->start, 1790 path->nodes[level]->start,
1790 src->root_key.objectid, level - 1, 0); 1791 src->root_key.objectid, level - 1, 0,
1792 1);
1791 BUG_ON(ret); 1793 BUG_ON(ret);
1792 1794
1793 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 1795 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1794 0, dest->root_key.objectid, level - 1, 1796 0, dest->root_key.objectid, level - 1,
1795 0); 1797 0, 1);
1796 BUG_ON(ret); 1798 BUG_ON(ret);
1797 1799
1798 btrfs_unlock_up_safe(path, 0); 1800 btrfs_unlock_up_safe(path, 0);
@@ -2244,7 +2246,7 @@ again:
2244 } else { 2246 } else {
2245 list_del_init(&reloc_root->root_list); 2247 list_del_init(&reloc_root->root_list);
2246 } 2248 }
2247 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0); 2249 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2248 } 2250 }
2249 2251
2250 if (found) { 2252 if (found) {
@@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2558 node->eb->start, blocksize, 2560 node->eb->start, blocksize,
2559 upper->eb->start, 2561 upper->eb->start,
2560 btrfs_header_owner(upper->eb), 2562 btrfs_header_owner(upper->eb),
2561 node->level, 0); 2563 node->level, 0, 1);
2562 BUG_ON(ret); 2564 BUG_ON(ret);
2563 2565
2564 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2566 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2947,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2947 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2949 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2948 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2950 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2949 while (index <= last_index) { 2951 while (index <= last_index) {
2950 mutex_lock(&inode->i_mutex);
2951 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); 2952 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2952 mutex_unlock(&inode->i_mutex);
2953 if (ret) 2953 if (ret)
2954 goto out; 2954 goto out;
2955 2955
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ddf2c90d3fc0..9770cc5bfb76 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "backref.h" 26#include "backref.h"
27#include "extent_io.h" 27#include "extent_io.h"
28#include "check-integrity.h"
28 29
29/* 30/*
30 * This is only the first step towards a full-features scrub. It reads all 31 * This is only the first step towards a full-features scrub. It reads all
@@ -309,7 +310,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
309 u8 ref_level; 310 u8 ref_level;
310 unsigned long ptr = 0; 311 unsigned long ptr = 0;
311 const int bufsize = 4096; 312 const int bufsize = 4096;
312 u64 extent_offset; 313 u64 extent_item_pos;
313 314
314 path = btrfs_alloc_path(); 315 path = btrfs_alloc_path();
315 316
@@ -329,12 +330,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
329 if (ret < 0) 330 if (ret < 0)
330 goto out; 331 goto out;
331 332
332 extent_offset = swarn.logical - found_key.objectid; 333 extent_item_pos = swarn.logical - found_key.objectid;
333 swarn.extent_item_size = found_key.offset; 334 swarn.extent_item_size = found_key.offset;
334 335
335 eb = path->nodes[0]; 336 eb = path->nodes[0];
336 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 337 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
337 item_size = btrfs_item_size_nr(eb, path->slots[0]); 338 item_size = btrfs_item_size_nr(eb, path->slots[0]);
339 btrfs_release_path(path);
338 340
339 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 341 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
340 do { 342 do {
@@ -351,7 +353,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
351 } else { 353 } else {
352 swarn.path = path; 354 swarn.path = path;
353 iterate_extent_inodes(fs_info, path, found_key.objectid, 355 iterate_extent_inodes(fs_info, path, found_key.objectid,
354 extent_offset, 356 extent_item_pos,
355 scrub_print_warning_inode, &swarn); 357 scrub_print_warning_inode, &swarn);
356 } 358 }
357 359
@@ -732,7 +734,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
732 bio_add_page(bio, page, PAGE_SIZE, 0); 734 bio_add_page(bio, page, PAGE_SIZE, 0);
733 bio->bi_end_io = scrub_fixup_end_io; 735 bio->bi_end_io = scrub_fixup_end_io;
734 bio->bi_private = &complete; 736 bio->bi_private = &complete;
735 submit_bio(rw, bio); 737 btrfsic_submit_bio(rw, bio);
736 738
737 /* this will also unplug the queue */ 739 /* this will also unplug the queue */
738 wait_for_completion(&complete); 740 wait_for_completion(&complete);
@@ -958,7 +960,7 @@ static int scrub_submit(struct scrub_dev *sdev)
958 sdev->curr = -1; 960 sdev->curr = -1;
959 atomic_inc(&sdev->in_flight); 961 atomic_inc(&sdev->in_flight);
960 962
961 submit_bio(READ, sbio->bio); 963 btrfsic_submit_bio(READ, sbio->bio);
962 964
963 return 0; 965 return 0;
964} 966}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ae488aa1966a..3ce97b217cbe 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -147,13 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
147 147
148static void btrfs_put_super(struct super_block *sb) 148static void btrfs_put_super(struct super_block *sb)
149{ 149{
150 struct btrfs_root *root = btrfs_sb(sb); 150 (void)close_ctree(btrfs_sb(sb)->tree_root);
151 int ret; 151 /* FIXME: need to fix VFS to return error? */
152 152 /* AV: return it _where_? ->put_super() can be triggered by any number
153 ret = close_ctree(root); 153 * of async events, up to and including delivery of SIGKILL to the
154 sb->s_fs_info = NULL; 154 * last process that kept it busy. Or segfault in the aforementioned
155 155 * process... Whom would you report that to?
156 (void)ret; /* FIXME: need to fix VFS to return error? */ 156 */
157} 157}
158 158
159enum { 159enum {
@@ -163,8 +163,11 @@ enum {
163 Opt_compress_type, Opt_compress_force, Opt_compress_force_type, 163 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
167 Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, 167 Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
168 Opt_check_integrity, Opt_check_integrity_including_extent_data,
169 Opt_check_integrity_print_mask,
170 Opt_err,
168}; 171};
169 172
170static match_table_t tokens = { 173static match_table_t tokens = {
@@ -199,6 +202,10 @@ static match_table_t tokens = {
199 {Opt_inode_cache, "inode_cache"}, 202 {Opt_inode_cache, "inode_cache"},
200 {Opt_no_space_cache, "nospace_cache"}, 203 {Opt_no_space_cache, "nospace_cache"},
201 {Opt_recovery, "recovery"}, 204 {Opt_recovery, "recovery"},
205 {Opt_skip_balance, "skip_balance"},
206 {Opt_check_integrity, "check_int"},
207 {Opt_check_integrity_including_extent_data, "check_int_data"},
208 {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
202 {Opt_err, NULL}, 209 {Opt_err, NULL},
203}; 210};
204 211
@@ -397,6 +404,40 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
397 printk(KERN_INFO "btrfs: enabling auto recovery"); 404 printk(KERN_INFO "btrfs: enabling auto recovery");
398 btrfs_set_opt(info->mount_opt, RECOVERY); 405 btrfs_set_opt(info->mount_opt, RECOVERY);
399 break; 406 break;
407 case Opt_skip_balance:
408 btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
409 break;
410#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
411 case Opt_check_integrity_including_extent_data:
412 printk(KERN_INFO "btrfs: enabling check integrity"
413 " including extent data\n");
414 btrfs_set_opt(info->mount_opt,
415 CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
416 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
417 break;
418 case Opt_check_integrity:
419 printk(KERN_INFO "btrfs: enabling check integrity\n");
420 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
421 break;
422 case Opt_check_integrity_print_mask:
423 intarg = 0;
424 match_int(&args[0], &intarg);
425 if (intarg) {
426 info->check_integrity_print_mask = intarg;
427 printk(KERN_INFO "btrfs:"
428 " check_integrity_print_mask 0x%x\n",
429 info->check_integrity_print_mask);
430 }
431 break;
432#else
433 case Opt_check_integrity_including_extent_data:
434 case Opt_check_integrity:
435 case Opt_check_integrity_print_mask:
436 printk(KERN_ERR "btrfs: support for check_integrity*"
437 " not compiled in!\n");
438 ret = -EINVAL;
439 goto out;
440#endif
400 case Opt_err: 441 case Opt_err:
401 printk(KERN_INFO "btrfs: unrecognized mount option " 442 printk(KERN_INFO "btrfs: unrecognized mount option "
402 "'%s'\n", p); 443 "'%s'\n", p);
@@ -500,7 +541,8 @@ out:
500static struct dentry *get_default_root(struct super_block *sb, 541static struct dentry *get_default_root(struct super_block *sb,
501 u64 subvol_objectid) 542 u64 subvol_objectid)
502{ 543{
503 struct btrfs_root *root = sb->s_fs_info; 544 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
545 struct btrfs_root *root = fs_info->tree_root;
504 struct btrfs_root *new_root; 546 struct btrfs_root *new_root;
505 struct btrfs_dir_item *di; 547 struct btrfs_dir_item *di;
506 struct btrfs_path *path; 548 struct btrfs_path *path;
@@ -530,7 +572,7 @@ static struct dentry *get_default_root(struct super_block *sb,
530 * will mount by default if we haven't been given a specific subvolume 572 * will mount by default if we haven't been given a specific subvolume
531 * to mount. 573 * to mount.
532 */ 574 */
533 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); 575 dir_id = btrfs_super_root_dir(fs_info->super_copy);
534 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 576 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
535 if (IS_ERR(di)) { 577 if (IS_ERR(di)) {
536 btrfs_free_path(path); 578 btrfs_free_path(path);
@@ -544,7 +586,7 @@ static struct dentry *get_default_root(struct super_block *sb,
544 */ 586 */
545 btrfs_free_path(path); 587 btrfs_free_path(path);
546 dir_id = BTRFS_FIRST_FREE_OBJECTID; 588 dir_id = BTRFS_FIRST_FREE_OBJECTID;
547 new_root = root->fs_info->fs_root; 589 new_root = fs_info->fs_root;
548 goto setup_root; 590 goto setup_root;
549 } 591 }
550 592
@@ -552,7 +594,7 @@ static struct dentry *get_default_root(struct super_block *sb,
552 btrfs_free_path(path); 594 btrfs_free_path(path);
553 595
554find_root: 596find_root:
555 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 597 new_root = btrfs_read_fs_root_no_name(fs_info, &location);
556 if (IS_ERR(new_root)) 598 if (IS_ERR(new_root))
557 return ERR_CAST(new_root); 599 return ERR_CAST(new_root);
558 600
@@ -588,7 +630,7 @@ static int btrfs_fill_super(struct super_block *sb,
588{ 630{
589 struct inode *inode; 631 struct inode *inode;
590 struct dentry *root_dentry; 632 struct dentry *root_dentry;
591 struct btrfs_root *tree_root; 633 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
592 struct btrfs_key key; 634 struct btrfs_key key;
593 int err; 635 int err;
594 636
@@ -603,18 +645,16 @@ static int btrfs_fill_super(struct super_block *sb,
603 sb->s_flags |= MS_POSIXACL; 645 sb->s_flags |= MS_POSIXACL;
604#endif 646#endif
605 647
606 tree_root = open_ctree(sb, fs_devices, (char *)data); 648 err = open_ctree(sb, fs_devices, (char *)data);
607 649 if (err) {
608 if (IS_ERR(tree_root)) {
609 printk("btrfs: open_ctree failed\n"); 650 printk("btrfs: open_ctree failed\n");
610 return PTR_ERR(tree_root); 651 return err;
611 } 652 }
612 sb->s_fs_info = tree_root;
613 653
614 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 654 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
615 key.type = BTRFS_INODE_ITEM_KEY; 655 key.type = BTRFS_INODE_ITEM_KEY;
616 key.offset = 0; 656 key.offset = 0;
617 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL); 657 inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
618 if (IS_ERR(inode)) { 658 if (IS_ERR(inode)) {
619 err = PTR_ERR(inode); 659 err = PTR_ERR(inode);
620 goto fail_close; 660 goto fail_close;
@@ -631,23 +671,25 @@ static int btrfs_fill_super(struct super_block *sb,
631 671
632 save_mount_options(sb, data); 672 save_mount_options(sb, data);
633 cleancache_init_fs(sb); 673 cleancache_init_fs(sb);
674 sb->s_flags |= MS_ACTIVE;
634 return 0; 675 return 0;
635 676
636fail_close: 677fail_close:
637 close_ctree(tree_root); 678 close_ctree(fs_info->tree_root);
638 return err; 679 return err;
639} 680}
640 681
641int btrfs_sync_fs(struct super_block *sb, int wait) 682int btrfs_sync_fs(struct super_block *sb, int wait)
642{ 683{
643 struct btrfs_trans_handle *trans; 684 struct btrfs_trans_handle *trans;
644 struct btrfs_root *root = btrfs_sb(sb); 685 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
686 struct btrfs_root *root = fs_info->tree_root;
645 int ret; 687 int ret;
646 688
647 trace_btrfs_sync_fs(wait); 689 trace_btrfs_sync_fs(wait);
648 690
649 if (!wait) { 691 if (!wait) {
650 filemap_flush(root->fs_info->btree_inode->i_mapping); 692 filemap_flush(fs_info->btree_inode->i_mapping);
651 return 0; 693 return 0;
652 } 694 }
653 695
@@ -663,8 +705,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
663 705
664static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) 706static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
665{ 707{
666 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 708 struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
667 struct btrfs_fs_info *info = root->fs_info; 709 struct btrfs_root *root = info->tree_root;
668 char *compress_type; 710 char *compress_type;
669 711
670 if (btrfs_test_opt(root, DEGRADED)) 712 if (btrfs_test_opt(root, DEGRADED))
@@ -722,28 +764,25 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
722 seq_puts(seq, ",autodefrag"); 764 seq_puts(seq, ",autodefrag");
723 if (btrfs_test_opt(root, INODE_MAP_CACHE)) 765 if (btrfs_test_opt(root, INODE_MAP_CACHE))
724 seq_puts(seq, ",inode_cache"); 766 seq_puts(seq, ",inode_cache");
767 if (btrfs_test_opt(root, SKIP_BALANCE))
768 seq_puts(seq, ",skip_balance");
725 return 0; 769 return 0;
726} 770}
727 771
728static int btrfs_test_super(struct super_block *s, void *data) 772static int btrfs_test_super(struct super_block *s, void *data)
729{ 773{
730 struct btrfs_root *test_root = data; 774 struct btrfs_fs_info *p = data;
731 struct btrfs_root *root = btrfs_sb(s); 775 struct btrfs_fs_info *fs_info = btrfs_sb(s);
732 776
733 /* 777 return fs_info->fs_devices == p->fs_devices;
734 * If this super block is going away, return false as it
735 * can't match as an existing super block.
736 */
737 if (!atomic_read(&s->s_active))
738 return 0;
739 return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
740} 778}
741 779
742static int btrfs_set_super(struct super_block *s, void *data) 780static int btrfs_set_super(struct super_block *s, void *data)
743{ 781{
744 s->s_fs_info = data; 782 int err = set_anon_super(s, data);
745 783 if (!err)
746 return set_anon_super(s, data); 784 s->s_fs_info = data;
785 return err;
747} 786}
748 787
749/* 788/*
@@ -903,12 +942,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
903 if (!fs_info) 942 if (!fs_info)
904 return ERR_PTR(-ENOMEM); 943 return ERR_PTR(-ENOMEM);
905 944
906 fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
907 if (!fs_info->tree_root) {
908 error = -ENOMEM;
909 goto error_fs_info;
910 }
911 fs_info->tree_root->fs_info = fs_info;
912 fs_info->fs_devices = fs_devices; 945 fs_info->fs_devices = fs_devices;
913 946
914 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); 947 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
@@ -928,43 +961,30 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
928 } 961 }
929 962
930 bdev = fs_devices->latest_bdev; 963 bdev = fs_devices->latest_bdev;
931 s = sget(fs_type, btrfs_test_super, btrfs_set_super, 964 s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
932 fs_info->tree_root);
933 if (IS_ERR(s)) { 965 if (IS_ERR(s)) {
934 error = PTR_ERR(s); 966 error = PTR_ERR(s);
935 goto error_close_devices; 967 goto error_close_devices;
936 } 968 }
937 969
938 if (s->s_root) { 970 if (s->s_root) {
939 if ((flags ^ s->s_flags) & MS_RDONLY) {
940 deactivate_locked_super(s);
941 error = -EBUSY;
942 goto error_close_devices;
943 }
944
945 btrfs_close_devices(fs_devices); 971 btrfs_close_devices(fs_devices);
946 free_fs_info(fs_info); 972 free_fs_info(fs_info);
973 if ((flags ^ s->s_flags) & MS_RDONLY)
974 error = -EBUSY;
947 } else { 975 } else {
948 char b[BDEVNAME_SIZE]; 976 char b[BDEVNAME_SIZE];
949 977
950 s->s_flags = flags | MS_NOSEC; 978 s->s_flags = flags | MS_NOSEC;
951 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 979 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
952 btrfs_sb(s)->fs_info->bdev_holder = fs_type; 980 btrfs_sb(s)->bdev_holder = fs_type;
953 error = btrfs_fill_super(s, fs_devices, data, 981 error = btrfs_fill_super(s, fs_devices, data,
954 flags & MS_SILENT ? 1 : 0); 982 flags & MS_SILENT ? 1 : 0);
955 if (error) {
956 deactivate_locked_super(s);
957 return ERR_PTR(error);
958 }
959
960 s->s_flags |= MS_ACTIVE;
961 } 983 }
962 984
963 root = get_default_root(s, subvol_objectid); 985 root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
964 if (IS_ERR(root)) { 986 if (IS_ERR(root))
965 deactivate_locked_super(s); 987 deactivate_locked_super(s);
966 return root;
967 }
968 988
969 return root; 989 return root;
970 990
@@ -977,7 +997,8 @@ error_fs_info:
977 997
978static int btrfs_remount(struct super_block *sb, int *flags, char *data) 998static int btrfs_remount(struct super_block *sb, int *flags, char *data)
979{ 999{
980 struct btrfs_root *root = btrfs_sb(sb); 1000 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1001 struct btrfs_root *root = fs_info->tree_root;
981 int ret; 1002 int ret;
982 1003
983 ret = btrfs_parse_options(root, data); 1004 ret = btrfs_parse_options(root, data);
@@ -993,13 +1014,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
993 ret = btrfs_commit_super(root); 1014 ret = btrfs_commit_super(root);
994 WARN_ON(ret); 1015 WARN_ON(ret);
995 } else { 1016 } else {
996 if (root->fs_info->fs_devices->rw_devices == 0) 1017 if (fs_info->fs_devices->rw_devices == 0)
997 return -EACCES; 1018 return -EACCES;
998 1019
999 if (btrfs_super_log_root(root->fs_info->super_copy) != 0) 1020 if (btrfs_super_log_root(fs_info->super_copy) != 0)
1000 return -EINVAL; 1021 return -EINVAL;
1001 1022
1002 ret = btrfs_cleanup_fs_roots(root->fs_info); 1023 ret = btrfs_cleanup_fs_roots(fs_info);
1003 WARN_ON(ret); 1024 WARN_ON(ret);
1004 1025
1005 /* recover relocation */ 1026 /* recover relocation */
@@ -1168,18 +1189,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1168 1189
1169static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1190static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1170{ 1191{
1171 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1192 struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
1172 struct btrfs_super_block *disk_super = root->fs_info->super_copy; 1193 struct btrfs_super_block *disk_super = fs_info->super_copy;
1173 struct list_head *head = &root->fs_info->space_info; 1194 struct list_head *head = &fs_info->space_info;
1174 struct btrfs_space_info *found; 1195 struct btrfs_space_info *found;
1175 u64 total_used = 0; 1196 u64 total_used = 0;
1176 u64 total_free_data = 0; 1197 u64 total_free_data = 0;
1177 int bits = dentry->d_sb->s_blocksize_bits; 1198 int bits = dentry->d_sb->s_blocksize_bits;
1178 __be32 *fsid = (__be32 *)root->fs_info->fsid; 1199 __be32 *fsid = (__be32 *)fs_info->fsid;
1179 int ret; 1200 int ret;
1180 1201
1181 /* holding chunk_muext to avoid allocating new chunks */ 1202 /* holding chunk_muext to avoid allocating new chunks */
1182 mutex_lock(&root->fs_info->chunk_mutex); 1203 mutex_lock(&fs_info->chunk_mutex);
1183 rcu_read_lock(); 1204 rcu_read_lock();
1184 list_for_each_entry_rcu(found, head, list) { 1205 list_for_each_entry_rcu(found, head, list) {
1185 if (found->flags & BTRFS_BLOCK_GROUP_DATA) { 1206 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1198,14 +1219,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1198 buf->f_bsize = dentry->d_sb->s_blocksize; 1219 buf->f_bsize = dentry->d_sb->s_blocksize;
1199 buf->f_type = BTRFS_SUPER_MAGIC; 1220 buf->f_type = BTRFS_SUPER_MAGIC;
1200 buf->f_bavail = total_free_data; 1221 buf->f_bavail = total_free_data;
1201 ret = btrfs_calc_avail_data_space(root, &total_free_data); 1222 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
1202 if (ret) { 1223 if (ret) {
1203 mutex_unlock(&root->fs_info->chunk_mutex); 1224 mutex_unlock(&fs_info->chunk_mutex);
1204 return ret; 1225 return ret;
1205 } 1226 }
1206 buf->f_bavail += total_free_data; 1227 buf->f_bavail += total_free_data;
1207 buf->f_bavail = buf->f_bavail >> bits; 1228 buf->f_bavail = buf->f_bavail >> bits;
1208 mutex_unlock(&root->fs_info->chunk_mutex); 1229 mutex_unlock(&fs_info->chunk_mutex);
1209 1230
1210 /* We treat it as constant endianness (it doesn't matter _which_) 1231 /* We treat it as constant endianness (it doesn't matter _which_)
1211 because we want the fsid to come out the same whether mounted 1232 because we want the fsid to come out the same whether mounted
@@ -1219,11 +1240,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1219 return 0; 1240 return 0;
1220} 1241}
1221 1242
1243static void btrfs_kill_super(struct super_block *sb)
1244{
1245 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1246 kill_anon_super(sb);
1247 free_fs_info(fs_info);
1248}
1249
1222static struct file_system_type btrfs_fs_type = { 1250static struct file_system_type btrfs_fs_type = {
1223 .owner = THIS_MODULE, 1251 .owner = THIS_MODULE,
1224 .name = "btrfs", 1252 .name = "btrfs",
1225 .mount = btrfs_mount, 1253 .mount = btrfs_mount,
1226 .kill_sb = kill_anon_super, 1254 .kill_sb = btrfs_kill_super,
1227 .fs_flags = FS_REQUIRES_DEV, 1255 .fs_flags = FS_REQUIRES_DEV,
1228}; 1256};
1229 1257
@@ -1257,17 +1285,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
1257 1285
1258static int btrfs_freeze(struct super_block *sb) 1286static int btrfs_freeze(struct super_block *sb)
1259{ 1287{
1260 struct btrfs_root *root = btrfs_sb(sb); 1288 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1261 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1289 mutex_lock(&fs_info->transaction_kthread_mutex);
1262 mutex_lock(&root->fs_info->cleaner_mutex); 1290 mutex_lock(&fs_info->cleaner_mutex);
1263 return 0; 1291 return 0;
1264} 1292}
1265 1293
1266static int btrfs_unfreeze(struct super_block *sb) 1294static int btrfs_unfreeze(struct super_block *sb)
1267{ 1295{
1268 struct btrfs_root *root = btrfs_sb(sb); 1296 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1269 mutex_unlock(&root->fs_info->cleaner_mutex); 1297 mutex_unlock(&fs_info->cleaner_mutex);
1270 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1298 mutex_unlock(&fs_info->transaction_kthread_mutex);
1271 return 0; 1299 return 0;
1272} 1300}
1273 1301
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81376d94cd3c..287a6728b1ad 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
36 WARN_ON(atomic_read(&transaction->use_count) == 0); 36 WARN_ON(atomic_read(&transaction->use_count) == 0);
37 if (atomic_dec_and_test(&transaction->use_count)) { 37 if (atomic_dec_and_test(&transaction->use_count)) {
38 BUG_ON(!list_empty(&transaction->list)); 38 BUG_ON(!list_empty(&transaction->list));
39 WARN_ON(transaction->delayed_refs.root.rb_node);
40 WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
39 memset(transaction, 0, sizeof(*transaction)); 41 memset(transaction, 0, sizeof(*transaction));
40 kmem_cache_free(btrfs_transaction_cachep, transaction); 42 kmem_cache_free(btrfs_transaction_cachep, transaction);
41 } 43 }
@@ -108,8 +110,11 @@ loop:
108 cur_trans->delayed_refs.num_heads = 0; 110 cur_trans->delayed_refs.num_heads = 0;
109 cur_trans->delayed_refs.flushing = 0; 111 cur_trans->delayed_refs.flushing = 0;
110 cur_trans->delayed_refs.run_delayed_start = 0; 112 cur_trans->delayed_refs.run_delayed_start = 0;
113 cur_trans->delayed_refs.seq = 1;
114 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
111 spin_lock_init(&cur_trans->commit_lock); 115 spin_lock_init(&cur_trans->commit_lock);
112 spin_lock_init(&cur_trans->delayed_refs.lock); 116 spin_lock_init(&cur_trans->delayed_refs.lock);
117 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
113 118
114 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 119 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
115 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 120 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
@@ -321,6 +326,8 @@ again:
321 } 326 }
322 327
323 if (num_bytes) { 328 if (num_bytes) {
329 trace_btrfs_space_reservation(root->fs_info, "transaction",
330 (u64)h, num_bytes, 1);
324 h->block_rsv = &root->fs_info->trans_block_rsv; 331 h->block_rsv = &root->fs_info->trans_block_rsv;
325 h->bytes_reserved = num_bytes; 332 h->bytes_reserved = num_bytes;
326 } 333 }
@@ -467,19 +474,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
467 474
468 btrfs_trans_release_metadata(trans, root); 475 btrfs_trans_release_metadata(trans, root);
469 trans->block_rsv = NULL; 476 trans->block_rsv = NULL;
470 while (count < 4) { 477 while (count < 2) {
471 unsigned long cur = trans->delayed_ref_updates; 478 unsigned long cur = trans->delayed_ref_updates;
472 trans->delayed_ref_updates = 0; 479 trans->delayed_ref_updates = 0;
473 if (cur && 480 if (cur &&
474 trans->transaction->delayed_refs.num_heads_ready > 64) { 481 trans->transaction->delayed_refs.num_heads_ready > 64) {
475 trans->delayed_ref_updates = 0; 482 trans->delayed_ref_updates = 0;
476
477 /*
478 * do a full flush if the transaction is trying
479 * to close
480 */
481 if (trans->transaction->delayed_refs.flushing)
482 cur = 0;
483 btrfs_run_delayed_refs(trans, root, cur); 483 btrfs_run_delayed_refs(trans, root, cur);
484 } else { 484 } else {
485 break; 485 break;
@@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1393 1393
1394 if (btrfs_header_backref_rev(root->node) < 1394 if (btrfs_header_backref_rev(root->node) <
1395 BTRFS_MIXED_BACKREF_REV) 1395 BTRFS_MIXED_BACKREF_REV)
1396 btrfs_drop_snapshot(root, NULL, 0); 1396 btrfs_drop_snapshot(root, NULL, 0, 0);
1397 else 1397 else
1398 btrfs_drop_snapshot(root, NULL, 1); 1398 btrfs_drop_snapshot(root, NULL, 1, 0);
1399 } 1399 }
1400 return 0; 1400 return 0;
1401} 1401}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3568374d419d..cb877e0886a7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
589 ret = btrfs_inc_extent_ref(trans, root, 589 ret = btrfs_inc_extent_ref(trans, root,
590 ins.objectid, ins.offset, 590 ins.objectid, ins.offset,
591 0, root->root_key.objectid, 591 0, root->root_key.objectid,
592 key->objectid, offset); 592 key->objectid, offset, 0);
593 BUG_ON(ret); 593 BUG_ON(ret);
594 } else { 594 } else {
595 /* 595 /*
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644
index 000000000000..12f5147bd2b1
--- /dev/null
+++ b/fs/btrfs/ulist.c
@@ -0,0 +1,220 @@
1/*
2 * Copyright (C) 2011 STRATO AG
3 * written by Arne Jansen <sensille@gmx.net>
4 * Distributed under the GNU GPL license version 2.
5 */
6
7#include <linux/slab.h>
8#include <linux/module.h>
9#include "ulist.h"
10
11/*
12 * ulist is a generic data structure to hold a collection of unique u64
13 * values. The only operations it supports is adding to the list and
14 * enumerating it.
15 * It is possible to store an auxiliary value along with the key.
16 *
17 * The implementation is preliminary and can probably be sped up
18 * significantly. A first step would be to store the values in an rbtree
19 * as soon as ULIST_SIZE is exceeded.
20 *
21 * A sample usage for ulists is the enumeration of directed graphs without
22 * visiting a node twice. The pseudo-code could look like this:
23 *
24 * ulist = ulist_alloc();
25 * ulist_add(ulist, root);
26 * elem = NULL;
27 *
28 * while ((elem = ulist_next(ulist, elem)) {
29 * for (all child nodes n in elem)
30 * ulist_add(ulist, n);
31 * do something useful with the node;
32 * }
33 * ulist_free(ulist);
34 *
35 * This assumes the graph nodes are adressable by u64. This stems from the
36 * usage for tree enumeration in btrfs, where the logical addresses are
37 * 64 bit.
38 *
39 * It is also useful for tree enumeration which could be done elegantly
40 * recursively, but is not possible due to kernel stack limitations. The
41 * loop would be similar to the above.
42 */
43
44/**
45 * ulist_init - freshly initialize a ulist
46 * @ulist: the ulist to initialize
47 *
48 * Note: don't use this function to init an already used ulist, use
49 * ulist_reinit instead.
50 */
51void ulist_init(struct ulist *ulist)
52{
53 ulist->nnodes = 0;
54 ulist->nodes = ulist->int_nodes;
55 ulist->nodes_alloced = ULIST_SIZE;
56}
57EXPORT_SYMBOL(ulist_init);
58
59/**
60 * ulist_fini - free up additionally allocated memory for the ulist
61 * @ulist: the ulist from which to free the additional memory
62 *
63 * This is useful in cases where the base 'struct ulist' has been statically
64 * allocated.
65 */
66void ulist_fini(struct ulist *ulist)
67{
68 /*
69 * The first ULIST_SIZE elements are stored inline in struct ulist.
70 * Only if more elements are alocated they need to be freed.
71 */
72 if (ulist->nodes_alloced > ULIST_SIZE)
73 kfree(ulist->nodes);
74 ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */
75}
76EXPORT_SYMBOL(ulist_fini);
77
78/**
79 * ulist_reinit - prepare a ulist for reuse
80 * @ulist: ulist to be reused
81 *
82 * Free up all additional memory allocated for the list elements and reinit
83 * the ulist.
84 */
85void ulist_reinit(struct ulist *ulist)
86{
87 ulist_fini(ulist);
88 ulist_init(ulist);
89}
90EXPORT_SYMBOL(ulist_reinit);
91
92/**
93 * ulist_alloc - dynamically allocate a ulist
94 * @gfp_mask: allocation flags to for base allocation
95 *
96 * The allocated ulist will be returned in an initialized state.
97 */
98struct ulist *ulist_alloc(unsigned long gfp_mask)
99{
100 struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
101
102 if (!ulist)
103 return NULL;
104
105 ulist_init(ulist);
106
107 return ulist;
108}
109EXPORT_SYMBOL(ulist_alloc);
110
111/**
112 * ulist_free - free dynamically allocated ulist
113 * @ulist: ulist to free
114 *
115 * It is not necessary to call ulist_fini before.
116 */
117void ulist_free(struct ulist *ulist)
118{
119 if (!ulist)
120 return;
121 ulist_fini(ulist);
122 kfree(ulist);
123}
124EXPORT_SYMBOL(ulist_free);
125
126/**
127 * ulist_add - add an element to the ulist
128 * @ulist: ulist to add the element to
129 * @val: value to add to ulist
130 * @aux: auxiliary value to store along with val
131 * @gfp_mask: flags to use for allocation
132 *
133 * Note: locking must be provided by the caller. In case of rwlocks write
134 * locking is needed
135 *
136 * Add an element to a ulist. The @val will only be added if it doesn't
137 * already exist. If it is added, the auxiliary value @aux is stored along with
138 * it. In case @val already exists in the ulist, @aux is ignored, even if
139 * it differs from the already stored value.
140 *
141 * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
142 * inserted.
143 * In case of allocation failure -ENOMEM is returned and the ulist stays
144 * unaltered.
145 */
146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
147 unsigned long gfp_mask)
148{
149 int i;
150
151 for (i = 0; i < ulist->nnodes; ++i) {
152 if (ulist->nodes[i].val == val)
153 return 0;
154 }
155
156 if (ulist->nnodes >= ulist->nodes_alloced) {
157 u64 new_alloced = ulist->nodes_alloced + 128;
158 struct ulist_node *new_nodes;
159 void *old = NULL;
160
161 /*
162 * if nodes_alloced == ULIST_SIZE no memory has been allocated
163 * yet, so pass NULL to krealloc
164 */
165 if (ulist->nodes_alloced > ULIST_SIZE)
166 old = ulist->nodes;
167
168 new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
169 gfp_mask);
170 if (!new_nodes)
171 return -ENOMEM;
172
173 if (!old)
174 memcpy(new_nodes, ulist->int_nodes,
175 sizeof(ulist->int_nodes));
176
177 ulist->nodes = new_nodes;
178 ulist->nodes_alloced = new_alloced;
179 }
180 ulist->nodes[ulist->nnodes].val = val;
181 ulist->nodes[ulist->nnodes].aux = aux;
182 ++ulist->nnodes;
183
184 return 1;
185}
186EXPORT_SYMBOL(ulist_add);
187
188/**
189 * ulist_next - iterate ulist
190 * @ulist: ulist to iterate
191 * @prev: previously returned element or %NULL to start iteration
192 *
193 * Note: locking must be provided by the caller. In case of rwlocks only read
194 * locking is needed
195 *
196 * This function is used to iterate an ulist. The iteration is started with
197 * @prev = %NULL. It returns the next element from the ulist or %NULL when the
198 * end is reached. No guarantee is made with respect to the order in which
199 * the elements are returned. They might neither be returned in order of
200 * addition nor in ascending order.
201 * It is allowed to call ulist_add during an enumeration. Newly added items
202 * are guaranteed to show up in the running enumeration.
203 */
204struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
205{
206 int next;
207
208 if (ulist->nnodes == 0)
209 return NULL;
210
211 if (!prev)
212 return &ulist->nodes[0];
213
214 next = (prev - ulist->nodes) + 1;
215 if (next < 0 || next >= ulist->nnodes)
216 return NULL;
217
218 return &ulist->nodes[next];
219}
220EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644
index 000000000000..2e25dec58ec0
--- /dev/null
+++ b/fs/btrfs/ulist.h
@@ -0,0 +1,68 @@
1/*
2 * Copyright (C) 2011 STRATO AG
3 * written by Arne Jansen <sensille@gmx.net>
4 * Distributed under the GNU GPL license version 2.
5 *
6 */
7
8#ifndef __ULIST__
9#define __ULIST__
10
11/*
12 * ulist is a generic data structure to hold a collection of unique u64
13 * values. The only operations it supports is adding to the list and
14 * enumerating it.
15 * It is possible to store an auxiliary value along with the key.
16 *
17 * The implementation is preliminary and can probably be sped up
18 * significantly. A first step would be to store the values in an rbtree
19 * as soon as ULIST_SIZE is exceeded.
20 */
21
22/*
23 * number of elements statically allocated inside struct ulist
24 */
25#define ULIST_SIZE 16
26
27/*
28 * element of the list
29 */
30struct ulist_node {
31 u64 val; /* value to store */
32 unsigned long aux; /* auxiliary value saved along with the val */
33};
34
35struct ulist {
36 /*
37 * number of elements stored in list
38 */
39 unsigned long nnodes;
40
41 /*
42 * number of nodes we already have room for
43 */
44 unsigned long nodes_alloced;
45
46 /*
47 * pointer to the array storing the elements. The first ULIST_SIZE
48 * elements are stored inline. In this case the it points to int_nodes.
49 * After exceeding ULIST_SIZE, dynamic memory is allocated.
50 */
51 struct ulist_node *nodes;
52
53 /*
54 * inline storage space for the first ULIST_SIZE entries
55 */
56 struct ulist_node int_nodes[ULIST_SIZE];
57};
58
59void ulist_init(struct ulist *ulist);
60void ulist_fini(struct ulist *ulist);
61void ulist_reinit(struct ulist *ulist);
62struct ulist *ulist_alloc(unsigned long gfp_mask);
63void ulist_free(struct ulist *ulist);
64int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
65 unsigned long gfp_mask);
66struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
67
68#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9d..0b4e2af7954d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/kthread.h>
26#include <asm/div64.h> 27#include <asm/div64.h>
27#include "compat.h" 28#include "compat.h"
28#include "ctree.h" 29#include "ctree.h"
@@ -32,6 +33,7 @@
32#include "print-tree.h" 33#include "print-tree.h"
33#include "volumes.h" 34#include "volumes.h"
34#include "async-thread.h" 35#include "async-thread.h"
36#include "check-integrity.h"
35 37
36static int init_first_rw_device(struct btrfs_trans_handle *trans, 38static int init_first_rw_device(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root, 39 struct btrfs_root *root,
@@ -246,7 +248,7 @@ loop_lock:
246 sync_pending = 0; 248 sync_pending = 0;
247 } 249 }
248 250
249 submit_bio(cur->bi_rw, cur); 251 btrfsic_submit_bio(cur->bi_rw, cur);
250 num_run++; 252 num_run++;
251 batch_run++; 253 batch_run++;
252 if (need_resched()) 254 if (need_resched())
@@ -706,8 +708,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
706 u64 devid; 708 u64 devid;
707 u64 transid; 709 u64 transid;
708 710
709 mutex_lock(&uuid_mutex);
710
711 flags |= FMODE_EXCL; 711 flags |= FMODE_EXCL;
712 bdev = blkdev_get_by_path(path, flags, holder); 712 bdev = blkdev_get_by_path(path, flags, holder);
713 713
@@ -716,6 +716,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
716 goto error; 716 goto error;
717 } 717 }
718 718
719 mutex_lock(&uuid_mutex);
719 ret = set_blocksize(bdev, 4096); 720 ret = set_blocksize(bdev, 4096);
720 if (ret) 721 if (ret)
721 goto error_close; 722 goto error_close;
@@ -737,9 +738,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
737 738
738 brelse(bh); 739 brelse(bh);
739error_close: 740error_close:
741 mutex_unlock(&uuid_mutex);
740 blkdev_put(bdev, flags); 742 blkdev_put(bdev, flags);
741error: 743error:
742 mutex_unlock(&uuid_mutex);
743 return ret; 744 return ret;
744} 745}
745 746
@@ -829,7 +830,6 @@ out:
829 830
830/* 831/*
831 * find_free_dev_extent - find free space in the specified device 832 * find_free_dev_extent - find free space in the specified device
832 * @trans: transaction handler
833 * @device: the device which we search the free space in 833 * @device: the device which we search the free space in
834 * @num_bytes: the size of the free space that we need 834 * @num_bytes: the size of the free space that we need
835 * @start: store the start of the free space. 835 * @start: store the start of the free space.
@@ -848,8 +848,7 @@ out:
848 * But if we don't find suitable free space, it is used to store the size of 848 * But if we don't find suitable free space, it is used to store the size of
849 * the max free space. 849 * the max free space.
850 */ 850 */
851int find_free_dev_extent(struct btrfs_trans_handle *trans, 851int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
852 struct btrfs_device *device, u64 num_bytes,
853 u64 *start, u64 *len) 852 u64 *start, u64 *len)
854{ 853{
855 struct btrfs_key key; 854 struct btrfs_key key;
@@ -893,7 +892,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
893 key.offset = search_start; 892 key.offset = search_start;
894 key.type = BTRFS_DEV_EXTENT_KEY; 893 key.type = BTRFS_DEV_EXTENT_KEY;
895 894
896 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 895 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
897 if (ret < 0) 896 if (ret < 0)
898 goto out; 897 goto out;
899 if (ret > 0) { 898 if (ret > 0) {
@@ -1282,7 +1281,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1282 bool clear_super = false; 1281 bool clear_super = false;
1283 1282
1284 mutex_lock(&uuid_mutex); 1283 mutex_lock(&uuid_mutex);
1285 mutex_lock(&root->fs_info->volume_mutex);
1286 1284
1287 all_avail = root->fs_info->avail_data_alloc_bits | 1285 all_avail = root->fs_info->avail_data_alloc_bits |
1288 root->fs_info->avail_system_alloc_bits | 1286 root->fs_info->avail_system_alloc_bits |
@@ -1452,7 +1450,6 @@ error_close:
1452 if (bdev) 1450 if (bdev)
1453 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1451 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1454out: 1452out:
1455 mutex_unlock(&root->fs_info->volume_mutex);
1456 mutex_unlock(&uuid_mutex); 1453 mutex_unlock(&uuid_mutex);
1457 return ret; 1454 return ret;
1458error_undo: 1455error_undo:
@@ -1469,8 +1466,7 @@ error_undo:
1469/* 1466/*
1470 * does all the dirty work required for changing file system's UUID. 1467 * does all the dirty work required for changing file system's UUID.
1471 */ 1468 */
1472static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1469static int btrfs_prepare_sprout(struct btrfs_root *root)
1473 struct btrfs_root *root)
1474{ 1470{
1475 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1471 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1476 struct btrfs_fs_devices *old_devices; 1472 struct btrfs_fs_devices *old_devices;
@@ -1629,7 +1625,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1629 } 1625 }
1630 1626
1631 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1627 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1632 mutex_lock(&root->fs_info->volume_mutex);
1633 1628
1634 devices = &root->fs_info->fs_devices->devices; 1629 devices = &root->fs_info->fs_devices->devices;
1635 /* 1630 /*
@@ -1695,7 +1690,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1695 1690
1696 if (seeding_dev) { 1691 if (seeding_dev) {
1697 sb->s_flags &= ~MS_RDONLY; 1692 sb->s_flags &= ~MS_RDONLY;
1698 ret = btrfs_prepare_sprout(trans, root); 1693 ret = btrfs_prepare_sprout(root);
1699 BUG_ON(ret); 1694 BUG_ON(ret);
1700 } 1695 }
1701 1696
@@ -1757,8 +1752,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1757 ret = btrfs_relocate_sys_chunks(root); 1752 ret = btrfs_relocate_sys_chunks(root);
1758 BUG_ON(ret); 1753 BUG_ON(ret);
1759 } 1754 }
1760out: 1755
1761 mutex_unlock(&root->fs_info->volume_mutex);
1762 return ret; 1756 return ret;
1763error: 1757error:
1764 blkdev_put(bdev, FMODE_EXCL); 1758 blkdev_put(bdev, FMODE_EXCL);
@@ -1766,7 +1760,7 @@ error:
1766 mutex_unlock(&uuid_mutex); 1760 mutex_unlock(&uuid_mutex);
1767 up_write(&sb->s_umount); 1761 up_write(&sb->s_umount);
1768 } 1762 }
1769 goto out; 1763 return ret;
1770} 1764}
1771 1765
1772static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1766static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2077,6 +2071,362 @@ error:
2077 return ret; 2071 return ret;
2078} 2072}
2079 2073
2074static int insert_balance_item(struct btrfs_root *root,
2075 struct btrfs_balance_control *bctl)
2076{
2077 struct btrfs_trans_handle *trans;
2078 struct btrfs_balance_item *item;
2079 struct btrfs_disk_balance_args disk_bargs;
2080 struct btrfs_path *path;
2081 struct extent_buffer *leaf;
2082 struct btrfs_key key;
2083 int ret, err;
2084
2085 path = btrfs_alloc_path();
2086 if (!path)
2087 return -ENOMEM;
2088
2089 trans = btrfs_start_transaction(root, 0);
2090 if (IS_ERR(trans)) {
2091 btrfs_free_path(path);
2092 return PTR_ERR(trans);
2093 }
2094
2095 key.objectid = BTRFS_BALANCE_OBJECTID;
2096 key.type = BTRFS_BALANCE_ITEM_KEY;
2097 key.offset = 0;
2098
2099 ret = btrfs_insert_empty_item(trans, root, path, &key,
2100 sizeof(*item));
2101 if (ret)
2102 goto out;
2103
2104 leaf = path->nodes[0];
2105 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2106
2107 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2108
2109 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2110 btrfs_set_balance_data(leaf, item, &disk_bargs);
2111 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2112 btrfs_set_balance_meta(leaf, item, &disk_bargs);
2113 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2114 btrfs_set_balance_sys(leaf, item, &disk_bargs);
2115
2116 btrfs_set_balance_flags(leaf, item, bctl->flags);
2117
2118 btrfs_mark_buffer_dirty(leaf);
2119out:
2120 btrfs_free_path(path);
2121 err = btrfs_commit_transaction(trans, root);
2122 if (err && !ret)
2123 ret = err;
2124 return ret;
2125}
2126
2127static int del_balance_item(struct btrfs_root *root)
2128{
2129 struct btrfs_trans_handle *trans;
2130 struct btrfs_path *path;
2131 struct btrfs_key key;
2132 int ret, err;
2133
2134 path = btrfs_alloc_path();
2135 if (!path)
2136 return -ENOMEM;
2137
2138 trans = btrfs_start_transaction(root, 0);
2139 if (IS_ERR(trans)) {
2140 btrfs_free_path(path);
2141 return PTR_ERR(trans);
2142 }
2143
2144 key.objectid = BTRFS_BALANCE_OBJECTID;
2145 key.type = BTRFS_BALANCE_ITEM_KEY;
2146 key.offset = 0;
2147
2148 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2149 if (ret < 0)
2150 goto out;
2151 if (ret > 0) {
2152 ret = -ENOENT;
2153 goto out;
2154 }
2155
2156 ret = btrfs_del_item(trans, root, path);
2157out:
2158 btrfs_free_path(path);
2159 err = btrfs_commit_transaction(trans, root);
2160 if (err && !ret)
2161 ret = err;
2162 return ret;
2163}
2164
2165/*
2166 * This is a heuristic used to reduce the number of chunks balanced on
2167 * resume after balance was interrupted.
2168 */
2169static void update_balance_args(struct btrfs_balance_control *bctl)
2170{
2171 /*
2172 * Turn on soft mode for chunk types that were being converted.
2173 */
2174 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2175 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2176 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2177 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2178 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2179 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2180
2181 /*
2182 * Turn on usage filter if is not already used. The idea is
2183 * that chunks that we have already balanced should be
2184 * reasonably full. Don't do it for chunks that are being
2185 * converted - that will keep us from relocating unconverted
2186 * (albeit full) chunks.
2187 */
2188 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2189 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2190 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2191 bctl->data.usage = 90;
2192 }
2193 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2194 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2195 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2196 bctl->sys.usage = 90;
2197 }
2198 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2199 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2200 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2201 bctl->meta.usage = 90;
2202 }
2203}
2204
2205/*
2206 * Should be called with both balance and volume mutexes held to
2207 * serialize other volume operations (add_dev/rm_dev/resize) with
2208 * restriper. Same goes for unset_balance_control.
2209 */
2210static void set_balance_control(struct btrfs_balance_control *bctl)
2211{
2212 struct btrfs_fs_info *fs_info = bctl->fs_info;
2213
2214 BUG_ON(fs_info->balance_ctl);
2215
2216 spin_lock(&fs_info->balance_lock);
2217 fs_info->balance_ctl = bctl;
2218 spin_unlock(&fs_info->balance_lock);
2219}
2220
2221static void unset_balance_control(struct btrfs_fs_info *fs_info)
2222{
2223 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2224
2225 BUG_ON(!fs_info->balance_ctl);
2226
2227 spin_lock(&fs_info->balance_lock);
2228 fs_info->balance_ctl = NULL;
2229 spin_unlock(&fs_info->balance_lock);
2230
2231 kfree(bctl);
2232}
2233
2234/*
2235 * Balance filters. Return 1 if chunk should be filtered out
2236 * (should not be balanced).
2237 */
2238static int chunk_profiles_filter(u64 chunk_profile,
2239 struct btrfs_balance_args *bargs)
2240{
2241 chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2242
2243 if (chunk_profile == 0)
2244 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2245
2246 if (bargs->profiles & chunk_profile)
2247 return 0;
2248
2249 return 1;
2250}
2251
2252static u64 div_factor_fine(u64 num, int factor)
2253{
2254 if (factor <= 0)
2255 return 0;
2256 if (factor >= 100)
2257 return num;
2258
2259 num *= factor;
2260 do_div(num, 100);
2261 return num;
2262}
2263
2264static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2265 struct btrfs_balance_args *bargs)
2266{
2267 struct btrfs_block_group_cache *cache;
2268 u64 chunk_used, user_thresh;
2269 int ret = 1;
2270
2271 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2272 chunk_used = btrfs_block_group_used(&cache->item);
2273
2274 user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
2275 if (chunk_used < user_thresh)
2276 ret = 0;
2277
2278 btrfs_put_block_group(cache);
2279 return ret;
2280}
2281
2282static int chunk_devid_filter(struct extent_buffer *leaf,
2283 struct btrfs_chunk *chunk,
2284 struct btrfs_balance_args *bargs)
2285{
2286 struct btrfs_stripe *stripe;
2287 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2288 int i;
2289
2290 for (i = 0; i < num_stripes; i++) {
2291 stripe = btrfs_stripe_nr(chunk, i);
2292 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2293 return 0;
2294 }
2295
2296 return 1;
2297}
2298
2299/* [pstart, pend) */
2300static int chunk_drange_filter(struct extent_buffer *leaf,
2301 struct btrfs_chunk *chunk,
2302 u64 chunk_offset,
2303 struct btrfs_balance_args *bargs)
2304{
2305 struct btrfs_stripe *stripe;
2306 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2307 u64 stripe_offset;
2308 u64 stripe_length;
2309 int factor;
2310 int i;
2311
2312 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2313 return 0;
2314
2315 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2316 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2317 factor = 2;
2318 else
2319 factor = 1;
2320 factor = num_stripes / factor;
2321
2322 for (i = 0; i < num_stripes; i++) {
2323 stripe = btrfs_stripe_nr(chunk, i);
2324 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2325 continue;
2326
2327 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2328 stripe_length = btrfs_chunk_length(leaf, chunk);
2329 do_div(stripe_length, factor);
2330
2331 if (stripe_offset < bargs->pend &&
2332 stripe_offset + stripe_length > bargs->pstart)
2333 return 0;
2334 }
2335
2336 return 1;
2337}
2338
2339/* [vstart, vend) */
2340static int chunk_vrange_filter(struct extent_buffer *leaf,
2341 struct btrfs_chunk *chunk,
2342 u64 chunk_offset,
2343 struct btrfs_balance_args *bargs)
2344{
2345 if (chunk_offset < bargs->vend &&
2346 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2347 /* at least part of the chunk is inside this vrange */
2348 return 0;
2349
2350 return 1;
2351}
2352
2353static int chunk_soft_convert_filter(u64 chunk_profile,
2354 struct btrfs_balance_args *bargs)
2355{
2356 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2357 return 0;
2358
2359 chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2360
2361 if (chunk_profile == 0)
2362 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2363
2364 if (bargs->target & chunk_profile)
2365 return 1;
2366
2367 return 0;
2368}
2369
2370static int should_balance_chunk(struct btrfs_root *root,
2371 struct extent_buffer *leaf,
2372 struct btrfs_chunk *chunk, u64 chunk_offset)
2373{
2374 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2375 struct btrfs_balance_args *bargs = NULL;
2376 u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2377
2378 /* type filter */
2379 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2380 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2381 return 0;
2382 }
2383
2384 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2385 bargs = &bctl->data;
2386 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2387 bargs = &bctl->sys;
2388 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2389 bargs = &bctl->meta;
2390
2391 /* profiles filter */
2392 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2393 chunk_profiles_filter(chunk_type, bargs)) {
2394 return 0;
2395 }
2396
2397 /* usage filter */
2398 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2399 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2400 return 0;
2401 }
2402
2403 /* devid filter */
2404 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2405 chunk_devid_filter(leaf, chunk, bargs)) {
2406 return 0;
2407 }
2408
2409 /* drange filter, makes sense only with devid filter */
2410 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2411 chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2412 return 0;
2413 }
2414
2415 /* vrange filter */
2416 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2417 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2418 return 0;
2419 }
2420
2421 /* soft profile changing mode */
2422 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2423 chunk_soft_convert_filter(chunk_type, bargs)) {
2424 return 0;
2425 }
2426
2427 return 1;
2428}
2429
2080static u64 div_factor(u64 num, int factor) 2430static u64 div_factor(u64 num, int factor)
2081{ 2431{
2082 if (factor == 10) 2432 if (factor == 10)
@@ -2086,29 +2436,28 @@ static u64 div_factor(u64 num, int factor)
2086 return num; 2436 return num;
2087} 2437}
2088 2438
2089int btrfs_balance(struct btrfs_root *dev_root) 2439static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2090{ 2440{
2091 int ret; 2441 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2092 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 2442 struct btrfs_root *chunk_root = fs_info->chunk_root;
2443 struct btrfs_root *dev_root = fs_info->dev_root;
2444 struct list_head *devices;
2093 struct btrfs_device *device; 2445 struct btrfs_device *device;
2094 u64 old_size; 2446 u64 old_size;
2095 u64 size_to_free; 2447 u64 size_to_free;
2448 struct btrfs_chunk *chunk;
2096 struct btrfs_path *path; 2449 struct btrfs_path *path;
2097 struct btrfs_key key; 2450 struct btrfs_key key;
2098 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
2099 struct btrfs_trans_handle *trans;
2100 struct btrfs_key found_key; 2451 struct btrfs_key found_key;
2101 2452 struct btrfs_trans_handle *trans;
2102 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2453 struct extent_buffer *leaf;
2103 return -EROFS; 2454 int slot;
2104 2455 int ret;
2105 if (!capable(CAP_SYS_ADMIN)) 2456 int enospc_errors = 0;
2106 return -EPERM; 2457 bool counting = true;
2107
2108 mutex_lock(&dev_root->fs_info->volume_mutex);
2109 dev_root = dev_root->fs_info->dev_root;
2110 2458
2111 /* step one make some room on all the devices */ 2459 /* step one make some room on all the devices */
2460 devices = &fs_info->fs_devices->devices;
2112 list_for_each_entry(device, devices, dev_list) { 2461 list_for_each_entry(device, devices, dev_list) {
2113 old_size = device->total_bytes; 2462 old_size = device->total_bytes;
2114 size_to_free = div_factor(old_size, 1); 2463 size_to_free = div_factor(old_size, 1);
@@ -2137,11 +2486,23 @@ int btrfs_balance(struct btrfs_root *dev_root)
2137 ret = -ENOMEM; 2486 ret = -ENOMEM;
2138 goto error; 2487 goto error;
2139 } 2488 }
2489
2490 /* zero out stat counters */
2491 spin_lock(&fs_info->balance_lock);
2492 memset(&bctl->stat, 0, sizeof(bctl->stat));
2493 spin_unlock(&fs_info->balance_lock);
2494again:
2140 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2495 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2141 key.offset = (u64)-1; 2496 key.offset = (u64)-1;
2142 key.type = BTRFS_CHUNK_ITEM_KEY; 2497 key.type = BTRFS_CHUNK_ITEM_KEY;
2143 2498
2144 while (1) { 2499 while (1) {
2500 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2501 atomic_read(&fs_info->balance_cancel_req)) {
2502 ret = -ECANCELED;
2503 goto error;
2504 }
2505
2145 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2506 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2146 if (ret < 0) 2507 if (ret < 0)
2147 goto error; 2508 goto error;
@@ -2151,15 +2512,19 @@ int btrfs_balance(struct btrfs_root *dev_root)
2151 * failed 2512 * failed
2152 */ 2513 */
2153 if (ret == 0) 2514 if (ret == 0)
2154 break; 2515 BUG(); /* FIXME break ? */
2155 2516
2156 ret = btrfs_previous_item(chunk_root, path, 0, 2517 ret = btrfs_previous_item(chunk_root, path, 0,
2157 BTRFS_CHUNK_ITEM_KEY); 2518 BTRFS_CHUNK_ITEM_KEY);
2158 if (ret) 2519 if (ret) {
2520 ret = 0;
2159 break; 2521 break;
2522 }
2523
2524 leaf = path->nodes[0];
2525 slot = path->slots[0];
2526 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2160 2527
2161 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2162 path->slots[0]);
2163 if (found_key.objectid != key.objectid) 2528 if (found_key.objectid != key.objectid)
2164 break; 2529 break;
2165 2530
@@ -2167,22 +2532,375 @@ int btrfs_balance(struct btrfs_root *dev_root)
2167 if (found_key.offset == 0) 2532 if (found_key.offset == 0)
2168 break; 2533 break;
2169 2534
2535 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2536
2537 if (!counting) {
2538 spin_lock(&fs_info->balance_lock);
2539 bctl->stat.considered++;
2540 spin_unlock(&fs_info->balance_lock);
2541 }
2542
2543 ret = should_balance_chunk(chunk_root, leaf, chunk,
2544 found_key.offset);
2170 btrfs_release_path(path); 2545 btrfs_release_path(path);
2546 if (!ret)
2547 goto loop;
2548
2549 if (counting) {
2550 spin_lock(&fs_info->balance_lock);
2551 bctl->stat.expected++;
2552 spin_unlock(&fs_info->balance_lock);
2553 goto loop;
2554 }
2555
2171 ret = btrfs_relocate_chunk(chunk_root, 2556 ret = btrfs_relocate_chunk(chunk_root,
2172 chunk_root->root_key.objectid, 2557 chunk_root->root_key.objectid,
2173 found_key.objectid, 2558 found_key.objectid,
2174 found_key.offset); 2559 found_key.offset);
2175 if (ret && ret != -ENOSPC) 2560 if (ret && ret != -ENOSPC)
2176 goto error; 2561 goto error;
2562 if (ret == -ENOSPC) {
2563 enospc_errors++;
2564 } else {
2565 spin_lock(&fs_info->balance_lock);
2566 bctl->stat.completed++;
2567 spin_unlock(&fs_info->balance_lock);
2568 }
2569loop:
2177 key.offset = found_key.offset - 1; 2570 key.offset = found_key.offset - 1;
2178 } 2571 }
2179 ret = 0; 2572
2573 if (counting) {
2574 btrfs_release_path(path);
2575 counting = false;
2576 goto again;
2577 }
2180error: 2578error:
2181 btrfs_free_path(path); 2579 btrfs_free_path(path);
2182 mutex_unlock(&dev_root->fs_info->volume_mutex); 2580 if (enospc_errors) {
2581 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2582 enospc_errors);
2583 if (!ret)
2584 ret = -ENOSPC;
2585 }
2586
2183 return ret; 2587 return ret;
2184} 2588}
2185 2589
2590static inline int balance_need_close(struct btrfs_fs_info *fs_info)
2591{
2592 /* cancel requested || normal exit path */
2593 return atomic_read(&fs_info->balance_cancel_req) ||
2594 (atomic_read(&fs_info->balance_pause_req) == 0 &&
2595 atomic_read(&fs_info->balance_cancel_req) == 0);
2596}
2597
2598static void __cancel_balance(struct btrfs_fs_info *fs_info)
2599{
2600 int ret;
2601
2602 unset_balance_control(fs_info);
2603 ret = del_balance_item(fs_info->tree_root);
2604 BUG_ON(ret);
2605}
2606
2607void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
2608 struct btrfs_ioctl_balance_args *bargs);
2609
2610/*
2611 * Should be called with both balance and volume mutexes held
2612 */
2613int btrfs_balance(struct btrfs_balance_control *bctl,
2614 struct btrfs_ioctl_balance_args *bargs)
2615{
2616 struct btrfs_fs_info *fs_info = bctl->fs_info;
2617 u64 allowed;
2618 int ret;
2619
2620 if (btrfs_fs_closing(fs_info) ||
2621 atomic_read(&fs_info->balance_pause_req) ||
2622 atomic_read(&fs_info->balance_cancel_req)) {
2623 ret = -EINVAL;
2624 goto out;
2625 }
2626
2627 /*
2628 * In case of mixed groups both data and meta should be picked,
2629 * and identical options should be given for both of them.
2630 */
2631 allowed = btrfs_super_incompat_flags(fs_info->super_copy);
2632 if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2633 (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
2634 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
2635 !(bctl->flags & BTRFS_BALANCE_METADATA) ||
2636 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
2637 printk(KERN_ERR "btrfs: with mixed groups data and "
2638 "metadata balance options must be the same\n");
2639 ret = -EINVAL;
2640 goto out;
2641 }
2642 }
2643
2644 /*
2645 * Profile changing sanity checks. Skip them if a simple
2646 * balance is requested.
2647 */
2648 if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
2649 BTRFS_BALANCE_ARGS_CONVERT))
2650 goto do_balance;
2651
2652 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2653 if (fs_info->fs_devices->num_devices == 1)
2654 allowed |= BTRFS_BLOCK_GROUP_DUP;
2655 else if (fs_info->fs_devices->num_devices < 4)
2656 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2657 else
2658 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2659 BTRFS_BLOCK_GROUP_RAID10);
2660
2661 if (!profile_is_valid(bctl->data.target, 1) ||
2662 bctl->data.target & ~allowed) {
2663 printk(KERN_ERR "btrfs: unable to start balance with target "
2664 "data profile %llu\n",
2665 (unsigned long long)bctl->data.target);
2666 ret = -EINVAL;
2667 goto out;
2668 }
2669 if (!profile_is_valid(bctl->meta.target, 1) ||
2670 bctl->meta.target & ~allowed) {
2671 printk(KERN_ERR "btrfs: unable to start balance with target "
2672 "metadata profile %llu\n",
2673 (unsigned long long)bctl->meta.target);
2674 ret = -EINVAL;
2675 goto out;
2676 }
2677 if (!profile_is_valid(bctl->sys.target, 1) ||
2678 bctl->sys.target & ~allowed) {
2679 printk(KERN_ERR "btrfs: unable to start balance with target "
2680 "system profile %llu\n",
2681 (unsigned long long)bctl->sys.target);
2682 ret = -EINVAL;
2683 goto out;
2684 }
2685
2686 if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
2687 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
2688 ret = -EINVAL;
2689 goto out;
2690 }
2691
2692 /* allow to reduce meta or sys integrity only if force set */
2693 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2694 BTRFS_BLOCK_GROUP_RAID10;
2695 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2696 (fs_info->avail_system_alloc_bits & allowed) &&
2697 !(bctl->sys.target & allowed)) ||
2698 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2699 (fs_info->avail_metadata_alloc_bits & allowed) &&
2700 !(bctl->meta.target & allowed))) {
2701 if (bctl->flags & BTRFS_BALANCE_FORCE) {
2702 printk(KERN_INFO "btrfs: force reducing metadata "
2703 "integrity\n");
2704 } else {
2705 printk(KERN_ERR "btrfs: balance will reduce metadata "
2706 "integrity, use force if you want this\n");
2707 ret = -EINVAL;
2708 goto out;
2709 }
2710 }
2711
2712do_balance:
2713 ret = insert_balance_item(fs_info->tree_root, bctl);
2714 if (ret && ret != -EEXIST)
2715 goto out;
2716
2717 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
2718 BUG_ON(ret == -EEXIST);
2719 set_balance_control(bctl);
2720 } else {
2721 BUG_ON(ret != -EEXIST);
2722 spin_lock(&fs_info->balance_lock);
2723 update_balance_args(bctl);
2724 spin_unlock(&fs_info->balance_lock);
2725 }
2726
2727 atomic_inc(&fs_info->balance_running);
2728 mutex_unlock(&fs_info->balance_mutex);
2729
2730 ret = __btrfs_balance(fs_info);
2731
2732 mutex_lock(&fs_info->balance_mutex);
2733 atomic_dec(&fs_info->balance_running);
2734
2735 if (bargs) {
2736 memset(bargs, 0, sizeof(*bargs));
2737 update_ioctl_balance_args(fs_info, 0, bargs);
2738 }
2739
2740 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
2741 balance_need_close(fs_info)) {
2742 __cancel_balance(fs_info);
2743 }
2744
2745 wake_up(&fs_info->balance_wait_q);
2746
2747 return ret;
2748out:
2749 if (bctl->flags & BTRFS_BALANCE_RESUME)
2750 __cancel_balance(fs_info);
2751 else
2752 kfree(bctl);
2753 return ret;
2754}
2755
2756static int balance_kthread(void *data)
2757{
2758 struct btrfs_balance_control *bctl =
2759 (struct btrfs_balance_control *)data;
2760 struct btrfs_fs_info *fs_info = bctl->fs_info;
2761 int ret = 0;
2762
2763 mutex_lock(&fs_info->volume_mutex);
2764 mutex_lock(&fs_info->balance_mutex);
2765
2766 set_balance_control(bctl);
2767
2768 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
2769 printk(KERN_INFO "btrfs: force skipping balance\n");
2770 } else {
2771 printk(KERN_INFO "btrfs: continuing balance\n");
2772 ret = btrfs_balance(bctl, NULL);
2773 }
2774
2775 mutex_unlock(&fs_info->balance_mutex);
2776 mutex_unlock(&fs_info->volume_mutex);
2777 return ret;
2778}
2779
2780int btrfs_recover_balance(struct btrfs_root *tree_root)
2781{
2782 struct task_struct *tsk;
2783 struct btrfs_balance_control *bctl;
2784 struct btrfs_balance_item *item;
2785 struct btrfs_disk_balance_args disk_bargs;
2786 struct btrfs_path *path;
2787 struct extent_buffer *leaf;
2788 struct btrfs_key key;
2789 int ret;
2790
2791 path = btrfs_alloc_path();
2792 if (!path)
2793 return -ENOMEM;
2794
2795 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
2796 if (!bctl) {
2797 ret = -ENOMEM;
2798 goto out;
2799 }
2800
2801 key.objectid = BTRFS_BALANCE_OBJECTID;
2802 key.type = BTRFS_BALANCE_ITEM_KEY;
2803 key.offset = 0;
2804
2805 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2806 if (ret < 0)
2807 goto out_bctl;
2808 if (ret > 0) { /* ret = -ENOENT; */
2809 ret = 0;
2810 goto out_bctl;
2811 }
2812
2813 leaf = path->nodes[0];
2814 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2815
2816 bctl->fs_info = tree_root->fs_info;
2817 bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
2818
2819 btrfs_balance_data(leaf, item, &disk_bargs);
2820 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
2821 btrfs_balance_meta(leaf, item, &disk_bargs);
2822 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
2823 btrfs_balance_sys(leaf, item, &disk_bargs);
2824 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2825
2826 tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
2827 if (IS_ERR(tsk))
2828 ret = PTR_ERR(tsk);
2829 else
2830 goto out;
2831
2832out_bctl:
2833 kfree(bctl);
2834out:
2835 btrfs_free_path(path);
2836 return ret;
2837}
2838
2839int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
2840{
2841 int ret = 0;
2842
2843 mutex_lock(&fs_info->balance_mutex);
2844 if (!fs_info->balance_ctl) {
2845 mutex_unlock(&fs_info->balance_mutex);
2846 return -ENOTCONN;
2847 }
2848
2849 if (atomic_read(&fs_info->balance_running)) {
2850 atomic_inc(&fs_info->balance_pause_req);
2851 mutex_unlock(&fs_info->balance_mutex);
2852
2853 wait_event(fs_info->balance_wait_q,
2854 atomic_read(&fs_info->balance_running) == 0);
2855
2856 mutex_lock(&fs_info->balance_mutex);
2857 /* we are good with balance_ctl ripped off from under us */
2858 BUG_ON(atomic_read(&fs_info->balance_running));
2859 atomic_dec(&fs_info->balance_pause_req);
2860 } else {
2861 ret = -ENOTCONN;
2862 }
2863
2864 mutex_unlock(&fs_info->balance_mutex);
2865 return ret;
2866}
2867
2868int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
2869{
2870 mutex_lock(&fs_info->balance_mutex);
2871 if (!fs_info->balance_ctl) {
2872 mutex_unlock(&fs_info->balance_mutex);
2873 return -ENOTCONN;
2874 }
2875
2876 atomic_inc(&fs_info->balance_cancel_req);
2877 /*
2878 * if we are running just wait and return, balance item is
2879 * deleted in btrfs_balance in this case
2880 */
2881 if (atomic_read(&fs_info->balance_running)) {
2882 mutex_unlock(&fs_info->balance_mutex);
2883 wait_event(fs_info->balance_wait_q,
2884 atomic_read(&fs_info->balance_running) == 0);
2885 mutex_lock(&fs_info->balance_mutex);
2886 } else {
2887 /* __cancel_balance needs volume_mutex */
2888 mutex_unlock(&fs_info->balance_mutex);
2889 mutex_lock(&fs_info->volume_mutex);
2890 mutex_lock(&fs_info->balance_mutex);
2891
2892 if (fs_info->balance_ctl)
2893 __cancel_balance(fs_info);
2894
2895 mutex_unlock(&fs_info->volume_mutex);
2896 }
2897
2898 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
2899 atomic_dec(&fs_info->balance_cancel_req);
2900 mutex_unlock(&fs_info->balance_mutex);
2901 return 0;
2902}
2903
2186/* 2904/*
2187 * shrinking a device means finding all of the device extents past 2905 * shrinking a device means finding all of the device extents past
2188 * the new size, and then following the back refs to the chunks. 2906 * the new size, and then following the back refs to the chunks.
@@ -2323,8 +3041,7 @@ done:
2323 return ret; 3041 return ret;
2324} 3042}
2325 3043
2326static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 3044static int btrfs_add_system_chunk(struct btrfs_root *root,
2327 struct btrfs_root *root,
2328 struct btrfs_key *key, 3045 struct btrfs_key *key,
2329 struct btrfs_chunk *chunk, int item_size) 3046 struct btrfs_chunk *chunk, int item_size)
2330{ 3047{
@@ -2441,10 +3158,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2441 max_stripe_size = 1024 * 1024 * 1024; 3158 max_stripe_size = 1024 * 1024 * 1024;
2442 max_chunk_size = 10 * max_stripe_size; 3159 max_chunk_size = 10 * max_stripe_size;
2443 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 3160 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2444 max_stripe_size = 256 * 1024 * 1024; 3161 /* for larger filesystems, use larger metadata chunks */
3162 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3163 max_stripe_size = 1024 * 1024 * 1024;
3164 else
3165 max_stripe_size = 256 * 1024 * 1024;
2445 max_chunk_size = max_stripe_size; 3166 max_chunk_size = max_stripe_size;
2446 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 3167 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2447 max_stripe_size = 8 * 1024 * 1024; 3168 max_stripe_size = 32 * 1024 * 1024;
2448 max_chunk_size = 2 * max_stripe_size; 3169 max_chunk_size = 2 * max_stripe_size;
2449 } else { 3170 } else {
2450 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", 3171 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
@@ -2496,7 +3217,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2496 if (total_avail == 0) 3217 if (total_avail == 0)
2497 continue; 3218 continue;
2498 3219
2499 ret = find_free_dev_extent(trans, device, 3220 ret = find_free_dev_extent(device,
2500 max_stripe_size * dev_stripes, 3221 max_stripe_size * dev_stripes,
2501 &dev_offset, &max_avail); 3222 &dev_offset, &max_avail);
2502 if (ret && ret != -ENOSPC) 3223 if (ret && ret != -ENOSPC)
@@ -2687,7 +3408,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2687 BUG_ON(ret); 3408 BUG_ON(ret);
2688 3409
2689 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3410 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2690 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 3411 ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
2691 item_size); 3412 item_size);
2692 BUG_ON(ret); 3413 BUG_ON(ret);
2693 } 3414 }
@@ -2752,8 +3473,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2752 return ret; 3473 return ret;
2753 3474
2754 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 3475 alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2755 (fs_info->metadata_alloc_profile & 3476 fs_info->avail_metadata_alloc_bits;
2756 fs_info->avail_metadata_alloc_bits);
2757 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3477 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2758 3478
2759 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 3479 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@ -2763,8 +3483,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2763 sys_chunk_offset = chunk_offset + chunk_size; 3483 sys_chunk_offset = chunk_offset + chunk_size;
2764 3484
2765 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 3485 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2766 (fs_info->system_alloc_profile & 3486 fs_info->avail_system_alloc_bits;
2767 fs_info->avail_system_alloc_bits);
2768 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3487 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2769 3488
2770 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 3489 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@ -2901,26 +3620,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2901 u64 stripe_nr; 3620 u64 stripe_nr;
2902 u64 stripe_nr_orig; 3621 u64 stripe_nr_orig;
2903 u64 stripe_nr_end; 3622 u64 stripe_nr_end;
2904 int stripes_allocated = 8;
2905 int stripes_required = 1;
2906 int stripe_index; 3623 int stripe_index;
2907 int i; 3624 int i;
3625 int ret = 0;
2908 int num_stripes; 3626 int num_stripes;
2909 int max_errors = 0; 3627 int max_errors = 0;
2910 struct btrfs_bio *bbio = NULL; 3628 struct btrfs_bio *bbio = NULL;
2911 3629
2912 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2913 stripes_allocated = 1;
2914again:
2915 if (bbio_ret) {
2916 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2917 GFP_NOFS);
2918 if (!bbio)
2919 return -ENOMEM;
2920
2921 atomic_set(&bbio->error, 0);
2922 }
2923
2924 read_lock(&em_tree->lock); 3630 read_lock(&em_tree->lock);
2925 em = lookup_extent_mapping(em_tree, logical, *length); 3631 em = lookup_extent_mapping(em_tree, logical, *length);
2926 read_unlock(&em_tree->lock); 3632 read_unlock(&em_tree->lock);
@@ -2939,32 +3645,6 @@ again:
2939 if (mirror_num > map->num_stripes) 3645 if (mirror_num > map->num_stripes)
2940 mirror_num = 0; 3646 mirror_num = 0;
2941 3647
2942 /* if our btrfs_bio struct is too small, back off and try again */
2943 if (rw & REQ_WRITE) {
2944 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2945 BTRFS_BLOCK_GROUP_DUP)) {
2946 stripes_required = map->num_stripes;
2947 max_errors = 1;
2948 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2949 stripes_required = map->sub_stripes;
2950 max_errors = 1;
2951 }
2952 }
2953 if (rw & REQ_DISCARD) {
2954 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2955 BTRFS_BLOCK_GROUP_RAID1 |
2956 BTRFS_BLOCK_GROUP_DUP |
2957 BTRFS_BLOCK_GROUP_RAID10)) {
2958 stripes_required = map->num_stripes;
2959 }
2960 }
2961 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2962 stripes_allocated < stripes_required) {
2963 stripes_allocated = map->num_stripes;
2964 free_extent_map(em);
2965 kfree(bbio);
2966 goto again;
2967 }
2968 stripe_nr = offset; 3648 stripe_nr = offset;
2969 /* 3649 /*
2970 * stripe_nr counts the total number of stripes we have to stride 3650 * stripe_nr counts the total number of stripes we have to stride
@@ -2980,10 +3660,7 @@ again:
2980 3660
2981 if (rw & REQ_DISCARD) 3661 if (rw & REQ_DISCARD)
2982 *length = min_t(u64, em->len - offset, *length); 3662 *length = min_t(u64, em->len - offset, *length);
2983 else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3663 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
2984 BTRFS_BLOCK_GROUP_RAID1 |
2985 BTRFS_BLOCK_GROUP_RAID10 |
2986 BTRFS_BLOCK_GROUP_DUP)) {
2987 /* we limit the length of each bio to what fits in a stripe */ 3664 /* we limit the length of each bio to what fits in a stripe */
2988 *length = min_t(u64, em->len - offset, 3665 *length = min_t(u64, em->len - offset,
2989 map->stripe_len - stripe_offset); 3666 map->stripe_len - stripe_offset);
@@ -3059,81 +3736,55 @@ again:
3059 } 3736 }
3060 BUG_ON(stripe_index >= map->num_stripes); 3737 BUG_ON(stripe_index >= map->num_stripes);
3061 3738
3739 bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
3740 if (!bbio) {
3741 ret = -ENOMEM;
3742 goto out;
3743 }
3744 atomic_set(&bbio->error, 0);
3745
3062 if (rw & REQ_DISCARD) { 3746 if (rw & REQ_DISCARD) {
3747 int factor = 0;
3748 int sub_stripes = 0;
3749 u64 stripes_per_dev = 0;
3750 u32 remaining_stripes = 0;
3751
3752 if (map->type &
3753 (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3754 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3755 sub_stripes = 1;
3756 else
3757 sub_stripes = map->sub_stripes;
3758
3759 factor = map->num_stripes / sub_stripes;
3760 stripes_per_dev = div_u64_rem(stripe_nr_end -
3761 stripe_nr_orig,
3762 factor,
3763 &remaining_stripes);
3764 }
3765
3063 for (i = 0; i < num_stripes; i++) { 3766 for (i = 0; i < num_stripes; i++) {
3064 bbio->stripes[i].physical = 3767 bbio->stripes[i].physical =
3065 map->stripes[stripe_index].physical + 3768 map->stripes[stripe_index].physical +
3066 stripe_offset + stripe_nr * map->stripe_len; 3769 stripe_offset + stripe_nr * map->stripe_len;
3067 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 3770 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3068 3771
3069 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3772 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3070 u64 stripes; 3773 BTRFS_BLOCK_GROUP_RAID10)) {
3071 u32 last_stripe = 0; 3774 bbio->stripes[i].length = stripes_per_dev *
3072 int j; 3775 map->stripe_len;
3073 3776 if (i / sub_stripes < remaining_stripes)
3074 div_u64_rem(stripe_nr_end - 1, 3777 bbio->stripes[i].length +=
3075 map->num_stripes, 3778 map->stripe_len;
3076 &last_stripe); 3779 if (i < sub_stripes)
3077
3078 for (j = 0; j < map->num_stripes; j++) {
3079 u32 test;
3080
3081 div_u64_rem(stripe_nr_end - 1 - j,
3082 map->num_stripes, &test);
3083 if (test == stripe_index)
3084 break;
3085 }
3086 stripes = stripe_nr_end - 1 - j;
3087 do_div(stripes, map->num_stripes);
3088 bbio->stripes[i].length = map->stripe_len *
3089 (stripes - stripe_nr + 1);
3090
3091 if (i == 0) {
3092 bbio->stripes[i].length -=
3093 stripe_offset;
3094 stripe_offset = 0;
3095 }
3096 if (stripe_index == last_stripe)
3097 bbio->stripes[i].length -=
3098 stripe_end_offset;
3099 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3100 u64 stripes;
3101 int j;
3102 int factor = map->num_stripes /
3103 map->sub_stripes;
3104 u32 last_stripe = 0;
3105
3106 div_u64_rem(stripe_nr_end - 1,
3107 factor, &last_stripe);
3108 last_stripe *= map->sub_stripes;
3109
3110 for (j = 0; j < factor; j++) {
3111 u32 test;
3112
3113 div_u64_rem(stripe_nr_end - 1 - j,
3114 factor, &test);
3115
3116 if (test ==
3117 stripe_index / map->sub_stripes)
3118 break;
3119 }
3120 stripes = stripe_nr_end - 1 - j;
3121 do_div(stripes, factor);
3122 bbio->stripes[i].length = map->stripe_len *
3123 (stripes - stripe_nr + 1);
3124
3125 if (i < map->sub_stripes) {
3126 bbio->stripes[i].length -= 3780 bbio->stripes[i].length -=
3127 stripe_offset; 3781 stripe_offset;
3128 if (i == map->sub_stripes - 1) 3782 if ((i / sub_stripes + 1) %
3129 stripe_offset = 0; 3783 sub_stripes == remaining_stripes)
3130 }
3131 if (stripe_index >= last_stripe &&
3132 stripe_index <= (last_stripe +
3133 map->sub_stripes - 1)) {
3134 bbio->stripes[i].length -= 3784 bbio->stripes[i].length -=
3135 stripe_end_offset; 3785 stripe_end_offset;
3136 } 3786 if (i == sub_stripes - 1)
3787 stripe_offset = 0;
3137 } else 3788 } else
3138 bbio->stripes[i].length = *length; 3789 bbio->stripes[i].length = *length;
3139 3790
@@ -3155,15 +3806,22 @@ again:
3155 stripe_index++; 3806 stripe_index++;
3156 } 3807 }
3157 } 3808 }
3158 if (bbio_ret) { 3809
3159 *bbio_ret = bbio; 3810 if (rw & REQ_WRITE) {
3160 bbio->num_stripes = num_stripes; 3811 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3161 bbio->max_errors = max_errors; 3812 BTRFS_BLOCK_GROUP_RAID10 |
3162 bbio->mirror_num = mirror_num; 3813 BTRFS_BLOCK_GROUP_DUP)) {
3814 max_errors = 1;
3815 }
3163 } 3816 }
3817
3818 *bbio_ret = bbio;
3819 bbio->num_stripes = num_stripes;
3820 bbio->max_errors = max_errors;
3821 bbio->mirror_num = mirror_num;
3164out: 3822out:
3165 free_extent_map(em); 3823 free_extent_map(em);
3166 return 0; 3824 return ret;
3167} 3825}
3168 3826
3169int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3827int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
@@ -3304,7 +3962,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
3304 /* don't bother with additional async steps for reads, right now */ 3962 /* don't bother with additional async steps for reads, right now */
3305 if (!(rw & REQ_WRITE)) { 3963 if (!(rw & REQ_WRITE)) {
3306 bio_get(bio); 3964 bio_get(bio);
3307 submit_bio(rw, bio); 3965 btrfsic_submit_bio(rw, bio);
3308 bio_put(bio); 3966 bio_put(bio);
3309 return 0; 3967 return 0;
3310 } 3968 }
@@ -3399,7 +4057,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3399 if (async_submit) 4057 if (async_submit)
3400 schedule_bio(root, dev, rw, bio); 4058 schedule_bio(root, dev, rw, bio);
3401 else 4059 else
3402 submit_bio(rw, bio); 4060 btrfsic_submit_bio(rw, bio);
3403 } else { 4061 } else {
3404 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 4062 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
3405 bio->bi_sector = logical >> 9; 4063 bio->bi_sector = logical >> 9;
@@ -3568,7 +4226,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3568 struct btrfs_fs_devices *fs_devices; 4226 struct btrfs_fs_devices *fs_devices;
3569 int ret; 4227 int ret;
3570 4228
3571 mutex_lock(&uuid_mutex); 4229 BUG_ON(!mutex_is_locked(&uuid_mutex));
3572 4230
3573 fs_devices = root->fs_info->fs_devices->seed; 4231 fs_devices = root->fs_info->fs_devices->seed;
3574 while (fs_devices) { 4232 while (fs_devices) {
@@ -3606,7 +4264,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3606 fs_devices->seed = root->fs_info->fs_devices->seed; 4264 fs_devices->seed = root->fs_info->fs_devices->seed;
3607 root->fs_info->fs_devices->seed = fs_devices; 4265 root->fs_info->fs_devices->seed = fs_devices;
3608out: 4266out:
3609 mutex_unlock(&uuid_mutex);
3610 return ret; 4267 return ret;
3611} 4268}
3612 4269
@@ -3749,6 +4406,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
3749 if (!path) 4406 if (!path)
3750 return -ENOMEM; 4407 return -ENOMEM;
3751 4408
4409 mutex_lock(&uuid_mutex);
4410 lock_chunks(root);
4411
3752 /* first we search for all of the device items, and then we 4412 /* first we search for all of the device items, and then we
3753 * read in all of the chunk items. This way we can create chunk 4413 * read in all of the chunk items. This way we can create chunk
3754 * mappings that reference all of the devices that are afound 4414 * mappings that reference all of the devices that are afound
@@ -3799,6 +4459,9 @@ again:
3799 } 4459 }
3800 ret = 0; 4460 ret = 0;
3801error: 4461error:
4462 unlock_chunks(root);
4463 mutex_unlock(&uuid_mutex);
4464
3802 btrfs_free_path(path); 4465 btrfs_free_path(path);
3803 return ret; 4466 return ret;
3804} 4467}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 78f2d4d4f37f..19ac95048b88 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -186,6 +186,51 @@ struct map_lookup {
186#define map_lookup_size(n) (sizeof(struct map_lookup) + \ 186#define map_lookup_size(n) (sizeof(struct map_lookup) + \
187 (sizeof(struct btrfs_bio_stripe) * (n))) 187 (sizeof(struct btrfs_bio_stripe) * (n)))
188 188
189/*
190 * Restriper's general type filter
191 */
192#define BTRFS_BALANCE_DATA (1ULL << 0)
193#define BTRFS_BALANCE_SYSTEM (1ULL << 1)
194#define BTRFS_BALANCE_METADATA (1ULL << 2)
195
196#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \
197 BTRFS_BALANCE_SYSTEM | \
198 BTRFS_BALANCE_METADATA)
199
200#define BTRFS_BALANCE_FORCE (1ULL << 3)
201#define BTRFS_BALANCE_RESUME (1ULL << 4)
202
203/*
204 * Balance filters
205 */
206#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0)
207#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1)
208#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
209#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
210#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
211
212/*
213 * Profile changing flags. When SOFT is set we won't relocate chunk if
214 * it already has the target profile (even though it may be
215 * half-filled).
216 */
217#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8)
218#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9)
219
220struct btrfs_balance_args;
221struct btrfs_balance_progress;
222struct btrfs_balance_control {
223 struct btrfs_fs_info *fs_info;
224
225 struct btrfs_balance_args data;
226 struct btrfs_balance_args meta;
227 struct btrfs_balance_args sys;
228
229 u64 flags;
230
231 struct btrfs_balance_progress stat;
232};
233
189int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 234int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
190 u64 end, u64 *length); 235 u64 end, u64 *length);
191 236
@@ -228,9 +273,12 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
228 u8 *uuid, u8 *fsid); 273 u8 *uuid, u8 *fsid);
229int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 274int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
230int btrfs_init_new_device(struct btrfs_root *root, char *path); 275int btrfs_init_new_device(struct btrfs_root *root, char *path);
231int btrfs_balance(struct btrfs_root *dev_root); 276int btrfs_balance(struct btrfs_balance_control *bctl,
277 struct btrfs_ioctl_balance_args *bargs);
278int btrfs_recover_balance(struct btrfs_root *tree_root);
279int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
280int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
232int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 281int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
233int find_free_dev_extent(struct btrfs_trans_handle *trans, 282int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
234 struct btrfs_device *device, u64 num_bytes,
235 u64 *start, u64 *max_avail); 283 u64 *start, u64 *max_avail);
236#endif 284#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3848b04e310e..e7a5659087e6 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
200 ret = btrfs_update_inode(trans, root, inode); 200 ret = btrfs_update_inode(trans, root, inode);
201 BUG_ON(ret); 201 BUG_ON(ret);
202out: 202out:
203 btrfs_end_transaction_throttle(trans, root); 203 btrfs_end_transaction(trans, root);
204 return ret; 204 return ret;
205} 205}
206 206
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 74fd74719dc2..618246bc2196 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -973,7 +973,7 @@ static int dentry_lease_is_valid(struct dentry *dentry)
973 973
974 spin_lock(&dentry->d_lock); 974 spin_lock(&dentry->d_lock);
975 di = ceph_dentry(dentry); 975 di = ceph_dentry(dentry);
976 if (di && di->lease_session) { 976 if (di->lease_session) {
977 s = di->lease_session; 977 s = di->lease_session;
978 spin_lock(&s->s_cap_lock); 978 spin_lock(&s->s_cap_lock);
979 gen = s->s_cap_gen; 979 gen = s->s_cap_gen;
@@ -1072,13 +1072,11 @@ static void ceph_d_release(struct dentry *dentry)
1072 struct ceph_dentry_info *di = ceph_dentry(dentry); 1072 struct ceph_dentry_info *di = ceph_dentry(dentry);
1073 1073
1074 dout("d_release %p\n", dentry); 1074 dout("d_release %p\n", dentry);
1075 if (di) { 1075 ceph_dentry_lru_del(dentry);
1076 ceph_dentry_lru_del(dentry); 1076 if (di->lease_session)
1077 if (di->lease_session) 1077 ceph_put_mds_session(di->lease_session);
1078 ceph_put_mds_session(di->lease_session); 1078 kmem_cache_free(ceph_dentry_cachep, di);
1079 kmem_cache_free(ceph_dentry_cachep, di); 1079 dentry->d_fsdata = NULL;
1080 dentry->d_fsdata = NULL;
1081 }
1082} 1080}
1083 1081
1084static int ceph_snapdir_d_revalidate(struct dentry *dentry, 1082static int ceph_snapdir_d_revalidate(struct dentry *dentry,
@@ -1096,17 +1094,36 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1096 */ 1094 */
1097void ceph_dir_set_complete(struct inode *inode) 1095void ceph_dir_set_complete(struct inode *inode)
1098{ 1096{
1099 /* not yet implemented */ 1097 struct dentry *dentry = d_find_any_alias(inode);
1098
1099 if (dentry && ceph_dentry(dentry) &&
1100 ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
1101 dout(" marking %p (%p) complete\n", inode, dentry);
1102 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1103 }
1104 dput(dentry);
1100} 1105}
1101 1106
1102void ceph_dir_clear_complete(struct inode *inode) 1107void ceph_dir_clear_complete(struct inode *inode)
1103{ 1108{
1104 /* not yet implemented */ 1109 struct dentry *dentry = d_find_any_alias(inode);
1110
1111 if (dentry && ceph_dentry(dentry)) {
1112 dout(" marking %p (%p) complete\n", inode, dentry);
1113 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1114 }
1115 dput(dentry);
1105} 1116}
1106 1117
1107bool ceph_dir_test_complete(struct inode *inode) 1118bool ceph_dir_test_complete(struct inode *inode)
1108{ 1119{
1109 /* not yet implemented */ 1120 struct dentry *dentry = d_find_any_alias(inode);
1121
1122 if (dentry && ceph_dentry(dentry)) {
1123 dout(" marking %p (%p) NOT complete\n", inode, dentry);
1124 clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1125 }
1126 dput(dentry);
1110 return false; 1127 return false;
1111} 1128}
1112 1129
@@ -1220,6 +1237,7 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
1220 do { 1237 do {
1221 ceph_mdsc_get_request(req); 1238 ceph_mdsc_get_request(req);
1222 spin_unlock(&ci->i_unsafe_lock); 1239 spin_unlock(&ci->i_unsafe_lock);
1240
1223 dout("dir_fsync %p wait on tid %llu (until %llu)\n", 1241 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1224 inode, req->r_tid, last_tid); 1242 inode, req->r_tid, last_tid);
1225 if (req->r_timeout) { 1243 if (req->r_timeout) {
@@ -1232,9 +1250,9 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
1232 } else { 1250 } else {
1233 wait_for_completion(&req->r_safe_completion); 1251 wait_for_completion(&req->r_safe_completion);
1234 } 1252 }
1235 spin_lock(&ci->i_unsafe_lock);
1236 ceph_mdsc_put_request(req); 1253 ceph_mdsc_put_request(req);
1237 1254
1255 spin_lock(&ci->i_unsafe_lock);
1238 if (ret || list_empty(head)) 1256 if (ret || list_empty(head))
1239 break; 1257 break;
1240 req = list_entry(head->next, 1258 req = list_entry(head->next,
@@ -1259,13 +1277,11 @@ void ceph_dentry_lru_add(struct dentry *dn)
1259 1277
1260 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1278 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1261 dn->d_name.len, dn->d_name.name); 1279 dn->d_name.len, dn->d_name.name);
1262 if (di) { 1280 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1263 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1281 spin_lock(&mdsc->dentry_lru_lock);
1264 spin_lock(&mdsc->dentry_lru_lock); 1282 list_add_tail(&di->lru, &mdsc->dentry_lru);
1265 list_add_tail(&di->lru, &mdsc->dentry_lru); 1283 mdsc->num_dentry++;
1266 mdsc->num_dentry++; 1284 spin_unlock(&mdsc->dentry_lru_lock);
1267 spin_unlock(&mdsc->dentry_lru_lock);
1268 }
1269} 1285}
1270 1286
1271void ceph_dentry_lru_touch(struct dentry *dn) 1287void ceph_dentry_lru_touch(struct dentry *dn)
@@ -1275,12 +1291,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
1275 1291
1276 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, 1292 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1277 dn->d_name.len, dn->d_name.name, di->offset); 1293 dn->d_name.len, dn->d_name.name, di->offset);
1278 if (di) { 1294 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1279 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1295 spin_lock(&mdsc->dentry_lru_lock);
1280 spin_lock(&mdsc->dentry_lru_lock); 1296 list_move_tail(&di->lru, &mdsc->dentry_lru);
1281 list_move_tail(&di->lru, &mdsc->dentry_lru); 1297 spin_unlock(&mdsc->dentry_lru_lock);
1282 spin_unlock(&mdsc->dentry_lru_lock);
1283 }
1284} 1298}
1285 1299
1286void ceph_dentry_lru_del(struct dentry *dn) 1300void ceph_dentry_lru_del(struct dentry *dn)
@@ -1290,13 +1304,11 @@ void ceph_dentry_lru_del(struct dentry *dn)
1290 1304
1291 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1305 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1292 dn->d_name.len, dn->d_name.name); 1306 dn->d_name.len, dn->d_name.name);
1293 if (di) { 1307 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1294 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1308 spin_lock(&mdsc->dentry_lru_lock);
1295 spin_lock(&mdsc->dentry_lru_lock); 1309 list_del_init(&di->lru);
1296 list_del_init(&di->lru); 1310 mdsc->num_dentry--;
1297 mdsc->num_dentry--; 1311 spin_unlock(&mdsc->dentry_lru_lock);
1298 spin_unlock(&mdsc->dentry_lru_lock);
1299 }
1300} 1312}
1301 1313
1302/* 1314/*
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9fbcdecaaccd..fbb2a643ef10 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -56,9 +56,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
56 return -EINVAL; 56 return -EINVAL;
57 57
58 spin_lock(&dentry->d_lock); 58 spin_lock(&dentry->d_lock);
59 parent = dget(dentry->d_parent); 59 parent = dentry->d_parent;
60 spin_unlock(&dentry->d_lock);
61
62 if (*max_len >= connected_handle_length) { 60 if (*max_len >= connected_handle_length) {
63 dout("encode_fh %p connectable\n", dentry); 61 dout("encode_fh %p connectable\n", dentry);
64 cfh->ino = ceph_ino(dentry->d_inode); 62 cfh->ino = ceph_ino(dentry->d_inode);
@@ -81,7 +79,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
81 *max_len = handle_length; 79 *max_len = handle_length;
82 type = 255; 80 type = 255;
83 } 81 }
84 dput(parent); 82 spin_unlock(&dentry->d_lock);
85 return type; 83 return type;
86} 84}
87 85
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 25283e7a37f8..2c489378b4cd 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -850,11 +850,12 @@ static void ceph_set_dentry_offset(struct dentry *dn)
850{ 850{
851 struct dentry *dir = dn->d_parent; 851 struct dentry *dir = dn->d_parent;
852 struct inode *inode = dir->d_inode; 852 struct inode *inode = dir->d_inode;
853 struct ceph_inode_info *ci = ceph_inode(inode); 853 struct ceph_inode_info *ci;
854 struct ceph_dentry_info *di; 854 struct ceph_dentry_info *di;
855 855
856 BUG_ON(!inode); 856 BUG_ON(!inode);
857 857
858 ci = ceph_inode(inode);
858 di = ceph_dentry(dn); 859 di = ceph_dentry(dn);
859 860
860 spin_lock(&ci->i_ceph_lock); 861 spin_lock(&ci->i_ceph_lock);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6203d805eb45..23ab6a3f1825 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2772,7 +2772,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2772 di = ceph_dentry(dentry); 2772 di = ceph_dentry(dentry);
2773 switch (h->action) { 2773 switch (h->action) {
2774 case CEPH_MDS_LEASE_REVOKE: 2774 case CEPH_MDS_LEASE_REVOKE:
2775 if (di && di->lease_session == session) { 2775 if (di->lease_session == session) {
2776 if (ceph_seq_cmp(di->lease_seq, seq) > 0) 2776 if (ceph_seq_cmp(di->lease_seq, seq) > 0)
2777 h->seq = cpu_to_le32(di->lease_seq); 2777 h->seq = cpu_to_le32(di->lease_seq);
2778 __ceph_mdsc_drop_dentry_lease(dentry); 2778 __ceph_mdsc_drop_dentry_lease(dentry);
@@ -2781,7 +2781,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2781 break; 2781 break;
2782 2782
2783 case CEPH_MDS_LEASE_RENEW: 2783 case CEPH_MDS_LEASE_RENEW:
2784 if (di && di->lease_session == session && 2784 if (di->lease_session == session &&
2785 di->lease_gen == session->s_cap_gen && 2785 di->lease_gen == session->s_cap_gen &&
2786 di->lease_renew_from && 2786 di->lease_renew_from &&
2787 di->lease_renew_after == 0) { 2787 di->lease_renew_after == 0) {
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 48f61a12af66..00de2c9568cd 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -131,6 +131,8 @@ enum {
131 Opt_rbytes, 131 Opt_rbytes,
132 Opt_norbytes, 132 Opt_norbytes,
133 Opt_noasyncreaddir, 133 Opt_noasyncreaddir,
134 Opt_dcache,
135 Opt_nodcache,
134 Opt_ino32, 136 Opt_ino32,
135}; 137};
136 138
@@ -152,6 +154,8 @@ static match_table_t fsopt_tokens = {
152 {Opt_rbytes, "rbytes"}, 154 {Opt_rbytes, "rbytes"},
153 {Opt_norbytes, "norbytes"}, 155 {Opt_norbytes, "norbytes"},
154 {Opt_noasyncreaddir, "noasyncreaddir"}, 156 {Opt_noasyncreaddir, "noasyncreaddir"},
157 {Opt_dcache, "dcache"},
158 {Opt_nodcache, "nodcache"},
155 {Opt_ino32, "ino32"}, 159 {Opt_ino32, "ino32"},
156 {-1, NULL} 160 {-1, NULL}
157}; 161};
@@ -231,6 +235,12 @@ static int parse_fsopt_token(char *c, void *private)
231 case Opt_noasyncreaddir: 235 case Opt_noasyncreaddir:
232 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; 236 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
233 break; 237 break;
238 case Opt_dcache:
239 fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
240 break;
241 case Opt_nodcache:
242 fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
243 break;
234 case Opt_ino32: 244 case Opt_ino32:
235 fsopt->flags |= CEPH_MOUNT_OPT_INO32; 245 fsopt->flags |= CEPH_MOUNT_OPT_INO32;
236 break; 246 break;
@@ -377,6 +387,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
377 seq_puts(m, ",norbytes"); 387 seq_puts(m, ",norbytes");
378 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) 388 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
379 seq_puts(m, ",noasyncreaddir"); 389 seq_puts(m, ",noasyncreaddir");
390 if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
391 seq_puts(m, ",dcache");
392 else
393 seq_puts(m, ",nodcache");
380 394
381 if (fsopt->wsize) 395 if (fsopt->wsize)
382 seq_printf(m, ",wsize=%d", fsopt->wsize); 396 seq_printf(m, ",wsize=%d", fsopt->wsize);
@@ -647,10 +661,10 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
647 root = ERR_PTR(-ENOMEM); 661 root = ERR_PTR(-ENOMEM);
648 goto out; 662 goto out;
649 } 663 }
650 ceph_init_dentry(root);
651 } else { 664 } else {
652 root = d_obtain_alias(inode); 665 root = d_obtain_alias(inode);
653 } 666 }
667 ceph_init_dentry(root);
654 dout("open_root_inode success, root dentry is %p\n", root); 668 dout("open_root_inode success, root dentry is %p\n", root);
655 } else { 669 } else {
656 root = ERR_PTR(err); 670 root = ERR_PTR(err);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cb3652b37271..1421f3d875a2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -28,6 +28,7 @@
28#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ 28#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ 29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
30#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ 30#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
31#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
31 32
32#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) 33#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
33 34
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a5e36e4488a7..857214ae8c08 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -818,6 +818,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
818 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); 818 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
819 int issued; 819 int issued;
820 int err; 820 int err;
821 int required_blob_size;
821 int dirty; 822 int dirty;
822 823
823 if (ceph_snap(inode) != CEPH_NOSNAP) 824 if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -833,14 +834,34 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
833 return -EOPNOTSUPP; 834 return -EOPNOTSUPP;
834 } 835 }
835 836
837 err = -ENOMEM;
836 spin_lock(&ci->i_ceph_lock); 838 spin_lock(&ci->i_ceph_lock);
837 __build_xattrs(inode); 839 __build_xattrs(inode);
840retry:
838 issued = __ceph_caps_issued(ci, NULL); 841 issued = __ceph_caps_issued(ci, NULL);
839 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); 842 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
840 843
841 if (!(issued & CEPH_CAP_XATTR_EXCL)) 844 if (!(issued & CEPH_CAP_XATTR_EXCL))
842 goto do_sync; 845 goto do_sync;
843 846
847 required_blob_size = __get_required_blob_size(ci, 0, 0);
848
849 if (!ci->i_xattrs.prealloc_blob ||
850 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
851 struct ceph_buffer *blob;
852
853 spin_unlock(&ci->i_ceph_lock);
854 dout(" preaallocating new blob size=%d\n", required_blob_size);
855 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
856 if (!blob)
857 goto out;
858 spin_lock(&ci->i_ceph_lock);
859 if (ci->i_xattrs.prealloc_blob)
860 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
861 ci->i_xattrs.prealloc_blob = blob;
862 goto retry;
863 }
864
844 err = __remove_xattr_by_name(ceph_inode(inode), name); 865 err = __remove_xattr_by_name(ceph_inode(inode), name);
845 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); 866 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
846 ci->i_xattrs.dirty = true; 867 ci->i_xattrs.dirty = true;
@@ -853,6 +874,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
853do_sync: 874do_sync:
854 spin_unlock(&ci->i_ceph_lock); 875 spin_unlock(&ci->i_ceph_lock);
855 err = ceph_send_removexattr(dentry, name); 876 err = ceph_send_removexattr(dentry, name);
877out:
856 return err; 878 return err;
857} 879}
858 880
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a10e428b32b4..a26bea10e81b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -105,6 +105,7 @@
105 105
106#include <linux/hiddev.h> 106#include <linux/hiddev.h>
107 107
108#define __DVB_CORE__
108#include <linux/dvb/audio.h> 109#include <linux/dvb/audio.h>
109#include <linux/dvb/dmx.h> 110#include <linux/dvb/dmx.h>
110#include <linux/dvb/frontend.h> 111#include <linux/dvb/frontend.h>
diff --git a/fs/dcache.c b/fs/dcache.c
index 616fedff011a..16a53cc2cc02 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1475,7 +1475,14 @@ static struct dentry * __d_find_any_alias(struct inode *inode)
1475 return alias; 1475 return alias;
1476} 1476}
1477 1477
1478static struct dentry * d_find_any_alias(struct inode *inode) 1478/**
1479 * d_find_any_alias - find any alias for a given inode
1480 * @inode: inode to find an alias for
1481 *
1482 * If any aliases exist for the given inode, take and return a
1483 * reference for one of them. If no aliases exist, return %NULL.
1484 */
1485struct dentry *d_find_any_alias(struct inode *inode)
1479{ 1486{
1480 struct dentry *de; 1487 struct dentry *de;
1481 1488
@@ -1484,7 +1491,7 @@ static struct dentry * d_find_any_alias(struct inode *inode)
1484 spin_unlock(&inode->i_lock); 1491 spin_unlock(&inode->i_lock);
1485 return de; 1492 return de;
1486} 1493}
1487 1494EXPORT_SYMBOL(d_find_any_alias);
1488 1495
1489/** 1496/**
1490 * d_obtain_alias - find or allocate a dentry for a given inode 1497 * d_obtain_alias - find or allocate a dentry for a given inode
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d740ab67ff6e..4a588dbd11bf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -36,6 +36,7 @@
36#include <linux/rwsem.h> 36#include <linux/rwsem.h>
37#include <linux/uio.h> 37#include <linux/uio.h>
38#include <linux/atomic.h> 38#include <linux/atomic.h>
39#include <linux/prefetch.h>
39 40
40/* 41/*
41 * How many user pages to map in one call to get_user_pages(). This determines 42 * How many user pages to map in one call to get_user_pages(). This determines
@@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
580{ 581{
581 int ret; 582 int ret;
582 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ 583 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
584 sector_t fs_endblk; /* Into file, in filesystem-sized blocks */
583 unsigned long fs_count; /* Number of filesystem-sized blocks */ 585 unsigned long fs_count; /* Number of filesystem-sized blocks */
584 unsigned long dio_count;/* Number of dio_block-sized blocks */
585 unsigned long blkmask;
586 int create; 586 int create;
587 587
588 /* 588 /*
@@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
593 if (ret == 0) { 593 if (ret == 0) {
594 BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); 594 BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
595 fs_startblk = sdio->block_in_file >> sdio->blkfactor; 595 fs_startblk = sdio->block_in_file >> sdio->blkfactor;
596 dio_count = sdio->final_block_in_request - sdio->block_in_file; 596 fs_endblk = (sdio->final_block_in_request - 1) >>
597 fs_count = dio_count >> sdio->blkfactor; 597 sdio->blkfactor;
598 blkmask = (1 << sdio->blkfactor) - 1; 598 fs_count = fs_endblk - fs_startblk + 1;
599 if (dio_count & blkmask)
600 fs_count++;
601 599
602 map_bh->b_state = 0; 600 map_bh->b_state = 0;
603 map_bh->b_size = fs_count << dio->inode->i_blkbits; 601 map_bh->b_size = fs_count << dio->inode->i_blkbits;
@@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio)
1090 * individual fields and will generate much worse code. This is important 1088 * individual fields and will generate much worse code. This is important
1091 * for the whole file. 1089 * for the whole file.
1092 */ 1090 */
1093ssize_t 1091static inline ssize_t
1094__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1092do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1095 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1093 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1096 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1094 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1097 dio_submit_t submit_io, int flags) 1095 dio_submit_t submit_io, int flags)
@@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1100 size_t size; 1098 size_t size;
1101 unsigned long addr; 1099 unsigned long addr;
1102 unsigned blkbits = inode->i_blkbits; 1100 unsigned blkbits = inode->i_blkbits;
1103 unsigned bdev_blkbits = 0;
1104 unsigned blocksize_mask = (1 << blkbits) - 1; 1101 unsigned blocksize_mask = (1 << blkbits) - 1;
1105 ssize_t retval = -EINVAL; 1102 ssize_t retval = -EINVAL;
1106 loff_t end = offset; 1103 loff_t end = offset;
@@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1113 if (rw & WRITE) 1110 if (rw & WRITE)
1114 rw = WRITE_ODIRECT; 1111 rw = WRITE_ODIRECT;
1115 1112
1116 if (bdev) 1113 /*
1117 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); 1114 * Avoid references to bdev if not absolutely needed to give
1115 * the early prefetch in the caller enough time.
1116 */
1118 1117
1119 if (offset & blocksize_mask) { 1118 if (offset & blocksize_mask) {
1120 if (bdev) 1119 if (bdev)
1121 blkbits = bdev_blkbits; 1120 blkbits = blksize_bits(bdev_logical_block_size(bdev));
1122 blocksize_mask = (1 << blkbits) - 1; 1121 blocksize_mask = (1 << blkbits) - 1;
1123 if (offset & blocksize_mask) 1122 if (offset & blocksize_mask)
1124 goto out; 1123 goto out;
@@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1129 addr = (unsigned long)iov[seg].iov_base; 1128 addr = (unsigned long)iov[seg].iov_base;
1130 size = iov[seg].iov_len; 1129 size = iov[seg].iov_len;
1131 end += size; 1130 end += size;
1132 if ((addr & blocksize_mask) || (size & blocksize_mask)) { 1131 if (unlikely((addr & blocksize_mask) ||
1132 (size & blocksize_mask))) {
1133 if (bdev) 1133 if (bdev)
1134 blkbits = bdev_blkbits; 1134 blkbits = blksize_bits(
1135 bdev_logical_block_size(bdev));
1135 blocksize_mask = (1 << blkbits) - 1; 1136 blocksize_mask = (1 << blkbits) - 1;
1136 if ((addr & blocksize_mask) || (size & blocksize_mask)) 1137 if ((addr & blocksize_mask) || (size & blocksize_mask))
1137 goto out; 1138 goto out;
1138 } 1139 }
1139 } 1140 }
@@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1316out: 1317out:
1317 return retval; 1318 return retval;
1318} 1319}
1320
1321ssize_t
1322__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1323 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1324 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1325 dio_submit_t submit_io, int flags)
1326{
1327 /*
1328 * The block device state is needed in the end to finally
1329 * submit everything. Since it's likely to be cache cold
1330 * prefetch it here as first thing to hide some of the
1331 * latency.
1332 *
1333 * Attempt to prefetch the pieces we likely need later.
1334 */
1335 prefetch(&bdev->bd_disk->part_tbl);
1336 prefetch(bdev->bd_queue);
1337 prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
1338
1339 return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
1340 nr_segs, get_block, end_io,
1341 submit_io, flags);
1342}
1343
1319EXPORT_SYMBOL(__blockdev_direct_IO); 1344EXPORT_SYMBOL(__blockdev_direct_IO);
1320 1345
1321static __init int dio_init(void) 1346static __init int dio_init(void)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 828e750af23a..aabdfc38cf24 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -197,6 +197,12 @@ struct eventpoll {
197 197
198 /* The user that created the eventpoll descriptor */ 198 /* The user that created the eventpoll descriptor */
199 struct user_struct *user; 199 struct user_struct *user;
200
201 struct file *file;
202
203 /* used to optimize loop detection check */
204 int visited;
205 struct list_head visited_list_link;
200}; 206};
201 207
202/* Wait structure used by the poll hooks */ 208/* Wait structure used by the poll hooks */
@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly;
255/* Slab cache used to allocate "struct eppoll_entry" */ 261/* Slab cache used to allocate "struct eppoll_entry" */
256static struct kmem_cache *pwq_cache __read_mostly; 262static struct kmem_cache *pwq_cache __read_mostly;
257 263
264/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
265static LIST_HEAD(visited_list);
266
267/*
268 * List of files with newly added links, where we may need to limit the number
269 * of emanating paths. Protected by the epmutex.
270 */
271static LIST_HEAD(tfile_check_list);
272
258#ifdef CONFIG_SYSCTL 273#ifdef CONFIG_SYSCTL
259 274
260#include <linux/sysctl.h> 275#include <linux/sysctl.h>
@@ -276,6 +291,12 @@ ctl_table epoll_table[] = {
276}; 291};
277#endif /* CONFIG_SYSCTL */ 292#endif /* CONFIG_SYSCTL */
278 293
294static const struct file_operations eventpoll_fops;
295
296static inline int is_file_epoll(struct file *f)
297{
298 return f->f_op == &eventpoll_fops;
299}
279 300
280/* Setup the structure that is used as key for the RB tree */ 301/* Setup the structure that is used as key for the RB tree */
281static inline void ep_set_ffd(struct epoll_filefd *ffd, 302static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = {
711 .llseek = noop_llseek, 732 .llseek = noop_llseek,
712}; 733};
713 734
714/* Fast test to see if the file is an eventpoll file */
715static inline int is_file_epoll(struct file *f)
716{
717 return f->f_op == &eventpoll_fops;
718}
719
720/* 735/*
721 * This is called from eventpoll_release() to unlink files from the eventpoll 736 * This is called from eventpoll_release() to unlink files from the eventpoll
722 * interface. We need to have this facility to cleanup correctly files that are 737 * interface. We need to have this facility to cleanup correctly files that are
@@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
926 rb_insert_color(&epi->rbn, &ep->rbr); 941 rb_insert_color(&epi->rbn, &ep->rbr);
927} 942}
928 943
944
945
946#define PATH_ARR_SIZE 5
947/*
948 * These are the number paths of length 1 to 5, that we are allowing to emanate
949 * from a single file of interest. For example, we allow 1000 paths of length
950 * 1, to emanate from each file of interest. This essentially represents the
951 * potential wakeup paths, which need to be limited in order to avoid massive
952 * uncontrolled wakeup storms. The common use case should be a single ep which
953 * is connected to n file sources. In this case each file source has 1 path
954 * of length 1. Thus, the numbers below should be more than sufficient. These
955 * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
956 * and delete can't add additional paths. Protected by the epmutex.
957 */
958static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
959static int path_count[PATH_ARR_SIZE];
960
961static int path_count_inc(int nests)
962{
963 if (++path_count[nests] > path_limits[nests])
964 return -1;
965 return 0;
966}
967
968static void path_count_init(void)
969{
970 int i;
971
972 for (i = 0; i < PATH_ARR_SIZE; i++)
973 path_count[i] = 0;
974}
975
976static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
977{
978 int error = 0;
979 struct file *file = priv;
980 struct file *child_file;
981 struct epitem *epi;
982
983 list_for_each_entry(epi, &file->f_ep_links, fllink) {
984 child_file = epi->ep->file;
985 if (is_file_epoll(child_file)) {
986 if (list_empty(&child_file->f_ep_links)) {
987 if (path_count_inc(call_nests)) {
988 error = -1;
989 break;
990 }
991 } else {
992 error = ep_call_nested(&poll_loop_ncalls,
993 EP_MAX_NESTS,
994 reverse_path_check_proc,
995 child_file, child_file,
996 current);
997 }
998 if (error != 0)
999 break;
1000 } else {
1001 printk(KERN_ERR "reverse_path_check_proc: "
1002 "file is not an ep!\n");
1003 }
1004 }
1005 return error;
1006}
1007
1008/**
1009 * reverse_path_check - The tfile_check_list is list of file *, which have
1010 * links that are proposed to be newly added. We need to
1011 * make sure that those added links don't add too many
1012 * paths such that we will spend all our time waking up
1013 * eventpoll objects.
1014 *
1015 * Returns: Returns zero if the proposed links don't create too many paths,
1016 * -1 otherwise.
1017 */
1018static int reverse_path_check(void)
1019{
1020 int length = 0;
1021 int error = 0;
1022 struct file *current_file;
1023
1024 /* let's call this for all tfiles */
1025 list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
1026 length++;
1027 path_count_init();
1028 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1029 reverse_path_check_proc, current_file,
1030 current_file, current);
1031 if (error)
1032 break;
1033 }
1034 return error;
1035}
1036
929/* 1037/*
930 * Must be called with "mtx" held. 1038 * Must be called with "mtx" held.
931 */ 1039 */
@@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
987 */ 1095 */
988 ep_rbtree_insert(ep, epi); 1096 ep_rbtree_insert(ep, epi);
989 1097
1098 /* now check if we've created too many backpaths */
1099 error = -EINVAL;
1100 if (reverse_path_check())
1101 goto error_remove_epi;
1102
990 /* We have to drop the new item inside our item list to keep track of it */ 1103 /* We have to drop the new item inside our item list to keep track of it */
991 spin_lock_irqsave(&ep->lock, flags); 1104 spin_lock_irqsave(&ep->lock, flags);
992 1105
@@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1011 1124
1012 return 0; 1125 return 0;
1013 1126
1127error_remove_epi:
1128 spin_lock(&tfile->f_lock);
1129 if (ep_is_linked(&epi->fllink))
1130 list_del_init(&epi->fllink);
1131 spin_unlock(&tfile->f_lock);
1132
1133 rb_erase(&epi->rbn, &ep->rbr);
1134
1014error_unregister: 1135error_unregister:
1015 ep_unregister_pollwait(ep, epi); 1136 ep_unregister_pollwait(ep, epi);
1016 1137
@@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1275 int error = 0; 1396 int error = 0;
1276 struct file *file = priv; 1397 struct file *file = priv;
1277 struct eventpoll *ep = file->private_data; 1398 struct eventpoll *ep = file->private_data;
1399 struct eventpoll *ep_tovisit;
1278 struct rb_node *rbp; 1400 struct rb_node *rbp;
1279 struct epitem *epi; 1401 struct epitem *epi;
1280 1402
1281 mutex_lock_nested(&ep->mtx, call_nests + 1); 1403 mutex_lock_nested(&ep->mtx, call_nests + 1);
1404 ep->visited = 1;
1405 list_add(&ep->visited_list_link, &visited_list);
1282 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { 1406 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1283 epi = rb_entry(rbp, struct epitem, rbn); 1407 epi = rb_entry(rbp, struct epitem, rbn);
1284 if (unlikely(is_file_epoll(epi->ffd.file))) { 1408 if (unlikely(is_file_epoll(epi->ffd.file))) {
1409 ep_tovisit = epi->ffd.file->private_data;
1410 if (ep_tovisit->visited)
1411 continue;
1285 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, 1412 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1286 ep_loop_check_proc, epi->ffd.file, 1413 ep_loop_check_proc, epi->ffd.file,
1287 epi->ffd.file->private_data, current); 1414 ep_tovisit, current);
1288 if (error != 0) 1415 if (error != 0)
1289 break; 1416 break;
1417 } else {
1418 /*
1419 * If we've reached a file that is not associated with
1420 * an ep, then we need to check if the newly added
1421 * links are going to add too many wakeup paths. We do
1422 * this by adding it to the tfile_check_list, if it's
1423 * not already there, and calling reverse_path_check()
1424 * during ep_insert().
1425 */
1426 if (list_empty(&epi->ffd.file->f_tfile_llink))
1427 list_add(&epi->ffd.file->f_tfile_llink,
1428 &tfile_check_list);
1290 } 1429 }
1291 } 1430 }
1292 mutex_unlock(&ep->mtx); 1431 mutex_unlock(&ep->mtx);
@@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1307 */ 1446 */
1308static int ep_loop_check(struct eventpoll *ep, struct file *file) 1447static int ep_loop_check(struct eventpoll *ep, struct file *file)
1309{ 1448{
1310 return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, 1449 int ret;
1450 struct eventpoll *ep_cur, *ep_next;
1451
1452 ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1311 ep_loop_check_proc, file, ep, current); 1453 ep_loop_check_proc, file, ep, current);
1454 /* clear visited list */
1455 list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
1456 visited_list_link) {
1457 ep_cur->visited = 0;
1458 list_del(&ep_cur->visited_list_link);
1459 }
1460 return ret;
1461}
1462
1463static void clear_tfile_check_list(void)
1464{
1465 struct file *file;
1466
1467 /* first clear the tfile_check_list */
1468 while (!list_empty(&tfile_check_list)) {
1469 file = list_first_entry(&tfile_check_list, struct file,
1470 f_tfile_llink);
1471 list_del_init(&file->f_tfile_llink);
1472 }
1473 INIT_LIST_HEAD(&tfile_check_list);
1312} 1474}
1313 1475
1314/* 1476/*
@@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
1316 */ 1478 */
1317SYSCALL_DEFINE1(epoll_create1, int, flags) 1479SYSCALL_DEFINE1(epoll_create1, int, flags)
1318{ 1480{
1319 int error; 1481 int error, fd;
1320 struct eventpoll *ep = NULL; 1482 struct eventpoll *ep = NULL;
1483 struct file *file;
1321 1484
1322 /* Check the EPOLL_* constant for consistency. */ 1485 /* Check the EPOLL_* constant for consistency. */
1323 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); 1486 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
@@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
1334 * Creates all the items needed to setup an eventpoll file. That is, 1497 * Creates all the items needed to setup an eventpoll file. That is,
1335 * a file structure and a free file descriptor. 1498 * a file structure and a free file descriptor.
1336 */ 1499 */
1337 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, 1500 fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
1501 if (fd < 0) {
1502 error = fd;
1503 goto out_free_ep;
1504 }
1505 file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
1338 O_RDWR | (flags & O_CLOEXEC)); 1506 O_RDWR | (flags & O_CLOEXEC));
1339 if (error < 0) 1507 if (IS_ERR(file)) {
1340 ep_free(ep); 1508 error = PTR_ERR(file);
1341 1509 goto out_free_fd;
1510 }
1511 fd_install(fd, file);
1512 ep->file = file;
1513 return fd;
1514
1515out_free_fd:
1516 put_unused_fd(fd);
1517out_free_ep:
1518 ep_free(ep);
1342 return error; 1519 return error;
1343} 1520}
1344 1521
@@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1404 /* 1581 /*
1405 * When we insert an epoll file descriptor, inside another epoll file 1582 * When we insert an epoll file descriptor, inside another epoll file
1406 * descriptor, there is the change of creating closed loops, which are 1583 * descriptor, there is the change of creating closed loops, which are
1407 * better be handled here, than in more critical paths. 1584 * better be handled here, than in more critical paths. While we are
1585 * checking for loops we also determine the list of files reachable
1586 * and hang them on the tfile_check_list, so we can check that we
1587 * haven't created too many possible wakeup paths.
1408 * 1588 *
1409 * We hold epmutex across the loop check and the insert in this case, in 1589 * We need to hold the epmutex across both ep_insert and ep_remove
1410 * order to prevent two separate inserts from racing and each doing the 1590 * b/c we want to make sure we are looking at a coherent view of
1411 * insert "at the same time" such that ep_loop_check passes on both 1591 * epoll network.
1412 * before either one does the insert, thereby creating a cycle.
1413 */ 1592 */
1414 if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { 1593 if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
1415 mutex_lock(&epmutex); 1594 mutex_lock(&epmutex);
1416 did_lock_epmutex = 1; 1595 did_lock_epmutex = 1;
1417 error = -ELOOP;
1418 if (ep_loop_check(ep, tfile) != 0)
1419 goto error_tgt_fput;
1420 } 1596 }
1421 1597 if (op == EPOLL_CTL_ADD) {
1598 if (is_file_epoll(tfile)) {
1599 error = -ELOOP;
1600 if (ep_loop_check(ep, tfile) != 0)
1601 goto error_tgt_fput;
1602 } else
1603 list_add(&tfile->f_tfile_llink, &tfile_check_list);
1604 }
1422 1605
1423 mutex_lock_nested(&ep->mtx, 0); 1606 mutex_lock_nested(&ep->mtx, 0);
1424 1607
@@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1437 error = ep_insert(ep, &epds, tfile, fd); 1620 error = ep_insert(ep, &epds, tfile, fd);
1438 } else 1621 } else
1439 error = -EEXIST; 1622 error = -EEXIST;
1623 clear_tfile_check_list();
1440 break; 1624 break;
1441 case EPOLL_CTL_DEL: 1625 case EPOLL_CTL_DEL:
1442 if (epi) 1626 if (epi)
@@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1455 mutex_unlock(&ep->mtx); 1639 mutex_unlock(&ep->mtx);
1456 1640
1457error_tgt_fput: 1641error_tgt_fput:
1458 if (unlikely(did_lock_epmutex)) 1642 if (did_lock_epmutex)
1459 mutex_unlock(&epmutex); 1643 mutex_unlock(&epmutex);
1460 1644
1461 fput(tfile); 1645 fput(tfile);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 2aaf3eaaf13d..5f3368ab0fa9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1378,7 +1378,59 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
1378 down_read(&fc->killsb); 1378 down_read(&fc->killsb);
1379 err = -ENOENT; 1379 err = -ENOENT;
1380 if (fc->sb) 1380 if (fc->sb)
1381 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name); 1381 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
1382 up_read(&fc->killsb);
1383 kfree(buf);
1384 return err;
1385
1386err:
1387 kfree(buf);
1388 fuse_copy_finish(cs);
1389 return err;
1390}
1391
1392static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
1393 struct fuse_copy_state *cs)
1394{
1395 struct fuse_notify_delete_out outarg;
1396 int err = -ENOMEM;
1397 char *buf;
1398 struct qstr name;
1399
1400 buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
1401 if (!buf)
1402 goto err;
1403
1404 err = -EINVAL;
1405 if (size < sizeof(outarg))
1406 goto err;
1407
1408 err = fuse_copy_one(cs, &outarg, sizeof(outarg));
1409 if (err)
1410 goto err;
1411
1412 err = -ENAMETOOLONG;
1413 if (outarg.namelen > FUSE_NAME_MAX)
1414 goto err;
1415
1416 err = -EINVAL;
1417 if (size != sizeof(outarg) + outarg.namelen + 1)
1418 goto err;
1419
1420 name.name = buf;
1421 name.len = outarg.namelen;
1422 err = fuse_copy_one(cs, buf, outarg.namelen + 1);
1423 if (err)
1424 goto err;
1425 fuse_copy_finish(cs);
1426 buf[outarg.namelen] = 0;
1427 name.hash = full_name_hash(name.name, name.len);
1428
1429 down_read(&fc->killsb);
1430 err = -ENOENT;
1431 if (fc->sb)
1432 err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
1433 outarg.child, &name);
1382 up_read(&fc->killsb); 1434 up_read(&fc->killsb);
1383 kfree(buf); 1435 kfree(buf);
1384 return err; 1436 return err;
@@ -1597,6 +1649,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
1597 case FUSE_NOTIFY_RETRIEVE: 1649 case FUSE_NOTIFY_RETRIEVE:
1598 return fuse_notify_retrieve(fc, size, cs); 1650 return fuse_notify_retrieve(fc, size, cs);
1599 1651
1652 case FUSE_NOTIFY_DELETE:
1653 return fuse_notify_delete(fc, size, cs);
1654
1600 default: 1655 default:
1601 fuse_copy_finish(cs); 1656 fuse_copy_finish(cs);
1602 return -EINVAL; 1657 return -EINVAL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5ddd6ea8f839..206632887bb4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -868,7 +868,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
868} 868}
869 869
870int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, 870int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
871 struct qstr *name) 871 u64 child_nodeid, struct qstr *name)
872{ 872{
873 int err = -ENOTDIR; 873 int err = -ENOTDIR;
874 struct inode *parent; 874 struct inode *parent;
@@ -895,8 +895,36 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
895 895
896 fuse_invalidate_attr(parent); 896 fuse_invalidate_attr(parent);
897 fuse_invalidate_entry(entry); 897 fuse_invalidate_entry(entry);
898
899 if (child_nodeid != 0 && entry->d_inode) {
900 mutex_lock(&entry->d_inode->i_mutex);
901 if (get_node_id(entry->d_inode) != child_nodeid) {
902 err = -ENOENT;
903 goto badentry;
904 }
905 if (d_mountpoint(entry)) {
906 err = -EBUSY;
907 goto badentry;
908 }
909 if (S_ISDIR(entry->d_inode->i_mode)) {
910 shrink_dcache_parent(entry);
911 if (!simple_empty(entry)) {
912 err = -ENOTEMPTY;
913 goto badentry;
914 }
915 entry->d_inode->i_flags |= S_DEAD;
916 }
917 dont_mount(entry);
918 clear_nlink(entry->d_inode);
919 err = 0;
920 badentry:
921 mutex_unlock(&entry->d_inode->i_mutex);
922 if (!err)
923 d_delete(entry);
924 } else {
925 err = 0;
926 }
898 dput(entry); 927 dput(entry);
899 err = 0;
900 928
901 unlock: 929 unlock:
902 mutex_unlock(&parent->i_mutex); 930 mutex_unlock(&parent->i_mutex);
@@ -1182,6 +1210,30 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
1182 return fuse_fsync_common(file, start, end, datasync, 1); 1210 return fuse_fsync_common(file, start, end, datasync, 1);
1183} 1211}
1184 1212
1213static long fuse_dir_ioctl(struct file *file, unsigned int cmd,
1214 unsigned long arg)
1215{
1216 struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
1217
1218 /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */
1219 if (fc->minor < 18)
1220 return -ENOTTY;
1221
1222 return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR);
1223}
1224
1225static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
1226 unsigned long arg)
1227{
1228 struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
1229
1230 if (fc->minor < 18)
1231 return -ENOTTY;
1232
1233 return fuse_ioctl_common(file, cmd, arg,
1234 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
1235}
1236
1185static bool update_mtime(unsigned ivalid) 1237static bool update_mtime(unsigned ivalid)
1186{ 1238{
1187 /* Always update if mtime is explicitly set */ 1239 /* Always update if mtime is explicitly set */
@@ -1596,6 +1648,8 @@ static const struct file_operations fuse_dir_operations = {
1596 .open = fuse_dir_open, 1648 .open = fuse_dir_open,
1597 .release = fuse_dir_release, 1649 .release = fuse_dir_release,
1598 .fsync = fuse_dir_fsync, 1650 .fsync = fuse_dir_fsync,
1651 .unlocked_ioctl = fuse_dir_ioctl,
1652 .compat_ioctl = fuse_dir_compat_ioctl,
1599}; 1653};
1600 1654
1601static const struct inode_operations fuse_common_inode_operations = { 1655static const struct inode_operations fuse_common_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0c84100acd44..4a199fd93fbd 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1555,48 +1555,16 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
1555 loff_t retval; 1555 loff_t retval;
1556 struct inode *inode = file->f_path.dentry->d_inode; 1556 struct inode *inode = file->f_path.dentry->d_inode;
1557 1557
1558 mutex_lock(&inode->i_mutex); 1558 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
1559 if (origin != SEEK_CUR && origin != SEEK_SET) { 1559 if (origin == SEEK_CUR || origin == SEEK_SET)
1560 retval = fuse_update_attributes(inode, NULL, file, NULL); 1560 return generic_file_llseek(file, offset, origin);
1561 if (retval)
1562 goto exit;
1563 }
1564 1561
1565 switch (origin) { 1562 mutex_lock(&inode->i_mutex);
1566 case SEEK_END: 1563 retval = fuse_update_attributes(inode, NULL, file, NULL);
1567 offset += i_size_read(inode); 1564 if (!retval)
1568 break; 1565 retval = generic_file_llseek(file, offset, origin);
1569 case SEEK_CUR:
1570 if (offset == 0) {
1571 retval = file->f_pos;
1572 goto exit;
1573 }
1574 offset += file->f_pos;
1575 break;
1576 case SEEK_DATA:
1577 if (offset >= i_size_read(inode)) {
1578 retval = -ENXIO;
1579 goto exit;
1580 }
1581 break;
1582 case SEEK_HOLE:
1583 if (offset >= i_size_read(inode)) {
1584 retval = -ENXIO;
1585 goto exit;
1586 }
1587 offset = i_size_read(inode);
1588 break;
1589 }
1590 retval = -EINVAL;
1591 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
1592 if (offset != file->f_pos) {
1593 file->f_pos = offset;
1594 file->f_version = 0;
1595 }
1596 retval = offset;
1597 }
1598exit:
1599 mutex_unlock(&inode->i_mutex); 1566 mutex_unlock(&inode->i_mutex);
1567
1600 return retval; 1568 return retval;
1601} 1569}
1602 1570
@@ -1808,7 +1776,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1808 BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); 1776 BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
1809 1777
1810 err = -ENOMEM; 1778 err = -ENOMEM;
1811 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); 1779 pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
1812 iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); 1780 iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
1813 if (!pages || !iov_page) 1781 if (!pages || !iov_page)
1814 goto out; 1782 goto out;
@@ -1958,8 +1926,8 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1958} 1926}
1959EXPORT_SYMBOL_GPL(fuse_do_ioctl); 1927EXPORT_SYMBOL_GPL(fuse_do_ioctl);
1960 1928
1961static long fuse_file_ioctl_common(struct file *file, unsigned int cmd, 1929long fuse_ioctl_common(struct file *file, unsigned int cmd,
1962 unsigned long arg, unsigned int flags) 1930 unsigned long arg, unsigned int flags)
1963{ 1931{
1964 struct inode *inode = file->f_dentry->d_inode; 1932 struct inode *inode = file->f_dentry->d_inode;
1965 struct fuse_conn *fc = get_fuse_conn(inode); 1933 struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1976,13 +1944,13 @@ static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
1976static long fuse_file_ioctl(struct file *file, unsigned int cmd, 1944static long fuse_file_ioctl(struct file *file, unsigned int cmd,
1977 unsigned long arg) 1945 unsigned long arg)
1978{ 1946{
1979 return fuse_file_ioctl_common(file, cmd, arg, 0); 1947 return fuse_ioctl_common(file, cmd, arg, 0);
1980} 1948}
1981 1949
1982static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, 1950static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
1983 unsigned long arg) 1951 unsigned long arg)
1984{ 1952{
1985 return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); 1953 return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
1986} 1954}
1987 1955
1988/* 1956/*
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1964da0257d9..572cefc78012 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -755,9 +755,15 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
755/** 755/**
756 * File-system tells the kernel to invalidate parent attributes and 756 * File-system tells the kernel to invalidate parent attributes and
757 * the dentry matching parent/name. 757 * the dentry matching parent/name.
758 *
759 * If the child_nodeid is non-zero and:
760 * - matches the inode number for the dentry matching parent/name,
761 * - is not a mount point
762 * - is a file or oan empty directory
763 * then the dentry is unhashed (d_delete()).
758 */ 764 */
759int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, 765int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
760 struct qstr *name); 766 u64 child_nodeid, struct qstr *name);
761 767
762int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, 768int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
763 bool isdir); 769 bool isdir);
@@ -765,6 +771,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
765 size_t count, loff_t *ppos, int write); 771 size_t count, loff_t *ppos, int write);
766long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, 772long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
767 unsigned int flags); 773 unsigned int flags);
774long fuse_ioctl_common(struct file *file, unsigned int cmd,
775 unsigned long arg, unsigned int flags);
768unsigned fuse_file_poll(struct file *file, poll_table *wait); 776unsigned fuse_file_poll(struct file *file, poll_table *wait);
769int fuse_dev_release(struct inode *inode, struct file *file); 777int fuse_dev_release(struct inode *inode, struct file *file);
770 778
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 88e8a23d0026..376816fcd040 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1353,7 +1353,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1353 spin_lock(&gl->gl_spin); 1353 spin_lock(&gl->gl_spin);
1354 gl->gl_reply = ret; 1354 gl->gl_reply = ret;
1355 1355
1356 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { 1356 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
1357 if (gfs2_should_freeze(gl)) { 1357 if (gfs2_should_freeze(gl)) {
1358 set_bit(GLF_FROZEN, &gl->gl_flags); 1358 set_bit(GLF_FROZEN, &gl->gl_flags);
1359 spin_unlock(&gl->gl_spin); 1359 spin_unlock(&gl->gl_spin);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2553b858a72e..307ac31df781 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -121,8 +121,11 @@ enum {
121 121
122struct lm_lockops { 122struct lm_lockops {
123 const char *lm_proto_name; 123 const char *lm_proto_name;
124 int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); 124 int (*lm_mount) (struct gfs2_sbd *sdp, const char *table);
125 void (*lm_unmount) (struct gfs2_sbd *sdp); 125 void (*lm_first_done) (struct gfs2_sbd *sdp);
126 void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid,
127 unsigned int result);
128 void (*lm_unmount) (struct gfs2_sbd *sdp);
126 void (*lm_withdraw) (struct gfs2_sbd *sdp); 129 void (*lm_withdraw) (struct gfs2_sbd *sdp);
127 void (*lm_put_lock) (struct gfs2_glock *gl); 130 void (*lm_put_lock) (struct gfs2_glock *gl);
128 int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, 131 int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e1d3bb59945c..97742a7ea9cc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -139,8 +139,45 @@ struct gfs2_bufdata {
139#define GDLM_STRNAME_BYTES 25 139#define GDLM_STRNAME_BYTES 25
140#define GDLM_LVB_SIZE 32 140#define GDLM_LVB_SIZE 32
141 141
142/*
143 * ls_recover_flags:
144 *
145 * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been
146 * held by failed nodes whose journals need recovery. Those locks should
147 * only be used for journal recovery until the journal recovery is done.
148 * This is set by the dlm recover_prep callback and cleared by the
149 * gfs2_control thread when journal recovery is complete. To avoid
150 * races between recover_prep setting and gfs2_control clearing, recover_spin
151 * is held while changing this bit and reading/writing recover_block
152 * and recover_start.
153 *
154 * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used.
155 *
156 * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing
157 * recovery of all journals before allowing other nodes to mount the fs.
158 * This is cleared when FIRST_MOUNT_DONE is set.
159 *
160 * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished
161 * recovery of all journals, and now allows other nodes to mount the fs.
162 *
163 * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared
164 * BLOCK_LOCKS for the first time. The gfs2_control thread should now
165 * control clearing BLOCK_LOCKS for further recoveries.
166 *
167 * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq.
168 *
169 * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep()
170 * and recover_done(), i.e. set while recover_block == recover_start.
171 */
172
142enum { 173enum {
143 DFL_BLOCK_LOCKS = 0, 174 DFL_BLOCK_LOCKS = 0,
175 DFL_NO_DLM_OPS = 1,
176 DFL_FIRST_MOUNT = 2,
177 DFL_FIRST_MOUNT_DONE = 3,
178 DFL_MOUNT_DONE = 4,
179 DFL_UNMOUNT = 5,
180 DFL_DLM_RECOVERY = 6,
144}; 181};
145 182
146struct lm_lockname { 183struct lm_lockname {
@@ -392,6 +429,7 @@ struct gfs2_jdesc {
392#define JDF_RECOVERY 1 429#define JDF_RECOVERY 1
393 unsigned int jd_jid; 430 unsigned int jd_jid;
394 unsigned int jd_blocks; 431 unsigned int jd_blocks;
432 int jd_recover_error;
395}; 433};
396 434
397struct gfs2_statfs_change_host { 435struct gfs2_statfs_change_host {
@@ -461,6 +499,7 @@ enum {
461 SDF_NORECOVERY = 4, 499 SDF_NORECOVERY = 4,
462 SDF_DEMOTE = 5, 500 SDF_DEMOTE = 5,
463 SDF_NOJOURNALID = 6, 501 SDF_NOJOURNALID = 6,
502 SDF_RORECOVERY = 7, /* read only recovery */
464}; 503};
465 504
466#define GFS2_FSNAME_LEN 256 505#define GFS2_FSNAME_LEN 256
@@ -499,14 +538,26 @@ struct gfs2_sb_host {
499struct lm_lockstruct { 538struct lm_lockstruct {
500 int ls_jid; 539 int ls_jid;
501 unsigned int ls_first; 540 unsigned int ls_first;
502 unsigned int ls_first_done;
503 unsigned int ls_nodir; 541 unsigned int ls_nodir;
504 const struct lm_lockops *ls_ops; 542 const struct lm_lockops *ls_ops;
505 unsigned long ls_flags;
506 dlm_lockspace_t *ls_dlm; 543 dlm_lockspace_t *ls_dlm;
507 544
508 int ls_recover_jid_done; 545 int ls_recover_jid_done; /* These two are deprecated, */
509 int ls_recover_jid_status; 546 int ls_recover_jid_status; /* used previously by gfs_controld */
547
548 struct dlm_lksb ls_mounted_lksb; /* mounted_lock */
549 struct dlm_lksb ls_control_lksb; /* control_lock */
550 char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */
551 struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */
552
553 spinlock_t ls_recover_spin; /* protects following fields */
554 unsigned long ls_recover_flags; /* DFL_ */
555 uint32_t ls_recover_mount; /* gen in first recover_done cb */
556 uint32_t ls_recover_start; /* gen in last recover_done cb */
557 uint32_t ls_recover_block; /* copy recover_start in last recover_prep */
558 uint32_t ls_recover_size; /* size of recover_submit, recover_result */
559 uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */
560 uint32_t *ls_recover_result; /* result of last jid recovery */
510}; 561};
511 562
512struct gfs2_sbd { 563struct gfs2_sbd {
@@ -544,6 +595,7 @@ struct gfs2_sbd {
544 wait_queue_head_t sd_glock_wait; 595 wait_queue_head_t sd_glock_wait;
545 atomic_t sd_glock_disposal; 596 atomic_t sd_glock_disposal;
546 struct completion sd_locking_init; 597 struct completion sd_locking_init;
598 struct delayed_work sd_control_work;
547 599
548 /* Inode Stuff */ 600 /* Inode Stuff */
549 601
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 017960cf1d7a..a7d611b93f0f 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -599,9 +599,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
599 error = gfs2_meta_inode_buffer(ip, &dibh); 599 error = gfs2_meta_inode_buffer(ip, &dibh);
600 if (error) 600 if (error)
601 goto fail_end_trans; 601 goto fail_end_trans;
602 inc_nlink(&ip->i_inode); 602 set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1);
603 if (S_ISDIR(ip->i_inode.i_mode))
604 inc_nlink(&ip->i_inode);
605 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 603 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
606 gfs2_dinode_out(ip, dibh->b_data); 604 gfs2_dinode_out(ip, dibh->b_data);
607 brelse(dibh); 605 brelse(dibh);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index ce85b62bc0a2..8944d1e32ab5 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. 3 * Copyright 2004-2011 Red Hat, Inc.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -11,12 +11,15 @@
11#include <linux/dlm.h> 11#include <linux/dlm.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/types.h> 13#include <linux/types.h>
14#include <linux/delay.h>
14#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
15 16
16#include "incore.h" 17#include "incore.h"
17#include "glock.h" 18#include "glock.h"
18#include "util.h" 19#include "util.h"
20#include "sys.h"
19 21
22extern struct workqueue_struct *gfs2_control_wq;
20 23
21static void gdlm_ast(void *arg) 24static void gdlm_ast(void *arg)
22{ 25{
@@ -185,34 +188,1002 @@ static void gdlm_cancel(struct gfs2_glock *gl)
185 dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); 188 dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
186} 189}
187 190
188static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname) 191/*
192 * dlm/gfs2 recovery coordination using dlm_recover callbacks
193 *
194 * 1. dlm_controld sees lockspace members change
195 * 2. dlm_controld blocks dlm-kernel locking activity
196 * 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep)
197 * 4. dlm_controld starts and finishes its own user level recovery
198 * 5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery
199 * 6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot)
200 * 7. dlm_recoverd does its own lock recovery
201 * 8. dlm_recoverd unblocks dlm-kernel locking activity
202 * 9. dlm_recoverd notifies gfs2 when done (recover_done with new generation)
203 * 10. gfs2_control updates control_lock lvb with new generation and jid bits
204 * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none)
205 * 12. gfs2_recover dequeues and recovers journals of failed nodes
206 * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result)
207 * 14. gfs2_control updates control_lock lvb jid bits for recovered journals
208 * 15. gfs2_control unblocks normal locking when all journals are recovered
209 *
210 * - failures during recovery
211 *
212 * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control
213 * clears BLOCK_LOCKS (step 15), e.g. another node fails while still
214 * recovering for a prior failure. gfs2_control needs a way to detect
215 * this so it can leave BLOCK_LOCKS set in step 15. This is managed using
216 * the recover_block and recover_start values.
217 *
218 * recover_done() provides a new lockspace generation number each time it
219 * is called (step 9). This generation number is saved as recover_start.
220 * When recover_prep() is called, it sets BLOCK_LOCKS and sets
221 * recover_block = recover_start. So, while recover_block is equal to
222 * recover_start, BLOCK_LOCKS should remain set. (recover_spin must
223 * be held around the BLOCK_LOCKS/recover_block/recover_start logic.)
224 *
225 * - more specific gfs2 steps in sequence above
226 *
227 * 3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start
228 * 6. recover_slot records any failed jids (maybe none)
229 * 9. recover_done sets recover_start = new generation number
230 * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids
231 * 12. gfs2_recover does journal recoveries for failed jids identified above
232 * 14. gfs2_control clears control_lock lvb bits for recovered jids
233 * 15. gfs2_control checks if recover_block == recover_start (step 3 occured
234 * again) then do nothing, otherwise if recover_start > recover_block
235 * then clear BLOCK_LOCKS.
236 *
237 * - parallel recovery steps across all nodes
238 *
239 * All nodes attempt to update the control_lock lvb with the new generation
240 * number and jid bits, but only the first to get the control_lock EX will
241 * do so; others will see that it's already done (lvb already contains new
242 * generation number.)
243 *
244 * . All nodes get the same recover_prep/recover_slot/recover_done callbacks
245 * . All nodes attempt to set control_lock lvb gen + bits for the new gen
246 * . One node gets control_lock first and writes the lvb, others see it's done
247 * . All nodes attempt to recover jids for which they see control_lock bits set
248 * . One node succeeds for a jid, and that one clears the jid bit in the lvb
249 * . All nodes will eventually see all lvb bits clear and unblock locks
250 *
251 * - is there a problem with clearing an lvb bit that should be set
252 * and missing a journal recovery?
253 *
254 * 1. jid fails
255 * 2. lvb bit set for step 1
256 * 3. jid recovered for step 1
257 * 4. jid taken again (new mount)
258 * 5. jid fails (for step 4)
259 * 6. lvb bit set for step 5 (will already be set)
260 * 7. lvb bit cleared for step 3
261 *
262 * This is not a problem because the failure in step 5 does not
263 * require recovery, because the mount in step 4 could not have
264 * progressed far enough to unblock locks and access the fs. The
265 * control_mount() function waits for all recoveries to be complete
266 * for the latest lockspace generation before ever unblocking locks
267 * and returning. The mount in step 4 waits until the recovery in
268 * step 1 is done.
269 *
270 * - special case of first mounter: first node to mount the fs
271 *
272 * The first node to mount a gfs2 fs needs to check all the journals
273 * and recover any that need recovery before other nodes are allowed
274 * to mount the fs. (Others may begin mounting, but they must wait
275 * for the first mounter to be done before taking locks on the fs
276 * or accessing the fs.) This has two parts:
277 *
278 * 1. The mounted_lock tells a node it's the first to mount the fs.
279 * Each node holds the mounted_lock in PR while it's mounted.
280 * Each node tries to acquire the mounted_lock in EX when it mounts.
281 * If a node is granted the mounted_lock EX it means there are no
282 * other mounted nodes (no PR locks exist), and it is the first mounter.
283 * The mounted_lock is demoted to PR when first recovery is done, so
284 * others will fail to get an EX lock, but will get a PR lock.
285 *
286 * 2. The control_lock blocks others in control_mount() while the first
287 * mounter is doing first mount recovery of all journals.
288 * A mounting node needs to acquire control_lock in EX mode before
289 * it can proceed. The first mounter holds control_lock in EX while doing
290 * the first mount recovery, blocking mounts from other nodes, then demotes
291 * control_lock to NL when it's done (others_may_mount/first_done),
292 * allowing other nodes to continue mounting.
293 *
294 * first mounter:
295 * control_lock EX/NOQUEUE success
296 * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters)
297 * set first=1
298 * do first mounter recovery
299 * mounted_lock EX->PR
300 * control_lock EX->NL, write lvb generation
301 *
302 * other mounter:
303 * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry)
304 * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR)
305 * mounted_lock PR/NOQUEUE success
306 * read lvb generation
307 * control_lock EX->NL
308 * set first=0
309 *
310 * - mount during recovery
311 *
312 * If a node mounts while others are doing recovery (not first mounter),
313 * the mounting node will get its initial recover_done() callback without
314 * having seen any previous failures/callbacks.
315 *
316 * It must wait for all recoveries preceding its mount to be finished
317 * before it unblocks locks. It does this by repeating the "other mounter"
318 * steps above until the lvb generation number is >= its mount generation
319 * number (from initial recover_done) and all lvb bits are clear.
320 *
321 * - control_lock lvb format
322 *
323 * 4 bytes generation number: the latest dlm lockspace generation number
324 * from recover_done callback. Indicates the jid bitmap has been updated
325 * to reflect all slot failures through that generation.
326 * 4 bytes unused.
327 * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates
328 * that jid N needs recovery.
329 */
330
331#define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */
332
333static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen,
334 char *lvb_bits)
335{
336 uint32_t gen;
337 memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE);
338 memcpy(&gen, lvb_bits, sizeof(uint32_t));
339 *lvb_gen = le32_to_cpu(gen);
340}
341
342static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,
343 char *lvb_bits)
344{
345 uint32_t gen;
346 memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE);
347 gen = cpu_to_le32(lvb_gen);
348 memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t));
349}
350
351static int all_jid_bits_clear(char *lvb)
352{
353 int i;
354 for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) {
355 if (lvb[i])
356 return 0;
357 }
358 return 1;
359}
360
361static void sync_wait_cb(void *arg)
362{
363 struct lm_lockstruct *ls = arg;
364 complete(&ls->ls_sync_wait);
365}
366
367static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name)
189{ 368{
190 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 369 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
191 int error; 370 int error;
192 371
193 if (fsname == NULL) { 372 error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls);
194 fs_info(sdp, "no fsname found\n"); 373 if (error) {
195 return -EINVAL; 374 fs_err(sdp, "%s lkid %x error %d\n",
375 name, lksb->sb_lkid, error);
376 return error;
377 }
378
379 wait_for_completion(&ls->ls_sync_wait);
380
381 if (lksb->sb_status != -DLM_EUNLOCK) {
382 fs_err(sdp, "%s lkid %x status %d\n",
383 name, lksb->sb_lkid, lksb->sb_status);
384 return -1;
385 }
386 return 0;
387}
388
389static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags,
390 unsigned int num, struct dlm_lksb *lksb, char *name)
391{
392 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
393 char strname[GDLM_STRNAME_BYTES];
394 int error, status;
395
396 memset(strname, 0, GDLM_STRNAME_BYTES);
397 snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num);
398
399 error = dlm_lock(ls->ls_dlm, mode, lksb, flags,
400 strname, GDLM_STRNAME_BYTES - 1,
401 0, sync_wait_cb, ls, NULL);
402 if (error) {
403 fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n",
404 name, lksb->sb_lkid, flags, mode, error);
405 return error;
406 }
407
408 wait_for_completion(&ls->ls_sync_wait);
409
410 status = lksb->sb_status;
411
412 if (status && status != -EAGAIN) {
413 fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n",
414 name, lksb->sb_lkid, flags, mode, status);
415 }
416
417 return status;
418}
419
420static int mounted_unlock(struct gfs2_sbd *sdp)
421{
422 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
423 return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock");
424}
425
426static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
427{
428 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
429 return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK,
430 &ls->ls_mounted_lksb, "mounted_lock");
431}
432
433static int control_unlock(struct gfs2_sbd *sdp)
434{
435 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
436 return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock");
437}
438
439static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
440{
441 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
442 return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK,
443 &ls->ls_control_lksb, "control_lock");
444}
445
446static void gfs2_control_func(struct work_struct *work)
447{
448 struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work);
449 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
450 char lvb_bits[GDLM_LVB_SIZE];
451 uint32_t block_gen, start_gen, lvb_gen, flags;
452 int recover_set = 0;
453 int write_lvb = 0;
454 int recover_size;
455 int i, error;
456
457 spin_lock(&ls->ls_recover_spin);
458 /*
459 * No MOUNT_DONE means we're still mounting; control_mount()
460 * will set this flag, after which this thread will take over
461 * all further clearing of BLOCK_LOCKS.
462 *
463 * FIRST_MOUNT means this node is doing first mounter recovery,
464 * for which recovery control is handled by
465 * control_mount()/control_first_done(), not this thread.
466 */
467 if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
468 test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
469 spin_unlock(&ls->ls_recover_spin);
470 return;
471 }
472 block_gen = ls->ls_recover_block;
473 start_gen = ls->ls_recover_start;
474 spin_unlock(&ls->ls_recover_spin);
475
476 /*
477 * Equal block_gen and start_gen implies we are between
478 * recover_prep and recover_done callbacks, which means
479 * dlm recovery is in progress and dlm locking is blocked.
480 * There's no point trying to do any work until recover_done.
481 */
482
483 if (block_gen == start_gen)
484 return;
485
486 /*
487 * Propagate recover_submit[] and recover_result[] to lvb:
488 * dlm_recoverd adds to recover_submit[] jids needing recovery
489 * gfs2_recover adds to recover_result[] journal recovery results
490 *
491 * set lvb bit for jids in recover_submit[] if the lvb has not
492 * yet been updated for the generation of the failure
493 *
494 * clear lvb bit for jids in recover_result[] if the result of
495 * the journal recovery is SUCCESS
496 */
497
498 error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
499 if (error) {
500 fs_err(sdp, "control lock EX error %d\n", error);
501 return;
502 }
503
504 control_lvb_read(ls, &lvb_gen, lvb_bits);
505
506 spin_lock(&ls->ls_recover_spin);
507 if (block_gen != ls->ls_recover_block ||
508 start_gen != ls->ls_recover_start) {
509 fs_info(sdp, "recover generation %u block1 %u %u\n",
510 start_gen, block_gen, ls->ls_recover_block);
511 spin_unlock(&ls->ls_recover_spin);
512 control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
513 return;
514 }
515
516 recover_size = ls->ls_recover_size;
517
518 if (lvb_gen <= start_gen) {
519 /*
520 * Clear lvb bits for jids we've successfully recovered.
521 * Because all nodes attempt to recover failed journals,
522 * a journal can be recovered multiple times successfully
523 * in succession. Only the first will really do recovery,
524 * the others find it clean, but still report a successful
525 * recovery. So, another node may have already recovered
526 * the jid and cleared the lvb bit for it.
527 */
528 for (i = 0; i < recover_size; i++) {
529 if (ls->ls_recover_result[i] != LM_RD_SUCCESS)
530 continue;
531
532 ls->ls_recover_result[i] = 0;
533
534 if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET))
535 continue;
536
537 __clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
538 write_lvb = 1;
539 }
540 }
541
542 if (lvb_gen == start_gen) {
543 /*
544 * Failed slots before start_gen are already set in lvb.
545 */
546 for (i = 0; i < recover_size; i++) {
547 if (!ls->ls_recover_submit[i])
548 continue;
549 if (ls->ls_recover_submit[i] < lvb_gen)
550 ls->ls_recover_submit[i] = 0;
551 }
552 } else if (lvb_gen < start_gen) {
553 /*
554 * Failed slots before start_gen are not yet set in lvb.
555 */
556 for (i = 0; i < recover_size; i++) {
557 if (!ls->ls_recover_submit[i])
558 continue;
559 if (ls->ls_recover_submit[i] < start_gen) {
560 ls->ls_recover_submit[i] = 0;
561 __set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
562 }
563 }
564 /* even if there are no bits to set, we need to write the
565 latest generation to the lvb */
566 write_lvb = 1;
567 } else {
568 /*
569 * we should be getting a recover_done() for lvb_gen soon
570 */
571 }
572 spin_unlock(&ls->ls_recover_spin);
573
574 if (write_lvb) {
575 control_lvb_write(ls, start_gen, lvb_bits);
576 flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK;
577 } else {
578 flags = DLM_LKF_CONVERT;
579 }
580
581 error = control_lock(sdp, DLM_LOCK_NL, flags);
582 if (error) {
583 fs_err(sdp, "control lock NL error %d\n", error);
584 return;
585 }
586
587 /*
588 * Everyone will see jid bits set in the lvb, run gfs2_recover_set(),
589 * and clear a jid bit in the lvb if the recovery is a success.
590 * Eventually all journals will be recovered, all jid bits will
591 * be cleared in the lvb, and everyone will clear BLOCK_LOCKS.
592 */
593
594 for (i = 0; i < recover_size; i++) {
595 if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) {
596 fs_info(sdp, "recover generation %u jid %d\n",
597 start_gen, i);
598 gfs2_recover_set(sdp, i);
599 recover_set++;
600 }
601 }
602 if (recover_set)
603 return;
604
605 /*
606 * No more jid bits set in lvb, all recovery is done, unblock locks
607 * (unless a new recover_prep callback has occured blocking locks
608 * again while working above)
609 */
610
611 spin_lock(&ls->ls_recover_spin);
612 if (ls->ls_recover_block == block_gen &&
613 ls->ls_recover_start == start_gen) {
614 clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
615 spin_unlock(&ls->ls_recover_spin);
616 fs_info(sdp, "recover generation %u done\n", start_gen);
617 gfs2_glock_thaw(sdp);
618 } else {
619 fs_info(sdp, "recover generation %u block2 %u %u\n",
620 start_gen, block_gen, ls->ls_recover_block);
621 spin_unlock(&ls->ls_recover_spin);
622 }
623}
624
625static int control_mount(struct gfs2_sbd *sdp)
626{
627 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
628 char lvb_bits[GDLM_LVB_SIZE];
629 uint32_t start_gen, block_gen, mount_gen, lvb_gen;
630 int mounted_mode;
631 int retries = 0;
632 int error;
633
634 memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb));
635 memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb));
636 memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE);
637 ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb;
638 init_completion(&ls->ls_sync_wait);
639
640 set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
641
642 error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK);
643 if (error) {
644 fs_err(sdp, "control_mount control_lock NL error %d\n", error);
645 return error;
646 }
647
648 error = mounted_lock(sdp, DLM_LOCK_NL, 0);
649 if (error) {
650 fs_err(sdp, "control_mount mounted_lock NL error %d\n", error);
651 control_unlock(sdp);
652 return error;
653 }
654 mounted_mode = DLM_LOCK_NL;
655
656restart:
657 if (retries++ && signal_pending(current)) {
658 error = -EINTR;
659 goto fail;
660 }
661
662 /*
663 * We always start with both locks in NL. control_lock is
664 * demoted to NL below so we don't need to do it here.
665 */
666
667 if (mounted_mode != DLM_LOCK_NL) {
668 error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
669 if (error)
670 goto fail;
671 mounted_mode = DLM_LOCK_NL;
672 }
673
674 /*
675 * Other nodes need to do some work in dlm recovery and gfs2_control
676 * before the recover_done and control_lock will be ready for us below.
677 * A delay here is not required but often avoids having to retry.
678 */
679
680 msleep_interruptible(500);
681
682 /*
683 * Acquire control_lock in EX and mounted_lock in either EX or PR.
684 * control_lock lvb keeps track of any pending journal recoveries.
685 * mounted_lock indicates if any other nodes have the fs mounted.
686 */
687
688 error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK);
689 if (error == -EAGAIN) {
690 goto restart;
691 } else if (error) {
692 fs_err(sdp, "control_mount control_lock EX error %d\n", error);
693 goto fail;
694 }
695
696 error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
697 if (!error) {
698 mounted_mode = DLM_LOCK_EX;
699 goto locks_done;
700 } else if (error != -EAGAIN) {
701 fs_err(sdp, "control_mount mounted_lock EX error %d\n", error);
702 goto fail;
703 }
704
705 error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
706 if (!error) {
707 mounted_mode = DLM_LOCK_PR;
708 goto locks_done;
709 } else {
710 /* not even -EAGAIN should happen here */
711 fs_err(sdp, "control_mount mounted_lock PR error %d\n", error);
712 goto fail;
713 }
714
715locks_done:
716 /*
717 * If we got both locks above in EX, then we're the first mounter.
718 * If not, then we need to wait for the control_lock lvb to be
719 * updated by other mounted nodes to reflect our mount generation.
720 *
721 * In simple first mounter cases, first mounter will see zero lvb_gen,
722 * but in cases where all existing nodes leave/fail before mounting
723 * nodes finish control_mount, then all nodes will be mounting and
724 * lvb_gen will be non-zero.
725 */
726
727 control_lvb_read(ls, &lvb_gen, lvb_bits);
728
729 if (lvb_gen == 0xFFFFFFFF) {
730 /* special value to force mount attempts to fail */
731 fs_err(sdp, "control_mount control_lock disabled\n");
732 error = -EINVAL;
733 goto fail;
734 }
735
736 if (mounted_mode == DLM_LOCK_EX) {
737 /* first mounter, keep both EX while doing first recovery */
738 spin_lock(&ls->ls_recover_spin);
739 clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
740 set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
741 set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
742 spin_unlock(&ls->ls_recover_spin);
743 fs_info(sdp, "first mounter control generation %u\n", lvb_gen);
744 return 0;
745 }
746
747 error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
748 if (error)
749 goto fail;
750
751 /*
752 * We are not first mounter, now we need to wait for the control_lock
753 * lvb generation to be >= the generation from our first recover_done
754 * and all lvb bits to be clear (no pending journal recoveries.)
755 */
756
757 if (!all_jid_bits_clear(lvb_bits)) {
758 /* journals need recovery, wait until all are clear */
759 fs_info(sdp, "control_mount wait for journal recovery\n");
760 goto restart;
761 }
762
763 spin_lock(&ls->ls_recover_spin);
764 block_gen = ls->ls_recover_block;
765 start_gen = ls->ls_recover_start;
766 mount_gen = ls->ls_recover_mount;
767
768 if (lvb_gen < mount_gen) {
769 /* wait for mounted nodes to update control_lock lvb to our
770 generation, which might include new recovery bits set */
771 fs_info(sdp, "control_mount wait1 block %u start %u mount %u "
772 "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
773 lvb_gen, ls->ls_recover_flags);
774 spin_unlock(&ls->ls_recover_spin);
775 goto restart;
776 }
777
778 if (lvb_gen != start_gen) {
779 /* wait for mounted nodes to update control_lock lvb to the
780 latest recovery generation */
781 fs_info(sdp, "control_mount wait2 block %u start %u mount %u "
782 "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
783 lvb_gen, ls->ls_recover_flags);
784 spin_unlock(&ls->ls_recover_spin);
785 goto restart;
786 }
787
788 if (block_gen == start_gen) {
789 /* dlm recovery in progress, wait for it to finish */
790 fs_info(sdp, "control_mount wait3 block %u start %u mount %u "
791 "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
792 lvb_gen, ls->ls_recover_flags);
793 spin_unlock(&ls->ls_recover_spin);
794 goto restart;
196 } 795 }
197 796
198 error = dlm_new_lockspace(fsname, NULL, 797 clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
199 DLM_LSFL_FS | DLM_LSFL_NEWEXCL | 798 set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
200 (ls->ls_nodir ? DLM_LSFL_NODIR : 0), 799 memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
201 GDLM_LVB_SIZE, NULL, NULL, NULL, &ls->ls_dlm); 800 memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
801 spin_unlock(&ls->ls_recover_spin);
802 return 0;
803
804fail:
805 mounted_unlock(sdp);
806 control_unlock(sdp);
807 return error;
808}
809
810static int dlm_recovery_wait(void *word)
811{
812 schedule();
813 return 0;
814}
815
816static int control_first_done(struct gfs2_sbd *sdp)
817{
818 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
819 char lvb_bits[GDLM_LVB_SIZE];
820 uint32_t start_gen, block_gen;
821 int error;
822
823restart:
824 spin_lock(&ls->ls_recover_spin);
825 start_gen = ls->ls_recover_start;
826 block_gen = ls->ls_recover_block;
827
828 if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) ||
829 !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
830 !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
831 /* sanity check, should not happen */
832 fs_err(sdp, "control_first_done start %u block %u flags %lx\n",
833 start_gen, block_gen, ls->ls_recover_flags);
834 spin_unlock(&ls->ls_recover_spin);
835 control_unlock(sdp);
836 return -1;
837 }
838
839 if (start_gen == block_gen) {
840 /*
841 * Wait for the end of a dlm recovery cycle to switch from
842 * first mounter recovery. We can ignore any recover_slot
843 * callbacks between the recover_prep and next recover_done
844 * because we are still the first mounter and any failed nodes
845 * have not fully mounted, so they don't need recovery.
846 */
847 spin_unlock(&ls->ls_recover_spin);
848 fs_info(sdp, "control_first_done wait gen %u\n", start_gen);
849
850 wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY,
851 dlm_recovery_wait, TASK_UNINTERRUPTIBLE);
852 goto restart;
853 }
854
855 clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
856 set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags);
857 memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
858 memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
859 spin_unlock(&ls->ls_recover_spin);
860
861 memset(lvb_bits, 0, sizeof(lvb_bits));
862 control_lvb_write(ls, start_gen, lvb_bits);
863
864 error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT);
865 if (error)
866 fs_err(sdp, "control_first_done mounted PR error %d\n", error);
867
868 error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
202 if (error) 869 if (error)
203 printk(KERN_ERR "dlm_new_lockspace error %d", error); 870 fs_err(sdp, "control_first_done control NL error %d\n", error);
204 871
205 return error; 872 return error;
206} 873}
207 874
875/*
876 * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC)
877 * to accomodate the largest slot number. (NB dlm slot numbers start at 1,
878 * gfs2 jids start at 0, so jid = slot - 1)
879 */
880
881#define RECOVER_SIZE_INC 16
882
883static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots,
884 int num_slots)
885{
886 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
887 uint32_t *submit = NULL;
888 uint32_t *result = NULL;
889 uint32_t old_size, new_size;
890 int i, max_jid;
891
892 max_jid = 0;
893 for (i = 0; i < num_slots; i++) {
894 if (max_jid < slots[i].slot - 1)
895 max_jid = slots[i].slot - 1;
896 }
897
898 old_size = ls->ls_recover_size;
899
900 if (old_size >= max_jid + 1)
901 return 0;
902
903 new_size = old_size + RECOVER_SIZE_INC;
904
905 submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
906 result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
907 if (!submit || !result) {
908 kfree(submit);
909 kfree(result);
910 return -ENOMEM;
911 }
912
913 spin_lock(&ls->ls_recover_spin);
914 memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t));
915 memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t));
916 kfree(ls->ls_recover_submit);
917 kfree(ls->ls_recover_result);
918 ls->ls_recover_submit = submit;
919 ls->ls_recover_result = result;
920 ls->ls_recover_size = new_size;
921 spin_unlock(&ls->ls_recover_spin);
922 return 0;
923}
924
925static void free_recover_size(struct lm_lockstruct *ls)
926{
927 kfree(ls->ls_recover_submit);
928 kfree(ls->ls_recover_result);
929 ls->ls_recover_submit = NULL;
930 ls->ls_recover_result = NULL;
931 ls->ls_recover_size = 0;
932}
933
934/* dlm calls before it does lock recovery */
935
936static void gdlm_recover_prep(void *arg)
937{
938 struct gfs2_sbd *sdp = arg;
939 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
940
941 spin_lock(&ls->ls_recover_spin);
942 ls->ls_recover_block = ls->ls_recover_start;
943 set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
944
945 if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
946 test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
947 spin_unlock(&ls->ls_recover_spin);
948 return;
949 }
950 set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
951 spin_unlock(&ls->ls_recover_spin);
952}
953
954/* dlm calls after recover_prep has been completed on all lockspace members;
955 identifies slot/jid of failed member */
956
957static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
958{
959 struct gfs2_sbd *sdp = arg;
960 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
961 int jid = slot->slot - 1;
962
963 spin_lock(&ls->ls_recover_spin);
964 if (ls->ls_recover_size < jid + 1) {
965 fs_err(sdp, "recover_slot jid %d gen %u short size %d",
966 jid, ls->ls_recover_block, ls->ls_recover_size);
967 spin_unlock(&ls->ls_recover_spin);
968 return;
969 }
970
971 if (ls->ls_recover_submit[jid]) {
972 fs_info(sdp, "recover_slot jid %d gen %u prev %u",
973 jid, ls->ls_recover_block, ls->ls_recover_submit[jid]);
974 }
975 ls->ls_recover_submit[jid] = ls->ls_recover_block;
976 spin_unlock(&ls->ls_recover_spin);
977}
978
979/* dlm calls after recover_slot and after it completes lock recovery */
980
981static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
982 int our_slot, uint32_t generation)
983{
984 struct gfs2_sbd *sdp = arg;
985 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
986
987 /* ensure the ls jid arrays are large enough */
988 set_recover_size(sdp, slots, num_slots);
989
990 spin_lock(&ls->ls_recover_spin);
991 ls->ls_recover_start = generation;
992
993 if (!ls->ls_recover_mount) {
994 ls->ls_recover_mount = generation;
995 ls->ls_jid = our_slot - 1;
996 }
997
998 if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
999 queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0);
1000
1001 clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
1002 smp_mb__after_clear_bit();
1003 wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY);
1004 spin_unlock(&ls->ls_recover_spin);
1005}
1006
1007/* gfs2_recover thread has a journal recovery result */
1008
1009static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid,
1010 unsigned int result)
1011{
1012 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
1013
1014 if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
1015 return;
1016
1017 /* don't care about the recovery of own journal during mount */
1018 if (jid == ls->ls_jid)
1019 return;
1020
1021 spin_lock(&ls->ls_recover_spin);
1022 if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
1023 spin_unlock(&ls->ls_recover_spin);
1024 return;
1025 }
1026 if (ls->ls_recover_size < jid + 1) {
1027 fs_err(sdp, "recovery_result jid %d short size %d",
1028 jid, ls->ls_recover_size);
1029 spin_unlock(&ls->ls_recover_spin);
1030 return;
1031 }
1032
1033 fs_info(sdp, "recover jid %d result %s\n", jid,
1034 result == LM_RD_GAVEUP ? "busy" : "success");
1035
1036 ls->ls_recover_result[jid] = result;
1037
1038 /* GAVEUP means another node is recovering the journal; delay our
1039 next attempt to recover it, to give the other node a chance to
1040 finish before trying again */
1041
1042 if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
1043 queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work,
1044 result == LM_RD_GAVEUP ? HZ : 0);
1045 spin_unlock(&ls->ls_recover_spin);
1046}
1047
1048const struct dlm_lockspace_ops gdlm_lockspace_ops = {
1049 .recover_prep = gdlm_recover_prep,
1050 .recover_slot = gdlm_recover_slot,
1051 .recover_done = gdlm_recover_done,
1052};
1053
1054static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
1055{
1056 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
1057 char cluster[GFS2_LOCKNAME_LEN];
1058 const char *fsname;
1059 uint32_t flags;
1060 int error, ops_result;
1061
1062 /*
1063 * initialize everything
1064 */
1065
1066 INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func);
1067 spin_lock_init(&ls->ls_recover_spin);
1068 ls->ls_recover_flags = 0;
1069 ls->ls_recover_mount = 0;
1070 ls->ls_recover_start = 0;
1071 ls->ls_recover_block = 0;
1072 ls->ls_recover_size = 0;
1073 ls->ls_recover_submit = NULL;
1074 ls->ls_recover_result = NULL;
1075
1076 error = set_recover_size(sdp, NULL, 0);
1077 if (error)
1078 goto fail;
1079
1080 /*
1081 * prepare dlm_new_lockspace args
1082 */
1083
1084 fsname = strchr(table, ':');
1085 if (!fsname) {
1086 fs_info(sdp, "no fsname found\n");
1087 error = -EINVAL;
1088 goto fail_free;
1089 }
1090 memset(cluster, 0, sizeof(cluster));
1091 memcpy(cluster, table, strlen(table) - strlen(fsname));
1092 fsname++;
1093
1094 flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL;
1095 if (ls->ls_nodir)
1096 flags |= DLM_LSFL_NODIR;
1097
1098 /*
1099 * create/join lockspace
1100 */
1101
1102 error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE,
1103 &gdlm_lockspace_ops, sdp, &ops_result,
1104 &ls->ls_dlm);
1105 if (error) {
1106 fs_err(sdp, "dlm_new_lockspace error %d\n", error);
1107 goto fail_free;
1108 }
1109
1110 if (ops_result < 0) {
1111 /*
1112 * dlm does not support ops callbacks,
1113 * old dlm_controld/gfs_controld are used, try without ops.
1114 */
1115 fs_info(sdp, "dlm lockspace ops not used\n");
1116 free_recover_size(ls);
1117 set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags);
1118 return 0;
1119 }
1120
1121 if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) {
1122 fs_err(sdp, "dlm lockspace ops disallow jid preset\n");
1123 error = -EINVAL;
1124 goto fail_release;
1125 }
1126
1127 /*
1128 * control_mount() uses control_lock to determine first mounter,
1129 * and for later mounts, waits for any recoveries to be cleared.
1130 */
1131
1132 error = control_mount(sdp);
1133 if (error) {
1134 fs_err(sdp, "mount control error %d\n", error);
1135 goto fail_release;
1136 }
1137
1138 ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
1139 clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
1140 smp_mb__after_clear_bit();
1141 wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
1142 return 0;
1143
1144fail_release:
1145 dlm_release_lockspace(ls->ls_dlm, 2);
1146fail_free:
1147 free_recover_size(ls);
1148fail:
1149 return error;
1150}
1151
1152static void gdlm_first_done(struct gfs2_sbd *sdp)
1153{
1154 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
1155 int error;
1156
1157 if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
1158 return;
1159
1160 error = control_first_done(sdp);
1161 if (error)
1162 fs_err(sdp, "mount first_done error %d\n", error);
1163}
1164
208static void gdlm_unmount(struct gfs2_sbd *sdp) 1165static void gdlm_unmount(struct gfs2_sbd *sdp)
209{ 1166{
210 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 1167 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
211 1168
1169 if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
1170 goto release;
1171
1172 /* wait for gfs2_control_wq to be done with this mount */
1173
1174 spin_lock(&ls->ls_recover_spin);
1175 set_bit(DFL_UNMOUNT, &ls->ls_recover_flags);
1176 spin_unlock(&ls->ls_recover_spin);
1177 flush_delayed_work_sync(&sdp->sd_control_work);
1178
1179 /* mounted_lock and control_lock will be purged in dlm recovery */
1180release:
212 if (ls->ls_dlm) { 1181 if (ls->ls_dlm) {
213 dlm_release_lockspace(ls->ls_dlm, 2); 1182 dlm_release_lockspace(ls->ls_dlm, 2);
214 ls->ls_dlm = NULL; 1183 ls->ls_dlm = NULL;
215 } 1184 }
1185
1186 free_recover_size(ls);
216} 1187}
217 1188
218static const match_table_t dlm_tokens = { 1189static const match_table_t dlm_tokens = {
@@ -226,6 +1197,8 @@ static const match_table_t dlm_tokens = {
226const struct lm_lockops gfs2_dlm_ops = { 1197const struct lm_lockops gfs2_dlm_ops = {
227 .lm_proto_name = "lock_dlm", 1198 .lm_proto_name = "lock_dlm",
228 .lm_mount = gdlm_mount, 1199 .lm_mount = gdlm_mount,
1200 .lm_first_done = gdlm_first_done,
1201 .lm_recovery_result = gdlm_recovery_result,
229 .lm_unmount = gdlm_unmount, 1202 .lm_unmount = gdlm_unmount,
230 .lm_put_lock = gdlm_put_lock, 1203 .lm_put_lock = gdlm_put_lock,
231 .lm_lock = gdlm_lock, 1204 .lm_lock = gdlm_lock,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index c150298e2d8e..a8d9bcd0e19c 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -28,6 +28,8 @@
28#include "recovery.h" 28#include "recovery.h"
29#include "dir.h" 29#include "dir.h"
30 30
31struct workqueue_struct *gfs2_control_wq;
32
31static struct shrinker qd_shrinker = { 33static struct shrinker qd_shrinker = {
32 .shrink = gfs2_shrink_qd_memory, 34 .shrink = gfs2_shrink_qd_memory,
33 .seeks = DEFAULT_SEEKS, 35 .seeks = DEFAULT_SEEKS,
@@ -146,12 +148,19 @@ static int __init init_gfs2_fs(void)
146 if (!gfs_recovery_wq) 148 if (!gfs_recovery_wq)
147 goto fail_wq; 149 goto fail_wq;
148 150
151 gfs2_control_wq = alloc_workqueue("gfs2_control",
152 WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0);
153 if (!gfs2_control_wq)
154 goto fail_control;
155
149 gfs2_register_debugfs(); 156 gfs2_register_debugfs();
150 157
151 printk("GFS2 installed\n"); 158 printk("GFS2 installed\n");
152 159
153 return 0; 160 return 0;
154 161
162fail_control:
163 destroy_workqueue(gfs_recovery_wq);
155fail_wq: 164fail_wq:
156 unregister_filesystem(&gfs2meta_fs_type); 165 unregister_filesystem(&gfs2meta_fs_type);
157fail_unregister: 166fail_unregister:
@@ -195,6 +204,7 @@ static void __exit exit_gfs2_fs(void)
195 unregister_filesystem(&gfs2_fs_type); 204 unregister_filesystem(&gfs2_fs_type);
196 unregister_filesystem(&gfs2meta_fs_type); 205 unregister_filesystem(&gfs2meta_fs_type);
197 destroy_workqueue(gfs_recovery_wq); 206 destroy_workqueue(gfs_recovery_wq);
207 destroy_workqueue(gfs2_control_wq);
198 208
199 rcu_barrier(); 209 rcu_barrier();
200 210
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index fe72e79e6ff9..6aacf3f230a2 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -562,8 +562,12 @@ static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
562{ 562{
563 char *message = "FIRSTMOUNT=Done"; 563 char *message = "FIRSTMOUNT=Done";
564 char *envp[] = { message, NULL }; 564 char *envp[] = { message, NULL };
565 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 565
566 ls->ls_first_done = 1; 566 fs_info(sdp, "first mount done, others may mount\n");
567
568 if (sdp->sd_lockstruct.ls_ops->lm_first_done)
569 sdp->sd_lockstruct.ls_ops->lm_first_done(sdp);
570
567 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); 571 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
568} 572}
569 573
@@ -944,7 +948,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
944 struct gfs2_args *args = &sdp->sd_args; 948 struct gfs2_args *args = &sdp->sd_args;
945 const char *proto = sdp->sd_proto_name; 949 const char *proto = sdp->sd_proto_name;
946 const char *table = sdp->sd_table_name; 950 const char *table = sdp->sd_table_name;
947 const char *fsname;
948 char *o, *options; 951 char *o, *options;
949 int ret; 952 int ret;
950 953
@@ -1004,21 +1007,12 @@ hostdata_error:
1004 } 1007 }
1005 } 1008 }
1006 1009
1007 if (sdp->sd_args.ar_spectator)
1008 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
1009 else
1010 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
1011 sdp->sd_lockstruct.ls_jid);
1012
1013 fsname = strchr(table, ':');
1014 if (fsname)
1015 fsname++;
1016 if (lm->lm_mount == NULL) { 1010 if (lm->lm_mount == NULL) {
1017 fs_info(sdp, "Now mounting FS...\n"); 1011 fs_info(sdp, "Now mounting FS...\n");
1018 complete_all(&sdp->sd_locking_init); 1012 complete_all(&sdp->sd_locking_init);
1019 return 0; 1013 return 0;
1020 } 1014 }
1021 ret = lm->lm_mount(sdp, fsname); 1015 ret = lm->lm_mount(sdp, table);
1022 if (ret == 0) 1016 if (ret == 0)
1023 fs_info(sdp, "Joined cluster. Now mounting FS...\n"); 1017 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
1024 complete_all(&sdp->sd_locking_init); 1018 complete_all(&sdp->sd_locking_init);
@@ -1084,7 +1078,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1084 1078
1085 if (sdp->sd_args.ar_spectator) { 1079 if (sdp->sd_args.ar_spectator) {
1086 sb->s_flags |= MS_RDONLY; 1080 sb->s_flags |= MS_RDONLY;
1087 set_bit(SDF_NORECOVERY, &sdp->sd_flags); 1081 set_bit(SDF_RORECOVERY, &sdp->sd_flags);
1088 } 1082 }
1089 if (sdp->sd_args.ar_posix_acl) 1083 if (sdp->sd_args.ar_posix_acl)
1090 sb->s_flags |= MS_POSIXACL; 1084 sb->s_flags |= MS_POSIXACL;
@@ -1124,6 +1118,8 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1124 if (error) 1118 if (error)
1125 goto fail; 1119 goto fail;
1126 1120
1121 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name);
1122
1127 gfs2_create_debugfs_file(sdp); 1123 gfs2_create_debugfs_file(sdp);
1128 1124
1129 error = gfs2_sys_fs_add(sdp); 1125 error = gfs2_sys_fs_add(sdp);
@@ -1160,6 +1156,13 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1160 goto fail_sb; 1156 goto fail_sb;
1161 } 1157 }
1162 1158
1159 if (sdp->sd_args.ar_spectator)
1160 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s",
1161 sdp->sd_table_name);
1162 else
1163 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u",
1164 sdp->sd_table_name, sdp->sd_lockstruct.ls_jid);
1165
1163 error = init_inodes(sdp, DO); 1166 error = init_inodes(sdp, DO);
1164 if (error) 1167 if (error)
1165 goto fail_sb; 1168 goto fail_sb;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f2a02edcac8f..963b2d75200c 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -436,12 +436,16 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
436 char env_status[20]; 436 char env_status[20];
437 char *envp[] = { env_jid, env_status, NULL }; 437 char *envp[] = { env_jid, env_status, NULL };
438 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 438 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
439
439 ls->ls_recover_jid_done = jid; 440 ls->ls_recover_jid_done = jid;
440 ls->ls_recover_jid_status = message; 441 ls->ls_recover_jid_status = message;
441 sprintf(env_jid, "JID=%d", jid); 442 sprintf(env_jid, "JID=%d", jid);
442 sprintf(env_status, "RECOVERY=%s", 443 sprintf(env_status, "RECOVERY=%s",
443 message == LM_RD_SUCCESS ? "Done" : "Failed"); 444 message == LM_RD_SUCCESS ? "Done" : "Failed");
444 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); 445 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
446
447 if (sdp->sd_lockstruct.ls_ops->lm_recovery_result)
448 sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message);
445} 449}
446 450
447void gfs2_recover_func(struct work_struct *work) 451void gfs2_recover_func(struct work_struct *work)
@@ -512,7 +516,9 @@ void gfs2_recover_func(struct work_struct *work)
512 if (error) 516 if (error)
513 goto fail_gunlock_ji; 517 goto fail_gunlock_ji;
514 518
515 if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { 519 if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) {
520 ro = 1;
521 } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
516 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) 522 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
517 ro = 1; 523 ro = 1;
518 } else { 524 } else {
@@ -577,6 +583,7 @@ fail_gunlock_j:
577 583
578 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); 584 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
579fail: 585fail:
586 jd->jd_recover_error = error;
580 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); 587 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
581done: 588done:
582 clear_bit(JDF_RECOVERY, &jd->jd_flags); 589 clear_bit(JDF_RECOVERY, &jd->jd_flags);
@@ -605,6 +612,6 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
605 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, 612 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait,
606 TASK_UNINTERRUPTIBLE); 613 TASK_UNINTERRUPTIBLE);
607 614
608 return 0; 615 return wait ? jd->jd_recover_error : 0;
609} 616}
610 617
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 22234627f684..981bfa32121a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1108,9 +1108,9 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1108{ 1108{
1109 struct gfs2_blkreserv *rs = ip->i_res; 1109 struct gfs2_blkreserv *rs = ip->i_res;
1110 1110
1111 gfs2_blkrsv_put(ip);
1112 if (rs->rs_rgd_gh.gh_gl) 1111 if (rs->rs_rgd_gh.gh_gl)
1113 gfs2_glock_dq_uninit(&rs->rs_rgd_gh); 1112 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
1113 gfs2_blkrsv_put(ip);
1114} 1114}
1115 1115
1116/** 1116/**
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 443cabcfcd23..d33172c291ba 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -298,7 +298,7 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
298 ssize_t ret; 298 ssize_t ret;
299 int val = 0; 299 int val = 0;
300 300
301 if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags)) 301 if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))
302 val = 1; 302 val = 1;
303 ret = sprintf(buf, "%d\n", val); 303 ret = sprintf(buf, "%d\n", val);
304 return ret; 304 return ret;
@@ -313,9 +313,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
313 val = simple_strtol(buf, NULL, 0); 313 val = simple_strtol(buf, NULL, 0);
314 314
315 if (val == 1) 315 if (val == 1)
316 set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags); 316 set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
317 else if (val == 0) { 317 else if (val == 0) {
318 clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags); 318 clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
319 smp_mb__after_clear_bit(); 319 smp_mb__after_clear_bit();
320 gfs2_glock_thaw(sdp); 320 gfs2_glock_thaw(sdp);
321 } else { 321 } else {
@@ -350,8 +350,8 @@ static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
350 goto out; 350 goto out;
351 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) 351 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
352 goto out; 352 goto out;
353 sdp->sd_lockstruct.ls_first = first; 353 sdp->sd_lockstruct.ls_first = first;
354 rv = 0; 354 rv = 0;
355out: 355out:
356 spin_unlock(&sdp->sd_jindex_spin); 356 spin_unlock(&sdp->sd_jindex_spin);
357 return rv ? rv : len; 357 return rv ? rv : len;
@@ -360,19 +360,14 @@ out:
360static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) 360static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
361{ 361{
362 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 362 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
363 return sprintf(buf, "%d\n", ls->ls_first_done); 363 return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags));
364} 364}
365 365
366static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) 366int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)
367{ 367{
368 unsigned jid;
369 struct gfs2_jdesc *jd; 368 struct gfs2_jdesc *jd;
370 int rv; 369 int rv;
371 370
372 rv = sscanf(buf, "%u", &jid);
373 if (rv != 1)
374 return -EINVAL;
375
376 rv = -ESHUTDOWN; 371 rv = -ESHUTDOWN;
377 spin_lock(&sdp->sd_jindex_spin); 372 spin_lock(&sdp->sd_jindex_spin);
378 if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) 373 if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
@@ -389,6 +384,20 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
389 } 384 }
390out: 385out:
391 spin_unlock(&sdp->sd_jindex_spin); 386 spin_unlock(&sdp->sd_jindex_spin);
387 return rv;
388}
389
390static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
391{
392 unsigned jid;
393 int rv;
394
395 rv = sscanf(buf, "%u", &jid);
396 if (rv != 1)
397 return -EINVAL;
398
399 rv = gfs2_recover_set(sdp, jid);
400
392 return rv ? rv : len; 401 return rv ? rv : len;
393} 402}
394 403
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index e94560e836d7..79182d6ad6ac 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -19,5 +19,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
19int gfs2_sys_init(void); 19int gfs2_sys_init(void);
20void gfs2_sys_uninit(void); 20void gfs2_sys_uninit(void);
21 21
22int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid);
23
22#endif /* __SYS_DOT_H__ */ 24#endif /* __SYS_DOT_H__ */
23 25
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e425ad9d0490..1e85a7ac0217 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -583,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page)
583} 583}
584 584
585static int hugetlbfs_migrate_page(struct address_space *mapping, 585static int hugetlbfs_migrate_page(struct address_space *mapping,
586 struct page *newpage, struct page *page) 586 struct page *newpage, struct page *page,
587 enum migrate_mode mode)
587{ 588{
588 int rc; 589 int rc;
589 590
diff --git a/fs/inode.c b/fs/inode.c
index 4fa4f0916af9..fb10d86ffad7 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -322,9 +322,6 @@ EXPORT_SYMBOL(clear_nlink);
322void set_nlink(struct inode *inode, unsigned int nlink) 322void set_nlink(struct inode *inode, unsigned int nlink)
323{ 323{
324 if (!nlink) { 324 if (!nlink) {
325 printk_ratelimited(KERN_INFO
326 "set_nlink() clearing i_nlink on %s inode %li\n",
327 inode->i_sb->s_type->name, inode->i_ino);
328 clear_nlink(inode); 325 clear_nlink(inode);
329 } else { 326 } else {
330 /* Yes, some filesystems do change nlink from zero to one */ 327 /* Yes, some filesystems do change nlink from zero to one */
diff --git a/fs/ioprio.c b/fs/ioprio.c
index f79dab83e17b..f84b380d65e5 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -48,28 +48,12 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
48 if (err) 48 if (err)
49 return err; 49 return err;
50 50
51 task_lock(task); 51 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
52 do { 52 if (ioc) {
53 ioc = task->io_context; 53 ioc_ioprio_changed(ioc, ioprio);
54 /* see wmb() in current_io_context() */ 54 put_io_context(ioc, NULL);
55 smp_read_barrier_depends();
56 if (ioc)
57 break;
58
59 ioc = alloc_io_context(GFP_ATOMIC, -1);
60 if (!ioc) {
61 err = -ENOMEM;
62 break;
63 }
64 task->io_context = ioc;
65 } while (1);
66
67 if (!err) {
68 ioc->ioprio = ioprio;
69 ioc->ioprio_changed = 1;
70 } 55 }
71 56
72 task_unlock(task);
73 return err; 57 return err;
74} 58}
75EXPORT_SYMBOL_GPL(set_task_ioprio); 59EXPORT_SYMBOL_GPL(set_task_ioprio);
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 23d7451b2938..65ba36b80a9e 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -55,7 +55,7 @@ static DEFINE_SPINLOCK(nsm_lock);
55 * Local NSM state 55 * Local NSM state
56 */ 56 */
57u32 __read_mostly nsm_local_state; 57u32 __read_mostly nsm_local_state;
58int __read_mostly nsm_use_hostnames; 58bool __read_mostly nsm_use_hostnames;
59 59
60static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) 60static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
61{ 61{
diff --git a/fs/mpage.c b/fs/mpage.c
index fdfae9fa98cd..643e9f55ef29 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -371,9 +371,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
371 sector_t last_block_in_bio = 0; 371 sector_t last_block_in_bio = 0;
372 struct buffer_head map_bh; 372 struct buffer_head map_bh;
373 unsigned long first_logical_block = 0; 373 unsigned long first_logical_block = 0;
374 struct blk_plug plug;
375
376 blk_start_plug(&plug);
377 374
378 map_bh.b_state = 0; 375 map_bh.b_state = 0;
379 map_bh.b_size = 0; 376 map_bh.b_size = 0;
@@ -395,7 +392,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
395 BUG_ON(!list_empty(pages)); 392 BUG_ON(!list_empty(pages));
396 if (bio) 393 if (bio)
397 mpage_bio_submit(READ, bio); 394 mpage_bio_submit(READ, bio);
398 blk_finish_plug(&plug);
399 return 0; 395 return 0;
400} 396}
401EXPORT_SYMBOL(mpage_readpages); 397EXPORT_SYMBOL(mpage_readpages);
diff --git a/fs/namei.c b/fs/namei.c
index c283a1ec008e..208c6aa4a989 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page)
140 140
141static char *getname_flags(const char __user *filename, int flags, int *empty) 141static char *getname_flags(const char __user *filename, int flags, int *empty)
142{ 142{
143 char *tmp, *result; 143 char *result = __getname();
144 144 int retval;
145 result = ERR_PTR(-ENOMEM); 145
146 tmp = __getname(); 146 if (!result)
147 if (tmp) { 147 return ERR_PTR(-ENOMEM);
148 int retval = do_getname(filename, tmp); 148
149 149 retval = do_getname(filename, result);
150 result = tmp; 150 if (retval < 0) {
151 if (retval < 0) { 151 if (retval == -ENOENT && empty)
152 if (retval == -ENOENT && empty) 152 *empty = 1;
153 *empty = 1; 153 if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
154 if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { 154 __putname(result);
155 __putname(tmp); 155 return ERR_PTR(retval);
156 result = ERR_PTR(retval);
157 }
158 } 156 }
159 } 157 }
160 audit_getname(result); 158 audit_getname(result);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 281ae95932c9..48cfac31f64c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -90,9 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
90 */ 90 */
91struct parallel_io { 91struct parallel_io {
92 struct kref refcnt; 92 struct kref refcnt;
93 struct rpc_call_ops call_ops; 93 void (*pnfs_callback) (void *data, int num_se);
94 void (*pnfs_callback) (void *data);
95 void *data; 94 void *data;
95 int bse_count;
96}; 96};
97 97
98static inline struct parallel_io *alloc_parallel(void *data) 98static inline struct parallel_io *alloc_parallel(void *data)
@@ -103,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
103 if (rv) { 103 if (rv) {
104 rv->data = data; 104 rv->data = data;
105 kref_init(&rv->refcnt); 105 kref_init(&rv->refcnt);
106 rv->bse_count = 0;
106 } 107 }
107 return rv; 108 return rv;
108} 109}
@@ -117,7 +118,7 @@ static void destroy_parallel(struct kref *kref)
117 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); 118 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
118 119
119 dprintk("%s enter\n", __func__); 120 dprintk("%s enter\n", __func__);
120 p->pnfs_callback(p->data); 121 p->pnfs_callback(p->data, p->bse_count);
121 kfree(p); 122 kfree(p);
122} 123}
123 124
@@ -146,14 +147,19 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
146{ 147{
147 struct bio *bio; 148 struct bio *bio;
148 149
150 npg = min(npg, BIO_MAX_PAGES);
149 bio = bio_alloc(GFP_NOIO, npg); 151 bio = bio_alloc(GFP_NOIO, npg);
150 if (!bio) 152 if (!bio && (current->flags & PF_MEMALLOC)) {
151 return NULL; 153 while (!bio && (npg /= 2))
154 bio = bio_alloc(GFP_NOIO, npg);
155 }
152 156
153 bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; 157 if (bio) {
154 bio->bi_bdev = be->be_mdev; 158 bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
155 bio->bi_end_io = end_io; 159 bio->bi_bdev = be->be_mdev;
156 bio->bi_private = par; 160 bio->bi_end_io = end_io;
161 bio->bi_private = par;
162 }
157 return bio; 163 return bio;
158} 164}
159 165
@@ -212,22 +218,15 @@ static void bl_read_cleanup(struct work_struct *work)
212} 218}
213 219
214static void 220static void
215bl_end_par_io_read(void *data) 221bl_end_par_io_read(void *data, int unused)
216{ 222{
217 struct nfs_read_data *rdata = data; 223 struct nfs_read_data *rdata = data;
218 224
225 rdata->task.tk_status = rdata->pnfs_error;
219 INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); 226 INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
220 schedule_work(&rdata->task.u.tk_work); 227 schedule_work(&rdata->task.u.tk_work);
221} 228}
222 229
223/* We don't want normal .rpc_call_done callback used, so we replace it
224 * with this stub.
225 */
226static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
227{
228 return;
229}
230
231static enum pnfs_try_status 230static enum pnfs_try_status
232bl_read_pagelist(struct nfs_read_data *rdata) 231bl_read_pagelist(struct nfs_read_data *rdata)
233{ 232{
@@ -247,8 +246,6 @@ bl_read_pagelist(struct nfs_read_data *rdata)
247 par = alloc_parallel(rdata); 246 par = alloc_parallel(rdata);
248 if (!par) 247 if (!par)
249 goto use_mds; 248 goto use_mds;
250 par->call_ops = *rdata->mds_ops;
251 par->call_ops.rpc_call_done = bl_rpc_do_nothing;
252 par->pnfs_callback = bl_end_par_io_read; 249 par->pnfs_callback = bl_end_par_io_read;
253 /* At this point, we can no longer jump to use_mds */ 250 /* At this point, we can no longer jump to use_mds */
254 251
@@ -322,6 +319,7 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
322{ 319{
323 sector_t isect, end; 320 sector_t isect, end;
324 struct pnfs_block_extent *be; 321 struct pnfs_block_extent *be;
322 struct pnfs_block_short_extent *se;
325 323
326 dprintk("%s(%llu, %u)\n", __func__, offset, count); 324 dprintk("%s(%llu, %u)\n", __func__, offset, count);
327 if (count == 0) 325 if (count == 0)
@@ -334,8 +332,11 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
334 be = bl_find_get_extent(bl, isect, NULL); 332 be = bl_find_get_extent(bl, isect, NULL);
335 BUG_ON(!be); /* FIXME */ 333 BUG_ON(!be); /* FIXME */
336 len = min(end, be->be_f_offset + be->be_length) - isect; 334 len = min(end, be->be_f_offset + be->be_length) - isect;
337 if (be->be_state == PNFS_BLOCK_INVALID_DATA) 335 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
338 bl_mark_for_commit(be, isect, len); /* What if fails? */ 336 se = bl_pop_one_short_extent(be->be_inval);
337 BUG_ON(!se);
338 bl_mark_for_commit(be, isect, len, se);
339 }
339 isect += len; 340 isect += len;
340 bl_put_extent(be); 341 bl_put_extent(be);
341 } 342 }
@@ -357,7 +358,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
357 end_page_writeback(page); 358 end_page_writeback(page);
358 page_cache_release(page); 359 page_cache_release(page);
359 } while (bvec >= bio->bi_io_vec); 360 } while (bvec >= bio->bi_io_vec);
360 if (!uptodate) { 361
362 if (unlikely(!uptodate)) {
361 if (!wdata->pnfs_error) 363 if (!wdata->pnfs_error)
362 wdata->pnfs_error = -EIO; 364 wdata->pnfs_error = -EIO;
363 pnfs_set_lo_fail(wdata->lseg); 365 pnfs_set_lo_fail(wdata->lseg);
@@ -366,7 +368,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
366 put_parallel(par); 368 put_parallel(par);
367} 369}
368 370
369/* This is basically copied from mpage_end_io_read */
370static void bl_end_io_write(struct bio *bio, int err) 371static void bl_end_io_write(struct bio *bio, int err)
371{ 372{
372 struct parallel_io *par = bio->bi_private; 373 struct parallel_io *par = bio->bi_private;
@@ -392,7 +393,7 @@ static void bl_write_cleanup(struct work_struct *work)
392 dprintk("%s enter\n", __func__); 393 dprintk("%s enter\n", __func__);
393 task = container_of(work, struct rpc_task, u.tk_work); 394 task = container_of(work, struct rpc_task, u.tk_work);
394 wdata = container_of(task, struct nfs_write_data, task); 395 wdata = container_of(task, struct nfs_write_data, task);
395 if (!wdata->pnfs_error) { 396 if (likely(!wdata->pnfs_error)) {
396 /* Marks for LAYOUTCOMMIT */ 397 /* Marks for LAYOUTCOMMIT */
397 mark_extents_written(BLK_LSEG2EXT(wdata->lseg), 398 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
398 wdata->args.offset, wdata->args.count); 399 wdata->args.offset, wdata->args.count);
@@ -401,11 +402,16 @@ static void bl_write_cleanup(struct work_struct *work)
401} 402}
402 403
403/* Called when last of bios associated with a bl_write_pagelist call finishes */ 404/* Called when last of bios associated with a bl_write_pagelist call finishes */
404static void bl_end_par_io_write(void *data) 405static void bl_end_par_io_write(void *data, int num_se)
405{ 406{
406 struct nfs_write_data *wdata = data; 407 struct nfs_write_data *wdata = data;
407 408
408 wdata->task.tk_status = 0; 409 if (unlikely(wdata->pnfs_error)) {
410 bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
411 num_se);
412 }
413
414 wdata->task.tk_status = wdata->pnfs_error;
409 wdata->verf.committed = NFS_FILE_SYNC; 415 wdata->verf.committed = NFS_FILE_SYNC;
410 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); 416 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
411 schedule_work(&wdata->task.u.tk_work); 417 schedule_work(&wdata->task.u.tk_work);
@@ -484,6 +490,55 @@ cleanup:
484 return ret; 490 return ret;
485} 491}
486 492
493/* Find or create a zeroing page marked being writeback.
494 * Return ERR_PTR on error, NULL to indicate skip this page and page itself
495 * to indicate write out.
496 */
497static struct page *
498bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
499 struct pnfs_block_extent *cow_read)
500{
501 struct page *page;
502 int locked = 0;
503 page = find_get_page(inode->i_mapping, index);
504 if (page)
505 goto check_page;
506
507 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
508 if (unlikely(!page)) {
509 dprintk("%s oom\n", __func__);
510 return ERR_PTR(-ENOMEM);
511 }
512 locked = 1;
513
514check_page:
515 /* PageDirty: Other will write this out
516 * PageWriteback: Other is writing this out
517 * PageUptodate: It was read before
518 */
519 if (PageDirty(page) || PageWriteback(page)) {
520 print_page(page);
521 if (locked)
522 unlock_page(page);
523 page_cache_release(page);
524 return NULL;
525 }
526
527 if (!locked) {
528 lock_page(page);
529 locked = 1;
530 goto check_page;
531 }
532 if (!PageUptodate(page)) {
533 /* New page, readin or zero it */
534 init_page_for_write(page, cow_read);
535 }
536 set_page_writeback(page);
537 unlock_page(page);
538
539 return page;
540}
541
487static enum pnfs_try_status 542static enum pnfs_try_status
488bl_write_pagelist(struct nfs_write_data *wdata, int sync) 543bl_write_pagelist(struct nfs_write_data *wdata, int sync)
489{ 544{
@@ -508,9 +563,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
508 */ 563 */
509 par = alloc_parallel(wdata); 564 par = alloc_parallel(wdata);
510 if (!par) 565 if (!par)
511 return PNFS_NOT_ATTEMPTED; 566 goto out_mds;
512 par->call_ops = *wdata->mds_ops;
513 par->call_ops.rpc_call_done = bl_rpc_do_nothing;
514 par->pnfs_callback = bl_end_par_io_write; 567 par->pnfs_callback = bl_end_par_io_write;
515 /* At this point, have to be more careful with error handling */ 568 /* At this point, have to be more careful with error handling */
516 569
@@ -518,12 +571,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
518 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); 571 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
519 if (!be || !is_writable(be, isect)) { 572 if (!be || !is_writable(be, isect)) {
520 dprintk("%s no matching extents!\n", __func__); 573 dprintk("%s no matching extents!\n", __func__);
521 wdata->pnfs_error = -EINVAL; 574 goto out_mds;
522 goto out;
523 } 575 }
524 576
525 /* First page inside INVALID extent */ 577 /* First page inside INVALID extent */
526 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 578 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
579 if (likely(!bl_push_one_short_extent(be->be_inval)))
580 par->bse_count++;
581 else
582 goto out_mds;
527 temp = offset >> PAGE_CACHE_SHIFT; 583 temp = offset >> PAGE_CACHE_SHIFT;
528 npg_zero = do_div(temp, npg_per_block); 584 npg_zero = do_div(temp, npg_per_block);
529 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & 585 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
@@ -543,36 +599,16 @@ fill_invalid_ext:
543 dprintk("%s zero %dth page: index %lu isect %llu\n", 599 dprintk("%s zero %dth page: index %lu isect %llu\n",
544 __func__, npg_zero, index, 600 __func__, npg_zero, index,
545 (unsigned long long)isect); 601 (unsigned long long)isect);
546 page = 602 page = bl_find_get_zeroing_page(wdata->inode, index,
547 find_or_create_page(wdata->inode->i_mapping, index, 603 cow_read);
548 GFP_NOFS); 604 if (unlikely(IS_ERR(page))) {
549 if (!page) { 605 wdata->pnfs_error = PTR_ERR(page);
550 dprintk("%s oom\n", __func__);
551 wdata->pnfs_error = -ENOMEM;
552 goto out; 606 goto out;
553 } 607 } else if (page == NULL)
554
555 /* PageDirty: Other will write this out
556 * PageWriteback: Other is writing this out
557 * PageUptodate: It was read before
558 * sector_initialized: already written out
559 */
560 if (PageDirty(page) || PageWriteback(page)) {
561 print_page(page);
562 unlock_page(page);
563 page_cache_release(page);
564 goto next_page; 608 goto next_page;
565 }
566 if (!PageUptodate(page)) {
567 /* New page, readin or zero it */
568 init_page_for_write(page, cow_read);
569 }
570 set_page_writeback(page);
571 unlock_page(page);
572 609
573 ret = bl_mark_sectors_init(be->be_inval, isect, 610 ret = bl_mark_sectors_init(be->be_inval, isect,
574 PAGE_CACHE_SECTORS, 611 PAGE_CACHE_SECTORS);
575 NULL);
576 if (unlikely(ret)) { 612 if (unlikely(ret)) {
577 dprintk("%s bl_mark_sectors_init fail %d\n", 613 dprintk("%s bl_mark_sectors_init fail %d\n",
578 __func__, ret); 614 __func__, ret);
@@ -581,6 +617,19 @@ fill_invalid_ext:
581 wdata->pnfs_error = ret; 617 wdata->pnfs_error = ret;
582 goto out; 618 goto out;
583 } 619 }
620 if (likely(!bl_push_one_short_extent(be->be_inval)))
621 par->bse_count++;
622 else {
623 end_page_writeback(page);
624 page_cache_release(page);
625 wdata->pnfs_error = -ENOMEM;
626 goto out;
627 }
628 /* FIXME: This should be done in bi_end_io */
629 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
630 page->index << PAGE_CACHE_SHIFT,
631 PAGE_CACHE_SIZE);
632
584 bio = bl_add_page_to_bio(bio, npg_zero, WRITE, 633 bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
585 isect, page, be, 634 isect, page, be,
586 bl_end_io_write_zero, par); 635 bl_end_io_write_zero, par);
@@ -589,10 +638,6 @@ fill_invalid_ext:
589 bio = NULL; 638 bio = NULL;
590 goto out; 639 goto out;
591 } 640 }
592 /* FIXME: This should be done in bi_end_io */
593 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
594 page->index << PAGE_CACHE_SHIFT,
595 PAGE_CACHE_SIZE);
596next_page: 641next_page:
597 isect += PAGE_CACHE_SECTORS; 642 isect += PAGE_CACHE_SECTORS;
598 extent_length -= PAGE_CACHE_SECTORS; 643 extent_length -= PAGE_CACHE_SECTORS;
@@ -616,13 +661,21 @@ next_page:
616 wdata->pnfs_error = -EINVAL; 661 wdata->pnfs_error = -EINVAL;
617 goto out; 662 goto out;
618 } 663 }
664 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
665 if (likely(!bl_push_one_short_extent(
666 be->be_inval)))
667 par->bse_count++;
668 else {
669 wdata->pnfs_error = -ENOMEM;
670 goto out;
671 }
672 }
619 extent_length = be->be_length - 673 extent_length = be->be_length -
620 (isect - be->be_f_offset); 674 (isect - be->be_f_offset);
621 } 675 }
622 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 676 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
623 ret = bl_mark_sectors_init(be->be_inval, isect, 677 ret = bl_mark_sectors_init(be->be_inval, isect,
624 PAGE_CACHE_SECTORS, 678 PAGE_CACHE_SECTORS);
625 NULL);
626 if (unlikely(ret)) { 679 if (unlikely(ret)) {
627 dprintk("%s bl_mark_sectors_init fail %d\n", 680 dprintk("%s bl_mark_sectors_init fail %d\n",
628 __func__, ret); 681 __func__, ret);
@@ -664,6 +717,10 @@ out:
664 bl_submit_bio(WRITE, bio); 717 bl_submit_bio(WRITE, bio);
665 put_parallel(par); 718 put_parallel(par);
666 return PNFS_ATTEMPTED; 719 return PNFS_ATTEMPTED;
720out_mds:
721 bl_put_extent(be);
722 kfree(par);
723 return PNFS_NOT_ATTEMPTED;
667} 724}
668 725
669/* FIXME - range ignored */ 726/* FIXME - range ignored */
@@ -690,11 +747,17 @@ static void
690release_inval_marks(struct pnfs_inval_markings *marks) 747release_inval_marks(struct pnfs_inval_markings *marks)
691{ 748{
692 struct pnfs_inval_tracking *pos, *temp; 749 struct pnfs_inval_tracking *pos, *temp;
750 struct pnfs_block_short_extent *se, *stemp;
693 751
694 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { 752 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
695 list_del(&pos->it_link); 753 list_del(&pos->it_link);
696 kfree(pos); 754 kfree(pos);
697 } 755 }
756
757 list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
758 list_del(&se->bse_node);
759 kfree(se);
760 }
698 return; 761 return;
699} 762}
700 763
@@ -779,16 +842,13 @@ bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
779static void free_blk_mountid(struct block_mount_id *mid) 842static void free_blk_mountid(struct block_mount_id *mid)
780{ 843{
781 if (mid) { 844 if (mid) {
782 struct pnfs_block_dev *dev; 845 struct pnfs_block_dev *dev, *tmp;
783 spin_lock(&mid->bm_lock); 846
784 while (!list_empty(&mid->bm_devlist)) { 847 /* No need to take bm_lock as we are last user freeing bm_devlist */
785 dev = list_first_entry(&mid->bm_devlist, 848 list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
786 struct pnfs_block_dev,
787 bm_node);
788 list_del(&dev->bm_node); 849 list_del(&dev->bm_node);
789 bl_free_block_dev(dev); 850 bl_free_block_dev(dev);
790 } 851 }
791 spin_unlock(&mid->bm_lock);
792 kfree(mid); 852 kfree(mid);
793 } 853 }
794} 854}
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 42acf7ef5992..e31a2df28e70 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -70,6 +70,7 @@ struct pnfs_inval_markings {
70 spinlock_t im_lock; 70 spinlock_t im_lock;
71 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ 71 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
72 sector_t im_block_size; /* Server blocksize in sectors */ 72 sector_t im_block_size; /* Server blocksize in sectors */
73 struct list_head im_extents; /* Short extents for INVAL->RW conversion */
73}; 74};
74 75
75struct pnfs_inval_tracking { 76struct pnfs_inval_tracking {
@@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
105{ 106{
106 spin_lock_init(&marks->im_lock); 107 spin_lock_init(&marks->im_lock);
107 INIT_LIST_HEAD(&marks->im_tree.mtt_stub); 108 INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
109 INIT_LIST_HEAD(&marks->im_extents);
108 marks->im_block_size = blocksize; 110 marks->im_block_size = blocksize;
109 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, 111 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
110 blocksize); 112 blocksize);
@@ -186,8 +188,7 @@ struct pnfs_block_extent *
186bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, 188bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
187 struct pnfs_block_extent **cow_read); 189 struct pnfs_block_extent **cow_read);
188int bl_mark_sectors_init(struct pnfs_inval_markings *marks, 190int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
189 sector_t offset, sector_t length, 191 sector_t offset, sector_t length);
190 sector_t **pages);
191void bl_put_extent(struct pnfs_block_extent *be); 192void bl_put_extent(struct pnfs_block_extent *be);
192struct pnfs_block_extent *bl_alloc_extent(void); 193struct pnfs_block_extent *bl_alloc_extent(void);
193int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); 194int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
@@ -200,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
200int bl_add_merge_extent(struct pnfs_block_layout *bl, 201int bl_add_merge_extent(struct pnfs_block_layout *bl,
201 struct pnfs_block_extent *new); 202 struct pnfs_block_extent *new);
202int bl_mark_for_commit(struct pnfs_block_extent *be, 203int bl_mark_for_commit(struct pnfs_block_extent *be,
203 sector_t offset, sector_t length); 204 sector_t offset, sector_t length,
205 struct pnfs_block_short_extent *new);
206int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
207struct pnfs_block_short_extent *
208bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
209void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
204 210
205#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ 211#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 19fa7b0b8c00..1abac09f7cd5 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -110,13 +110,7 @@ static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
110 return 0; 110 return 0;
111 } else { 111 } else {
112 struct pnfs_inval_tracking *new; 112 struct pnfs_inval_tracking *new;
113 if (storage) 113 new = storage;
114 new = storage;
115 else {
116 new = kmalloc(sizeof(*new), GFP_NOFS);
117 if (!new)
118 return -ENOMEM;
119 }
120 new->it_sector = s; 114 new->it_sector = s;
121 new->it_tags = (1 << tag); 115 new->it_tags = (1 << tag);
122 list_add(&new->it_link, &pos->it_link); 116 list_add(&new->it_link, &pos->it_link);
@@ -139,11 +133,13 @@ static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
139} 133}
140 134
141/* Ensure that future operations on given range of tree will not malloc */ 135/* Ensure that future operations on given range of tree will not malloc */
142static int _preload_range(struct my_tree *tree, u64 offset, u64 length) 136static int _preload_range(struct pnfs_inval_markings *marks,
137 u64 offset, u64 length)
143{ 138{
144 u64 start, end, s; 139 u64 start, end, s;
145 int count, i, used = 0, status = -ENOMEM; 140 int count, i, used = 0, status = -ENOMEM;
146 struct pnfs_inval_tracking **storage; 141 struct pnfs_inval_tracking **storage;
142 struct my_tree *tree = &marks->im_tree;
147 143
148 dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); 144 dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
149 start = normalize(offset, tree->mtt_step_size); 145 start = normalize(offset, tree->mtt_step_size);
@@ -161,12 +157,11 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
161 goto out_cleanup; 157 goto out_cleanup;
162 } 158 }
163 159
164 /* Now need lock - HOW??? */ 160 spin_lock_bh(&marks->im_lock);
165
166 for (s = start; s < end; s += tree->mtt_step_size) 161 for (s = start; s < end; s += tree->mtt_step_size)
167 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); 162 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
163 spin_unlock_bh(&marks->im_lock);
168 164
169 /* Unlock - HOW??? */
170 status = 0; 165 status = 0;
171 166
172 out_cleanup: 167 out_cleanup:
@@ -179,41 +174,14 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
179 return status; 174 return status;
180} 175}
181 176
182static void set_needs_init(sector_t *array, sector_t offset)
183{
184 sector_t *p = array;
185
186 dprintk("%s enter\n", __func__);
187 if (!p)
188 return;
189 while (*p < offset)
190 p++;
191 if (*p == offset)
192 return;
193 else if (*p == ~0) {
194 *p++ = offset;
195 *p = ~0;
196 return;
197 } else {
198 sector_t *save = p;
199 dprintk("%s Adding %llu\n", __func__, (u64)offset);
200 while (*p != ~0)
201 p++;
202 p++;
203 memmove(save + 1, save, (char *)p - (char *)save);
204 *save = offset;
205 return;
206 }
207}
208
209/* We are relying on page lock to serialize this */ 177/* We are relying on page lock to serialize this */
210int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) 178int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
211{ 179{
212 int rv; 180 int rv;
213 181
214 spin_lock(&marks->im_lock); 182 spin_lock_bh(&marks->im_lock);
215 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); 183 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
216 spin_unlock(&marks->im_lock); 184 spin_unlock_bh(&marks->im_lock);
217 return rv; 185 return rv;
218} 186}
219 187
@@ -253,78 +221,39 @@ static int is_range_written(struct pnfs_inval_markings *marks,
253{ 221{
254 int rv; 222 int rv;
255 223
256 spin_lock(&marks->im_lock); 224 spin_lock_bh(&marks->im_lock);
257 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); 225 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
258 spin_unlock(&marks->im_lock); 226 spin_unlock_bh(&marks->im_lock);
259 return rv; 227 return rv;
260} 228}
261 229
262/* Marks sectors in [offest, offset_length) as having been initialized. 230/* Marks sectors in [offest, offset_length) as having been initialized.
263 * All lengths are step-aligned, where step is min(pagesize, blocksize). 231 * All lengths are step-aligned, where step is min(pagesize, blocksize).
264 * Notes where partial block is initialized, and helps prepare it for 232 * Currently assumes offset is page-aligned
265 * complete initialization later.
266 */ 233 */
267/* Currently assumes offset is page-aligned */
268int bl_mark_sectors_init(struct pnfs_inval_markings *marks, 234int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
269 sector_t offset, sector_t length, 235 sector_t offset, sector_t length)
270 sector_t **pages)
271{ 236{
272 sector_t s, start, end; 237 sector_t start, end;
273 sector_t *array = NULL; /* Pages to mark */
274 238
275 dprintk("%s(offset=%llu,len=%llu) enter\n", 239 dprintk("%s(offset=%llu,len=%llu) enter\n",
276 __func__, (u64)offset, (u64)length); 240 __func__, (u64)offset, (u64)length);
277 s = max((sector_t) 3,
278 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
279 dprintk("%s set max=%llu\n", __func__, (u64)s);
280 if (pages) {
281 array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
282 if (!array)
283 goto outerr;
284 array[0] = ~0;
285 }
286 241
287 start = normalize(offset, marks->im_block_size); 242 start = normalize(offset, marks->im_block_size);
288 end = normalize_up(offset + length, marks->im_block_size); 243 end = normalize_up(offset + length, marks->im_block_size);
289 if (_preload_range(&marks->im_tree, start, end - start)) 244 if (_preload_range(marks, start, end - start))
290 goto outerr; 245 goto outerr;
291 246
292 spin_lock(&marks->im_lock); 247 spin_lock_bh(&marks->im_lock);
293
294 for (s = normalize_up(start, PAGE_CACHE_SECTORS);
295 s < offset; s += PAGE_CACHE_SECTORS) {
296 dprintk("%s pre-area pages\n", __func__);
297 /* Portion of used block is not initialized */
298 if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
299 set_needs_init(array, s);
300 }
301 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) 248 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
302 goto out_unlock; 249 goto out_unlock;
303 for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); 250 spin_unlock_bh(&marks->im_lock);
304 s < end; s += PAGE_CACHE_SECTORS) {
305 dprintk("%s post-area pages\n", __func__);
306 if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
307 set_needs_init(array, s);
308 }
309
310 spin_unlock(&marks->im_lock);
311 251
312 if (pages) {
313 if (array[0] == ~0) {
314 kfree(array);
315 *pages = NULL;
316 } else
317 *pages = array;
318 }
319 return 0; 252 return 0;
320 253
321 out_unlock: 254out_unlock:
322 spin_unlock(&marks->im_lock); 255 spin_unlock_bh(&marks->im_lock);
323 outerr: 256outerr:
324 if (pages) {
325 kfree(array);
326 *pages = NULL;
327 }
328 return -ENOMEM; 257 return -ENOMEM;
329} 258}
330 259
@@ -338,9 +267,9 @@ static int mark_written_sectors(struct pnfs_inval_markings *marks,
338 267
339 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, 268 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
340 (u64)offset, (u64)length); 269 (u64)offset, (u64)length);
341 spin_lock(&marks->im_lock); 270 spin_lock_bh(&marks->im_lock);
342 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); 271 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
343 spin_unlock(&marks->im_lock); 272 spin_unlock_bh(&marks->im_lock);
344 return status; 273 return status;
345} 274}
346 275
@@ -440,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout *bl,
440 369
441/* Note the range described by offset, length is guaranteed to be contained 370/* Note the range described by offset, length is guaranteed to be contained
442 * within be. 371 * within be.
372 * new will be freed, either by this function or add_to_commitlist if they
373 * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
443 */ 374 */
444int bl_mark_for_commit(struct pnfs_block_extent *be, 375int bl_mark_for_commit(struct pnfs_block_extent *be,
445 sector_t offset, sector_t length) 376 sector_t offset, sector_t length,
377 struct pnfs_block_short_extent *new)
446{ 378{
447 sector_t new_end, end = offset + length; 379 sector_t new_end, end = offset + length;
448 struct pnfs_block_short_extent *new;
449 struct pnfs_block_layout *bl = container_of(be->be_inval, 380 struct pnfs_block_layout *bl = container_of(be->be_inval,
450 struct pnfs_block_layout, 381 struct pnfs_block_layout,
451 bl_inval); 382 bl_inval);
452 383
453 new = kmalloc(sizeof(*new), GFP_NOFS);
454 if (!new)
455 return -ENOMEM;
456
457 mark_written_sectors(be->be_inval, offset, length); 384 mark_written_sectors(be->be_inval, offset, length);
458 /* We want to add the range to commit list, but it must be 385 /* We want to add the range to commit list, but it must be
459 * block-normalized, and verified that the normalized range has 386 * block-normalized, and verified that the normalized range has
@@ -483,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
483 new->bse_mdev = be->be_mdev; 410 new->bse_mdev = be->be_mdev;
484 411
485 spin_lock(&bl->bl_ext_lock); 412 spin_lock(&bl->bl_ext_lock);
486 /* new will be freed, either by add_to_commitlist if it decides not
487 * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
488 */
489 add_to_commitlist(bl, new); 413 add_to_commitlist(bl, new);
490 spin_unlock(&bl->bl_ext_lock); 414 spin_unlock(&bl->bl_ext_lock);
491 return 0; 415 return 0;
@@ -933,3 +857,53 @@ clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
933 } 857 }
934 } 858 }
935} 859}
860
861int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
862{
863 struct pnfs_block_short_extent *new;
864
865 new = kmalloc(sizeof(*new), GFP_NOFS);
866 if (unlikely(!new))
867 return -ENOMEM;
868
869 spin_lock_bh(&marks->im_lock);
870 list_add(&new->bse_node, &marks->im_extents);
871 spin_unlock_bh(&marks->im_lock);
872
873 return 0;
874}
875
876struct pnfs_block_short_extent *
877bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
878{
879 struct pnfs_block_short_extent *rv = NULL;
880
881 spin_lock_bh(&marks->im_lock);
882 if (!list_empty(&marks->im_extents)) {
883 rv = list_entry((&marks->im_extents)->next,
884 struct pnfs_block_short_extent, bse_node);
885 list_del_init(&rv->bse_node);
886 }
887 spin_unlock_bh(&marks->im_lock);
888
889 return rv;
890}
891
892void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
893{
894 struct pnfs_block_short_extent *se = NULL, *tmp;
895
896 if (num_to_free <= 0)
897 return;
898
899 spin_lock(&marks->im_lock);
900 list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
901 list_del(&se->bse_node);
902 kfree(se);
903 if (--num_to_free == 0)
904 break;
905 }
906 spin_unlock(&marks->im_lock);
907
908 BUG_ON(num_to_free > 0);
909}
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 07df5f1d85e5..c89d3b9e483c 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -162,7 +162,7 @@ struct cb_layoutrecallargs {
162 }; 162 };
163}; 163};
164 164
165extern unsigned nfs4_callback_layoutrecall( 165extern __be32 nfs4_callback_layoutrecall(
166 struct cb_layoutrecallargs *args, 166 struct cb_layoutrecallargs *args,
167 void *dummy, struct cb_process_state *cps); 167 void *dummy, struct cb_process_state *cps);
168 168
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 726e59a9e50f..d50b2742f23b 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -305,6 +305,10 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
305 n = ntohl(*p++); 305 n = ntohl(*p++);
306 if (n <= 0) 306 if (n <= 0)
307 goto out; 307 goto out;
308 if (n > ULONG_MAX / sizeof(*args->devs)) {
309 status = htonl(NFS4ERR_BADXDR);
310 goto out;
311 }
308 312
309 args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL); 313 args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
310 if (!args->devs) { 314 if (!args->devs) {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 277dfaf2e99a..31778f74357d 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -84,7 +84,7 @@ retry:
84/* 84/*
85 * Turn off NFSv4 uid/gid mapping when using AUTH_SYS 85 * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
86 */ 86 */
87static int nfs4_disable_idmapping = 1; 87static bool nfs4_disable_idmapping = true;
88 88
89/* 89/*
90 * RPC cruft for NFS 90 * RPC cruft for NFS
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 25c3bfad7953..f649fba8c384 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -57,7 +57,7 @@
57#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 57#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1
58 58
59/* Default is to see 64-bit inode numbers */ 59/* Default is to see 64-bit inode numbers */
60static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; 60static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
61 61
62static void nfs_invalidate_inode(struct inode *); 62static void nfs_invalidate_inode(struct inode *);
63static int nfs_update_inode(struct inode *, struct nfs_fattr *); 63static int nfs_update_inode(struct inode *, struct nfs_fattr *);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5ee92538b063..8102db9b926c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -332,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data);
332 332
333#ifdef CONFIG_MIGRATION 333#ifdef CONFIG_MIGRATION
334extern int nfs_migrate_page(struct address_space *, 334extern int nfs_migrate_page(struct address_space *,
335 struct page *, struct page *); 335 struct page *, struct page *, enum migrate_mode);
336#else 336#else
337#define nfs_migrate_page NULL 337#define nfs_migrate_page NULL
338#endif 338#endif
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index ed388aae9689..8ae91908f5aa 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -382,7 +382,7 @@ decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)
382{ 382{
383 struct nfs4_pnfs_ds_addr *da = NULL; 383 struct nfs4_pnfs_ds_addr *da = NULL;
384 char *buf, *portstr; 384 char *buf, *portstr;
385 u32 port; 385 __be16 port;
386 int nlen, rlen; 386 int nlen, rlen;
387 int tmp[2]; 387 int tmp[2];
388 __be32 *p; 388 __be32 *p;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 75366dc89686..f0c849c98fe4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3587,7 +3587,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
3587 res.acl_flags |= NFS4_ACL_LEN_REQUEST; 3587 res.acl_flags |= NFS4_ACL_LEN_REQUEST;
3588 resp_buf = page_address(pages[0]); 3588 resp_buf = page_address(pages[0]);
3589 3589
3590 dprintk("%s buf %p buflen %ld npages %d args.acl_len %ld\n", 3590 dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n",
3591 __func__, buf, buflen, npages, args.acl_len); 3591 __func__, buf, buflen, npages, args.acl_len);
3592 ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), 3592 ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
3593 &msg, &args.seq_args, &res.seq_res, 0); 3593 &msg, &args.seq_args, &res.seq_res, 0);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0c3885255f97..834f0fe96f89 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1688,7 +1688,7 @@ out_error:
1688 1688
1689#ifdef CONFIG_MIGRATION 1689#ifdef CONFIG_MIGRATION
1690int nfs_migrate_page(struct address_space *mapping, struct page *newpage, 1690int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1691 struct page *page) 1691 struct page *page, enum migrate_mode mode)
1692{ 1692{
1693 /* 1693 /*
1694 * If PagePrivate is set, then the page is currently associated with 1694 * If PagePrivate is set, then the page is currently associated with
@@ -1703,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1703 1703
1704 nfs_fscache_release_page(page, GFP_KERNEL); 1704 nfs_fscache_release_page(page, GFP_KERNEL);
1705 1705
1706 return migrate_page(mapping, newpage, page); 1706 return migrate_page(mapping, newpage, page, mode);
1707} 1707}
1708#endif 1708#endif
1709 1709
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 10e6366608f2..8df1ea4a6ff9 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -80,3 +80,13 @@ config NFSD_V4
80 available from http://linux-nfs.org/. 80 available from http://linux-nfs.org/.
81 81
82 If unsure, say N. 82 If unsure, say N.
83
84config NFSD_FAULT_INJECTION
85 bool "NFS server manual fault injection"
86 depends on NFSD_V4 && DEBUG_KERNEL
87 help
88 This option enables support for manually injecting faults
89 into the NFS server. This is intended to be used for
90 testing error recovery on the NFS client.
91
92 If unsure, say N.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 9b118ee20193..af32ef06b4fe 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_NFSD) += nfsd.o
6 6
7nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ 7nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
8 export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o 8 export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
9nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
9nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o 10nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
10nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o 11nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
11nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o 12nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 62f3b9074e84..cf8a6bd062fa 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -87,7 +87,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
87 struct svc_expkey key; 87 struct svc_expkey key;
88 struct svc_expkey *ek = NULL; 88 struct svc_expkey *ek = NULL;
89 89
90 if (mesg[mlen-1] != '\n') 90 if (mlen < 1 || mesg[mlen-1] != '\n')
91 return -EINVAL; 91 return -EINVAL;
92 mesg[mlen-1] = 0; 92 mesg[mlen-1] = 0;
93 93
@@ -1226,12 +1226,12 @@ nfsd_export_init(void)
1226 int rv; 1226 int rv;
1227 dprintk("nfsd: initializing export module.\n"); 1227 dprintk("nfsd: initializing export module.\n");
1228 1228
1229 rv = cache_register(&svc_export_cache); 1229 rv = cache_register_net(&svc_export_cache, &init_net);
1230 if (rv) 1230 if (rv)
1231 return rv; 1231 return rv;
1232 rv = cache_register(&svc_expkey_cache); 1232 rv = cache_register_net(&svc_expkey_cache, &init_net);
1233 if (rv) 1233 if (rv)
1234 cache_unregister(&svc_export_cache); 1234 cache_unregister_net(&svc_export_cache, &init_net);
1235 return rv; 1235 return rv;
1236 1236
1237} 1237}
@@ -1255,8 +1255,8 @@ nfsd_export_shutdown(void)
1255 1255
1256 dprintk("nfsd: shutting down export module.\n"); 1256 dprintk("nfsd: shutting down export module.\n");
1257 1257
1258 cache_unregister(&svc_expkey_cache); 1258 cache_unregister_net(&svc_expkey_cache, &init_net);
1259 cache_unregister(&svc_export_cache); 1259 cache_unregister_net(&svc_export_cache, &init_net);
1260 svcauth_unix_purge(); 1260 svcauth_unix_purge();
1261 1261
1262 dprintk("nfsd: export shutdown complete.\n"); 1262 dprintk("nfsd: export shutdown complete.\n");
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
new file mode 100644
index 000000000000..ce7f0758d84c
--- /dev/null
+++ b/fs/nfsd/fault_inject.c
@@ -0,0 +1,91 @@
1/*
2 * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
3 *
4 * Uses debugfs to create fault injection points for client testing
5 */
6
7#include <linux/types.h>
8#include <linux/fs.h>
9#include <linux/debugfs.h>
10#include <linux/module.h>
11
12#include "state.h"
13#include "fault_inject.h"
14
15struct nfsd_fault_inject_op {
16 char *file;
17 void (*func)(u64);
18};
19
20static struct nfsd_fault_inject_op inject_ops[] = {
21 {
22 .file = "forget_clients",
23 .func = nfsd_forget_clients,
24 },
25 {
26 .file = "forget_locks",
27 .func = nfsd_forget_locks,
28 },
29 {
30 .file = "forget_openowners",
31 .func = nfsd_forget_openowners,
32 },
33 {
34 .file = "forget_delegations",
35 .func = nfsd_forget_delegations,
36 },
37 {
38 .file = "recall_delegations",
39 .func = nfsd_recall_delegations,
40 },
41};
42
43static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);
44static struct dentry *debug_dir;
45
46static int nfsd_inject_set(void *op_ptr, u64 val)
47{
48 struct nfsd_fault_inject_op *op = op_ptr;
49
50 if (val == 0)
51 printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);
52 else
53 printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val);
54
55 op->func(val);
56 return 0;
57}
58
59static int nfsd_inject_get(void *data, u64 *val)
60{
61 return 0;
62}
63
64DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n");
65
66void nfsd_fault_inject_cleanup(void)
67{
68 debugfs_remove_recursive(debug_dir);
69}
70
71int nfsd_fault_inject_init(void)
72{
73 unsigned int i;
74 struct nfsd_fault_inject_op *op;
75 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
76
77 debug_dir = debugfs_create_dir("nfsd", NULL);
78 if (!debug_dir)
79 goto fail;
80
81 for (i = 0; i < NUM_INJECT_OPS; i++) {
82 op = &inject_ops[i];
83 if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd))
84 goto fail;
85 }
86 return 0;
87
88fail:
89 nfsd_fault_inject_cleanup();
90 return -ENOMEM;
91}
diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h
new file mode 100644
index 000000000000..90bd0570956c
--- /dev/null
+++ b/fs/nfsd/fault_inject.h
@@ -0,0 +1,28 @@
1/*
2 * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
3 *
4 * Function definitions for fault injection
5 */
6
7#ifndef LINUX_NFSD_FAULT_INJECT_H
8#define LINUX_NFSD_FAULT_INJECT_H
9
10#ifdef CONFIG_NFSD_FAULT_INJECTION
11int nfsd_fault_inject_init(void);
12void nfsd_fault_inject_cleanup(void);
13void nfsd_forget_clients(u64);
14void nfsd_forget_locks(u64);
15void nfsd_forget_openowners(u64);
16void nfsd_forget_delegations(u64);
17void nfsd_recall_delegations(u64);
18#else /* CONFIG_NFSD_FAULT_INJECTION */
19static inline int nfsd_fault_inject_init(void) { return 0; }
20static inline void nfsd_fault_inject_cleanup(void) {}
21static inline void nfsd_forget_clients(u64 num) {}
22static inline void nfsd_forget_locks(u64 num) {}
23static inline void nfsd_forget_openowners(u64 num) {}
24static inline void nfsd_forget_delegations(u64 num) {}
25static inline void nfsd_recall_delegations(u64 num) {}
26#endif /* CONFIG_NFSD_FAULT_INJECTION */
27
28#endif /* LINUX_NFSD_FAULT_INJECT_H */
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 55780a22fdbd..94096273cd6c 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,6 +36,7 @@
36#include <linux/seq_file.h> 36#include <linux/seq_file.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <net/net_namespace.h>
39#include "idmap.h" 40#include "idmap.h"
40#include "nfsd.h" 41#include "nfsd.h"
41 42
@@ -466,20 +467,20 @@ nfsd_idmap_init(void)
466{ 467{
467 int rv; 468 int rv;
468 469
469 rv = cache_register(&idtoname_cache); 470 rv = cache_register_net(&idtoname_cache, &init_net);
470 if (rv) 471 if (rv)
471 return rv; 472 return rv;
472 rv = cache_register(&nametoid_cache); 473 rv = cache_register_net(&nametoid_cache, &init_net);
473 if (rv) 474 if (rv)
474 cache_unregister(&idtoname_cache); 475 cache_unregister_net(&idtoname_cache, &init_net);
475 return rv; 476 return rv;
476} 477}
477 478
478void 479void
479nfsd_idmap_shutdown(void) 480nfsd_idmap_shutdown(void)
480{ 481{
481 cache_unregister(&idtoname_cache); 482 cache_unregister_net(&idtoname_cache, &init_net);
482 cache_unregister(&nametoid_cache); 483 cache_unregister_net(&nametoid_cache, &init_net);
483} 484}
484 485
485static int 486static int
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c5e28ed8bca0..896da74ec563 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -266,10 +266,6 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
266{ 266{
267 __be32 status; 267 __be32 status;
268 268
269 /* Only reclaims from previously confirmed clients are valid */
270 if ((status = nfs4_check_open_reclaim(&open->op_clientid)))
271 return status;
272
273 /* We don't know the target directory, and therefore can not 269 /* We don't know the target directory, and therefore can not
274 * set the change info 270 * set the change info
275 */ 271 */
@@ -373,6 +369,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
373 break; 369 break;
374 case NFS4_OPEN_CLAIM_PREVIOUS: 370 case NFS4_OPEN_CLAIM_PREVIOUS:
375 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; 371 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
372 status = nfs4_check_open_reclaim(&open->op_clientid);
373 if (status)
374 goto out;
376 case NFS4_OPEN_CLAIM_FH: 375 case NFS4_OPEN_CLAIM_FH:
377 case NFS4_OPEN_CLAIM_DELEG_CUR_FH: 376 case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
378 status = do_open_fhandle(rqstp, &cstate->current_fh, 377 status = do_open_fhandle(rqstp, &cstate->current_fh,
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 80a0be9ed008..0b3e875d1abd 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -117,8 +117,7 @@ out_no_tfm:
117 return status; 117 return status;
118} 118}
119 119
120int 120void nfsd4_create_clid_dir(struct nfs4_client *clp)
121nfsd4_create_clid_dir(struct nfs4_client *clp)
122{ 121{
123 const struct cred *original_cred; 122 const struct cred *original_cred;
124 char *dname = clp->cl_recdir; 123 char *dname = clp->cl_recdir;
@@ -127,13 +126,14 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
127 126
128 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); 127 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
129 128
130 if (!rec_file || clp->cl_firststate) 129 if (clp->cl_firststate)
131 return 0; 130 return;
132
133 clp->cl_firststate = 1; 131 clp->cl_firststate = 1;
132 if (!rec_file)
133 return;
134 status = nfs4_save_creds(&original_cred); 134 status = nfs4_save_creds(&original_cred);
135 if (status < 0) 135 if (status < 0)
136 return status; 136 return;
137 137
138 dir = rec_file->f_path.dentry; 138 dir = rec_file->f_path.dentry;
139 /* lock the parent */ 139 /* lock the parent */
@@ -144,8 +144,15 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
144 status = PTR_ERR(dentry); 144 status = PTR_ERR(dentry);
145 goto out_unlock; 145 goto out_unlock;
146 } 146 }
147 status = -EEXIST;
148 if (dentry->d_inode) 147 if (dentry->d_inode)
148 /*
149 * In the 4.1 case, where we're called from
150 * reclaim_complete(), records from the previous reboot
151 * may still be left, so this is OK.
152 *
153 * In the 4.0 case, we should never get here; but we may
154 * as well be forgiving and just succeed silently.
155 */
149 goto out_put; 156 goto out_put;
150 status = mnt_want_write_file(rec_file); 157 status = mnt_want_write_file(rec_file);
151 if (status) 158 if (status)
@@ -164,7 +171,6 @@ out_unlock:
164 " and is writeable", status, 171 " and is writeable", status,
165 user_recovery_dirname); 172 user_recovery_dirname);
166 nfs4_reset_creds(original_cred); 173 nfs4_reset_creds(original_cred);
167 return status;
168} 174}
169 175
170typedef int (recdir_func)(struct dentry *, struct dentry *); 176typedef int (recdir_func)(struct dentry *, struct dentry *);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9ca16dc09e04..e8c98f009670 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,12 +49,20 @@
49time_t nfsd4_lease = 90; /* default lease time */ 49time_t nfsd4_lease = 90; /* default lease time */
50time_t nfsd4_grace = 90; 50time_t nfsd4_grace = 90;
51static time_t boot_time; 51static time_t boot_time;
52static stateid_t zerostateid; /* bits all 0 */ 52
53static stateid_t onestateid; /* bits all 1 */ 53#define all_ones {{~0,~0},~0}
54static const stateid_t one_stateid = {
55 .si_generation = ~0,
56 .si_opaque = all_ones,
57};
58static const stateid_t zero_stateid = {
59 /* all fields zero */
60};
61
54static u64 current_sessionid = 1; 62static u64 current_sessionid = 1;
55 63
56#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) 64#define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t)))
57#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) 65#define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t)))
58 66
59/* forward declarations */ 67/* forward declarations */
60static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner); 68static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
@@ -133,21 +141,21 @@ unsigned int max_delegations;
133 * Open owner state (share locks) 141 * Open owner state (share locks)
134 */ 142 */
135 143
136/* hash tables for open owners */ 144/* hash tables for lock and open owners */
137#define OPEN_OWNER_HASH_BITS 8 145#define OWNER_HASH_BITS 8
138#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS) 146#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS)
139#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1) 147#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1)
140 148
141static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) 149static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
142{ 150{
143 unsigned int ret; 151 unsigned int ret;
144 152
145 ret = opaque_hashval(ownername->data, ownername->len); 153 ret = opaque_hashval(ownername->data, ownername->len);
146 ret += clientid; 154 ret += clientid;
147 return ret & OPEN_OWNER_HASH_MASK; 155 return ret & OWNER_HASH_MASK;
148} 156}
149 157
150static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE]; 158static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
151 159
152/* hash table for nfs4_file */ 160/* hash table for nfs4_file */
153#define FILE_HASH_BITS 8 161#define FILE_HASH_BITS 8
@@ -514,6 +522,7 @@ static void unhash_lockowner(struct nfs4_lockowner *lo)
514 522
515 list_del(&lo->lo_owner.so_strhash); 523 list_del(&lo->lo_owner.so_strhash);
516 list_del(&lo->lo_perstateid); 524 list_del(&lo->lo_perstateid);
525 list_del(&lo->lo_owner_ino_hash);
517 while (!list_empty(&lo->lo_owner.so_stateids)) { 526 while (!list_empty(&lo->lo_owner.so_stateids)) {
518 stp = list_first_entry(&lo->lo_owner.so_stateids, 527 stp = list_first_entry(&lo->lo_owner.so_stateids,
519 struct nfs4_ol_stateid, st_perstateowner); 528 struct nfs4_ol_stateid, st_perstateowner);
@@ -985,12 +994,11 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
985 clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL); 994 clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
986 if (clp == NULL) 995 if (clp == NULL)
987 return NULL; 996 return NULL;
988 clp->cl_name.data = kmalloc(name.len, GFP_KERNEL); 997 clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL);
989 if (clp->cl_name.data == NULL) { 998 if (clp->cl_name.data == NULL) {
990 kfree(clp); 999 kfree(clp);
991 return NULL; 1000 return NULL;
992 } 1001 }
993 memcpy(clp->cl_name.data, name.data, name.len);
994 clp->cl_name.len = name.len; 1002 clp->cl_name.len = name.len;
995 return clp; 1003 return clp;
996} 1004}
@@ -1058,7 +1066,6 @@ expire_client(struct nfs4_client *clp)
1058 spin_unlock(&recall_lock); 1066 spin_unlock(&recall_lock);
1059 while (!list_empty(&reaplist)) { 1067 while (!list_empty(&reaplist)) {
1060 dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); 1068 dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
1061 list_del_init(&dp->dl_recall_lru);
1062 unhash_delegation(dp); 1069 unhash_delegation(dp);
1063 } 1070 }
1064 while (!list_empty(&clp->cl_openowners)) { 1071 while (!list_empty(&clp->cl_openowners)) {
@@ -2301,7 +2308,7 @@ nfsd4_free_slabs(void)
2301 nfsd4_free_slab(&deleg_slab); 2308 nfsd4_free_slab(&deleg_slab);
2302} 2309}
2303 2310
2304static int 2311int
2305nfsd4_init_slabs(void) 2312nfsd4_init_slabs(void)
2306{ 2313{
2307 openowner_slab = kmem_cache_create("nfsd4_openowners", 2314 openowner_slab = kmem_cache_create("nfsd4_openowners",
@@ -2373,7 +2380,7 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
2373 2380
2374static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) 2381static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
2375{ 2382{
2376 list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]); 2383 list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
2377 list_add(&oo->oo_perclient, &clp->cl_openowners); 2384 list_add(&oo->oo_perclient, &clp->cl_openowners);
2378} 2385}
2379 2386
@@ -2436,7 +2443,9 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
2436 struct nfs4_stateowner *so; 2443 struct nfs4_stateowner *so;
2437 struct nfs4_openowner *oo; 2444 struct nfs4_openowner *oo;
2438 2445
2439 list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) { 2446 list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
2447 if (!so->so_is_open_owner)
2448 continue;
2440 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { 2449 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
2441 oo = openowner(so); 2450 oo = openowner(so);
2442 renew_client(oo->oo_owner.so_client); 2451 renew_client(oo->oo_owner.so_client);
@@ -2580,7 +2589,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2580 if (open->op_file == NULL) 2589 if (open->op_file == NULL)
2581 return nfserr_jukebox; 2590 return nfserr_jukebox;
2582 2591
2583 strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner); 2592 strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
2584 oo = find_openstateowner_str(strhashval, open); 2593 oo = find_openstateowner_str(strhashval, open);
2585 open->op_openowner = oo; 2594 open->op_openowner = oo;
2586 if (!oo) { 2595 if (!oo) {
@@ -3123,7 +3132,6 @@ nfs4_laundromat(void)
3123 spin_unlock(&recall_lock); 3132 spin_unlock(&recall_lock);
3124 list_for_each_safe(pos, next, &reaplist) { 3133 list_for_each_safe(pos, next, &reaplist) {
3125 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 3134 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
3126 list_del_init(&dp->dl_recall_lru);
3127 unhash_delegation(dp); 3135 unhash_delegation(dp);
3128 } 3136 }
3129 test_val = nfsd4_lease; 3137 test_val = nfsd4_lease;
@@ -3718,13 +3726,11 @@ out:
3718} 3726}
3719 3727
3720 3728
3721/*
3722 * Lock owner state (byte-range locks)
3723 */
3724#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start)) 3729#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start))
3725#define LOCK_HASH_BITS 8 3730
3726#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) 3731#define LOCKOWNER_INO_HASH_BITS 8
3727#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) 3732#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
3733#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)
3728 3734
3729static inline u64 3735static inline u64
3730end_offset(u64 start, u64 len) 3736end_offset(u64 start, u64 len)
@@ -3746,16 +3752,14 @@ last_byte_offset(u64 start, u64 len)
3746 return end > start ? end - 1: NFS4_MAX_UINT64; 3752 return end > start ? end - 1: NFS4_MAX_UINT64;
3747} 3753}
3748 3754
3749static inline unsigned int 3755static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername)
3750lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
3751 struct xdr_netobj *ownername)
3752{ 3756{
3753 return (file_hashval(inode) + cl_id 3757 return (file_hashval(inode) + cl_id
3754 + opaque_hashval(ownername->data, ownername->len)) 3758 + opaque_hashval(ownername->data, ownername->len))
3755 & LOCK_HASH_MASK; 3759 & LOCKOWNER_INO_HASH_MASK;
3756} 3760}
3757 3761
3758static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; 3762static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE];
3759 3763
3760/* 3764/*
3761 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that 3765 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
@@ -3809,23 +3813,39 @@ nevermind:
3809 deny->ld_type = NFS4_WRITE_LT; 3813 deny->ld_type = NFS4_WRITE_LT;
3810} 3814}
3811 3815
3816static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, clientid_t *clid, struct xdr_netobj *owner)
3817{
3818 struct nfs4_ol_stateid *lst;
3819
3820 if (!same_owner_str(&lo->lo_owner, owner, clid))
3821 return false;
3822 lst = list_first_entry(&lo->lo_owner.so_stateids,
3823 struct nfs4_ol_stateid, st_perstateowner);
3824 return lst->st_file->fi_inode == inode;
3825}
3826
3812static struct nfs4_lockowner * 3827static struct nfs4_lockowner *
3813find_lockowner_str(struct inode *inode, clientid_t *clid, 3828find_lockowner_str(struct inode *inode, clientid_t *clid,
3814 struct xdr_netobj *owner) 3829 struct xdr_netobj *owner)
3815{ 3830{
3816 unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner); 3831 unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);
3817 struct nfs4_stateowner *op; 3832 struct nfs4_lockowner *lo;
3818 3833
3819 list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) { 3834 list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
3820 if (same_owner_str(op, owner, clid)) 3835 if (same_lockowner_ino(lo, inode, clid, owner))
3821 return lockowner(op); 3836 return lo;
3822 } 3837 }
3823 return NULL; 3838 return NULL;
3824} 3839}
3825 3840
3826static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp) 3841static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
3827{ 3842{
3828 list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]); 3843 struct inode *inode = open_stp->st_file->fi_inode;
3844 unsigned int inohash = lockowner_ino_hashval(inode,
3845 clp->cl_clientid.cl_id, &lo->lo_owner.so_owner);
3846
3847 list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
3848 list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]);
3829 list_add(&lo->lo_perstateid, &open_stp->st_lockowners); 3849 list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
3830} 3850}
3831 3851
@@ -3834,7 +3854,7 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
3834 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 3854 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has
3835 * occurred. 3855 * occurred.
3836 * 3856 *
3837 * strhashval = lock_ownerstr_hashval 3857 * strhashval = ownerstr_hashval
3838 */ 3858 */
3839 3859
3840static struct nfs4_lockowner * 3860static struct nfs4_lockowner *
@@ -3892,6 +3912,37 @@ static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
3892 __set_bit(access, &lock_stp->st_access_bmap); 3912 __set_bit(access, &lock_stp->st_access_bmap);
3893} 3913}
3894 3914
3915__be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
3916{
3917 struct nfs4_file *fi = ost->st_file;
3918 struct nfs4_openowner *oo = openowner(ost->st_stateowner);
3919 struct nfs4_client *cl = oo->oo_owner.so_client;
3920 struct nfs4_lockowner *lo;
3921 unsigned int strhashval;
3922
3923 lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner);
3924 if (lo) {
3925 if (!cstate->minorversion)
3926 return nfserr_bad_seqid;
3927 /* XXX: a lockowner always has exactly one stateid: */
3928 *lst = list_first_entry(&lo->lo_owner.so_stateids,
3929 struct nfs4_ol_stateid, st_perstateowner);
3930 return nfs_ok;
3931 }
3932 strhashval = ownerstr_hashval(cl->cl_clientid.cl_id,
3933 &lock->v.new.owner);
3934 lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
3935 if (lo == NULL)
3936 return nfserr_jukebox;
3937 *lst = alloc_init_lock_stateid(lo, fi, ost);
3938 if (*lst == NULL) {
3939 release_lockowner(lo);
3940 return nfserr_jukebox;
3941 }
3942 *new = true;
3943 return nfs_ok;
3944}
3945
3895/* 3946/*
3896 * LOCK operation 3947 * LOCK operation
3897 */ 3948 */
@@ -3907,7 +3958,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3907 struct file_lock file_lock; 3958 struct file_lock file_lock;
3908 struct file_lock conflock; 3959 struct file_lock conflock;
3909 __be32 status = 0; 3960 __be32 status = 0;
3910 unsigned int strhashval; 3961 bool new_state = false;
3911 int lkflg; 3962 int lkflg;
3912 int err; 3963 int err;
3913 3964
@@ -3933,10 +3984,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3933 * lock stateid. 3984 * lock stateid.
3934 */ 3985 */
3935 struct nfs4_ol_stateid *open_stp = NULL; 3986 struct nfs4_ol_stateid *open_stp = NULL;
3936 3987
3988 if (nfsd4_has_session(cstate))
3989 /* See rfc 5661 18.10.3: given clientid is ignored: */
3990 memcpy(&lock->v.new.clientid,
3991 &cstate->session->se_client->cl_clientid,
3992 sizeof(clientid_t));
3993
3937 status = nfserr_stale_clientid; 3994 status = nfserr_stale_clientid;
3938 if (!nfsd4_has_session(cstate) && 3995 if (STALE_CLIENTID(&lock->lk_new_clientid))
3939 STALE_CLIENTID(&lock->lk_new_clientid))
3940 goto out; 3996 goto out;
3941 3997
3942 /* validate and update open stateid and open seqid */ 3998 /* validate and update open stateid and open seqid */
@@ -3948,25 +4004,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3948 goto out; 4004 goto out;
3949 open_sop = openowner(open_stp->st_stateowner); 4005 open_sop = openowner(open_stp->st_stateowner);
3950 status = nfserr_bad_stateid; 4006 status = nfserr_bad_stateid;
3951 if (!nfsd4_has_session(cstate) && 4007 if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
3952 !same_clid(&open_sop->oo_owner.so_client->cl_clientid,
3953 &lock->v.new.clientid)) 4008 &lock->v.new.clientid))
3954 goto out; 4009 goto out;
3955 /* create lockowner and lock stateid */ 4010 status = lookup_or_create_lock_state(cstate, open_stp, lock,
3956 fp = open_stp->st_file; 4011 &lock_stp, &new_state);
3957 strhashval = lock_ownerstr_hashval(fp->fi_inode, 4012 if (status)
3958 open_sop->oo_owner.so_client->cl_clientid.cl_id,
3959 &lock->v.new.owner);
3960 /* XXX: Do we need to check for duplicate stateowners on
3961 * the same file, or should they just be allowed (and
3962 * create new stateids)? */
3963 status = nfserr_jukebox;
3964 lock_sop = alloc_init_lock_stateowner(strhashval,
3965 open_sop->oo_owner.so_client, open_stp, lock);
3966 if (lock_sop == NULL)
3967 goto out;
3968 lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
3969 if (lock_stp == NULL)
3970 goto out; 4013 goto out;
3971 } else { 4014 } else {
3972 /* lock (lock owner + lock stateid) already exists */ 4015 /* lock (lock owner + lock stateid) already exists */
@@ -3976,10 +4019,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3976 NFS4_LOCK_STID, &lock_stp); 4019 NFS4_LOCK_STID, &lock_stp);
3977 if (status) 4020 if (status)
3978 goto out; 4021 goto out;
3979 lock_sop = lockowner(lock_stp->st_stateowner);
3980 fp = lock_stp->st_file;
3981 } 4022 }
3982 /* lock_sop and lock_stp have been created or found */ 4023 lock_sop = lockowner(lock_stp->st_stateowner);
4024 fp = lock_stp->st_file;
3983 4025
3984 lkflg = setlkflg(lock->lk_type); 4026 lkflg = setlkflg(lock->lk_type);
3985 status = nfs4_check_openmode(lock_stp, lkflg); 4027 status = nfs4_check_openmode(lock_stp, lkflg);
@@ -4054,7 +4096,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4054 break; 4096 break;
4055 } 4097 }
4056out: 4098out:
4057 if (status && lock->lk_is_new && lock_sop) 4099 if (status && new_state)
4058 release_lockowner(lock_sop); 4100 release_lockowner(lock_sop);
4059 if (!cstate->replay_owner) 4101 if (!cstate->replay_owner)
4060 nfs4_unlock_state(); 4102 nfs4_unlock_state();
@@ -4251,7 +4293,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
4251 struct nfs4_ol_stateid *stp; 4293 struct nfs4_ol_stateid *stp;
4252 struct xdr_netobj *owner = &rlockowner->rl_owner; 4294 struct xdr_netobj *owner = &rlockowner->rl_owner;
4253 struct list_head matches; 4295 struct list_head matches;
4254 int i; 4296 unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
4255 __be32 status; 4297 __be32 status;
4256 4298
4257 dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", 4299 dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
@@ -4266,22 +4308,19 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
4266 nfs4_lock_state(); 4308 nfs4_lock_state();
4267 4309
4268 status = nfserr_locks_held; 4310 status = nfserr_locks_held;
4269 /* XXX: we're doing a linear search through all the lockowners.
4270 * Yipes! For now we'll just hope clients aren't really using
4271 * release_lockowner much, but eventually we have to fix these
4272 * data structures. */
4273 INIT_LIST_HEAD(&matches); 4311 INIT_LIST_HEAD(&matches);
4274 for (i = 0; i < LOCK_HASH_SIZE; i++) { 4312
4275 list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) { 4313 list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) {
4276 if (!same_owner_str(sop, owner, clid)) 4314 if (sop->so_is_open_owner)
4277 continue; 4315 continue;
4278 list_for_each_entry(stp, &sop->so_stateids, 4316 if (!same_owner_str(sop, owner, clid))
4279 st_perstateowner) { 4317 continue;
4280 lo = lockowner(sop); 4318 list_for_each_entry(stp, &sop->so_stateids,
4281 if (check_for_locks(stp->st_file, lo)) 4319 st_perstateowner) {
4282 goto out; 4320 lo = lockowner(sop);
4283 list_add(&lo->lo_list, &matches); 4321 if (check_for_locks(stp->st_file, lo))
4284 } 4322 goto out;
4323 list_add(&lo->lo_list, &matches);
4285 } 4324 }
4286 } 4325 }
4287 /* Clients probably won't expect us to return with some (but not all) 4326 /* Clients probably won't expect us to return with some (but not all)
@@ -4394,16 +4433,127 @@ nfs4_check_open_reclaim(clientid_t *clid)
4394 return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad; 4433 return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad;
4395} 4434}
4396 4435
4436#ifdef CONFIG_NFSD_FAULT_INJECTION
4437
4438void nfsd_forget_clients(u64 num)
4439{
4440 struct nfs4_client *clp, *next;
4441 int count = 0;
4442
4443 nfs4_lock_state();
4444 list_for_each_entry_safe(clp, next, &client_lru, cl_lru) {
4445 nfsd4_remove_clid_dir(clp);
4446 expire_client(clp);
4447 if (++count == num)
4448 break;
4449 }
4450 nfs4_unlock_state();
4451
4452 printk(KERN_INFO "NFSD: Forgot %d clients", count);
4453}
4454
4455static void release_lockowner_sop(struct nfs4_stateowner *sop)
4456{
4457 release_lockowner(lockowner(sop));
4458}
4459
4460static void release_openowner_sop(struct nfs4_stateowner *sop)
4461{
4462 release_openowner(openowner(sop));
4463}
4464
4465static int nfsd_release_n_owners(u64 num, bool is_open_owner,
4466 void (*release_sop)(struct nfs4_stateowner *))
4467{
4468 int i, count = 0;
4469 struct nfs4_stateowner *sop, *next;
4470
4471 for (i = 0; i < OWNER_HASH_SIZE; i++) {
4472 list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) {
4473 if (sop->so_is_open_owner != is_open_owner)
4474 continue;
4475 release_sop(sop);
4476 if (++count == num)
4477 return count;
4478 }
4479 }
4480 return count;
4481}
4482
4483void nfsd_forget_locks(u64 num)
4484{
4485 int count;
4486
4487 nfs4_lock_state();
4488 count = nfsd_release_n_owners(num, false, release_lockowner_sop);
4489 nfs4_unlock_state();
4490
4491 printk(KERN_INFO "NFSD: Forgot %d locks", count);
4492}
4493
4494void nfsd_forget_openowners(u64 num)
4495{
4496 int count;
4497
4498 nfs4_lock_state();
4499 count = nfsd_release_n_owners(num, true, release_openowner_sop);
4500 nfs4_unlock_state();
4501
4502 printk(KERN_INFO "NFSD: Forgot %d open owners", count);
4503}
4504
4505int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegation *))
4506{
4507 int i, count = 0;
4508 struct nfs4_file *fp, *fnext;
4509 struct nfs4_delegation *dp, *dnext;
4510
4511 for (i = 0; i < FILE_HASH_SIZE; i++) {
4512 list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) {
4513 list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) {
4514 deleg_func(dp);
4515 if (++count == num)
4516 return count;
4517 }
4518 }
4519 }
4520
4521 return count;
4522}
4523
4524void nfsd_forget_delegations(u64 num)
4525{
4526 unsigned int count;
4527
4528 nfs4_lock_state();
4529 count = nfsd_process_n_delegations(num, unhash_delegation);
4530 nfs4_unlock_state();
4531
4532 printk(KERN_INFO "NFSD: Forgot %d delegations", count);
4533}
4534
4535void nfsd_recall_delegations(u64 num)
4536{
4537 unsigned int count;
4538
4539 nfs4_lock_state();
4540 spin_lock(&recall_lock);
4541 count = nfsd_process_n_delegations(num, nfsd_break_one_deleg);
4542 spin_unlock(&recall_lock);
4543 nfs4_unlock_state();
4544
4545 printk(KERN_INFO "NFSD: Recalled %d delegations", count);
4546}
4547
4548#endif /* CONFIG_NFSD_FAULT_INJECTION */
4549
4397/* initialization to perform at module load time: */ 4550/* initialization to perform at module load time: */
4398 4551
4399int 4552void
4400nfs4_state_init(void) 4553nfs4_state_init(void)
4401{ 4554{
4402 int i, status; 4555 int i;
4403 4556
4404 status = nfsd4_init_slabs();
4405 if (status)
4406 return status;
4407 for (i = 0; i < CLIENT_HASH_SIZE; i++) { 4557 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
4408 INIT_LIST_HEAD(&conf_id_hashtbl[i]); 4558 INIT_LIST_HEAD(&conf_id_hashtbl[i]);
4409 INIT_LIST_HEAD(&conf_str_hashtbl[i]); 4559 INIT_LIST_HEAD(&conf_str_hashtbl[i]);
@@ -4416,18 +4566,15 @@ nfs4_state_init(void)
4416 for (i = 0; i < FILE_HASH_SIZE; i++) { 4566 for (i = 0; i < FILE_HASH_SIZE; i++) {
4417 INIT_LIST_HEAD(&file_hashtbl[i]); 4567 INIT_LIST_HEAD(&file_hashtbl[i]);
4418 } 4568 }
4419 for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) { 4569 for (i = 0; i < OWNER_HASH_SIZE; i++) {
4420 INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]); 4570 INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
4421 }
4422 for (i = 0; i < LOCK_HASH_SIZE; i++) {
4423 INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
4424 } 4571 }
4425 memset(&onestateid, ~0, sizeof(stateid_t)); 4572 for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
4573 INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]);
4426 INIT_LIST_HEAD(&close_lru); 4574 INIT_LIST_HEAD(&close_lru);
4427 INIT_LIST_HEAD(&client_lru); 4575 INIT_LIST_HEAD(&client_lru);
4428 INIT_LIST_HEAD(&del_recall_lru); 4576 INIT_LIST_HEAD(&del_recall_lru);
4429 reclaim_str_hashtbl_size = 0; 4577 reclaim_str_hashtbl_size = 0;
4430 return 0;
4431} 4578}
4432 4579
4433static void 4580static void
@@ -4526,7 +4673,6 @@ __nfs4_state_shutdown(void)
4526 spin_unlock(&recall_lock); 4673 spin_unlock(&recall_lock);
4527 list_for_each_safe(pos, next, &reaplist) { 4674 list_for_each_safe(pos, next, &reaplist) {
4528 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 4675 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
4529 list_del_init(&dp->dl_recall_lru);
4530 unhash_delegation(dp); 4676 unhash_delegation(dp);
4531 } 4677 }
4532 4678
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b6fa792d6b85..0ec5a1b9700e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -215,10 +215,9 @@ defer_free(struct nfsd4_compoundargs *argp,
215static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) 215static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
216{ 216{
217 if (p == argp->tmp) { 217 if (p == argp->tmp) {
218 p = kmalloc(nbytes, GFP_KERNEL); 218 p = kmemdup(argp->tmp, nbytes, GFP_KERNEL);
219 if (!p) 219 if (!p)
220 return NULL; 220 return NULL;
221 memcpy(p, argp->tmp, nbytes);
222 } else { 221 } else {
223 BUG_ON(p != argp->tmpp); 222 BUG_ON(p != argp->tmpp);
224 argp->tmpp = NULL; 223 argp->tmpp = NULL;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bb4a11d58a5a..748eda93ce59 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -18,6 +18,7 @@
18#include "idmap.h" 18#include "idmap.h"
19#include "nfsd.h" 19#include "nfsd.h"
20#include "cache.h" 20#include "cache.h"
21#include "fault_inject.h"
21 22
22/* 23/*
23 * We have a single directory with several nodes in it. 24 * We have a single directory with several nodes in it.
@@ -1128,9 +1129,13 @@ static int __init init_nfsd(void)
1128 int retval; 1129 int retval;
1129 printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); 1130 printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
1130 1131
1131 retval = nfs4_state_init(); /* nfs4 locking state */ 1132 retval = nfsd4_init_slabs();
1132 if (retval) 1133 if (retval)
1133 return retval; 1134 return retval;
1135 nfs4_state_init();
1136 retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
1137 if (retval)
1138 goto out_free_slabs;
1134 nfsd_stat_init(); /* Statistics */ 1139 nfsd_stat_init(); /* Statistics */
1135 retval = nfsd_reply_cache_init(); 1140 retval = nfsd_reply_cache_init();
1136 if (retval) 1141 if (retval)
@@ -1161,6 +1166,8 @@ out_free_cache:
1161 nfsd_reply_cache_shutdown(); 1166 nfsd_reply_cache_shutdown();
1162out_free_stat: 1167out_free_stat:
1163 nfsd_stat_shutdown(); 1168 nfsd_stat_shutdown();
1169 nfsd_fault_inject_cleanup();
1170out_free_slabs:
1164 nfsd4_free_slabs(); 1171 nfsd4_free_slabs();
1165 return retval; 1172 return retval;
1166} 1173}
@@ -1175,6 +1182,7 @@ static void __exit exit_nfsd(void)
1175 nfsd_lockd_shutdown(); 1182 nfsd_lockd_shutdown();
1176 nfsd_idmap_shutdown(); 1183 nfsd_idmap_shutdown();
1177 nfsd4_free_slabs(); 1184 nfsd4_free_slabs();
1185 nfsd_fault_inject_cleanup();
1178 unregister_filesystem(&nfsd_fs_type); 1186 unregister_filesystem(&nfsd_fs_type);
1179} 1187}
1180 1188
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 58134a23fdfb..1d1e8589b4ce 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -104,14 +104,16 @@ static inline int nfsd_v4client(struct svc_rqst *rq)
104 */ 104 */
105#ifdef CONFIG_NFSD_V4 105#ifdef CONFIG_NFSD_V4
106extern unsigned int max_delegations; 106extern unsigned int max_delegations;
107int nfs4_state_init(void); 107void nfs4_state_init(void);
108int nfsd4_init_slabs(void);
108void nfsd4_free_slabs(void); 109void nfsd4_free_slabs(void);
109int nfs4_state_start(void); 110int nfs4_state_start(void);
110void nfs4_state_shutdown(void); 111void nfs4_state_shutdown(void);
111void nfs4_reset_lease(time_t leasetime); 112void nfs4_reset_lease(time_t leasetime);
112int nfs4_reset_recoverydir(char *recdir); 113int nfs4_reset_recoverydir(char *recdir);
113#else 114#else
114static inline int nfs4_state_init(void) { return 0; } 115static inline void nfs4_state_init(void) { }
116static inline int nfsd4_init_slabs(void) { return 0; }
115static inline void nfsd4_free_slabs(void) { } 117static inline void nfsd4_free_slabs(void) { }
116static inline int nfs4_state_start(void) { return 0; } 118static inline int nfs4_state_start(void) { return 0; }
117static inline void nfs4_state_shutdown(void) { } 119static inline void nfs4_state_shutdown(void) { }
@@ -338,15 +340,15 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
338} 340}
339 341
340/* These will return ERR_INVAL if specified in GETATTR or READDIR. */ 342/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
341#define NFSD_WRITEONLY_ATTRS_WORD1 \ 343#define NFSD_WRITEONLY_ATTRS_WORD1 \
342(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) 344 (FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
343 345
344/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ 346/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
345#define NFSD_WRITEABLE_ATTRS_WORD0 \ 347#define NFSD_WRITEABLE_ATTRS_WORD0 \
346(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL ) 348 (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL)
347#define NFSD_WRITEABLE_ATTRS_WORD1 \ 349#define NFSD_WRITEABLE_ATTRS_WORD1 \
348(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ 350 (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
349 | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) 351 | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
350#define NFSD_WRITEABLE_ATTRS_WORD2 0 352#define NFSD_WRITEABLE_ATTRS_WORD2 0
351 353
352#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ 354#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index a3cf38476a1b..ffb5df1db94f 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -366,6 +366,7 @@ struct nfs4_openowner {
366 366
367struct nfs4_lockowner { 367struct nfs4_lockowner {
368 struct nfs4_stateowner lo_owner; /* must be first element */ 368 struct nfs4_stateowner lo_owner; /* must be first element */
369 struct list_head lo_owner_ino_hash; /* hash by owner,file */
369 struct list_head lo_perstateid; /* for lockowners only */ 370 struct list_head lo_perstateid; /* for lockowners only */
370 struct list_head lo_list; /* for temporary uses */ 371 struct list_head lo_list; /* for temporary uses */
371}; 372};
@@ -482,7 +483,7 @@ extern void nfsd4_shutdown_recdir(void);
482extern int nfs4_client_to_reclaim(const char *name); 483extern int nfs4_client_to_reclaim(const char *name);
483extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); 484extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
484extern void nfsd4_recdir_purge_old(void); 485extern void nfsd4_recdir_purge_old(void);
485extern int nfsd4_create_clid_dir(struct nfs4_client *clp); 486extern void nfsd4_create_clid_dir(struct nfs4_client *clp);
486extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); 487extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
487extern void release_session_client(struct nfsd4_session *); 488extern void release_session_client(struct nfsd4_session *);
488extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *); 489extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d25a723b68ad..edf6d3ed8777 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -594,8 +594,19 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
594 return error; 594 return error;
595} 595}
596 596
597#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction." 597/*
598#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type" 598 * NFS junction information is stored in an extended attribute.
599 */
600#define NFSD_JUNCTION_XATTR_NAME XATTR_TRUSTED_PREFIX "junction.nfs"
601
602/**
603 * nfsd4_is_junction - Test if an object could be an NFS junction
604 *
605 * @dentry: object to test
606 *
607 * Returns 1 if "dentry" appears to contain NFS junction information.
608 * Otherwise 0 is returned.
609 */
599int nfsd4_is_junction(struct dentry *dentry) 610int nfsd4_is_junction(struct dentry *dentry)
600{ 611{
601 struct inode *inode = dentry->d_inode; 612 struct inode *inode = dentry->d_inode;
@@ -606,7 +617,7 @@ int nfsd4_is_junction(struct dentry *dentry)
606 return 0; 617 return 0;
607 if (!(inode->i_mode & S_ISVTX)) 618 if (!(inode->i_mode & S_ISVTX))
608 return 0; 619 return 0;
609 if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0) 620 if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0)
610 return 0; 621 return 0;
611 return 1; 622 return 1;
612} 623}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index e14587d55689..f104d565b682 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -135,9 +135,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
135 135
136 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; 136 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
137 137
138 /* 1 from caller and 1 for being on i_list/g_list */
139 BUG_ON(atomic_read(&mark->refcnt) < 2);
140
141 spin_lock(&group->mark_lock); 138 spin_lock(&group->mark_lock);
142 139
143 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { 140 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
@@ -182,6 +179,11 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
182 iput(inode); 179 iput(inode);
183 180
184 /* 181 /*
182 * We don't necessarily have a ref on mark from caller so the above iput
183 * may have already destroyed it. Don't touch from now on.
184 */
185
186 /*
185 * it's possible that this group tried to destroy itself, but this 187 * it's possible that this group tried to destroy itself, but this
186 * this mark was simultaneously being freed by inode. If that's the 188 * this mark was simultaneously being freed by inode. If that's the
187 * case, we finish freeing the group here. 189 * case, we finish freeing the group here.
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 608be4516091..5a4a8af5c406 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3198,7 +3198,7 @@ MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparm
3198MODULE_VERSION(NTFS_VERSION); 3198MODULE_VERSION(NTFS_VERSION);
3199MODULE_LICENSE("GPL"); 3199MODULE_LICENSE("GPL");
3200#ifdef DEBUG 3200#ifdef DEBUG
3201module_param(debug_msgs, bool, 0); 3201module_param(debug_msgs, bint, 0);
3202MODULE_PARM_DESC(debug_msgs, "Enable debug messages."); 3202MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
3203#endif 3203#endif
3204 3204
diff --git a/fs/pipe.c b/fs/pipe.c
index f0e485d54e64..a932ced92a16 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
1137 if (nr_pages < pipe->nrbufs) 1137 if (nr_pages < pipe->nrbufs)
1138 return -EBUSY; 1138 return -EBUSY;
1139 1139
1140 bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); 1140 bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
1141 if (unlikely(!bufs)) 1141 if (unlikely(!bufs))
1142 return -ENOMEM; 1142 return -ENOMEM;
1143 1143
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 8c344f037bd0..c602b8d20f06 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -380,7 +380,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
380 380
381 state = *get_task_state(task); 381 state = *get_task_state(task);
382 vsize = eip = esp = 0; 382 vsize = eip = esp = 0;
383 permitted = ptrace_may_access(task, PTRACE_MODE_READ); 383 permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
384 mm = get_task_mm(task); 384 mm = get_task_mm(task);
385 if (mm) { 385 if (mm) {
386 vsize = task_vsize(mm); 386 vsize = task_vsize(mm);
@@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
464 464
465 seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ 465 seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
466%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ 466%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
467%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", 467%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n",
468 pid_nr_ns(pid, ns), 468 pid_nr_ns(pid, ns),
469 tcomm, 469 tcomm,
470 state, 470 state,
@@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
511 task->policy, 511 task->policy,
512 (unsigned long long)delayacct_blkio_ticks(task), 512 (unsigned long long)delayacct_blkio_ticks(task),
513 cputime_to_clock_t(gtime), 513 cputime_to_clock_t(gtime),
514 cputime_to_clock_t(cgtime)); 514 cputime_to_clock_t(cgtime),
515 (mm && permitted) ? mm->start_data : 0,
516 (mm && permitted) ? mm->end_data : 0,
517 (mm && permitted) ? mm->start_brk : 0);
515 if (mm) 518 if (mm)
516 mmput(mm); 519 mmput(mm);
517 return 0; 520 return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8173dfd89cb2..9cde9edf9c4d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -198,65 +198,7 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
198 return result; 198 return result;
199} 199}
200 200
201static struct mm_struct *__check_mem_permission(struct task_struct *task) 201static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
202{
203 struct mm_struct *mm;
204
205 mm = get_task_mm(task);
206 if (!mm)
207 return ERR_PTR(-EINVAL);
208
209 /*
210 * A task can always look at itself, in case it chooses
211 * to use system calls instead of load instructions.
212 */
213 if (task == current)
214 return mm;
215
216 /*
217 * If current is actively ptrace'ing, and would also be
218 * permitted to freshly attach with ptrace now, permit it.
219 */
220 if (task_is_stopped_or_traced(task)) {
221 int match;
222 rcu_read_lock();
223 match = (ptrace_parent(task) == current);
224 rcu_read_unlock();
225 if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
226 return mm;
227 }
228
229 /*
230 * No one else is allowed.
231 */
232 mmput(mm);
233 return ERR_PTR(-EPERM);
234}
235
236/*
237 * If current may access user memory in @task return a reference to the
238 * corresponding mm, otherwise ERR_PTR.
239 */
240static struct mm_struct *check_mem_permission(struct task_struct *task)
241{
242 struct mm_struct *mm;
243 int err;
244
245 /*
246 * Avoid racing if task exec's as we might get a new mm but validate
247 * against old credentials.
248 */
249 err = mutex_lock_killable(&task->signal->cred_guard_mutex);
250 if (err)
251 return ERR_PTR(err);
252
253 mm = __check_mem_permission(task);
254 mutex_unlock(&task->signal->cred_guard_mutex);
255
256 return mm;
257}
258
259struct mm_struct *mm_for_maps(struct task_struct *task)
260{ 202{
261 struct mm_struct *mm; 203 struct mm_struct *mm;
262 int err; 204 int err;
@@ -267,7 +209,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
267 209
268 mm = get_task_mm(task); 210 mm = get_task_mm(task);
269 if (mm && mm != current->mm && 211 if (mm && mm != current->mm &&
270 !ptrace_may_access(task, PTRACE_MODE_READ)) { 212 !ptrace_may_access(task, mode)) {
271 mmput(mm); 213 mmput(mm);
272 mm = ERR_PTR(-EACCES); 214 mm = ERR_PTR(-EACCES);
273 } 215 }
@@ -276,6 +218,11 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
276 return mm; 218 return mm;
277} 219}
278 220
221struct mm_struct *mm_for_maps(struct task_struct *task)
222{
223 return mm_access(task, PTRACE_MODE_READ);
224}
225
279static int proc_pid_cmdline(struct task_struct *task, char * buffer) 226static int proc_pid_cmdline(struct task_struct *task, char * buffer)
280{ 227{
281 int res = 0; 228 int res = 0;
@@ -654,6 +601,8 @@ static int proc_pid_permission(struct inode *inode, int mask)
654 bool has_perms; 601 bool has_perms;
655 602
656 task = get_proc_task(inode); 603 task = get_proc_task(inode);
604 if (!task)
605 return -ESRCH;
657 has_perms = has_pid_permissions(pid, task, 1); 606 has_perms = has_pid_permissions(pid, task, 1);
658 put_task_struct(task); 607 put_task_struct(task);
659 608
@@ -750,38 +699,39 @@ static const struct file_operations proc_single_file_operations = {
750 699
751static int mem_open(struct inode* inode, struct file* file) 700static int mem_open(struct inode* inode, struct file* file)
752{ 701{
753 file->private_data = (void*)((long)current->self_exec_id); 702 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
703 struct mm_struct *mm;
704
705 if (!task)
706 return -ESRCH;
707
708 mm = mm_access(task, PTRACE_MODE_ATTACH);
709 put_task_struct(task);
710
711 if (IS_ERR(mm))
712 return PTR_ERR(mm);
713
754 /* OK to pass negative loff_t, we can catch out-of-range */ 714 /* OK to pass negative loff_t, we can catch out-of-range */
755 file->f_mode |= FMODE_UNSIGNED_OFFSET; 715 file->f_mode |= FMODE_UNSIGNED_OFFSET;
716 file->private_data = mm;
717
756 return 0; 718 return 0;
757} 719}
758 720
759static ssize_t mem_read(struct file * file, char __user * buf, 721static ssize_t mem_read(struct file * file, char __user * buf,
760 size_t count, loff_t *ppos) 722 size_t count, loff_t *ppos)
761{ 723{
762 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 724 int ret;
763 char *page; 725 char *page;
764 unsigned long src = *ppos; 726 unsigned long src = *ppos;
765 int ret = -ESRCH; 727 struct mm_struct *mm = file->private_data;
766 struct mm_struct *mm;
767 728
768 if (!task) 729 if (!mm)
769 goto out_no_task; 730 return 0;
770 731
771 ret = -ENOMEM;
772 page = (char *)__get_free_page(GFP_TEMPORARY); 732 page = (char *)__get_free_page(GFP_TEMPORARY);
773 if (!page) 733 if (!page)
774 goto out; 734 return -ENOMEM;
775
776 mm = check_mem_permission(task);
777 ret = PTR_ERR(mm);
778 if (IS_ERR(mm))
779 goto out_free;
780
781 ret = -EIO;
782
783 if (file->private_data != (void*)((long)current->self_exec_id))
784 goto out_put;
785 735
786 ret = 0; 736 ret = 0;
787 737
@@ -808,13 +758,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
808 } 758 }
809 *ppos = src; 759 *ppos = src;
810 760
811out_put:
812 mmput(mm);
813out_free:
814 free_page((unsigned long) page); 761 free_page((unsigned long) page);
815out:
816 put_task_struct(task);
817out_no_task:
818 return ret; 762 return ret;
819} 763}
820 764
@@ -823,27 +767,15 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
823{ 767{
824 int copied; 768 int copied;
825 char *page; 769 char *page;
826 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
827 unsigned long dst = *ppos; 770 unsigned long dst = *ppos;
828 struct mm_struct *mm; 771 struct mm_struct *mm = file->private_data;
829 772
830 copied = -ESRCH; 773 if (!mm)
831 if (!task) 774 return 0;
832 goto out_no_task;
833 775
834 copied = -ENOMEM;
835 page = (char *)__get_free_page(GFP_TEMPORARY); 776 page = (char *)__get_free_page(GFP_TEMPORARY);
836 if (!page) 777 if (!page)
837 goto out_task; 778 return -ENOMEM;
838
839 mm = check_mem_permission(task);
840 copied = PTR_ERR(mm);
841 if (IS_ERR(mm))
842 goto out_free;
843
844 copied = -EIO;
845 if (file->private_data != (void *)((long)current->self_exec_id))
846 goto out_mm;
847 779
848 copied = 0; 780 copied = 0;
849 while (count > 0) { 781 while (count > 0) {
@@ -867,13 +799,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
867 } 799 }
868 *ppos = dst; 800 *ppos = dst;
869 801
870out_mm:
871 mmput(mm);
872out_free:
873 free_page((unsigned long) page); 802 free_page((unsigned long) page);
874out_task:
875 put_task_struct(task);
876out_no_task:
877 return copied; 803 return copied;
878} 804}
879 805
@@ -893,11 +819,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
893 return file->f_pos; 819 return file->f_pos;
894} 820}
895 821
822static int mem_release(struct inode *inode, struct file *file)
823{
824 struct mm_struct *mm = file->private_data;
825
826 mmput(mm);
827 return 0;
828}
829
896static const struct file_operations proc_mem_operations = { 830static const struct file_operations proc_mem_operations = {
897 .llseek = mem_lseek, 831 .llseek = mem_lseek,
898 .read = mem_read, 832 .read = mem_read,
899 .write = mem_write, 833 .write = mem_write,
900 .open = mem_open, 834 .open = mem_open,
835 .release = mem_release,
901}; 836};
902 837
903static ssize_t environ_read(struct file *file, char __user *buf, 838static ssize_t environ_read(struct file *file, char __user *buf,
@@ -1197,9 +1132,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1197 ssize_t length; 1132 ssize_t length;
1198 uid_t loginuid; 1133 uid_t loginuid;
1199 1134
1200 if (!capable(CAP_AUDIT_CONTROL))
1201 return -EPERM;
1202
1203 rcu_read_lock(); 1135 rcu_read_lock();
1204 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { 1136 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
1205 rcu_read_unlock(); 1137 rcu_read_unlock();
@@ -1228,7 +1160,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1228 goto out_free_page; 1160 goto out_free_page;
1229 1161
1230 } 1162 }
1231 length = audit_set_loginuid(current, loginuid); 1163 length = audit_set_loginuid(loginuid);
1232 if (likely(length == 0)) 1164 if (likely(length == 0))
1233 length = count; 1165 length = count;
1234 1166
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index d76ca6ae2b1b..121f77cfef76 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -77,6 +77,8 @@ static int show_stat(struct seq_file *p, void *v)
77 steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; 77 steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
78 guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; 78 guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
79 guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; 79 guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
80 sum += kstat_cpu_irqs_sum(i);
81 sum += arch_irq_stat_cpu(i);
80 82
81 for (j = 0; j < NR_SOFTIRQS; j++) { 83 for (j = 0; j < NR_SOFTIRQS; j++) {
82 unsigned int softirq_stat = kstat_softirqs_cpu(j, i); 84 unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2bfd987f4853..6b009548d2e0 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -179,47 +179,33 @@ static const char *qnx4_checkroot(struct super_block *sb)
179 struct qnx4_inode_entry *rootdir; 179 struct qnx4_inode_entry *rootdir;
180 int rd, rl; 180 int rd, rl;
181 int i, j; 181 int i, j;
182 int found = 0;
183 182
184 if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') { 183 if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/')
185 return "no qnx4 filesystem (no root dir)."; 184 return "no qnx4 filesystem (no root dir).";
186 } else { 185 QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
187 QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id)); 186 rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
188 rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1; 187 rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
189 rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size); 188 for (j = 0; j < rl; j++) {
190 for (j = 0; j < rl; j++) { 189 bh = sb_bread(sb, rd + j); /* root dir, first block */
191 bh = sb_bread(sb, rd + j); /* root dir, first block */ 190 if (bh == NULL)
192 if (bh == NULL) { 191 return "unable to read root entry.";
193 return "unable to read root entry."; 192 rootdir = (struct qnx4_inode_entry *) bh->b_data;
194 } 193 for (i = 0; i < QNX4_INODES_PER_BLOCK; i++, rootdir++) {
195 for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) { 194 QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
196 rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE); 195 if (strcmp(rootdir->di_fname, QNX4_BMNAME) != 0)
197 if (rootdir->di_fname != NULL) { 196 continue;
198 QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname)); 197 qnx4_sb(sb)->BitMap = kmemdup(rootdir,
199 if (!strcmp(rootdir->di_fname, 198 sizeof(struct qnx4_inode_entry),
200 QNX4_BMNAME)) { 199 GFP_KERNEL);
201 found = 1;
202 qnx4_sb(sb)->BitMap = kmemdup(rootdir,
203 sizeof(struct qnx4_inode_entry),
204 GFP_KERNEL);
205 if (!qnx4_sb(sb)->BitMap) {
206 brelse (bh);
207 return "not enough memory for bitmap inode";
208 }/* keep bitmap inode known */
209 break;
210 }
211 }
212 }
213 brelse(bh); 200 brelse(bh);
214 if (found != 0) { 201 if (!qnx4_sb(sb)->BitMap)
215 break; 202 return "not enough memory for bitmap inode";
216 } 203 /* keep bitmap inode known */
217 } 204 return NULL;
218 if (found == 0) {
219 return "bitmap file not found.";
220 } 205 }
206 brelse(bh);
221 } 207 }
222 return NULL; 208 return "bitmap file not found.";
223} 209}
224 210
225static int qnx4_fill_super(struct super_block *s, void *data, int silent) 211static int qnx4_fill_super(struct super_block *s, void *data, int silent)
@@ -270,7 +256,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
270 if (IS_ERR(root)) { 256 if (IS_ERR(root)) {
271 printk(KERN_ERR "qnx4: get inode failed\n"); 257 printk(KERN_ERR "qnx4: get inode failed\n");
272 ret = PTR_ERR(root); 258 ret = PTR_ERR(root);
273 goto out; 259 goto outb;
274 } 260 }
275 261
276 ret = -ENOMEM; 262 ret = -ENOMEM;
@@ -283,6 +269,8 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
283 269
284 outi: 270 outi:
285 iput(root); 271 iput(root);
272 outb:
273 kfree(qs->BitMap);
286 out: 274 out:
287 brelse(bh); 275 brelse(bh);
288 outnobh: 276 outnobh:
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f744be98cd5a..af0b73802592 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -70,11 +70,15 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
70 spin_lock(&cache->lock); 70 spin_lock(&cache->lock);
71 71
72 while (1) { 72 while (1) {
73 for (i = 0; i < cache->entries; i++) 73 for (i = cache->curr_blk, n = 0; n < cache->entries; n++) {
74 if (cache->entry[i].block == block) 74 if (cache->entry[i].block == block) {
75 cache->curr_blk = i;
75 break; 76 break;
77 }
78 i = (i + 1) % cache->entries;
79 }
76 80
77 if (i == cache->entries) { 81 if (n == cache->entries) {
78 /* 82 /*
79 * Block not in cache, if all cache entries are used 83 * Block not in cache, if all cache entries are used
80 * go to sleep waiting for one to become available. 84 * go to sleep waiting for one to become available.
@@ -245,6 +249,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
245 goto cleanup; 249 goto cleanup;
246 } 250 }
247 251
252 cache->curr_blk = 0;
248 cache->next_blk = 0; 253 cache->next_blk = 0;
249 cache->unused = entries; 254 cache->unused = entries;
250 cache->entries = entries; 255 cache->entries = entries;
@@ -332,17 +337,20 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer,
332 u64 *block, int *offset, int length) 337 u64 *block, int *offset, int length)
333{ 338{
334 struct squashfs_sb_info *msblk = sb->s_fs_info; 339 struct squashfs_sb_info *msblk = sb->s_fs_info;
335 int bytes, copied = length; 340 int bytes, res = length;
336 struct squashfs_cache_entry *entry; 341 struct squashfs_cache_entry *entry;
337 342
338 TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset); 343 TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
339 344
340 while (length) { 345 while (length) {
341 entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0); 346 entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
342 if (entry->error) 347 if (entry->error) {
343 return entry->error; 348 res = entry->error;
344 else if (*offset >= entry->length) 349 goto error;
345 return -EIO; 350 } else if (*offset >= entry->length) {
351 res = -EIO;
352 goto error;
353 }
346 354
347 bytes = squashfs_copy_data(buffer, entry, *offset, length); 355 bytes = squashfs_copy_data(buffer, entry, *offset, length);
348 if (buffer) 356 if (buffer)
@@ -358,7 +366,11 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer,
358 squashfs_cache_put(entry); 366 squashfs_cache_put(entry);
359 } 367 }
360 368
361 return copied; 369 return res;
370
371error:
372 squashfs_cache_put(entry);
373 return res;
362} 374}
363 375
364 376
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index fd7b3b3bda13..81afbccfa843 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -208,8 +208,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
208 inode->i_op = &squashfs_inode_ops; 208 inode->i_op = &squashfs_inode_ops;
209 inode->i_fop = &generic_ro_fops; 209 inode->i_fop = &generic_ro_fops;
210 inode->i_mode |= S_IFREG; 210 inode->i_mode |= S_IFREG;
211 inode->i_blocks = ((inode->i_size - 211 inode->i_blocks = (inode->i_size -
212 le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1; 212 le64_to_cpu(sqsh_ino->sparse) + 511) >> 9;
213 213
214 squashfs_i(inode)->fragment_block = frag_blk; 214 squashfs_i(inode)->fragment_block = frag_blk;
215 squashfs_i(inode)->fragment_size = frag_size; 215 squashfs_i(inode)->fragment_size = frag_size;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 651f0b31d296..52934a22f296 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -28,6 +28,7 @@
28struct squashfs_cache { 28struct squashfs_cache {
29 char *name; 29 char *name;
30 int entries; 30 int entries;
31 int curr_blk;
31 int next_blk; 32 int next_blk;
32 int num_waiters; 33 int num_waiters;
33 int unused; 34 int unused;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index d0858c2d9a47..ecaa2f7bdb8f 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -290,7 +290,7 @@ handle_fragments:
290 290
291check_directory_table: 291check_directory_table:
292 /* Sanity check directory_table */ 292 /* Sanity check directory_table */
293 if (msblk->directory_table >= next_table) { 293 if (msblk->directory_table > next_table) {
294 err = -EINVAL; 294 err = -EINVAL;
295 goto failed_mount; 295 goto failed_mount;
296 } 296 }
diff --git a/fs/super.c b/fs/super.c
index de41e1e46f09..6015c02296b7 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1186,6 +1186,8 @@ int freeze_super(struct super_block *sb)
1186 printk(KERN_ERR 1186 printk(KERN_ERR
1187 "VFS:Filesystem freeze failed\n"); 1187 "VFS:Filesystem freeze failed\n");
1188 sb->s_frozen = SB_UNFROZEN; 1188 sb->s_frozen = SB_UNFROZEN;
1189 smp_wmb();
1190 wake_up(&sb->s_wait_unfrozen);
1189 deactivate_locked_super(sb); 1191 deactivate_locked_super(sb);
1190 return ret; 1192 return ret;
1191 } 1193 }
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index b09ba2dd8b62..f922cbacdb96 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -38,9 +38,6 @@
38 38
39DEFINE_SPINLOCK(dbg_lock); 39DEFINE_SPINLOCK(dbg_lock);
40 40
41static char dbg_key_buf0[128];
42static char dbg_key_buf1[128];
43
44static const char *get_key_fmt(int fmt) 41static const char *get_key_fmt(int fmt)
45{ 42{
46 switch (fmt) { 43 switch (fmt) {
@@ -103,8 +100,8 @@ static const char *get_dent_type(int type)
103 } 100 }
104} 101}
105 102
106static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, 103const char *dbg_snprintf_key(const struct ubifs_info *c,
107 char *buffer) 104 const union ubifs_key *key, char *buffer, int len)
108{ 105{
109 char *p = buffer; 106 char *p = buffer;
110 int type = key_type(c, key); 107 int type = key_type(c, key);
@@ -112,45 +109,34 @@ static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
112 if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) { 109 if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
113 switch (type) { 110 switch (type) {
114 case UBIFS_INO_KEY: 111 case UBIFS_INO_KEY:
115 sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key), 112 len -= snprintf(p, len, "(%lu, %s)",
116 get_key_type(type)); 113 (unsigned long)key_inum(c, key),
114 get_key_type(type));
117 break; 115 break;
118 case UBIFS_DENT_KEY: 116 case UBIFS_DENT_KEY:
119 case UBIFS_XENT_KEY: 117 case UBIFS_XENT_KEY:
120 sprintf(p, "(%lu, %s, %#08x)", 118 len -= snprintf(p, len, "(%lu, %s, %#08x)",
121 (unsigned long)key_inum(c, key), 119 (unsigned long)key_inum(c, key),
122 get_key_type(type), key_hash(c, key)); 120 get_key_type(type), key_hash(c, key));
123 break; 121 break;
124 case UBIFS_DATA_KEY: 122 case UBIFS_DATA_KEY:
125 sprintf(p, "(%lu, %s, %u)", 123 len -= snprintf(p, len, "(%lu, %s, %u)",
126 (unsigned long)key_inum(c, key), 124 (unsigned long)key_inum(c, key),
127 get_key_type(type), key_block(c, key)); 125 get_key_type(type), key_block(c, key));
128 break; 126 break;
129 case UBIFS_TRUN_KEY: 127 case UBIFS_TRUN_KEY:
130 sprintf(p, "(%lu, %s)", 128 len -= snprintf(p, len, "(%lu, %s)",
131 (unsigned long)key_inum(c, key), 129 (unsigned long)key_inum(c, key),
132 get_key_type(type)); 130 get_key_type(type));
133 break; 131 break;
134 default: 132 default:
135 sprintf(p, "(bad key type: %#08x, %#08x)", 133 len -= snprintf(p, len, "(bad key type: %#08x, %#08x)",
136 key->u32[0], key->u32[1]); 134 key->u32[0], key->u32[1]);
137 } 135 }
138 } else 136 } else
139 sprintf(p, "bad key format %d", c->key_fmt); 137 len -= snprintf(p, len, "bad key format %d", c->key_fmt);
140} 138 ubifs_assert(len > 0);
141 139 return p;
142const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
143{
144 /* dbg_lock must be held */
145 sprintf_key(c, key, dbg_key_buf0);
146 return dbg_key_buf0;
147}
148
149const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
150{
151 /* dbg_lock must be held */
152 sprintf_key(c, key, dbg_key_buf1);
153 return dbg_key_buf1;
154} 140}
155 141
156const char *dbg_ntype(int type) 142const char *dbg_ntype(int type)
@@ -319,6 +305,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
319 int i, n; 305 int i, n;
320 union ubifs_key key; 306 union ubifs_key key;
321 const struct ubifs_ch *ch = node; 307 const struct ubifs_ch *ch = node;
308 char key_buf[DBG_KEY_BUF_LEN];
322 309
323 if (dbg_is_tst_rcvry(c)) 310 if (dbg_is_tst_rcvry(c))
324 return; 311 return;
@@ -474,7 +461,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
474 const struct ubifs_ino_node *ino = node; 461 const struct ubifs_ino_node *ino = node;
475 462
476 key_read(c, &ino->key, &key); 463 key_read(c, &ino->key, &key);
477 printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); 464 printk(KERN_DEBUG "\tkey %s\n",
465 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
478 printk(KERN_DEBUG "\tcreat_sqnum %llu\n", 466 printk(KERN_DEBUG "\tcreat_sqnum %llu\n",
479 (unsigned long long)le64_to_cpu(ino->creat_sqnum)); 467 (unsigned long long)le64_to_cpu(ino->creat_sqnum));
480 printk(KERN_DEBUG "\tsize %llu\n", 468 printk(KERN_DEBUG "\tsize %llu\n",
@@ -517,7 +505,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
517 int nlen = le16_to_cpu(dent->nlen); 505 int nlen = le16_to_cpu(dent->nlen);
518 506
519 key_read(c, &dent->key, &key); 507 key_read(c, &dent->key, &key);
520 printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); 508 printk(KERN_DEBUG "\tkey %s\n",
509 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
521 printk(KERN_DEBUG "\tinum %llu\n", 510 printk(KERN_DEBUG "\tinum %llu\n",
522 (unsigned long long)le64_to_cpu(dent->inum)); 511 (unsigned long long)le64_to_cpu(dent->inum));
523 printk(KERN_DEBUG "\ttype %d\n", (int)dent->type); 512 printk(KERN_DEBUG "\ttype %d\n", (int)dent->type);
@@ -541,7 +530,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
541 int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; 530 int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
542 531
543 key_read(c, &dn->key, &key); 532 key_read(c, &dn->key, &key);
544 printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); 533 printk(KERN_DEBUG "\tkey %s\n",
534 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
545 printk(KERN_DEBUG "\tsize %u\n", 535 printk(KERN_DEBUG "\tsize %u\n",
546 le32_to_cpu(dn->size)); 536 le32_to_cpu(dn->size));
547 printk(KERN_DEBUG "\tcompr_typ %d\n", 537 printk(KERN_DEBUG "\tcompr_typ %d\n",
@@ -582,7 +572,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
582 key_read(c, &br->key, &key); 572 key_read(c, &br->key, &key);
583 printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n", 573 printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
584 i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs), 574 i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
585 le32_to_cpu(br->len), DBGKEY(&key)); 575 le32_to_cpu(br->len),
576 dbg_snprintf_key(c, &key, key_buf,
577 DBG_KEY_BUF_LEN));
586 } 578 }
587 break; 579 break;
588 } 580 }
@@ -934,6 +926,7 @@ void dbg_dump_znode(const struct ubifs_info *c,
934{ 926{
935 int n; 927 int n;
936 const struct ubifs_zbranch *zbr; 928 const struct ubifs_zbranch *zbr;
929 char key_buf[DBG_KEY_BUF_LEN];
937 930
938 spin_lock(&dbg_lock); 931 spin_lock(&dbg_lock);
939 if (znode->parent) 932 if (znode->parent)
@@ -958,12 +951,16 @@ void dbg_dump_znode(const struct ubifs_info *c,
958 printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key " 951 printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
959 "%s\n", n, zbr->znode, zbr->lnum, 952 "%s\n", n, zbr->znode, zbr->lnum,
960 zbr->offs, zbr->len, 953 zbr->offs, zbr->len,
961 DBGKEY(&zbr->key)); 954 dbg_snprintf_key(c, &zbr->key,
955 key_buf,
956 DBG_KEY_BUF_LEN));
962 else 957 else
963 printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key " 958 printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
964 "%s\n", n, zbr->znode, zbr->lnum, 959 "%s\n", n, zbr->znode, zbr->lnum,
965 zbr->offs, zbr->len, 960 zbr->offs, zbr->len,
966 DBGKEY(&zbr->key)); 961 dbg_snprintf_key(c, &zbr->key,
962 key_buf,
963 DBG_KEY_BUF_LEN));
967 } 964 }
968 spin_unlock(&dbg_lock); 965 spin_unlock(&dbg_lock);
969} 966}
@@ -1260,6 +1257,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
1260 int err, nlen1, nlen2, cmp; 1257 int err, nlen1, nlen2, cmp;
1261 struct ubifs_dent_node *dent1, *dent2; 1258 struct ubifs_dent_node *dent1, *dent2;
1262 union ubifs_key key; 1259 union ubifs_key key;
1260 char key_buf[DBG_KEY_BUF_LEN];
1263 1261
1264 ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key)); 1262 ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
1265 dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); 1263 dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
@@ -1290,9 +1288,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
1290 key_read(c, &dent1->key, &key); 1288 key_read(c, &dent1->key, &key);
1291 if (keys_cmp(c, &zbr1->key, &key)) { 1289 if (keys_cmp(c, &zbr1->key, &key)) {
1292 dbg_err("1st entry at %d:%d has key %s", zbr1->lnum, 1290 dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
1293 zbr1->offs, DBGKEY(&key)); 1291 zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
1292 DBG_KEY_BUF_LEN));
1294 dbg_err("but it should have key %s according to tnc", 1293 dbg_err("but it should have key %s according to tnc",
1295 DBGKEY(&zbr1->key)); 1294 dbg_snprintf_key(c, &zbr1->key, key_buf,
1295 DBG_KEY_BUF_LEN));
1296 dbg_dump_node(c, dent1); 1296 dbg_dump_node(c, dent1);
1297 goto out_free; 1297 goto out_free;
1298 } 1298 }
@@ -1300,9 +1300,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
1300 key_read(c, &dent2->key, &key); 1300 key_read(c, &dent2->key, &key);
1301 if (keys_cmp(c, &zbr2->key, &key)) { 1301 if (keys_cmp(c, &zbr2->key, &key)) {
1302 dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum, 1302 dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
1303 zbr1->offs, DBGKEY(&key)); 1303 zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
1304 DBG_KEY_BUF_LEN));
1304 dbg_err("but it should have key %s according to tnc", 1305 dbg_err("but it should have key %s according to tnc",
1305 DBGKEY(&zbr2->key)); 1306 dbg_snprintf_key(c, &zbr2->key, key_buf,
1307 DBG_KEY_BUF_LEN));
1306 dbg_dump_node(c, dent2); 1308 dbg_dump_node(c, dent2);
1307 goto out_free; 1309 goto out_free;
1308 } 1310 }
@@ -1319,7 +1321,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
1319 dbg_err("2 xent/dent nodes with the same name"); 1321 dbg_err("2 xent/dent nodes with the same name");
1320 else 1322 else
1321 dbg_err("bad order of colliding key %s", 1323 dbg_err("bad order of colliding key %s",
1322 DBGKEY(&key)); 1324 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
1323 1325
1324 ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); 1326 ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
1325 dbg_dump_node(c, dent1); 1327 dbg_dump_node(c, dent1);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 8d9c46810189..ad1a6fee6010 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -169,40 +169,39 @@ struct ubifs_global_debug_info {
169 spin_unlock(&dbg_lock); \ 169 spin_unlock(&dbg_lock); \
170} while (0) 170} while (0)
171 171
172const char *dbg_key_str0(const struct ubifs_info *c, 172#define ubifs_dbg_msg(type, fmt, ...) \
173 const union ubifs_key *key); 173 pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
174const char *dbg_key_str1(const struct ubifs_info *c, 174
175 const union ubifs_key *key); 175#define DBG_KEY_BUF_LEN 32
176 176#define ubifs_dbg_msg_key(type, key, fmt, ...) do { \
177/* 177 char __tmp_key_buf[DBG_KEY_BUF_LEN]; \
178 * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message 178 pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__, \
179 * macros. 179 dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN)); \
180 */
181#define DBGKEY(key) dbg_key_str0(c, (key))
182#define DBGKEY1(key) dbg_key_str1(c, (key))
183
184extern spinlock_t dbg_lock;
185
186#define ubifs_dbg_msg(type, fmt, ...) do { \
187 spin_lock(&dbg_lock); \
188 pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
189 spin_unlock(&dbg_lock); \
190} while (0) 180} while (0)
191 181
192/* Just a debugging messages not related to any specific UBIFS subsystem */ 182/* Just a debugging messages not related to any specific UBIFS subsystem */
193#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__) 183#define dbg_msg(fmt, ...) \
184 printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \
185 __func__, ##__VA_ARGS__)
186
194/* General messages */ 187/* General messages */
195#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__) 188#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
196/* Additional journal messages */ 189/* Additional journal messages */
197#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__) 190#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
191#define dbg_jnlk(key, fmt, ...) \
192 ubifs_dbg_msg_key("jnl", key, fmt, ##__VA_ARGS__)
198/* Additional TNC messages */ 193/* Additional TNC messages */
199#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__) 194#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
195#define dbg_tnck(key, fmt, ...) \
196 ubifs_dbg_msg_key("tnc", key, fmt, ##__VA_ARGS__)
200/* Additional lprops messages */ 197/* Additional lprops messages */
201#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__) 198#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
202/* Additional LEB find messages */ 199/* Additional LEB find messages */
203#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__) 200#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
204/* Additional mount messages */ 201/* Additional mount messages */
205#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__) 202#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
203#define dbg_mntk(key, fmt, ...) \
204 ubifs_dbg_msg_key("mnt", key, fmt, ##__VA_ARGS__)
206/* Additional I/O messages */ 205/* Additional I/O messages */
207#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__) 206#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
208/* Additional commit messages */ 207/* Additional commit messages */
@@ -218,6 +217,7 @@ extern spinlock_t dbg_lock;
218/* Additional recovery messages */ 217/* Additional recovery messages */
219#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) 218#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
220 219
220extern spinlock_t dbg_lock;
221extern struct ubifs_global_debug_info ubifs_dbg; 221extern struct ubifs_global_debug_info ubifs_dbg;
222 222
223static inline int dbg_is_chk_gen(const struct ubifs_info *c) 223static inline int dbg_is_chk_gen(const struct ubifs_info *c)
@@ -258,6 +258,8 @@ const char *dbg_cstate(int cmt_state);
258const char *dbg_jhead(int jhead); 258const char *dbg_jhead(int jhead);
259const char *dbg_get_key_dump(const struct ubifs_info *c, 259const char *dbg_get_key_dump(const struct ubifs_info *c,
260 const union ubifs_key *key); 260 const union ubifs_key *key);
261const char *dbg_snprintf_key(const struct ubifs_info *c,
262 const union ubifs_key *key, char *buffer, int len);
261void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode); 263void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode);
262void dbg_dump_node(const struct ubifs_info *c, const void *node); 264void dbg_dump_node(const struct ubifs_info *c, const void *node);
263void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum, 265void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
@@ -345,20 +347,23 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
345#define dbg_dump_stack() 347#define dbg_dump_stack()
346#define ubifs_assert_cmt_locked(c) 348#define ubifs_assert_cmt_locked(c)
347 349
348#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 350#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
349#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 351#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
350#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 352#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
351#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 353#define dbg_jnlk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
352#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 354#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
353#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 355#define dbg_tnck(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
354#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 356#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
355#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 357#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
356#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 358#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
357#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 359#define dbg_mntk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
358#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 360#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
359#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 361#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
360#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 362#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
361#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 363#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
364#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
365#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
366#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
362 367
363static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; } 368static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; }
364static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; } 369static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; }
@@ -368,6 +373,10 @@ static inline const char *dbg_jhead(int jhead) { return ""; }
368static inline const char * 373static inline const char *
369dbg_get_key_dump(const struct ubifs_info *c, 374dbg_get_key_dump(const struct ubifs_info *c,
370 const union ubifs_key *key) { return ""; } 375 const union ubifs_key *key) { return ""; }
376static inline const char *
377dbg_snprintf_key(const struct ubifs_info *c,
378 const union ubifs_key *key, char *buffer,
379 int len) { return ""; }
371static inline void dbg_dump_inode(struct ubifs_info *c, 380static inline void dbg_dump_inode(struct ubifs_info *c,
372 const struct inode *inode) { return; } 381 const struct inode *inode) { return; }
373static inline void dbg_dump_node(const struct ubifs_info *c, 382static inline void dbg_dump_node(const struct ubifs_info *c,
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index cef0460f4c54..2f438ab2e7a2 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -697,9 +697,8 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
697 int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; 697 int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
698 struct ubifs_inode *ui = ubifs_inode(inode); 698 struct ubifs_inode *ui = ubifs_inode(inode);
699 699
700 dbg_jnl("ino %lu, blk %u, len %d, key %s", 700 dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
701 (unsigned long)key_inum(c, key), key_block(c, key), len, 701 (unsigned long)key_inum(c, key), key_block(c, key), len);
702 DBGKEY(key));
703 ubifs_assert(len <= UBIFS_BLOCK_SIZE); 702 ubifs_assert(len <= UBIFS_BLOCK_SIZE);
704 703
705 data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN); 704 data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
@@ -1177,7 +1176,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
1177 dn = (void *)trun + UBIFS_TRUN_NODE_SZ; 1176 dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
1178 blk = new_size >> UBIFS_BLOCK_SHIFT; 1177 blk = new_size >> UBIFS_BLOCK_SHIFT;
1179 data_key_init(c, &key, inum, blk); 1178 data_key_init(c, &key, inum, blk);
1180 dbg_jnl("last block key %s", DBGKEY(&key)); 1179 dbg_jnlk(&key, "last block key ");
1181 err = ubifs_tnc_lookup(c, &key, dn); 1180 err = ubifs_tnc_lookup(c, &key, dn);
1182 if (err == -ENOENT) 1181 if (err == -ENOENT)
1183 dlen = 0; /* Not found (so it is a hole) */ 1182 dlen = 0; /* Not found (so it is a hole) */
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ccabaf1164b3..b007637f0406 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -221,8 +221,8 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
221{ 221{
222 int err; 222 int err;
223 223
224 dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum, 224 dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ",
225 r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key)); 225 r->lnum, r->offs, r->len, r->deletion, r->sqnum);
226 226
227 /* Set c->replay_sqnum to help deal with dangling branches. */ 227 /* Set c->replay_sqnum to help deal with dangling branches. */
228 c->replay_sqnum = r->sqnum; 228 c->replay_sqnum = r->sqnum;
@@ -361,7 +361,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
361{ 361{
362 struct replay_entry *r; 362 struct replay_entry *r;
363 363
364 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); 364 dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
365 365
366 if (key_inum(c, key) >= c->highest_inum) 366 if (key_inum(c, key) >= c->highest_inum)
367 c->highest_inum = key_inum(c, key); 367 c->highest_inum = key_inum(c, key);
@@ -409,7 +409,7 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
409 struct replay_entry *r; 409 struct replay_entry *r;
410 char *nbuf; 410 char *nbuf;
411 411
412 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); 412 dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
413 if (key_inum(c, key) >= c->highest_inum) 413 if (key_inum(c, key) >= c->highest_inum)
414 c->highest_inum = key_inum(c, key); 414 c->highest_inum = key_inum(c, key);
415 415
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e14ee53159db..16ad84d8402f 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -505,7 +505,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
505{ 505{
506 int ret; 506 int ret;
507 507
508 dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key)); 508 dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs);
509 509
510 ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum, 510 ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
511 zbr->offs); 511 zbr->offs);
@@ -519,8 +519,8 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
519 ret = 0; 519 ret = 0;
520 } 520 }
521 if (ret == 0 && c->replaying) 521 if (ret == 0 && c->replaying)
522 dbg_mnt("dangling branch LEB %d:%d len %d, key %s", 522 dbg_mntk(key, "dangling branch LEB %d:%d len %d, key ",
523 zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); 523 zbr->lnum, zbr->offs, zbr->len);
524 return ret; 524 return ret;
525} 525}
526 526
@@ -995,9 +995,9 @@ static int fallible_resolve_collision(struct ubifs_info *c,
995 if (adding || !o_znode) 995 if (adding || !o_znode)
996 return 0; 996 return 0;
997 997
998 dbg_mnt("dangling match LEB %d:%d len %d %s", 998 dbg_mntk(key, "dangling match LEB %d:%d len %d key ",
999 o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs, 999 o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
1000 o_znode->zbranch[o_n].len, DBGKEY(key)); 1000 o_znode->zbranch[o_n].len);
1001 *zn = o_znode; 1001 *zn = o_znode;
1002 *n = o_n; 1002 *n = o_n;
1003 return 1; 1003 return 1;
@@ -1179,7 +1179,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1179 struct ubifs_znode *znode; 1179 struct ubifs_znode *znode;
1180 unsigned long time = get_seconds(); 1180 unsigned long time = get_seconds();
1181 1181
1182 dbg_tnc("search key %s", DBGKEY(key)); 1182 dbg_tnck(key, "search key ");
1183 ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY); 1183 ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
1184 1184
1185 znode = c->zroot.znode; 1185 znode = c->zroot.znode;
@@ -1315,7 +1315,7 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
1315 struct ubifs_znode *znode; 1315 struct ubifs_znode *znode;
1316 unsigned long time = get_seconds(); 1316 unsigned long time = get_seconds();
1317 1317
1318 dbg_tnc("search and dirty key %s", DBGKEY(key)); 1318 dbg_tnck(key, "search and dirty key ");
1319 1319
1320 znode = c->zroot.znode; 1320 znode = c->zroot.znode;
1321 if (unlikely(!znode)) { 1321 if (unlikely(!znode)) {
@@ -1722,8 +1722,8 @@ static int validate_data_node(struct ubifs_info *c, void *buf,
1722 if (!keys_eq(c, &zbr->key, &key1)) { 1722 if (!keys_eq(c, &zbr->key, &key1)) {
1723 ubifs_err("bad key in node at LEB %d:%d", 1723 ubifs_err("bad key in node at LEB %d:%d",
1724 zbr->lnum, zbr->offs); 1724 zbr->lnum, zbr->offs);
1725 dbg_tnc("looked for key %s found node's key %s", 1725 dbg_tnck(&zbr->key, "looked for key ");
1726 DBGKEY(&zbr->key), DBGKEY1(&key1)); 1726 dbg_tnck(&key1, "found node's key ");
1727 goto out_err; 1727 goto out_err;
1728 } 1728 }
1729 1729
@@ -1776,7 +1776,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
1776 ubifs_err("failed to read from LEB %d:%d, error %d", 1776 ubifs_err("failed to read from LEB %d:%d, error %d",
1777 lnum, offs, err); 1777 lnum, offs, err);
1778 dbg_dump_stack(); 1778 dbg_dump_stack();
1779 dbg_tnc("key %s", DBGKEY(&bu->key)); 1779 dbg_tnck(&bu->key, "key ");
1780 return err; 1780 return err;
1781 } 1781 }
1782 1782
@@ -1811,7 +1811,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1811 int found, n, err; 1811 int found, n, err;
1812 struct ubifs_znode *znode; 1812 struct ubifs_znode *znode;
1813 1813
1814 dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); 1814 dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
1815 mutex_lock(&c->tnc_mutex); 1815 mutex_lock(&c->tnc_mutex);
1816 found = ubifs_lookup_level0(c, key, &znode, &n); 1816 found = ubifs_lookup_level0(c, key, &znode, &n);
1817 if (!found) { 1817 if (!found) {
@@ -1985,8 +1985,7 @@ again:
1985 zp = znode->parent; 1985 zp = znode->parent;
1986 if (znode->child_cnt < c->fanout) { 1986 if (znode->child_cnt < c->fanout) {
1987 ubifs_assert(n != c->fanout); 1987 ubifs_assert(n != c->fanout);
1988 dbg_tnc("inserted at %d level %d, key %s", n, znode->level, 1988 dbg_tnck(key, "inserted at %d level %d, key ", n, znode->level);
1989 DBGKEY(key));
1990 1989
1991 insert_zbranch(znode, zbr, n); 1990 insert_zbranch(znode, zbr, n);
1992 1991
@@ -2001,7 +2000,7 @@ again:
2001 * Unfortunately, @znode does not have more empty slots and we have to 2000 * Unfortunately, @znode does not have more empty slots and we have to
2002 * split it. 2001 * split it.
2003 */ 2002 */
2004 dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key)); 2003 dbg_tnck(key, "splitting level %d, key ", znode->level);
2005 2004
2006 if (znode->alt) 2005 if (znode->alt)
2007 /* 2006 /*
@@ -2095,7 +2094,7 @@ do_split:
2095 } 2094 }
2096 2095
2097 /* Insert new key and branch */ 2096 /* Insert new key and branch */
2098 dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key)); 2097 dbg_tnck(key, "inserting at %d level %d, key ", n, zn->level);
2099 2098
2100 insert_zbranch(zi, zbr, n); 2099 insert_zbranch(zi, zbr, n);
2101 2100
@@ -2171,7 +2170,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
2171 struct ubifs_znode *znode; 2170 struct ubifs_znode *znode;
2172 2171
2173 mutex_lock(&c->tnc_mutex); 2172 mutex_lock(&c->tnc_mutex);
2174 dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key)); 2173 dbg_tnck(key, "%d:%d, len %d, key ", lnum, offs, len);
2175 found = lookup_level0_dirty(c, key, &znode, &n); 2174 found = lookup_level0_dirty(c, key, &znode, &n);
2176 if (!found) { 2175 if (!found) {
2177 struct ubifs_zbranch zbr; 2176 struct ubifs_zbranch zbr;
@@ -2220,8 +2219,8 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
2220 struct ubifs_znode *znode; 2219 struct ubifs_znode *znode;
2221 2220
2222 mutex_lock(&c->tnc_mutex); 2221 mutex_lock(&c->tnc_mutex);
2223 dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum, 2222 dbg_tnck(key, "old LEB %d:%d, new LEB %d:%d, len %d, key ", old_lnum,
2224 old_offs, lnum, offs, len, DBGKEY(key)); 2223 old_offs, lnum, offs, len);
2225 found = lookup_level0_dirty(c, key, &znode, &n); 2224 found = lookup_level0_dirty(c, key, &znode, &n);
2226 if (found < 0) { 2225 if (found < 0) {
2227 err = found; 2226 err = found;
@@ -2303,8 +2302,8 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
2303 struct ubifs_znode *znode; 2302 struct ubifs_znode *znode;
2304 2303
2305 mutex_lock(&c->tnc_mutex); 2304 mutex_lock(&c->tnc_mutex);
2306 dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name, 2305 dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
2307 DBGKEY(key)); 2306 lnum, offs, nm->len, nm->name);
2308 found = lookup_level0_dirty(c, key, &znode, &n); 2307 found = lookup_level0_dirty(c, key, &znode, &n);
2309 if (found < 0) { 2308 if (found < 0) {
2310 err = found; 2309 err = found;
@@ -2397,7 +2396,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
2397 /* Delete without merge for now */ 2396 /* Delete without merge for now */
2398 ubifs_assert(znode->level == 0); 2397 ubifs_assert(znode->level == 0);
2399 ubifs_assert(n >= 0 && n < c->fanout); 2398 ubifs_assert(n >= 0 && n < c->fanout);
2400 dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key)); 2399 dbg_tnck(&znode->zbranch[n].key, "deleting key ");
2401 2400
2402 zbr = &znode->zbranch[n]; 2401 zbr = &znode->zbranch[n];
2403 lnc_free(zbr); 2402 lnc_free(zbr);
@@ -2507,7 +2506,7 @@ int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
2507 struct ubifs_znode *znode; 2506 struct ubifs_znode *znode;
2508 2507
2509 mutex_lock(&c->tnc_mutex); 2508 mutex_lock(&c->tnc_mutex);
2510 dbg_tnc("key %s", DBGKEY(key)); 2509 dbg_tnck(key, "key ");
2511 found = lookup_level0_dirty(c, key, &znode, &n); 2510 found = lookup_level0_dirty(c, key, &znode, &n);
2512 if (found < 0) { 2511 if (found < 0) {
2513 err = found; 2512 err = found;
@@ -2538,7 +2537,7 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
2538 struct ubifs_znode *znode; 2537 struct ubifs_znode *znode;
2539 2538
2540 mutex_lock(&c->tnc_mutex); 2539 mutex_lock(&c->tnc_mutex);
2541 dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key)); 2540 dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
2542 err = lookup_level0_dirty(c, key, &znode, &n); 2541 err = lookup_level0_dirty(c, key, &znode, &n);
2543 if (err < 0) 2542 if (err < 0)
2544 goto out_unlock; 2543 goto out_unlock;
@@ -2653,7 +2652,7 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
2653 dbg_dump_znode(c, znode); 2652 dbg_dump_znode(c, znode);
2654 goto out_unlock; 2653 goto out_unlock;
2655 } 2654 }
2656 dbg_tnc("removing %s", DBGKEY(key)); 2655 dbg_tnck(key, "removing key ");
2657 } 2656 }
2658 if (k) { 2657 if (k) {
2659 for (i = n + 1 + k; i < znode->child_cnt; i++) 2658 for (i = n + 1 + k; i < znode->child_cnt; i++)
@@ -2773,7 +2772,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
2773 struct ubifs_zbranch *zbr; 2772 struct ubifs_zbranch *zbr;
2774 union ubifs_key *dkey; 2773 union ubifs_key *dkey;
2775 2774
2776 dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key)); 2775 dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
2777 ubifs_assert(is_hash_key(c, key)); 2776 ubifs_assert(is_hash_key(c, key));
2778 2777
2779 mutex_lock(&c->tnc_mutex); 2778 mutex_lock(&c->tnc_mutex);
@@ -3332,9 +3331,9 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
3332 3331
3333out_dump: 3332out_dump:
3334 block = key_block(c, key); 3333 block = key_block(c, key);
3335 ubifs_err("inode %lu has size %lld, but there are data at offset %lld " 3334 ubifs_err("inode %lu has size %lld, but there are data at offset %lld",
3336 "(data key %s)", (unsigned long)inode->i_ino, size, 3335 (unsigned long)inode->i_ino, size,
3337 ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key)); 3336 ((loff_t)block) << UBIFS_BLOCK_SHIFT);
3338 mutex_unlock(&c->tnc_mutex); 3337 mutex_unlock(&c->tnc_mutex);
3339 dbg_dump_inode(c, inode); 3338 dbg_dump_inode(c, inode);
3340 dbg_dump_stack(); 3339 dbg_dump_stack();
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index b48db999903e..dc28fe6ec07a 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -328,8 +328,8 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
328 case UBIFS_XENT_KEY: 328 case UBIFS_XENT_KEY:
329 break; 329 break;
330 default: 330 default:
331 dbg_msg("bad key type at slot %d: %s", i, 331 dbg_msg("bad key type at slot %d: %d",
332 DBGKEY(&zbr->key)); 332 i, key_type(c, &zbr->key));
333 err = 3; 333 err = 3;
334 goto out_dump; 334 goto out_dump;
335 } 335 }
@@ -475,7 +475,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
475 zbr->offs); 475 zbr->offs);
476 476
477 if (err) { 477 if (err) {
478 dbg_tnc("key %s", DBGKEY(key)); 478 dbg_tnck(key, "key ");
479 return err; 479 return err;
480 } 480 }
481 481
@@ -484,8 +484,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
484 if (!keys_eq(c, key, &key1)) { 484 if (!keys_eq(c, key, &key1)) {
485 ubifs_err("bad key in node at LEB %d:%d", 485 ubifs_err("bad key in node at LEB %d:%d",
486 zbr->lnum, zbr->offs); 486 zbr->lnum, zbr->offs);
487 dbg_tnc("looked for key %s found node's key %s", 487 dbg_tnck(key, "looked for key ");
488 DBGKEY(key), DBGKEY1(&key1)); 488 dbg_tnck(&key1, "but found node's key ");
489 dbg_dump_node(c, node); 489 dbg_dump_node(c, node);
490 return -EINVAL; 490 return -EINVAL;
491 } 491 }
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee9b625..74b9baf36ac3 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -111,8 +111,7 @@ xfs_ioend_new_eof(
111 xfs_fsize_t bsize; 111 xfs_fsize_t bsize;
112 112
113 bsize = ioend->io_offset + ioend->io_size; 113 bsize = ioend->io_offset + ioend->io_size;
114 isize = MAX(ip->i_size, ip->i_new_size); 114 isize = MIN(i_size_read(VFS_I(ip)), bsize);
115 isize = MIN(isize, bsize);
116 return isize > ip->i_d.di_size ? isize : 0; 115 return isize > ip->i_d.di_size ? isize : 0;
117} 116}
118 117
@@ -126,11 +125,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
126} 125}
127 126
128/* 127/*
129 * Update on-disk file size now that data has been written to disk. The 128 * Update on-disk file size now that data has been written to disk.
130 * current in-memory file size is i_size. If a write is beyond eof i_new_size
131 * will be the intended file size until i_size is updated. If this write does
132 * not extend all the way to the valid file size then restrict this update to
133 * the end of the write.
134 * 129 *
135 * This function does not block as blocking on the inode lock in IO completion 130 * This function does not block as blocking on the inode lock in IO completion
136 * can lead to IO completion order dependency deadlocks.. If it can't get the 131 * can lead to IO completion order dependency deadlocks.. If it can't get the
@@ -1279,6 +1274,15 @@ xfs_end_io_direct_write(
1279 struct xfs_ioend *ioend = iocb->private; 1274 struct xfs_ioend *ioend = iocb->private;
1280 1275
1281 /* 1276 /*
1277 * While the generic direct I/O code updates the inode size, it does
1278 * so only after the end_io handler is called, which means our
1279 * end_io handler thinks the on-disk size is outside the in-core
1280 * size. To prevent this just update it a little bit earlier here.
1281 */
1282 if (offset + size > i_size_read(ioend->io_inode))
1283 i_size_write(ioend->io_inode, offset + size);
1284
1285 /*
1282 * blockdev_direct_IO can return an error even after the I/O 1286 * blockdev_direct_IO can return an error even after the I/O
1283 * completion handler was called. Thus we need to protect 1287 * completion handler was called. Thus we need to protect
1284 * against double-freeing. 1288 * against double-freeing.
@@ -1340,12 +1344,11 @@ xfs_vm_write_failed(
1340 1344
1341 if (to > inode->i_size) { 1345 if (to > inode->i_size) {
1342 /* 1346 /*
1343 * punch out the delalloc blocks we have already allocated. We 1347 * Punch out the delalloc blocks we have already allocated.
1344 * don't call xfs_setattr() to do this as we may be in the 1348 *
1345 * middle of a multi-iovec write and so the vfs inode->i_size 1349 * Don't bother with xfs_setattr given that nothing can have
1346 * will not match the xfs ip->i_size and so it will zero too 1350 * made it to disk yet as the page is still locked at this
1347 * much. Hence we jus truncate the page cache to zero what is 1351 * point.
1348 * necessary and punch the delalloc blocks directly.
1349 */ 1352 */
1350 struct xfs_inode *ip = XFS_I(inode); 1353 struct xfs_inode *ip = XFS_I(inode);
1351 xfs_fileoff_t start_fsb; 1354 xfs_fileoff_t start_fsb;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 1e5d97f86ea8..08b9ac644c31 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
827 if (error) 827 if (error)
828 goto out; 828 goto out;
829 829
830 /*
831 * Commit the last in the sequence of transactions.
832 */
833 xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
834 error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); 830 error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
835 xfs_iunlock(dp, XFS_ILOCK_EXCL); 831 xfs_iunlock(dp, XFS_ILOCK_EXCL);
836 832
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index c1b55e596551..d25eafd4d28d 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -271,10 +271,6 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
271 dp = args->dp; 271 dp = args->dp;
272 mp = dp->i_mount; 272 mp = dp->i_mount;
273 dp->i_d.di_forkoff = forkoff; 273 dp->i_d.di_forkoff = forkoff;
274 dp->i_df.if_ext_max =
275 XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
276 dp->i_afp->if_ext_max =
277 XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
278 274
279 ifp = dp->i_afp; 275 ifp = dp->i_afp;
280 ASSERT(ifp->if_flags & XFS_IFINLINE); 276 ASSERT(ifp->if_flags & XFS_IFINLINE);
@@ -326,7 +322,6 @@ xfs_attr_fork_reset(
326 ASSERT(ip->i_d.di_anextents == 0); 322 ASSERT(ip->i_d.di_anextents == 0);
327 ASSERT(ip->i_afp == NULL); 323 ASSERT(ip->i_afp == NULL);
328 324
329 ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
330 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 325 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
331} 326}
332 327
@@ -389,10 +384,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
389 (args->op_flags & XFS_DA_OP_ADDNAME) || 384 (args->op_flags & XFS_DA_OP_ADDNAME) ||
390 !(mp->m_flags & XFS_MOUNT_ATTR2) || 385 !(mp->m_flags & XFS_MOUNT_ATTR2) ||
391 dp->i_d.di_format == XFS_DINODE_FMT_BTREE); 386 dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
392 dp->i_afp->if_ext_max =
393 XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
394 dp->i_df.if_ext_max =
395 XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
396 xfs_trans_log_inode(args->trans, dp, 387 xfs_trans_log_inode(args->trans, dp,
397 XFS_ILOG_CORE | XFS_ILOG_ADATA); 388 XFS_ILOG_CORE | XFS_ILOG_ADATA);
398 } 389 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index d0ab78837057..188ef2fbd628 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge(
249} 249}
250 250
251/* 251/*
252* Update the record referred to by cur to the value given 252 * Check if the inode needs to be converted to btree format.
253 */
254static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
255{
256 return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
257 XFS_IFORK_NEXTENTS(ip, whichfork) >
258 XFS_IFORK_MAXEXT(ip, whichfork);
259}
260
261/*
262 * Check if the inode should be converted to extent format.
263 */
264static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
265{
266 return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
267 XFS_IFORK_NEXTENTS(ip, whichfork) <=
268 XFS_IFORK_MAXEXT(ip, whichfork);
269}
270
271/*
272 * Update the record referred to by cur to the value given
253 * by [off, bno, len, state]. 273 * by [off, bno, len, state].
254 * This either works (return 0) or gets an EFSCORRUPTED error. 274 * This either works (return 0) or gets an EFSCORRUPTED error.
255 */ 275 */
@@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real(
683 goto done; 703 goto done;
684 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 704 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
685 } 705 }
686 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 706
687 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { 707 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
688 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, 708 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
689 bma->firstblock, bma->flist, 709 bma->firstblock, bma->flist,
690 &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); 710 &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
@@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real(
767 goto done; 787 goto done;
768 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 788 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
769 } 789 }
770 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 790
771 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { 791 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
772 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, 792 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
773 bma->firstblock, bma->flist, &bma->cur, 1, 793 bma->firstblock, bma->flist, &bma->cur, 1,
774 &tmp_rval, XFS_DATA_FORK); 794 &tmp_rval, XFS_DATA_FORK);
@@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real(
836 goto done; 856 goto done;
837 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 857 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
838 } 858 }
839 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 859
840 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { 860 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
841 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, 861 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
842 bma->firstblock, bma->flist, &bma->cur, 862 bma->firstblock, bma->flist, &bma->cur,
843 1, &tmp_rval, XFS_DATA_FORK); 863 1, &tmp_rval, XFS_DATA_FORK);
@@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real(
884 } 904 }
885 905
886 /* convert to a btree if necessary */ 906 /* convert to a btree if necessary */
887 if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && 907 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
888 XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
889 int tmp_logflags; /* partial log flag return val */ 908 int tmp_logflags; /* partial log flag return val */
890 909
891 ASSERT(bma->cur == NULL); 910 ASSERT(bma->cur == NULL);
@@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real(
1421 } 1440 }
1422 1441
1423 /* convert to a btree if necessary */ 1442 /* convert to a btree if necessary */
1424 if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && 1443 if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
1425 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
1426 int tmp_logflags; /* partial log flag return val */ 1444 int tmp_logflags; /* partial log flag return val */
1427 1445
1428 ASSERT(cur == NULL); 1446 ASSERT(cur == NULL);
@@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real(
1812 } 1830 }
1813 1831
1814 /* convert to a btree if necessary */ 1832 /* convert to a btree if necessary */
1815 if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS && 1833 if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
1816 XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
1817 int tmp_logflags; /* partial log flag return val */ 1834 int tmp_logflags; /* partial log flag return val */
1818 1835
1819 ASSERT(bma->cur == NULL); 1836 ASSERT(bma->cur == NULL);
@@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree(
3037 3054
3038 ifp = XFS_IFORK_PTR(ip, whichfork); 3055 ifp = XFS_IFORK_PTR(ip, whichfork);
3039 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); 3056 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
3040 ASSERT(ifp->if_ext_max == 3057
3041 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
3042 /* 3058 /*
3043 * Make space in the inode incore. 3059 * Make space in the inode incore.
3044 */ 3060 */
@@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset(
3184 ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { 3200 ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
3185 uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; 3201 uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
3186 3202
3187 if (dfl_forkoff > ip->i_d.di_forkoff) { 3203 if (dfl_forkoff > ip->i_d.di_forkoff)
3188 ip->i_d.di_forkoff = dfl_forkoff; 3204 ip->i_d.di_forkoff = dfl_forkoff;
3189 ip->i_df.if_ext_max =
3190 XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
3191 ip->i_afp->if_ext_max =
3192 XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
3193 }
3194 } 3205 }
3195} 3206}
3196 3207
@@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork(
3430 int error; /* error return value */ 3441 int error; /* error return value */
3431 3442
3432 ASSERT(XFS_IFORK_Q(ip) == 0); 3443 ASSERT(XFS_IFORK_Q(ip) == 0);
3433 ASSERT(ip->i_df.if_ext_max ==
3434 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3435 3444
3436 mp = ip->i_mount; 3445 mp = ip->i_mount;
3437 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 3446 ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork(
3486 error = XFS_ERROR(EINVAL); 3495 error = XFS_ERROR(EINVAL);
3487 goto error1; 3496 goto error1;
3488 } 3497 }
3489 ip->i_df.if_ext_max = 3498
3490 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
3491 ASSERT(ip->i_afp == NULL); 3499 ASSERT(ip->i_afp == NULL);
3492 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); 3500 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
3493 ip->i_afp->if_ext_max =
3494 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
3495 ip->i_afp->if_flags = XFS_IFEXTENTS; 3501 ip->i_afp->if_flags = XFS_IFEXTENTS;
3496 logflags = 0; 3502 logflags = 0;
3497 xfs_bmap_init(&flist, &firstblock); 3503 xfs_bmap_init(&flist, &firstblock);
@@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork(
3535 } else 3541 } else
3536 spin_unlock(&mp->m_sb_lock); 3542 spin_unlock(&mp->m_sb_lock);
3537 } 3543 }
3538 if ((error = xfs_bmap_finish(&tp, &flist, &committed))) 3544
3545 error = xfs_bmap_finish(&tp, &flist, &committed);
3546 if (error)
3539 goto error2; 3547 goto error2;
3540 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 3548 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3541 ASSERT(ip->i_df.if_ext_max ==
3542 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3543 return error;
3544error2: 3549error2:
3545 xfs_bmap_cancel(&flist); 3550 xfs_bmap_cancel(&flist);
3546error1: 3551error1:
3547 xfs_iunlock(ip, XFS_ILOCK_EXCL); 3552 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3548error0: 3553error0:
3549 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); 3554 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
3550 ASSERT(ip->i_df.if_ext_max ==
3551 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3552 return error; 3555 return error;
3553} 3556}
3554 3557
@@ -3994,11 +3997,8 @@ xfs_bmap_one_block(
3994 xfs_bmbt_irec_t s; /* internal version of extent */ 3997 xfs_bmbt_irec_t s; /* internal version of extent */
3995 3998
3996#ifndef DEBUG 3999#ifndef DEBUG
3997 if (whichfork == XFS_DATA_FORK) { 4000 if (whichfork == XFS_DATA_FORK)
3998 return S_ISREG(ip->i_d.di_mode) ? 4001 return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
3999 (ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
4000 (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
4001 }
4002#endif /* !DEBUG */ 4002#endif /* !DEBUG */
4003 if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) 4003 if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
4004 return 0; 4004 return 0;
@@ -4010,7 +4010,7 @@ xfs_bmap_one_block(
4010 xfs_bmbt_get_all(ep, &s); 4010 xfs_bmbt_get_all(ep, &s);
4011 rval = s.br_startoff == 0 && s.br_blockcount == 1; 4011 rval = s.br_startoff == 0 && s.br_blockcount == 1;
4012 if (rval && whichfork == XFS_DATA_FORK) 4012 if (rval && whichfork == XFS_DATA_FORK)
4013 ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize); 4013 ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
4014 return rval; 4014 return rval;
4015} 4015}
4016 4016
@@ -4379,8 +4379,6 @@ xfs_bmapi_read(
4379 XFS_STATS_INC(xs_blk_mapr); 4379 XFS_STATS_INC(xs_blk_mapr);
4380 4380
4381 ifp = XFS_IFORK_PTR(ip, whichfork); 4381 ifp = XFS_IFORK_PTR(ip, whichfork);
4382 ASSERT(ifp->if_ext_max ==
4383 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
4384 4382
4385 if (!(ifp->if_flags & XFS_IFEXTENTS)) { 4383 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
4386 error = xfs_iread_extents(NULL, ip, whichfork); 4384 error = xfs_iread_extents(NULL, ip, whichfork);
@@ -4871,8 +4869,6 @@ xfs_bmapi_write(
4871 return XFS_ERROR(EIO); 4869 return XFS_ERROR(EIO);
4872 4870
4873 ifp = XFS_IFORK_PTR(ip, whichfork); 4871 ifp = XFS_IFORK_PTR(ip, whichfork);
4874 ASSERT(ifp->if_ext_max ==
4875 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
4876 4872
4877 XFS_STATS_INC(xs_blk_mapw); 4873 XFS_STATS_INC(xs_blk_mapw);
4878 4874
@@ -4981,8 +4977,7 @@ xfs_bmapi_write(
4981 /* 4977 /*
4982 * Transform from btree to extents, give it cur. 4978 * Transform from btree to extents, give it cur.
4983 */ 4979 */
4984 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && 4980 if (xfs_bmap_wants_extents(ip, whichfork)) {
4985 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
4986 int tmp_logflags = 0; 4981 int tmp_logflags = 0;
4987 4982
4988 ASSERT(bma.cur); 4983 ASSERT(bma.cur);
@@ -4992,10 +4987,10 @@ xfs_bmapi_write(
4992 if (error) 4987 if (error)
4993 goto error0; 4988 goto error0;
4994 } 4989 }
4995 ASSERT(ifp->if_ext_max == 4990
4996 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
4997 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || 4991 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
4998 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); 4992 XFS_IFORK_NEXTENTS(ip, whichfork) >
4993 XFS_IFORK_MAXEXT(ip, whichfork));
4999 error = 0; 4994 error = 0;
5000error0: 4995error0:
5001 /* 4996 /*
@@ -5095,8 +5090,7 @@ xfs_bunmapi(
5095 5090
5096 ASSERT(len > 0); 5091 ASSERT(len > 0);
5097 ASSERT(nexts >= 0); 5092 ASSERT(nexts >= 0);
5098 ASSERT(ifp->if_ext_max == 5093
5099 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5100 if (!(ifp->if_flags & XFS_IFEXTENTS) && 5094 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
5101 (error = xfs_iread_extents(tp, ip, whichfork))) 5095 (error = xfs_iread_extents(tp, ip, whichfork)))
5102 return error; 5096 return error;
@@ -5322,7 +5316,8 @@ xfs_bunmapi(
5322 */ 5316 */
5323 if (!wasdel && xfs_trans_get_block_res(tp) == 0 && 5317 if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
5324 XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && 5318 XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
5325 XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max && 5319 XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
5320 XFS_IFORK_MAXEXT(ip, whichfork) &&
5326 del.br_startoff > got.br_startoff && 5321 del.br_startoff > got.br_startoff &&
5327 del.br_startoff + del.br_blockcount < 5322 del.br_startoff + del.br_blockcount <
5328 got.br_startoff + got.br_blockcount) { 5323 got.br_startoff + got.br_blockcount) {
@@ -5353,13 +5348,11 @@ nodelete:
5353 } 5348 }
5354 } 5349 }
5355 *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; 5350 *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
5356 ASSERT(ifp->if_ext_max == 5351
5357 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5358 /* 5352 /*
5359 * Convert to a btree if necessary. 5353 * Convert to a btree if necessary.
5360 */ 5354 */
5361 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && 5355 if (xfs_bmap_needs_btree(ip, whichfork)) {
5362 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
5363 ASSERT(cur == NULL); 5356 ASSERT(cur == NULL);
5364 error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, 5357 error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
5365 &cur, 0, &tmp_logflags, whichfork); 5358 &cur, 0, &tmp_logflags, whichfork);
@@ -5370,8 +5363,7 @@ nodelete:
5370 /* 5363 /*
5371 * transform from btree to extents, give it cur 5364 * transform from btree to extents, give it cur
5372 */ 5365 */
5373 else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && 5366 else if (xfs_bmap_wants_extents(ip, whichfork)) {
5374 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
5375 ASSERT(cur != NULL); 5367 ASSERT(cur != NULL);
5376 error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, 5368 error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
5377 whichfork); 5369 whichfork);
@@ -5382,8 +5374,6 @@ nodelete:
5382 /* 5374 /*
5383 * transform from extents to local? 5375 * transform from extents to local?
5384 */ 5376 */
5385 ASSERT(ifp->if_ext_max ==
5386 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5387 error = 0; 5377 error = 0;
5388error0: 5378error0:
5389 /* 5379 /*
@@ -5434,7 +5424,7 @@ xfs_getbmapx_fix_eof_hole(
5434 if (startblock == HOLESTARTBLOCK) { 5424 if (startblock == HOLESTARTBLOCK) {
5435 mp = ip->i_mount; 5425 mp = ip->i_mount;
5436 out->bmv_block = -1; 5426 out->bmv_block = -1;
5437 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size)); 5427 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
5438 fixlen -= out->bmv_offset; 5428 fixlen -= out->bmv_offset;
5439 if (prealloced && out->bmv_offset + out->bmv_length == end) { 5429 if (prealloced && out->bmv_offset + out->bmv_length == end) {
5440 /* Came to hole at EOF. Trim it. */ 5430 /* Came to hole at EOF. Trim it. */
@@ -5522,7 +5512,7 @@ xfs_getbmap(
5522 fixlen = XFS_MAXIOFFSET(mp); 5512 fixlen = XFS_MAXIOFFSET(mp);
5523 } else { 5513 } else {
5524 prealloced = 0; 5514 prealloced = 0;
5525 fixlen = ip->i_size; 5515 fixlen = XFS_ISIZE(ip);
5526 } 5516 }
5527 } 5517 }
5528 5518
@@ -5551,7 +5541,7 @@ xfs_getbmap(
5551 5541
5552 xfs_ilock(ip, XFS_IOLOCK_SHARED); 5542 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5553 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { 5543 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
5554 if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) { 5544 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
5555 error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); 5545 error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
5556 if (error) 5546 if (error)
5557 goto out_unlock_iolock; 5547 goto out_unlock_iolock;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 654dc6f05bac..dd974a55c77d 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -163,12 +163,14 @@ xfs_swap_extents_check_format(
163 163
164 /* Check temp in extent form to max in target */ 164 /* Check temp in extent form to max in target */
165 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 165 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
166 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) 166 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
167 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
167 return EINVAL; 168 return EINVAL;
168 169
169 /* Check target in extent form to max in temp */ 170 /* Check target in extent form to max in temp */
170 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 171 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
171 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) 172 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
173 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
172 return EINVAL; 174 return EINVAL;
173 175
174 /* 176 /*
@@ -180,18 +182,25 @@ xfs_swap_extents_check_format(
180 * (a common defrag case) which will occur when the temp inode is in 182 * (a common defrag case) which will occur when the temp inode is in
181 * extent format... 183 * extent format...
182 */ 184 */
183 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && 185 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
184 ((XFS_IFORK_BOFF(ip) && 186 if (XFS_IFORK_BOFF(ip) &&
185 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) || 187 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
186 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max)) 188 return EINVAL;
187 return EINVAL; 189 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
190 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
191 return EINVAL;
192 }
188 193
189 /* Reciprocal target->temp btree format checks */ 194 /* Reciprocal target->temp btree format checks */
190 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && 195 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
191 ((XFS_IFORK_BOFF(tip) && 196 if (XFS_IFORK_BOFF(tip) &&
192 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) || 197 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
193 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max)) 198 return EINVAL;
194 return EINVAL; 199
200 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
201 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
202 return EINVAL;
203 }
195 204
196 return 0; 205 return 0;
197} 206}
@@ -349,16 +358,6 @@ xfs_swap_extents(
349 *tifp = *tempifp; /* struct copy */ 358 *tifp = *tempifp; /* struct copy */
350 359
351 /* 360 /*
352 * Fix the in-memory data fork values that are dependent on the fork
353 * offset in the inode. We can't assume they remain the same as attr2
354 * has dynamic fork offsets.
355 */
356 ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
357 (uint)sizeof(xfs_bmbt_rec_t);
358 tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
359 (uint)sizeof(xfs_bmbt_rec_t);
360
361 /*
362 * Fix the on-disk inode values 361 * Fix the on-disk inode values
363 */ 362 */
364 tmp = (__uint64_t)ip->i_d.di_nblocks; 363 tmp = (__uint64_t)ip->i_d.di_nblocks;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f675f3d9d7b3..7e5bc872f2b4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -327,7 +327,7 @@ xfs_file_aio_read(
327 mp->m_rtdev_targp : mp->m_ddev_targp; 327 mp->m_rtdev_targp : mp->m_ddev_targp;
328 if ((iocb->ki_pos & target->bt_smask) || 328 if ((iocb->ki_pos & target->bt_smask) ||
329 (size & target->bt_smask)) { 329 (size & target->bt_smask)) {
330 if (iocb->ki_pos == ip->i_size) 330 if (iocb->ki_pos == i_size_read(inode))
331 return 0; 331 return 0;
332 return -XFS_ERROR(EINVAL); 332 return -XFS_ERROR(EINVAL);
333 } 333 }
@@ -412,51 +412,6 @@ xfs_file_splice_read(
412 return ret; 412 return ret;
413} 413}
414 414
415STATIC void
416xfs_aio_write_isize_update(
417 struct inode *inode,
418 loff_t *ppos,
419 ssize_t bytes_written)
420{
421 struct xfs_inode *ip = XFS_I(inode);
422 xfs_fsize_t isize = i_size_read(inode);
423
424 if (bytes_written > 0)
425 XFS_STATS_ADD(xs_write_bytes, bytes_written);
426
427 if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
428 *ppos > isize))
429 *ppos = isize;
430
431 if (*ppos > ip->i_size) {
432 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
433 if (*ppos > ip->i_size)
434 ip->i_size = *ppos;
435 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
436 }
437}
438
439/*
440 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
441 * part of the I/O may have been written to disk before the error occurred. In
442 * this case the on-disk file size may have been adjusted beyond the in-memory
443 * file size and now needs to be truncated back.
444 */
445STATIC void
446xfs_aio_write_newsize_update(
447 struct xfs_inode *ip,
448 xfs_fsize_t new_size)
449{
450 if (new_size == ip->i_new_size) {
451 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
452 if (new_size == ip->i_new_size)
453 ip->i_new_size = 0;
454 if (ip->i_d.di_size > ip->i_size)
455 ip->i_d.di_size = ip->i_size;
456 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
457 }
458}
459
460/* 415/*
461 * xfs_file_splice_write() does not use xfs_rw_ilock() because 416 * xfs_file_splice_write() does not use xfs_rw_ilock() because
462 * generic_file_splice_write() takes the i_mutex itself. This, in theory, 417 * generic_file_splice_write() takes the i_mutex itself. This, in theory,
@@ -475,7 +430,6 @@ xfs_file_splice_write(
475{ 430{
476 struct inode *inode = outfilp->f_mapping->host; 431 struct inode *inode = outfilp->f_mapping->host;
477 struct xfs_inode *ip = XFS_I(inode); 432 struct xfs_inode *ip = XFS_I(inode);
478 xfs_fsize_t new_size;
479 int ioflags = 0; 433 int ioflags = 0;
480 ssize_t ret; 434 ssize_t ret;
481 435
@@ -489,19 +443,12 @@ xfs_file_splice_write(
489 443
490 xfs_ilock(ip, XFS_IOLOCK_EXCL); 444 xfs_ilock(ip, XFS_IOLOCK_EXCL);
491 445
492 new_size = *ppos + count;
493
494 xfs_ilock(ip, XFS_ILOCK_EXCL);
495 if (new_size > ip->i_size)
496 ip->i_new_size = new_size;
497 xfs_iunlock(ip, XFS_ILOCK_EXCL);
498
499 trace_xfs_file_splice_write(ip, count, *ppos, ioflags); 446 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
500 447
501 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 448 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
449 if (ret > 0)
450 XFS_STATS_ADD(xs_write_bytes, ret);
502 451
503 xfs_aio_write_isize_update(inode, ppos, ret);
504 xfs_aio_write_newsize_update(ip, new_size);
505 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 452 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
506 return ret; 453 return ret;
507} 454}
@@ -689,28 +636,26 @@ out_lock:
689/* 636/*
690 * Common pre-write limit and setup checks. 637 * Common pre-write limit and setup checks.
691 * 638 *
692 * Returns with iolock held according to @iolock. 639 * Called with the iolocked held either shared and exclusive according to
640 * @iolock, and returns with it held. Might upgrade the iolock to exclusive
641 * if called for a direct write beyond i_size.
693 */ 642 */
694STATIC ssize_t 643STATIC ssize_t
695xfs_file_aio_write_checks( 644xfs_file_aio_write_checks(
696 struct file *file, 645 struct file *file,
697 loff_t *pos, 646 loff_t *pos,
698 size_t *count, 647 size_t *count,
699 xfs_fsize_t *new_sizep,
700 int *iolock) 648 int *iolock)
701{ 649{
702 struct inode *inode = file->f_mapping->host; 650 struct inode *inode = file->f_mapping->host;
703 struct xfs_inode *ip = XFS_I(inode); 651 struct xfs_inode *ip = XFS_I(inode);
704 xfs_fsize_t new_size;
705 int error = 0; 652 int error = 0;
706 653
707 xfs_rw_ilock(ip, XFS_ILOCK_EXCL); 654 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
708 *new_sizep = 0;
709restart: 655restart:
710 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); 656 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
711 if (error) { 657 if (error) {
712 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); 658 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
713 *iolock = 0;
714 return error; 659 return error;
715 } 660 }
716 661
@@ -720,36 +665,21 @@ restart:
720 /* 665 /*
721 * If the offset is beyond the size of the file, we need to zero any 666 * If the offset is beyond the size of the file, we need to zero any
722 * blocks that fall between the existing EOF and the start of this 667 * blocks that fall between the existing EOF and the start of this
723 * write. There is no need to issue zeroing if another in-flght IO ends 668 * write. If zeroing is needed and we are currently holding the
724 * at or before this one If zeronig is needed and we are currently 669 * iolock shared, we need to update it to exclusive which involves
725 * holding the iolock shared, we need to update it to exclusive which 670 * dropping all locks and relocking to maintain correct locking order.
726 * involves dropping all locks and relocking to maintain correct locking 671 * If we do this, restart the function to ensure all checks and values
727 * order. If we do this, restart the function to ensure all checks and 672 * are still valid.
728 * values are still valid.
729 */ 673 */
730 if ((ip->i_new_size && *pos > ip->i_new_size) || 674 if (*pos > i_size_read(inode)) {
731 (!ip->i_new_size && *pos > ip->i_size)) {
732 if (*iolock == XFS_IOLOCK_SHARED) { 675 if (*iolock == XFS_IOLOCK_SHARED) {
733 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); 676 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
734 *iolock = XFS_IOLOCK_EXCL; 677 *iolock = XFS_IOLOCK_EXCL;
735 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); 678 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
736 goto restart; 679 goto restart;
737 } 680 }
738 error = -xfs_zero_eof(ip, *pos, ip->i_size); 681 error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
739 } 682 }
740
741 /*
742 * If this IO extends beyond EOF, we may need to update ip->i_new_size.
743 * We have already zeroed space beyond EOF (if necessary). Only update
744 * ip->i_new_size if this IO ends beyond any other in-flight writes.
745 */
746 new_size = *pos + *count;
747 if (new_size > ip->i_size) {
748 if (new_size > ip->i_new_size)
749 ip->i_new_size = new_size;
750 *new_sizep = new_size;
751 }
752
753 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); 683 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
754 if (error) 684 if (error)
755 return error; 685 return error;
@@ -794,9 +724,7 @@ xfs_file_dio_aio_write(
794 const struct iovec *iovp, 724 const struct iovec *iovp,
795 unsigned long nr_segs, 725 unsigned long nr_segs,
796 loff_t pos, 726 loff_t pos,
797 size_t ocount, 727 size_t ocount)
798 xfs_fsize_t *new_size,
799 int *iolock)
800{ 728{
801 struct file *file = iocb->ki_filp; 729 struct file *file = iocb->ki_filp;
802 struct address_space *mapping = file->f_mapping; 730 struct address_space *mapping = file->f_mapping;
@@ -806,10 +734,10 @@ xfs_file_dio_aio_write(
806 ssize_t ret = 0; 734 ssize_t ret = 0;
807 size_t count = ocount; 735 size_t count = ocount;
808 int unaligned_io = 0; 736 int unaligned_io = 0;
737 int iolock;
809 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? 738 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
810 mp->m_rtdev_targp : mp->m_ddev_targp; 739 mp->m_rtdev_targp : mp->m_ddev_targp;
811 740
812 *iolock = 0;
813 if ((pos & target->bt_smask) || (count & target->bt_smask)) 741 if ((pos & target->bt_smask) || (count & target->bt_smask))
814 return -XFS_ERROR(EINVAL); 742 return -XFS_ERROR(EINVAL);
815 743
@@ -824,31 +752,31 @@ xfs_file_dio_aio_write(
824 * EOF zeroing cases and fill out the new inode size as appropriate. 752 * EOF zeroing cases and fill out the new inode size as appropriate.
825 */ 753 */
826 if (unaligned_io || mapping->nrpages) 754 if (unaligned_io || mapping->nrpages)
827 *iolock = XFS_IOLOCK_EXCL; 755 iolock = XFS_IOLOCK_EXCL;
828 else 756 else
829 *iolock = XFS_IOLOCK_SHARED; 757 iolock = XFS_IOLOCK_SHARED;
830 xfs_rw_ilock(ip, *iolock); 758 xfs_rw_ilock(ip, iolock);
831 759
832 /* 760 /*
833 * Recheck if there are cached pages that need invalidate after we got 761 * Recheck if there are cached pages that need invalidate after we got
834 * the iolock to protect against other threads adding new pages while 762 * the iolock to protect against other threads adding new pages while
835 * we were waiting for the iolock. 763 * we were waiting for the iolock.
836 */ 764 */
837 if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) { 765 if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
838 xfs_rw_iunlock(ip, *iolock); 766 xfs_rw_iunlock(ip, iolock);
839 *iolock = XFS_IOLOCK_EXCL; 767 iolock = XFS_IOLOCK_EXCL;
840 xfs_rw_ilock(ip, *iolock); 768 xfs_rw_ilock(ip, iolock);
841 } 769 }
842 770
843 ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); 771 ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
844 if (ret) 772 if (ret)
845 return ret; 773 goto out;
846 774
847 if (mapping->nrpages) { 775 if (mapping->nrpages) {
848 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, 776 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
849 FI_REMAPF_LOCKED); 777 FI_REMAPF_LOCKED);
850 if (ret) 778 if (ret)
851 return ret; 779 goto out;
852 } 780 }
853 781
854 /* 782 /*
@@ -857,15 +785,18 @@ xfs_file_dio_aio_write(
857 */ 785 */
858 if (unaligned_io) 786 if (unaligned_io)
859 inode_dio_wait(inode); 787 inode_dio_wait(inode);
860 else if (*iolock == XFS_IOLOCK_EXCL) { 788 else if (iolock == XFS_IOLOCK_EXCL) {
861 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 789 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
862 *iolock = XFS_IOLOCK_SHARED; 790 iolock = XFS_IOLOCK_SHARED;
863 } 791 }
864 792
865 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); 793 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
866 ret = generic_file_direct_write(iocb, iovp, 794 ret = generic_file_direct_write(iocb, iovp,
867 &nr_segs, pos, &iocb->ki_pos, count, ocount); 795 &nr_segs, pos, &iocb->ki_pos, count, ocount);
868 796
797out:
798 xfs_rw_iunlock(ip, iolock);
799
869 /* No fallback to buffered IO on errors for XFS. */ 800 /* No fallback to buffered IO on errors for XFS. */
870 ASSERT(ret < 0 || ret == count); 801 ASSERT(ret < 0 || ret == count);
871 return ret; 802 return ret;
@@ -877,9 +808,7 @@ xfs_file_buffered_aio_write(
877 const struct iovec *iovp, 808 const struct iovec *iovp,
878 unsigned long nr_segs, 809 unsigned long nr_segs,
879 loff_t pos, 810 loff_t pos,
880 size_t ocount, 811 size_t ocount)
881 xfs_fsize_t *new_size,
882 int *iolock)
883{ 812{
884 struct file *file = iocb->ki_filp; 813 struct file *file = iocb->ki_filp;
885 struct address_space *mapping = file->f_mapping; 814 struct address_space *mapping = file->f_mapping;
@@ -887,14 +816,14 @@ xfs_file_buffered_aio_write(
887 struct xfs_inode *ip = XFS_I(inode); 816 struct xfs_inode *ip = XFS_I(inode);
888 ssize_t ret; 817 ssize_t ret;
889 int enospc = 0; 818 int enospc = 0;
819 int iolock = XFS_IOLOCK_EXCL;
890 size_t count = ocount; 820 size_t count = ocount;
891 821
892 *iolock = XFS_IOLOCK_EXCL; 822 xfs_rw_ilock(ip, iolock);
893 xfs_rw_ilock(ip, *iolock);
894 823
895 ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); 824 ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
896 if (ret) 825 if (ret)
897 return ret; 826 goto out;
898 827
899 /* We can write back this queue in page reclaim */ 828 /* We can write back this queue in page reclaim */
900 current->backing_dev_info = mapping->backing_dev_info; 829 current->backing_dev_info = mapping->backing_dev_info;
@@ -908,13 +837,15 @@ write_retry:
908 * page locks and retry *once* 837 * page locks and retry *once*
909 */ 838 */
910 if (ret == -ENOSPC && !enospc) { 839 if (ret == -ENOSPC && !enospc) {
911 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
912 if (ret)
913 return ret;
914 enospc = 1; 840 enospc = 1;
915 goto write_retry; 841 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
842 if (!ret)
843 goto write_retry;
916 } 844 }
845
917 current->backing_dev_info = NULL; 846 current->backing_dev_info = NULL;
847out:
848 xfs_rw_iunlock(ip, iolock);
918 return ret; 849 return ret;
919} 850}
920 851
@@ -930,9 +861,7 @@ xfs_file_aio_write(
930 struct inode *inode = mapping->host; 861 struct inode *inode = mapping->host;
931 struct xfs_inode *ip = XFS_I(inode); 862 struct xfs_inode *ip = XFS_I(inode);
932 ssize_t ret; 863 ssize_t ret;
933 int iolock;
934 size_t ocount = 0; 864 size_t ocount = 0;
935 xfs_fsize_t new_size = 0;
936 865
937 XFS_STATS_INC(xs_write_calls); 866 XFS_STATS_INC(xs_write_calls);
938 867
@@ -951,33 +880,22 @@ xfs_file_aio_write(
951 return -EIO; 880 return -EIO;
952 881
953 if (unlikely(file->f_flags & O_DIRECT)) 882 if (unlikely(file->f_flags & O_DIRECT))
954 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, 883 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
955 ocount, &new_size, &iolock);
956 else 884 else
957 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, 885 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
958 ocount, &new_size, &iolock); 886 ocount);
959
960 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
961 887
962 if (ret <= 0) 888 if (ret > 0) {
963 goto out_unlock; 889 ssize_t err;
964 890
965 /* Handle various SYNC-type writes */ 891 XFS_STATS_ADD(xs_write_bytes, ret);
966 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
967 loff_t end = pos + ret - 1;
968 int error;
969 892
970 xfs_rw_iunlock(ip, iolock); 893 /* Handle various SYNC-type writes */
971 error = xfs_file_fsync(file, pos, end, 894 err = generic_write_sync(file, pos, ret);
972 (file->f_flags & __O_SYNC) ? 0 : 1); 895 if (err < 0)
973 xfs_rw_ilock(ip, iolock); 896 ret = err;
974 if (error)
975 ret = error;
976 } 897 }
977 898
978out_unlock:
979 xfs_aio_write_newsize_update(ip, new_size);
980 xfs_rw_iunlock(ip, iolock);
981 return ret; 899 return ret;
982} 900}
983 901
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index ed88ed16811c..652b875a9d4c 100644
--- a/fs/xfs/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
@@ -90,7 +90,7 @@ xfs_wait_on_pages(
90 90
91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { 91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
92 return -filemap_fdatawait_range(mapping, first, 92 return -filemap_fdatawait_range(mapping, first,
93 last == -1 ? ip->i_size - 1 : last); 93 last == -1 ? XFS_ISIZE(ip) - 1 : last);
94 } 94 }
95 return 0; 95 return 0;
96} 96}
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 3960a066d7ff..8c3e46394d48 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -77,7 +77,7 @@ xfs_inode_alloc(
77 77
78 ASSERT(atomic_read(&ip->i_pincount) == 0); 78 ASSERT(atomic_read(&ip->i_pincount) == 0);
79 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 79 ASSERT(!spin_is_locked(&ip->i_flags_lock));
80 ASSERT(completion_done(&ip->i_flush)); 80 ASSERT(!xfs_isiflocked(ip));
81 ASSERT(ip->i_ino == 0); 81 ASSERT(ip->i_ino == 0);
82 82
83 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 83 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
@@ -94,8 +94,6 @@ xfs_inode_alloc(
94 ip->i_update_core = 0; 94 ip->i_update_core = 0;
95 ip->i_delayed_blks = 0; 95 ip->i_delayed_blks = 0;
96 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); 96 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
97 ip->i_size = 0;
98 ip->i_new_size = 0;
99 97
100 return ip; 98 return ip;
101} 99}
@@ -150,7 +148,7 @@ xfs_inode_free(
150 /* asserts to verify all state is correct here */ 148 /* asserts to verify all state is correct here */
151 ASSERT(atomic_read(&ip->i_pincount) == 0); 149 ASSERT(atomic_read(&ip->i_pincount) == 0);
152 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 150 ASSERT(!spin_is_locked(&ip->i_flags_lock));
153 ASSERT(completion_done(&ip->i_flush)); 151 ASSERT(!xfs_isiflocked(ip));
154 152
155 /* 153 /*
156 * Because we use RCU freeing we need to ensure the inode always 154 * Because we use RCU freeing we need to ensure the inode always
@@ -450,8 +448,6 @@ again:
450 448
451 *ipp = ip; 449 *ipp = ip;
452 450
453 ASSERT(ip->i_df.if_ext_max ==
454 XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
455 /* 451 /*
456 * If we have a real type for an on-disk inode, we can set ops(&unlock) 452 * If we have a real type for an on-disk inode, we can set ops(&unlock)
457 * now. If it's a new inode being created, xfs_ialloc will handle it. 453 * now. If it's a new inode being created, xfs_ialloc will handle it.
@@ -715,3 +711,19 @@ xfs_isilocked(
715 return 0; 711 return 0;
716} 712}
717#endif 713#endif
714
715void
716__xfs_iflock(
717 struct xfs_inode *ip)
718{
719 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
720 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
721
722 do {
723 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
724 if (xfs_isiflocked(ip))
725 io_schedule();
726 } while (!xfs_iflock_nowait(ip));
727
728 finish_wait(wq, &wait.wait);
729}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9dda7cc32848..b21022499c2e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -299,11 +299,8 @@ xfs_iformat(
299{ 299{
300 xfs_attr_shortform_t *atp; 300 xfs_attr_shortform_t *atp;
301 int size; 301 int size;
302 int error; 302 int error = 0;
303 xfs_fsize_t di_size; 303 xfs_fsize_t di_size;
304 ip->i_df.if_ext_max =
305 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
306 error = 0;
307 304
308 if (unlikely(be32_to_cpu(dip->di_nextents) + 305 if (unlikely(be32_to_cpu(dip->di_nextents) +
309 be16_to_cpu(dip->di_anextents) > 306 be16_to_cpu(dip->di_anextents) >
@@ -350,7 +347,6 @@ xfs_iformat(
350 return XFS_ERROR(EFSCORRUPTED); 347 return XFS_ERROR(EFSCORRUPTED);
351 } 348 }
352 ip->i_d.di_size = 0; 349 ip->i_d.di_size = 0;
353 ip->i_size = 0;
354 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); 350 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
355 break; 351 break;
356 352
@@ -409,10 +405,10 @@ xfs_iformat(
409 } 405 }
410 if (!XFS_DFORK_Q(dip)) 406 if (!XFS_DFORK_Q(dip))
411 return 0; 407 return 0;
408
412 ASSERT(ip->i_afp == NULL); 409 ASSERT(ip->i_afp == NULL);
413 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); 410 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
414 ip->i_afp->if_ext_max = 411
415 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
416 switch (dip->di_aformat) { 412 switch (dip->di_aformat) {
417 case XFS_DINODE_FMT_LOCAL: 413 case XFS_DINODE_FMT_LOCAL:
418 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); 414 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
@@ -604,10 +600,11 @@ xfs_iformat_btree(
604 * or the number of extents is greater than the number of 600 * or the number of extents is greater than the number of
605 * blocks. 601 * blocks.
606 */ 602 */
607 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max 603 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
608 || XFS_BMDR_SPACE_CALC(nrecs) > 604 XFS_IFORK_MAXEXT(ip, whichfork) ||
609 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) 605 XFS_BMDR_SPACE_CALC(nrecs) >
610 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 606 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
607 XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
611 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", 608 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
612 (unsigned long long) ip->i_ino); 609 (unsigned long long) ip->i_ino);
613 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 610 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -835,12 +832,6 @@ xfs_iread(
835 * with the uninitialized part of it. 832 * with the uninitialized part of it.
836 */ 833 */
837 ip->i_d.di_mode = 0; 834 ip->i_d.di_mode = 0;
838 /*
839 * Initialize the per-fork minima and maxima for a new
840 * inode here. xfs_iformat will do it for old inodes.
841 */
842 ip->i_df.if_ext_max =
843 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
844 } 835 }
845 836
846 /* 837 /*
@@ -861,7 +852,6 @@ xfs_iread(
861 } 852 }
862 853
863 ip->i_delayed_blks = 0; 854 ip->i_delayed_blks = 0;
864 ip->i_size = ip->i_d.di_size;
865 855
866 /* 856 /*
867 * Mark the buffer containing the inode as something to keep 857 * Mark the buffer containing the inode as something to keep
@@ -1051,7 +1041,6 @@ xfs_ialloc(
1051 } 1041 }
1052 1042
1053 ip->i_d.di_size = 0; 1043 ip->i_d.di_size = 0;
1054 ip->i_size = 0;
1055 ip->i_d.di_nextents = 0; 1044 ip->i_d.di_nextents = 0;
1056 ASSERT(ip->i_d.di_nblocks == 0); 1045 ASSERT(ip->i_d.di_nblocks == 0);
1057 1046
@@ -1166,52 +1155,6 @@ xfs_ialloc(
1166} 1155}
1167 1156
1168/* 1157/*
1169 * Check to make sure that there are no blocks allocated to the
1170 * file beyond the size of the file. We don't check this for
1171 * files with fixed size extents or real time extents, but we
1172 * at least do it for regular files.
1173 */
1174#ifdef DEBUG
1175STATIC void
1176xfs_isize_check(
1177 struct xfs_inode *ip,
1178 xfs_fsize_t isize)
1179{
1180 struct xfs_mount *mp = ip->i_mount;
1181 xfs_fileoff_t map_first;
1182 int nimaps;
1183 xfs_bmbt_irec_t imaps[2];
1184 int error;
1185
1186 if (!S_ISREG(ip->i_d.di_mode))
1187 return;
1188
1189 if (XFS_IS_REALTIME_INODE(ip))
1190 return;
1191
1192 if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
1193 return;
1194
1195 nimaps = 2;
1196 map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
1197 /*
1198 * The filesystem could be shutting down, so bmapi may return
1199 * an error.
1200 */
1201 error = xfs_bmapi_read(ip, map_first,
1202 (XFS_B_TO_FSB(mp,
1203 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
1204 imaps, &nimaps, XFS_BMAPI_ENTIRE);
1205 if (error)
1206 return;
1207 ASSERT(nimaps == 1);
1208 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
1209}
1210#else /* DEBUG */
1211#define xfs_isize_check(ip, isize)
1212#endif /* DEBUG */
1213
1214/*
1215 * Free up the underlying blocks past new_size. The new size must be smaller 1158 * Free up the underlying blocks past new_size. The new size must be smaller
1216 * than the current size. This routine can be used both for the attribute and 1159 * than the current size. This routine can be used both for the attribute and
1217 * data fork, and does not modify the inode size, which is left to the caller. 1160 * data fork, and does not modify the inode size, which is left to the caller.
@@ -1252,12 +1195,14 @@ xfs_itruncate_extents(
1252 int done = 0; 1195 int done = 0;
1253 1196
1254 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 1197 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
1255 ASSERT(new_size <= ip->i_size); 1198 ASSERT(new_size <= XFS_ISIZE(ip));
1256 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1199 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1257 ASSERT(ip->i_itemp != NULL); 1200 ASSERT(ip->i_itemp != NULL);
1258 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1201 ASSERT(ip->i_itemp->ili_lock_flags == 0);
1259 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1202 ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1260 1203
1204 trace_xfs_itruncate_extents_start(ip, new_size);
1205
1261 /* 1206 /*
1262 * Since it is possible for space to become allocated beyond 1207 * Since it is possible for space to become allocated beyond
1263 * the end of the file (in a crash where the space is allocated 1208 * the end of the file (in a crash where the space is allocated
@@ -1325,6 +1270,14 @@ xfs_itruncate_extents(
1325 goto out; 1270 goto out;
1326 } 1271 }
1327 1272
1273 /*
1274 * Always re-log the inode so that our permanent transaction can keep
1275 * on rolling it forward in the log.
1276 */
1277 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1278
1279 trace_xfs_itruncate_extents_end(ip, new_size);
1280
1328out: 1281out:
1329 *tpp = tp; 1282 *tpp = tp;
1330 return error; 1283 return error;
@@ -1338,74 +1291,6 @@ out_bmap_cancel:
1338 goto out; 1291 goto out;
1339} 1292}
1340 1293
1341int
1342xfs_itruncate_data(
1343 struct xfs_trans **tpp,
1344 struct xfs_inode *ip,
1345 xfs_fsize_t new_size)
1346{
1347 int error;
1348
1349 trace_xfs_itruncate_data_start(ip, new_size);
1350
1351 /*
1352 * The first thing we do is set the size to new_size permanently on
1353 * disk. This way we don't have to worry about anyone ever being able
1354 * to look at the data being freed even in the face of a crash.
1355 * What we're getting around here is the case where we free a block, it
1356 * is allocated to another file, it is written to, and then we crash.
1357 * If the new data gets written to the file but the log buffers
1358 * containing the free and reallocation don't, then we'd end up with
1359 * garbage in the blocks being freed. As long as we make the new_size
1360 * permanent before actually freeing any blocks it doesn't matter if
1361 * they get written to.
1362 */
1363 if (ip->i_d.di_nextents > 0) {
1364 /*
1365 * If we are not changing the file size then do not update
1366 * the on-disk file size - we may be called from
1367 * xfs_inactive_free_eofblocks(). If we update the on-disk
1368 * file size and then the system crashes before the contents
1369 * of the file are flushed to disk then the files may be
1370 * full of holes (ie NULL files bug).
1371 */
1372 if (ip->i_size != new_size) {
1373 ip->i_d.di_size = new_size;
1374 ip->i_size = new_size;
1375 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1376 }
1377 }
1378
1379 error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
1380 if (error)
1381 return error;
1382
1383 /*
1384 * If we are not changing the file size then do not update the on-disk
1385 * file size - we may be called from xfs_inactive_free_eofblocks().
1386 * If we update the on-disk file size and then the system crashes
1387 * before the contents of the file are flushed to disk then the files
1388 * may be full of holes (ie NULL files bug).
1389 */
1390 xfs_isize_check(ip, new_size);
1391 if (ip->i_size != new_size) {
1392 ip->i_d.di_size = new_size;
1393 ip->i_size = new_size;
1394 }
1395
1396 ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
1397 ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
1398
1399 /*
1400 * Always re-log the inode so that our permanent transaction can keep
1401 * on rolling it forward in the log.
1402 */
1403 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1404
1405 trace_xfs_itruncate_data_end(ip, new_size);
1406 return 0;
1407}
1408
1409/* 1294/*
1410 * This is called when the inode's link count goes to 0. 1295 * This is called when the inode's link count goes to 0.
1411 * We place the on-disk inode on a list in the AGI. It 1296 * We place the on-disk inode on a list in the AGI. It
@@ -1824,8 +1709,7 @@ xfs_ifree(
1824 ASSERT(ip->i_d.di_nlink == 0); 1709 ASSERT(ip->i_d.di_nlink == 0);
1825 ASSERT(ip->i_d.di_nextents == 0); 1710 ASSERT(ip->i_d.di_nextents == 0);
1826 ASSERT(ip->i_d.di_anextents == 0); 1711 ASSERT(ip->i_d.di_anextents == 0);
1827 ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || 1712 ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
1828 (!S_ISREG(ip->i_d.di_mode)));
1829 ASSERT(ip->i_d.di_nblocks == 0); 1713 ASSERT(ip->i_d.di_nblocks == 0);
1830 1714
1831 /* 1715 /*
@@ -1844,8 +1728,6 @@ xfs_ifree(
1844 ip->i_d.di_flags = 0; 1728 ip->i_d.di_flags = 0;
1845 ip->i_d.di_dmevmask = 0; 1729 ip->i_d.di_dmevmask = 0;
1846 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ 1730 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
1847 ip->i_df.if_ext_max =
1848 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
1849 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 1731 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1850 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 1732 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1851 /* 1733 /*
@@ -2151,7 +2033,7 @@ xfs_idestroy_fork(
2151 * once someone is waiting for it to be unpinned. 2033 * once someone is waiting for it to be unpinned.
2152 */ 2034 */
2153static void 2035static void
2154xfs_iunpin_nowait( 2036xfs_iunpin(
2155 struct xfs_inode *ip) 2037 struct xfs_inode *ip)
2156{ 2038{
2157 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2039 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
@@ -2163,14 +2045,29 @@ xfs_iunpin_nowait(
2163 2045
2164} 2046}
2165 2047
2048static void
2049__xfs_iunpin_wait(
2050 struct xfs_inode *ip)
2051{
2052 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2053 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2054
2055 xfs_iunpin(ip);
2056
2057 do {
2058 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2059 if (xfs_ipincount(ip))
2060 io_schedule();
2061 } while (xfs_ipincount(ip));
2062 finish_wait(wq, &wait.wait);
2063}
2064
2166void 2065void
2167xfs_iunpin_wait( 2066xfs_iunpin_wait(
2168 struct xfs_inode *ip) 2067 struct xfs_inode *ip)
2169{ 2068{
2170 if (xfs_ipincount(ip)) { 2069 if (xfs_ipincount(ip))
2171 xfs_iunpin_nowait(ip); 2070 __xfs_iunpin_wait(ip);
2172 wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
2173 }
2174} 2071}
2175 2072
2176/* 2073/*
@@ -2510,9 +2407,9 @@ xfs_iflush(
2510 XFS_STATS_INC(xs_iflush_count); 2407 XFS_STATS_INC(xs_iflush_count);
2511 2408
2512 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2409 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2513 ASSERT(!completion_done(&ip->i_flush)); 2410 ASSERT(xfs_isiflocked(ip));
2514 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2411 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2515 ip->i_d.di_nextents > ip->i_df.if_ext_max); 2412 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2516 2413
2517 iip = ip->i_itemp; 2414 iip = ip->i_itemp;
2518 mp = ip->i_mount; 2415 mp = ip->i_mount;
@@ -2529,7 +2426,7 @@ xfs_iflush(
2529 * out for us if they occur after the log force completes. 2426 * out for us if they occur after the log force completes.
2530 */ 2427 */
2531 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { 2428 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
2532 xfs_iunpin_nowait(ip); 2429 xfs_iunpin(ip);
2533 xfs_ifunlock(ip); 2430 xfs_ifunlock(ip);
2534 return EAGAIN; 2431 return EAGAIN;
2535 } 2432 }
@@ -2626,9 +2523,9 @@ xfs_iflush_int(
2626#endif 2523#endif
2627 2524
2628 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2525 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2629 ASSERT(!completion_done(&ip->i_flush)); 2526 ASSERT(xfs_isiflocked(ip));
2630 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2527 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2631 ip->i_d.di_nextents > ip->i_df.if_ext_max); 2528 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2632 2529
2633 iip = ip->i_itemp; 2530 iip = ip->i_itemp;
2634 mp = ip->i_mount; 2531 mp = ip->i_mount;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f0e6b151ba37..2f27b7454085 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -66,7 +66,6 @@ typedef struct xfs_ifork {
66 struct xfs_btree_block *if_broot; /* file's incore btree root */ 66 struct xfs_btree_block *if_broot; /* file's incore btree root */
67 short if_broot_bytes; /* bytes allocated for root */ 67 short if_broot_bytes; /* bytes allocated for root */
68 unsigned char if_flags; /* per-fork flags */ 68 unsigned char if_flags; /* per-fork flags */
69 unsigned char if_ext_max; /* max # of extent records */
70 union { 69 union {
71 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ 70 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
72 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ 71 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
@@ -206,12 +205,12 @@ typedef struct xfs_icdinode {
206 ((w) == XFS_DATA_FORK ? \ 205 ((w) == XFS_DATA_FORK ? \
207 ((ip)->i_d.di_nextents = (n)) : \ 206 ((ip)->i_d.di_nextents = (n)) : \
208 ((ip)->i_d.di_anextents = (n))) 207 ((ip)->i_d.di_anextents = (n)))
209 208#define XFS_IFORK_MAXEXT(ip, w) \
209 (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
210 210
211 211
212#ifdef __KERNEL__ 212#ifdef __KERNEL__
213 213
214struct bhv_desc;
215struct xfs_buf; 214struct xfs_buf;
216struct xfs_bmap_free; 215struct xfs_bmap_free;
217struct xfs_bmbt_irec; 216struct xfs_bmbt_irec;
@@ -220,12 +219,6 @@ struct xfs_mount;
220struct xfs_trans; 219struct xfs_trans;
221struct xfs_dquot; 220struct xfs_dquot;
222 221
223typedef struct dm_attrs_s {
224 __uint32_t da_dmevmask; /* DMIG event mask */
225 __uint16_t da_dmstate; /* DMIG state info */
226 __uint16_t da_pad; /* DMIG extra padding */
227} dm_attrs_t;
228
229typedef struct xfs_inode { 222typedef struct xfs_inode {
230 /* Inode linking and identification information. */ 223 /* Inode linking and identification information. */
231 struct xfs_mount *i_mount; /* fs mount struct ptr */ 224 struct xfs_mount *i_mount; /* fs mount struct ptr */
@@ -244,27 +237,19 @@ typedef struct xfs_inode {
244 struct xfs_inode_log_item *i_itemp; /* logging information */ 237 struct xfs_inode_log_item *i_itemp; /* logging information */
245 mrlock_t i_lock; /* inode lock */ 238 mrlock_t i_lock; /* inode lock */
246 mrlock_t i_iolock; /* inode IO lock */ 239 mrlock_t i_iolock; /* inode IO lock */
247 struct completion i_flush; /* inode flush completion q */
248 atomic_t i_pincount; /* inode pin count */ 240 atomic_t i_pincount; /* inode pin count */
249 wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */
250 spinlock_t i_flags_lock; /* inode i_flags lock */ 241 spinlock_t i_flags_lock; /* inode i_flags lock */
251 /* Miscellaneous state. */ 242 /* Miscellaneous state. */
252 unsigned short i_flags; /* see defined flags below */ 243 unsigned long i_flags; /* see defined flags below */
253 unsigned char i_update_core; /* timestamps/size is dirty */ 244 unsigned char i_update_core; /* timestamps/size is dirty */
254 unsigned int i_delayed_blks; /* count of delay alloc blks */ 245 unsigned int i_delayed_blks; /* count of delay alloc blks */
255 246
256 xfs_icdinode_t i_d; /* most of ondisk inode */ 247 xfs_icdinode_t i_d; /* most of ondisk inode */
257 248
258 xfs_fsize_t i_size; /* in-memory size */
259 xfs_fsize_t i_new_size; /* size when write completes */
260
261 /* VFS inode */ 249 /* VFS inode */
262 struct inode i_vnode; /* embedded VFS inode */ 250 struct inode i_vnode; /* embedded VFS inode */
263} xfs_inode_t; 251} xfs_inode_t;
264 252
265#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \
266 (ip)->i_size : (ip)->i_d.di_size;
267
268/* Convert from vfs inode to xfs inode */ 253/* Convert from vfs inode to xfs inode */
269static inline struct xfs_inode *XFS_I(struct inode *inode) 254static inline struct xfs_inode *XFS_I(struct inode *inode)
270{ 255{
@@ -278,6 +263,18 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
278} 263}
279 264
280/* 265/*
266 * For regular files we only update the on-disk filesize when actually
267 * writing data back to disk. Until then only the copy in the VFS inode
268 * is uptodate.
269 */
270static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
271{
272 if (S_ISREG(ip->i_d.di_mode))
273 return i_size_read(VFS_I(ip));
274 return ip->i_d.di_size;
275}
276
277/*
281 * i_flags helper functions 278 * i_flags helper functions
282 */ 279 */
283static inline void 280static inline void
@@ -331,6 +328,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
331 return ret; 328 return ret;
332} 329}
333 330
331static inline int
332xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
333{
334 int ret;
335
336 spin_lock(&ip->i_flags_lock);
337 ret = ip->i_flags & flags;
338 if (!ret)
339 ip->i_flags |= flags;
340 spin_unlock(&ip->i_flags_lock);
341 return ret;
342}
343
334/* 344/*
335 * Project quota id helpers (previously projid was 16bit only 345 * Project quota id helpers (previously projid was 16bit only
336 * and using two 16bit values to hold new 32bit projid was chosen 346 * and using two 16bit values to hold new 32bit projid was chosen
@@ -351,35 +361,19 @@ xfs_set_projid(struct xfs_inode *ip,
351} 361}
352 362
353/* 363/*
354 * Manage the i_flush queue embedded in the inode. This completion
355 * queue synchronizes processes attempting to flush the in-core
356 * inode back to disk.
357 */
358static inline void xfs_iflock(xfs_inode_t *ip)
359{
360 wait_for_completion(&ip->i_flush);
361}
362
363static inline int xfs_iflock_nowait(xfs_inode_t *ip)
364{
365 return try_wait_for_completion(&ip->i_flush);
366}
367
368static inline void xfs_ifunlock(xfs_inode_t *ip)
369{
370 complete(&ip->i_flush);
371}
372
373/*
374 * In-core inode flags. 364 * In-core inode flags.
375 */ 365 */
376#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */ 366#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */
377#define XFS_ISTALE 0x0002 /* inode has been staled */ 367#define XFS_ISTALE (1 << 1) /* inode has been staled */
378#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ 368#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
379#define XFS_INEW 0x0008 /* inode has just been allocated */ 369#define XFS_INEW (1 << 3) /* inode has just been allocated */
380#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ 370#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */
381#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ 371#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
382#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */ 372#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
373#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
374#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT)
375#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
376#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
383 377
384/* 378/*
385 * Per-lifetime flags need to be reset when re-using a reclaimable inode during 379 * Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -392,6 +386,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
392 XFS_IFILESTREAM); 386 XFS_IFILESTREAM);
393 387
394/* 388/*
389 * Synchronize processes attempting to flush the in-core inode back to disk.
390 */
391
392extern void __xfs_iflock(struct xfs_inode *ip);
393
394static inline int xfs_iflock_nowait(struct xfs_inode *ip)
395{
396 return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
397}
398
399static inline void xfs_iflock(struct xfs_inode *ip)
400{
401 if (!xfs_iflock_nowait(ip))
402 __xfs_iflock(ip);
403}
404
405static inline void xfs_ifunlock(struct xfs_inode *ip)
406{
407 xfs_iflags_clear(ip, XFS_IFLOCK);
408 wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
409}
410
411static inline int xfs_isiflocked(struct xfs_inode *ip)
412{
413 return xfs_iflags_test(ip, XFS_IFLOCK);
414}
415
416/*
395 * Flags for inode locking. 417 * Flags for inode locking.
396 * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) 418 * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
397 * 1<<16 - 1<<32-1 -- lockdep annotation (integers) 419 * 1<<16 - 1<<32-1 -- lockdep annotation (integers)
@@ -491,8 +513,6 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
491 struct xfs_bmap_free *); 513 struct xfs_bmap_free *);
492int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, 514int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
493 int, xfs_fsize_t); 515 int, xfs_fsize_t);
494int xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *,
495 xfs_fsize_t);
496int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); 516int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
497 517
498void xfs_iext_realloc(xfs_inode_t *, int, int); 518void xfs_iext_realloc(xfs_inode_t *, int, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cfd6c7f8cc3c..91d71dcd4852 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -79,8 +79,6 @@ xfs_inode_item_size(
79 break; 79 break;
80 80
81 case XFS_DINODE_FMT_BTREE: 81 case XFS_DINODE_FMT_BTREE:
82 ASSERT(ip->i_df.if_ext_max ==
83 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
84 iip->ili_format.ilf_fields &= 82 iip->ili_format.ilf_fields &=
85 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | 83 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
86 XFS_ILOG_DEV | XFS_ILOG_UUID); 84 XFS_ILOG_DEV | XFS_ILOG_UUID);
@@ -557,7 +555,7 @@ xfs_inode_item_unpin(
557 trace_xfs_inode_unpin(ip, _RET_IP_); 555 trace_xfs_inode_unpin(ip, _RET_IP_);
558 ASSERT(atomic_read(&ip->i_pincount) > 0); 556 ASSERT(atomic_read(&ip->i_pincount) > 0);
559 if (atomic_dec_and_test(&ip->i_pincount)) 557 if (atomic_dec_and_test(&ip->i_pincount))
560 wake_up(&ip->i_ipin_wait); 558 wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
561} 559}
562 560
563/* 561/*
@@ -719,7 +717,7 @@ xfs_inode_item_pushbuf(
719 * If a flush is not in progress anymore, chances are that the 717 * If a flush is not in progress anymore, chances are that the
720 * inode was taken off the AIL. So, just get out. 718 * inode was taken off the AIL. So, just get out.
721 */ 719 */
722 if (completion_done(&ip->i_flush) || 720 if (!xfs_isiflocked(ip) ||
723 !(lip->li_flags & XFS_LI_IN_AIL)) { 721 !(lip->li_flags & XFS_LI_IN_AIL)) {
724 xfs_iunlock(ip, XFS_ILOCK_SHARED); 722 xfs_iunlock(ip, XFS_ILOCK_SHARED);
725 return true; 723 return true;
@@ -752,7 +750,7 @@ xfs_inode_item_push(
752 struct xfs_inode *ip = iip->ili_inode; 750 struct xfs_inode *ip = iip->ili_inode;
753 751
754 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 752 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
755 ASSERT(!completion_done(&ip->i_flush)); 753 ASSERT(xfs_isiflocked(ip));
756 754
757 /* 755 /*
758 * Since we were able to lock the inode's flush lock and 756 * Since we were able to lock the inode's flush lock and
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 9afa282aa937..246c7d57c6f9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb(
57 xfs_fileoff_t *last_fsb) 57 xfs_fileoff_t *last_fsb)
58{ 58{
59 xfs_fileoff_t new_last_fsb = 0; 59 xfs_fileoff_t new_last_fsb = 0;
60 xfs_extlen_t align; 60 xfs_extlen_t align = 0;
61 int eof, error; 61 int eof, error;
62 62
63 if (XFS_IS_REALTIME_INODE(ip)) 63 if (!XFS_IS_REALTIME_INODE(ip)) {
64 ; 64 /*
65 /* 65 * Round up the allocation request to a stripe unit
66 * If mounted with the "-o swalloc" option, roundup the allocation 66 * (m_dalign) boundary if the file size is >= stripe unit
67 * request to a stripe width boundary if the file size is >= 67 * size, and we are allocating past the allocation eof.
68 * stripe width and we are allocating past the allocation eof. 68 *
69 */ 69 * If mounted with the "-o swalloc" option the alignment is
70 else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) && 70 * increased from the strip unit size to the stripe width.
71 (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth))) 71 */
72 new_last_fsb = roundup_64(*last_fsb, mp->m_swidth); 72 if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
73 /* 73 align = mp->m_swidth;
74 * Roundup the allocation request to a stripe unit (m_dalign) boundary 74 else if (mp->m_dalign)
75 * if the file size is >= stripe unit size, and we are allocating past 75 align = mp->m_dalign;
76 * the allocation eof. 76
77 */ 77 if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
78 else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign))) 78 new_last_fsb = roundup_64(*last_fsb, align);
79 new_last_fsb = roundup_64(*last_fsb, mp->m_dalign); 79 }
80 80
81 /* 81 /*
82 * Always round up the allocation request to an extent boundary 82 * Always round up the allocation request to an extent boundary
@@ -154,7 +154,7 @@ xfs_iomap_write_direct(
154 154
155 offset_fsb = XFS_B_TO_FSBT(mp, offset); 155 offset_fsb = XFS_B_TO_FSBT(mp, offset);
156 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); 156 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
157 if ((offset + count) > ip->i_size) { 157 if ((offset + count) > XFS_ISIZE(ip)) {
158 error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); 158 error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
159 if (error) 159 if (error)
160 goto error_out; 160 goto error_out;
@@ -211,7 +211,7 @@ xfs_iomap_write_direct(
211 xfs_trans_ijoin(tp, ip, 0); 211 xfs_trans_ijoin(tp, ip, 0);
212 212
213 bmapi_flag = 0; 213 bmapi_flag = 0;
214 if (offset < ip->i_size || extsz) 214 if (offset < XFS_ISIZE(ip) || extsz)
215 bmapi_flag |= XFS_BMAPI_PREALLOC; 215 bmapi_flag |= XFS_BMAPI_PREALLOC;
216 216
217 /* 217 /*
@@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate(
286 int found_delalloc = 0; 286 int found_delalloc = 0;
287 287
288 *prealloc = 0; 288 *prealloc = 0;
289 if ((offset + count) <= ip->i_size) 289 if (offset + count <= XFS_ISIZE(ip))
290 return 0; 290 return 0;
291 291
292 /* 292 /*
@@ -340,7 +340,7 @@ xfs_iomap_prealloc_size(
340 * if we pass in alloc_blocks = 0. Hence the "+ 1" to 340 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
341 * ensure we always pass in a non-zero value. 341 * ensure we always pass in a non-zero value.
342 */ 342 */
343 alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1; 343 alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
344 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, 344 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
345 rounddown_pow_of_two(alloc_blocks)); 345 rounddown_pow_of_two(alloc_blocks));
346 346
@@ -564,7 +564,7 @@ xfs_iomap_write_allocate(
564 * back.... 564 * back....
565 */ 565 */
566 nimaps = 1; 566 nimaps = 1;
567 end_fsb = XFS_B_TO_FSB(mp, ip->i_size); 567 end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
568 error = xfs_bmap_last_offset(NULL, ip, &last_block, 568 error = xfs_bmap_last_offset(NULL, ip, &last_block,
569 XFS_DATA_FORK); 569 XFS_DATA_FORK);
570 if (error) 570 if (error)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f9babd179223..ab302539e5b9 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -750,6 +750,7 @@ xfs_setattr_size(
750 struct xfs_mount *mp = ip->i_mount; 750 struct xfs_mount *mp = ip->i_mount;
751 struct inode *inode = VFS_I(ip); 751 struct inode *inode = VFS_I(ip);
752 int mask = iattr->ia_valid; 752 int mask = iattr->ia_valid;
753 xfs_off_t oldsize, newsize;
753 struct xfs_trans *tp; 754 struct xfs_trans *tp;
754 int error; 755 int error;
755 uint lock_flags; 756 uint lock_flags;
@@ -777,11 +778,13 @@ xfs_setattr_size(
777 lock_flags |= XFS_IOLOCK_EXCL; 778 lock_flags |= XFS_IOLOCK_EXCL;
778 xfs_ilock(ip, lock_flags); 779 xfs_ilock(ip, lock_flags);
779 780
781 oldsize = inode->i_size;
782 newsize = iattr->ia_size;
783
780 /* 784 /*
781 * Short circuit the truncate case for zero length files. 785 * Short circuit the truncate case for zero length files.
782 */ 786 */
783 if (iattr->ia_size == 0 && 787 if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
784 ip->i_size == 0 && ip->i_d.di_nextents == 0) {
785 if (!(mask & (ATTR_CTIME|ATTR_MTIME))) 788 if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
786 goto out_unlock; 789 goto out_unlock;
787 790
@@ -807,14 +810,14 @@ xfs_setattr_size(
807 * the inode to the transaction, because the inode cannot be unlocked 810 * the inode to the transaction, because the inode cannot be unlocked
808 * once it is a part of the transaction. 811 * once it is a part of the transaction.
809 */ 812 */
810 if (iattr->ia_size > ip->i_size) { 813 if (newsize > oldsize) {
811 /* 814 /*
812 * Do the first part of growing a file: zero any data in the 815 * Do the first part of growing a file: zero any data in the
813 * last block that is beyond the old EOF. We need to do this 816 * last block that is beyond the old EOF. We need to do this
814 * before the inode is joined to the transaction to modify 817 * before the inode is joined to the transaction to modify
815 * i_size. 818 * i_size.
816 */ 819 */
817 error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); 820 error = xfs_zero_eof(ip, newsize, oldsize);
818 if (error) 821 if (error)
819 goto out_unlock; 822 goto out_unlock;
820 } 823 }
@@ -833,8 +836,8 @@ xfs_setattr_size(
833 * here and prevents waiting for other data not within the range we 836 * here and prevents waiting for other data not within the range we
834 * care about here. 837 * care about here.
835 */ 838 */
836 if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { 839 if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
837 error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0, 840 error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
838 FI_NONE); 841 FI_NONE);
839 if (error) 842 if (error)
840 goto out_unlock; 843 goto out_unlock;
@@ -845,8 +848,7 @@ xfs_setattr_size(
845 */ 848 */
846 inode_dio_wait(inode); 849 inode_dio_wait(inode);
847 850
848 error = -block_truncate_page(inode->i_mapping, iattr->ia_size, 851 error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
849 xfs_get_blocks);
850 if (error) 852 if (error)
851 goto out_unlock; 853 goto out_unlock;
852 854
@@ -857,7 +859,7 @@ xfs_setattr_size(
857 if (error) 859 if (error)
858 goto out_trans_cancel; 860 goto out_trans_cancel;
859 861
860 truncate_setsize(inode, iattr->ia_size); 862 truncate_setsize(inode, newsize);
861 863
862 commit_flags = XFS_TRANS_RELEASE_LOG_RES; 864 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
863 lock_flags |= XFS_ILOCK_EXCL; 865 lock_flags |= XFS_ILOCK_EXCL;
@@ -876,19 +878,29 @@ xfs_setattr_size(
876 * these flags set. For all other operations the VFS set these flags 878 * these flags set. For all other operations the VFS set these flags
877 * explicitly if it wants a timestamp update. 879 * explicitly if it wants a timestamp update.
878 */ 880 */
879 if (iattr->ia_size != ip->i_size && 881 if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
880 (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
881 iattr->ia_ctime = iattr->ia_mtime = 882 iattr->ia_ctime = iattr->ia_mtime =
882 current_fs_time(inode->i_sb); 883 current_fs_time(inode->i_sb);
883 mask |= ATTR_CTIME | ATTR_MTIME; 884 mask |= ATTR_CTIME | ATTR_MTIME;
884 } 885 }
885 886
886 if (iattr->ia_size > ip->i_size) { 887 /*
887 ip->i_d.di_size = iattr->ia_size; 888 * The first thing we do is set the size to new_size permanently on
888 ip->i_size = iattr->ia_size; 889 * disk. This way we don't have to worry about anyone ever being able
889 } else if (iattr->ia_size <= ip->i_size || 890 * to look at the data being freed even in the face of a crash.
890 (iattr->ia_size == 0 && ip->i_d.di_nextents)) { 891 * What we're getting around here is the case where we free a block, it
891 error = xfs_itruncate_data(&tp, ip, iattr->ia_size); 892 * is allocated to another file, it is written to, and then we crash.
893 * If the new data gets written to the file but the log buffers
894 * containing the free and reallocation don't, then we'd end up with
895 * garbage in the blocks being freed. As long as we make the new size
896 * permanent before actually freeing any blocks it doesn't matter if
897 * they get written to.
898 */
899 ip->i_d.di_size = newsize;
900 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
901
902 if (newsize <= oldsize) {
903 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
892 if (error) 904 if (error)
893 goto out_trans_abort; 905 goto out_trans_abort;
894 906
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 5cc3dde1bc90..eafbcff81f3a 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -31,6 +31,7 @@
31#include "xfs_mount.h" 31#include "xfs_mount.h"
32#include "xfs_bmap_btree.h" 32#include "xfs_bmap_btree.h"
33#include "xfs_inode.h" 33#include "xfs_inode.h"
34#include "xfs_inode_item.h"
34#include "xfs_itable.h" 35#include "xfs_itable.h"
35#include "xfs_bmap.h" 36#include "xfs_bmap.h"
36#include "xfs_rtalloc.h" 37#include "xfs_rtalloc.h"
@@ -263,13 +264,18 @@ xfs_qm_scall_trunc_qfile(
263 xfs_ilock(ip, XFS_ILOCK_EXCL); 264 xfs_ilock(ip, XFS_ILOCK_EXCL);
264 xfs_trans_ijoin(tp, ip, 0); 265 xfs_trans_ijoin(tp, ip, 0);
265 266
266 error = xfs_itruncate_data(&tp, ip, 0); 267 ip->i_d.di_size = 0;
268 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
269
270 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
267 if (error) { 271 if (error) {
268 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 272 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
269 XFS_TRANS_ABORT); 273 XFS_TRANS_ABORT);
270 goto out_unlock; 274 goto out_unlock;
271 } 275 }
272 276
277 ASSERT(ip->i_d.di_nextents == 0);
278
273 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 279 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
274 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 280 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
275 281
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 281961c1d81a..ee5b695c99a7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -828,14 +828,6 @@ xfs_fs_inode_init_once(
828 /* xfs inode */ 828 /* xfs inode */
829 atomic_set(&ip->i_pincount, 0); 829 atomic_set(&ip->i_pincount, 0);
830 spin_lock_init(&ip->i_flags_lock); 830 spin_lock_init(&ip->i_flags_lock);
831 init_waitqueue_head(&ip->i_ipin_wait);
832 /*
833 * Because we want to use a counting completion, complete
834 * the flush completion once to allow a single access to
835 * the flush completion without blocking.
836 */
837 init_completion(&ip->i_flush);
838 complete(&ip->i_flush);
839 831
840 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, 832 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
841 "xfsino", ip->i_ino); 833 "xfsino", ip->i_ino);
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 72c01a1c16e7..40b75eecd2b4 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -707,14 +707,13 @@ xfs_reclaim_inode_grab(
707 return 1; 707 return 1;
708 708
709 /* 709 /*
710 * do some unlocked checks first to avoid unnecessary lock traffic. 710 * If we are asked for non-blocking operation, do unlocked checks to
711 * The first is a flush lock check, the second is a already in reclaim 711 * see if the inode already is being flushed or in reclaim to avoid
712 * check. Only do these checks if we are not going to block on locks. 712 * lock traffic.
713 */ 713 */
714 if ((flags & SYNC_TRYLOCK) && 714 if ((flags & SYNC_TRYLOCK) &&
715 (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { 715 __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
716 return 1; 716 return 1;
717 }
718 717
719 /* 718 /*
720 * The radix tree lock here protects a thread in xfs_iget from racing 719 * The radix tree lock here protects a thread in xfs_iget from racing
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a9d5b1e06efe..6b6df5802e95 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -891,7 +891,6 @@ DECLARE_EVENT_CLASS(xfs_file_class,
891 __field(dev_t, dev) 891 __field(dev_t, dev)
892 __field(xfs_ino_t, ino) 892 __field(xfs_ino_t, ino)
893 __field(xfs_fsize_t, size) 893 __field(xfs_fsize_t, size)
894 __field(xfs_fsize_t, new_size)
895 __field(loff_t, offset) 894 __field(loff_t, offset)
896 __field(size_t, count) 895 __field(size_t, count)
897 __field(int, flags) 896 __field(int, flags)
@@ -900,17 +899,15 @@ DECLARE_EVENT_CLASS(xfs_file_class,
900 __entry->dev = VFS_I(ip)->i_sb->s_dev; 899 __entry->dev = VFS_I(ip)->i_sb->s_dev;
901 __entry->ino = ip->i_ino; 900 __entry->ino = ip->i_ino;
902 __entry->size = ip->i_d.di_size; 901 __entry->size = ip->i_d.di_size;
903 __entry->new_size = ip->i_new_size;
904 __entry->offset = offset; 902 __entry->offset = offset;
905 __entry->count = count; 903 __entry->count = count;
906 __entry->flags = flags; 904 __entry->flags = flags;
907 ), 905 ),
908 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " 906 TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
909 "offset 0x%llx count 0x%zx ioflags %s", 907 "offset 0x%llx count 0x%zx ioflags %s",
910 MAJOR(__entry->dev), MINOR(__entry->dev), 908 MAJOR(__entry->dev), MINOR(__entry->dev),
911 __entry->ino, 909 __entry->ino,
912 __entry->size, 910 __entry->size,
913 __entry->new_size,
914 __entry->offset, 911 __entry->offset,
915 __entry->count, 912 __entry->count,
916 __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) 913 __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
@@ -978,7 +975,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
978 __field(dev_t, dev) 975 __field(dev_t, dev)
979 __field(xfs_ino_t, ino) 976 __field(xfs_ino_t, ino)
980 __field(loff_t, size) 977 __field(loff_t, size)
981 __field(loff_t, new_size)
982 __field(loff_t, offset) 978 __field(loff_t, offset)
983 __field(size_t, count) 979 __field(size_t, count)
984 __field(int, type) 980 __field(int, type)
@@ -990,7 +986,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
990 __entry->dev = VFS_I(ip)->i_sb->s_dev; 986 __entry->dev = VFS_I(ip)->i_sb->s_dev;
991 __entry->ino = ip->i_ino; 987 __entry->ino = ip->i_ino;
992 __entry->size = ip->i_d.di_size; 988 __entry->size = ip->i_d.di_size;
993 __entry->new_size = ip->i_new_size;
994 __entry->offset = offset; 989 __entry->offset = offset;
995 __entry->count = count; 990 __entry->count = count;
996 __entry->type = type; 991 __entry->type = type;
@@ -998,13 +993,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
998 __entry->startblock = irec ? irec->br_startblock : 0; 993 __entry->startblock = irec ? irec->br_startblock : 0;
999 __entry->blockcount = irec ? irec->br_blockcount : 0; 994 __entry->blockcount = irec ? irec->br_blockcount : 0;
1000 ), 995 ),
1001 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " 996 TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
1002 "offset 0x%llx count %zd type %s " 997 "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
1003 "startoff 0x%llx startblock %lld blockcount 0x%llx",
1004 MAJOR(__entry->dev), MINOR(__entry->dev), 998 MAJOR(__entry->dev), MINOR(__entry->dev),
1005 __entry->ino, 999 __entry->ino,
1006 __entry->size, 1000 __entry->size,
1007 __entry->new_size,
1008 __entry->offset, 1001 __entry->offset,
1009 __entry->count, 1002 __entry->count,
1010 __print_symbolic(__entry->type, XFS_IO_TYPES), 1003 __print_symbolic(__entry->type, XFS_IO_TYPES),
@@ -1031,26 +1024,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
1031 __field(xfs_ino_t, ino) 1024 __field(xfs_ino_t, ino)
1032 __field(loff_t, isize) 1025 __field(loff_t, isize)
1033 __field(loff_t, disize) 1026 __field(loff_t, disize)
1034 __field(loff_t, new_size)
1035 __field(loff_t, offset) 1027 __field(loff_t, offset)
1036 __field(size_t, count) 1028 __field(size_t, count)
1037 ), 1029 ),
1038 TP_fast_assign( 1030 TP_fast_assign(
1039 __entry->dev = VFS_I(ip)->i_sb->s_dev; 1031 __entry->dev = VFS_I(ip)->i_sb->s_dev;
1040 __entry->ino = ip->i_ino; 1032 __entry->ino = ip->i_ino;
1041 __entry->isize = ip->i_size; 1033 __entry->isize = VFS_I(ip)->i_size;
1042 __entry->disize = ip->i_d.di_size; 1034 __entry->disize = ip->i_d.di_size;
1043 __entry->new_size = ip->i_new_size;
1044 __entry->offset = offset; 1035 __entry->offset = offset;
1045 __entry->count = count; 1036 __entry->count = count;
1046 ), 1037 ),
1047 TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx " 1038 TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
1048 "offset 0x%llx count %zd", 1039 "offset 0x%llx count %zd",
1049 MAJOR(__entry->dev), MINOR(__entry->dev), 1040 MAJOR(__entry->dev), MINOR(__entry->dev),
1050 __entry->ino, 1041 __entry->ino,
1051 __entry->isize, 1042 __entry->isize,
1052 __entry->disize, 1043 __entry->disize,
1053 __entry->new_size,
1054 __entry->offset, 1044 __entry->offset,
1055 __entry->count) 1045 __entry->count)
1056); 1046);
@@ -1090,8 +1080,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
1090DEFINE_EVENT(xfs_itrunc_class, name, \ 1080DEFINE_EVENT(xfs_itrunc_class, name, \
1091 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ 1081 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
1092 TP_ARGS(ip, new_size)) 1082 TP_ARGS(ip, new_size))
1093DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start); 1083DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
1094DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end); 1084DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
1095 1085
1096TRACE_EVENT(xfs_pagecache_inval, 1086TRACE_EVENT(xfs_pagecache_inval,
1097 TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), 1087 TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
@@ -1568,7 +1558,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
1568 __field(xfs_ino_t, ino) 1558 __field(xfs_ino_t, ino)
1569 __field(int, format) 1559 __field(int, format)
1570 __field(int, nex) 1560 __field(int, nex)
1571 __field(int, max_nex)
1572 __field(int, broot_size) 1561 __field(int, broot_size)
1573 __field(int, fork_off) 1562 __field(int, fork_off)
1574 ), 1563 ),
@@ -1578,18 +1567,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
1578 __entry->ino = ip->i_ino; 1567 __entry->ino = ip->i_ino;
1579 __entry->format = ip->i_d.di_format; 1568 __entry->format = ip->i_d.di_format;
1580 __entry->nex = ip->i_d.di_nextents; 1569 __entry->nex = ip->i_d.di_nextents;
1581 __entry->max_nex = ip->i_df.if_ext_max;
1582 __entry->broot_size = ip->i_df.if_broot_bytes; 1570 __entry->broot_size = ip->i_df.if_broot_bytes;
1583 __entry->fork_off = XFS_IFORK_BOFF(ip); 1571 __entry->fork_off = XFS_IFORK_BOFF(ip);
1584 ), 1572 ),
1585 TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " 1573 TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
1586 "Max in-fork extents %d, broot size %d, fork offset %d", 1574 "broot size %d, fork offset %d",
1587 MAJOR(__entry->dev), MINOR(__entry->dev), 1575 MAJOR(__entry->dev), MINOR(__entry->dev),
1588 __entry->ino, 1576 __entry->ino,
1589 __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), 1577 __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
1590 __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), 1578 __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
1591 __entry->nex, 1579 __entry->nex,
1592 __entry->max_nex,
1593 __entry->broot_size, 1580 __entry->broot_size,
1594 __entry->fork_off) 1581 __entry->fork_off)
1595) 1582)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f2fea868d4db..0cf52da9d246 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -175,7 +175,7 @@ xfs_free_eofblocks(
175 * Figure out if there are any blocks beyond the end 175 * Figure out if there are any blocks beyond the end
176 * of the file. If not, then there is nothing to do. 176 * of the file. If not, then there is nothing to do.
177 */ 177 */
178 end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); 178 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
179 last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 179 last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
180 if (last_fsb <= end_fsb) 180 if (last_fsb <= end_fsb)
181 return 0; 181 return 0;
@@ -226,7 +226,14 @@ xfs_free_eofblocks(
226 xfs_ilock(ip, XFS_ILOCK_EXCL); 226 xfs_ilock(ip, XFS_ILOCK_EXCL);
227 xfs_trans_ijoin(tp, ip, 0); 227 xfs_trans_ijoin(tp, ip, 0);
228 228
229 error = xfs_itruncate_data(&tp, ip, ip->i_size); 229 /*
230 * Do not update the on-disk file size. If we update the
231 * on-disk file size and then the system crashes before the
232 * contents of the file are flushed to disk then the files
233 * may be full of holes (ie NULL files bug).
234 */
235 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
236 XFS_ISIZE(ip));
230 if (error) { 237 if (error) {
231 /* 238 /*
232 * If we get an error at this point we simply don't 239 * If we get an error at this point we simply don't
@@ -540,8 +547,8 @@ xfs_release(
540 return 0; 547 return 0;
541 548
542 if ((S_ISREG(ip->i_d.di_mode) && 549 if ((S_ISREG(ip->i_d.di_mode) &&
543 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || 550 (VFS_I(ip)->i_size > 0 ||
544 ip->i_delayed_blks > 0)) && 551 (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
545 (ip->i_df.if_flags & XFS_IFEXTENTS)) && 552 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
546 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { 553 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
547 554
@@ -618,7 +625,7 @@ xfs_inactive(
618 * only one with a reference to the inode. 625 * only one with a reference to the inode.
619 */ 626 */
620 truncate = ((ip->i_d.di_nlink == 0) && 627 truncate = ((ip->i_d.di_nlink == 0) &&
621 ((ip->i_d.di_size != 0) || (ip->i_size != 0) || 628 ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
622 (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && 629 (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
623 S_ISREG(ip->i_d.di_mode)); 630 S_ISREG(ip->i_d.di_mode));
624 631
@@ -632,12 +639,12 @@ xfs_inactive(
632 639
633 if (ip->i_d.di_nlink != 0) { 640 if (ip->i_d.di_nlink != 0) {
634 if ((S_ISREG(ip->i_d.di_mode) && 641 if ((S_ISREG(ip->i_d.di_mode) &&
635 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || 642 (VFS_I(ip)->i_size > 0 ||
636 ip->i_delayed_blks > 0)) && 643 (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
637 (ip->i_df.if_flags & XFS_IFEXTENTS) && 644 (ip->i_df.if_flags & XFS_IFEXTENTS) &&
638 (!(ip->i_d.di_flags & 645 (!(ip->i_d.di_flags &
639 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || 646 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
640 (ip->i_delayed_blks != 0)))) { 647 ip->i_delayed_blks != 0))) {
641 error = xfs_free_eofblocks(mp, ip, 0); 648 error = xfs_free_eofblocks(mp, ip, 0);
642 if (error) 649 if (error)
643 return VN_INACTIVE_CACHE; 650 return VN_INACTIVE_CACHE;
@@ -670,13 +677,18 @@ xfs_inactive(
670 xfs_ilock(ip, XFS_ILOCK_EXCL); 677 xfs_ilock(ip, XFS_ILOCK_EXCL);
671 xfs_trans_ijoin(tp, ip, 0); 678 xfs_trans_ijoin(tp, ip, 0);
672 679
673 error = xfs_itruncate_data(&tp, ip, 0); 680 ip->i_d.di_size = 0;
681 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
682
683 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
674 if (error) { 684 if (error) {
675 xfs_trans_cancel(tp, 685 xfs_trans_cancel(tp,
676 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 686 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
677 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 687 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
678 return VN_INACTIVE_CACHE; 688 return VN_INACTIVE_CACHE;
679 } 689 }
690
691 ASSERT(ip->i_d.di_nextents == 0);
680 } else if (S_ISLNK(ip->i_d.di_mode)) { 692 } else if (S_ISLNK(ip->i_d.di_mode)) {
681 693
682 /* 694 /*
@@ -1961,11 +1973,11 @@ xfs_zero_remaining_bytes(
1961 * since nothing can read beyond eof. The space will 1973 * since nothing can read beyond eof. The space will
1962 * be zeroed when the file is extended anyway. 1974 * be zeroed when the file is extended anyway.
1963 */ 1975 */
1964 if (startoff >= ip->i_size) 1976 if (startoff >= XFS_ISIZE(ip))
1965 return 0; 1977 return 0;
1966 1978
1967 if (endoff > ip->i_size) 1979 if (endoff > XFS_ISIZE(ip))
1968 endoff = ip->i_size; 1980 endoff = XFS_ISIZE(ip);
1969 1981
1970 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? 1982 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
1971 mp->m_rtdev_targp : mp->m_ddev_targp, 1983 mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -2260,7 +2272,7 @@ xfs_change_file_space(
2260 bf->l_start += offset; 2272 bf->l_start += offset;
2261 break; 2273 break;
2262 case 2: /*SEEK_END*/ 2274 case 2: /*SEEK_END*/
2263 bf->l_start += ip->i_size; 2275 bf->l_start += XFS_ISIZE(ip);
2264 break; 2276 break;
2265 default: 2277 default:
2266 return XFS_ERROR(EINVAL); 2278 return XFS_ERROR(EINVAL);
@@ -2277,7 +2289,7 @@ xfs_change_file_space(
2277 bf->l_whence = 0; 2289 bf->l_whence = 0;
2278 2290
2279 startoffset = bf->l_start; 2291 startoffset = bf->l_start;
2280 fsize = ip->i_size; 2292 fsize = XFS_ISIZE(ip);
2281 2293
2282 /* 2294 /*
2283 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve 2295 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve