aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-21 21:12:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-21 21:12:42 -0400
commit968f3e374faf41e5e6049399eb7302777a09a1e8 (patch)
tree613c5aa9a005cfbe3fada77fcb0ab24deda126d9
parente531cdf50a8a0fb7a4d51c06e52097bd01e9bf7c (diff)
parent389f239c53420802ad5085e51e88c37e2df5e003 (diff)
Merge branch 'for-linus-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs updates from Chris Mason: "We have a good sized cleanup of our internal read ahead code, and the first series of commits from Chandan to enable PAGE_SIZE > sectorsize Otherwise, it's a normal series of cleanups and fixes, with many thanks to Dave Sterba for doing most of the patch wrangling this time" * 'for-linus-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (82 commits) btrfs: make sure we stay inside the bvec during __btrfs_lookup_bio_sums btrfs: Fix misspellings in comments. btrfs: Print Warning only if ENOSPC_DEBUG is enabled btrfs: scrub: silence an uninitialized variable warning btrfs: move btrfs_compression_type to compression.h btrfs: rename btrfs_print_info to btrfs_print_mod_info Btrfs: Show a warning message if one of objectid reaches its highest value Documentation: btrfs: remove usage specific information btrfs: use kbasename in btrfsic_mount Btrfs: do not collect ordered extents when logging that inode exists Btrfs: fix race when checking if we can skip fsync'ing an inode Btrfs: fix listxattrs not listing all xattrs packed in the same item Btrfs: fix deadlock between direct IO reads and buffered writes Btrfs: fix extent_same allowing destination offset beyond i_size Btrfs: fix file loss on log replay after renaming a file and fsync Btrfs: fix unreplayable log after snapshot delete + parent dir fsync Btrfs: fix lockdep deadlock warning due to dev_replace btrfs: drop unused argument in btrfs_ioctl_get_supported_features btrfs: add GET_SUPPORTED_FEATURES to the control device ioctls btrfs: change max_inline default to 2048 ...
-rw-r--r--Documentation/filesystems/btrfs.txt261
-rw-r--r--fs/btrfs/backref.c12
-rw-r--r--fs/btrfs/check-integrity.c12
-rw-r--r--fs/btrfs/compression.h9
-rw-r--r--fs/btrfs/ctree.c36
-rw-r--r--fs/btrfs/ctree.h87
-rw-r--r--fs/btrfs/delayed-inode.c10
-rw-r--r--fs/btrfs/delayed-ref.c12
-rw-r--r--fs/btrfs/dev-replace.c134
-rw-r--r--fs/btrfs/dev-replace.h7
-rw-r--r--fs/btrfs/disk-io.c71
-rw-r--r--fs/btrfs/extent-tree.c40
-rw-r--r--fs/btrfs/extent_io.c40
-rw-r--r--fs/btrfs/extent_io.h5
-rw-r--r--fs/btrfs/extent_map.c8
-rw-r--r--fs/btrfs/file-item.c103
-rw-r--r--fs/btrfs/file.c158
-rw-r--r--fs/btrfs/inode-map.c3
-rw-r--r--fs/btrfs/inode.c326
-rw-r--r--fs/btrfs/ioctl.c35
-rw-r--r--fs/btrfs/ordered-data.c6
-rw-r--r--fs/btrfs/print-tree.c23
-rw-r--r--fs/btrfs/props.c1
-rw-r--r--fs/btrfs/reada.c268
-rw-r--r--fs/btrfs/root-tree.c2
-rw-r--r--fs/btrfs/scrub.c32
-rw-r--r--fs/btrfs/send.c37
-rw-r--r--fs/btrfs/super.c52
-rw-r--r--fs/btrfs/tests/btrfs-tests.c6
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c1
-rw-r--r--fs/btrfs/tests/inode-tests.c1
-rw-r--r--fs/btrfs/transaction.c13
-rw-r--r--fs/btrfs/tree-log.c102
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c51
-rw-r--r--fs/btrfs/xattr.c67
36 files changed, 1102 insertions, 931 deletions
diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
index c772b47e7ef0..f9dad22d95ce 100644
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -1,20 +1,10 @@
1
2BTRFS 1BTRFS
3===== 2=====
4 3
5Btrfs is a copy on write filesystem for Linux aimed at 4Btrfs is a copy on write filesystem for Linux aimed at implementing advanced
6implementing advanced features while focusing on fault tolerance, 5features while focusing on fault tolerance, repair and easy administration.
7repair and easy administration. Initially developed by Oracle, Btrfs 6Jointly developed by several companies, licensed under the GPL and open for
8is licensed under the GPL and open for contribution from anyone. 7contribution from anyone.
9
10Linux has a wealth of filesystems to choose from, but we are facing a
11number of challenges with scaling to the large storage subsystems that
12are becoming common in today's data centers. Filesystems need to scale
13in their ability to address and manage large storage, and also in
14their ability to detect, repair and tolerate errors in the data stored
15on disk. Btrfs is under heavy development, and is not suitable for
16any uses other than benchmarking and review. The Btrfs disk format is
17not yet finalized.
18 8
19The main Btrfs features include: 9The main Btrfs features include:
20 10
@@ -28,243 +18,14 @@ The main Btrfs features include:
28 * Checksums on data and metadata (multiple algorithms available) 18 * Checksums on data and metadata (multiple algorithms available)
29 * Compression 19 * Compression
30 * Integrated multiple device support, with several raid algorithms 20 * Integrated multiple device support, with several raid algorithms
31 * Online filesystem check (not yet implemented) 21 * Offline filesystem check
32 * Very fast offline filesystem check 22 * Efficient incremental backup and FS mirroring
33 * Efficient incremental backup and FS mirroring (not yet implemented)
34 * Online filesystem defragmentation 23 * Online filesystem defragmentation
35 24
25For more information please refer to the wiki
36 26
37Mount Options 27 https://btrfs.wiki.kernel.org
38=============
39
40When mounting a btrfs filesystem, the following option are accepted.
41Options with (*) are default options and will not show in the mount options.
42
43 alloc_start=<bytes>
44 Debugging option to force all block allocations above a certain
45 byte threshold on each block device. The value is specified in
46 bytes, optionally with a K, M, or G suffix, case insensitive.
47 Default is 1MB.
48
49 noautodefrag(*)
50 autodefrag
51 Disable/enable auto defragmentation.
52 Auto defragmentation detects small random writes into files and queue
53 them up for the defrag process. Works best for small files;
54 Not well suited for large database workloads.
55
56 check_int
57 check_int_data
58 check_int_print_mask=<value>
59 These debugging options control the behavior of the integrity checking
60 module (the BTRFS_FS_CHECK_INTEGRITY config option required).
61
62 check_int enables the integrity checker module, which examines all
63 block write requests to ensure on-disk consistency, at a large
64 memory and CPU cost.
65
66 check_int_data includes extent data in the integrity checks, and
67 implies the check_int option.
68
69 check_int_print_mask takes a bitmask of BTRFSIC_PRINT_MASK_* values
70 as defined in fs/btrfs/check-integrity.c, to control the integrity
71 checker module behavior.
72
73 See comments at the top of fs/btrfs/check-integrity.c for more info.
74
75 commit=<seconds>
76 Set the interval of periodic commit, 30 seconds by default. Higher
77 values defer data being synced to permanent storage with obvious
78 consequences when the system crashes. The upper bound is not forced,
79 but a warning is printed if it's more than 300 seconds (5 minutes).
80
81 compress
82 compress=<type>
83 compress-force
84 compress-force=<type>
85 Control BTRFS file data compression. Type may be specified as "zlib"
86 "lzo" or "no" (for no compression, used for remounting). If no type
87 is specified, zlib is used. If compress-force is specified,
88 all files will be compressed, whether or not they compress well.
89 If compression is enabled, nodatacow and nodatasum are disabled.
90
91 degraded
92 Allow mounts to continue with missing devices. A read-write mount may
93 fail with too many devices missing, for example if a stripe member
94 is completely missing.
95
96 device=<devicepath>
97 Specify a device during mount so that ioctls on the control device
98 can be avoided. Especially useful when trying to mount a multi-device
99 setup as root. May be specified multiple times for multiple devices.
100
101 nodiscard(*)
102 discard
103 Disable/enable discard mount option.
104 Discard issues frequent commands to let the block device reclaim space
105 freed by the filesystem.
106 This is useful for SSD devices, thinly provisioned
107 LUNs and virtual machine images, but may have a significant
108 performance impact. (The fstrim command is also available to
109 initiate batch trims from userspace).
110
111 noenospc_debug(*)
112 enospc_debug
113 Disable/enable debugging option to be more verbose in some ENOSPC conditions.
114
115 fatal_errors=<action>
116 Action to take when encountering a fatal error:
117 "bug" - BUG() on a fatal error. This is the default.
118 "panic" - panic() on a fatal error.
119
120 noflushoncommit(*)
121 flushoncommit
122 The 'flushoncommit' mount option forces any data dirtied by a write in a
123 prior transaction to commit as part of the current commit. This makes
124 the committed state a fully consistent view of the file system from the
125 application's perspective (i.e., it includes all completed file system
126 operations). This was previously the behavior only when a snapshot is
127 created.
128
129 inode_cache
130 Enable free inode number caching. Defaults to off due to an overflow
131 problem when the free space crcs don't fit inside a single page.
132
133 max_inline=<bytes>
134 Specify the maximum amount of space, in bytes, that can be inlined in
135 a metadata B-tree leaf. The value is specified in bytes, optionally
136 with a K, M, or G suffix, case insensitive. In practice, this value
137 is limited by the root sector size, with some space unavailable due
138 to leaf headers. For a 4k sector size, max inline data is ~3900 bytes.
139
140 metadata_ratio=<value>
141 Specify that 1 metadata chunk should be allocated after every <value>
142 data chunks. Off by default.
143
144 acl(*)
145 noacl
146 Enable/disable support for Posix Access Control Lists (ACLs). See the
147 acl(5) manual page for more information about ACLs.
148
149 barrier(*)
150 nobarrier
151 Enable/disable the use of block layer write barriers. Write barriers
152 ensure that certain IOs make it through the device cache and are on
153 persistent storage. If disabled on a device with a volatile
154 (non-battery-backed) write-back cache, nobarrier option will lead to
155 filesystem corruption on a system crash or power loss.
156
157 datacow(*)
158 nodatacow
159 Enable/disable data copy-on-write for newly created files.
160 Nodatacow implies nodatasum, and disables all compression.
161
162 datasum(*)
163 nodatasum
164 Enable/disable data checksumming for newly created files.
165 Datasum implies datacow.
166
167 treelog(*)
168 notreelog
169 Enable/disable the tree logging used for fsync and O_SYNC writes.
170
171 recovery
172 Enable autorecovery attempts if a bad tree root is found at mount time.
173 Currently this scans a list of several previous tree roots and tries to
174 use the first readable.
175
176 rescan_uuid_tree
177 Force check and rebuild procedure of the UUID tree. This should not
178 normally be needed.
179
180 skip_balance
181 Skip automatic resume of interrupted balance operation after mount.
182 May be resumed with "btrfs balance resume."
183
184 space_cache (*)
185 Enable the on-disk freespace cache.
186 nospace_cache
187 Disable freespace cache loading without clearing the cache.
188 clear_cache
189 Force clearing and rebuilding of the disk space cache if something
190 has gone wrong.
191
192 ssd
193 nossd
194 ssd_spread
195 Options to control ssd allocation schemes. By default, BTRFS will
196 enable or disable ssd allocation heuristics depending on whether a
197 rotational or non-rotational disk is in use. The ssd and nossd options
198 can override this autodetection.
199
200 The ssd_spread mount option attempts to allocate into big chunks
201 of unused space, and may perform better on low-end ssds. ssd_spread
202 implies ssd, enabling all other ssd heuristics as well.
203
204 subvol=<path>
205 Mount subvolume at <path> rather than the root subvolume. <path> is
206 relative to the top level subvolume.
207
208 subvolid=<ID>
209 Mount subvolume specified by an ID number rather than the root subvolume.
210 This allows mounting of subvolumes which are not in the root of the mounted
211 filesystem.
212 You can use "btrfs subvolume list" to see subvolume ID numbers.
213
214 subvolrootid=<objectid> (deprecated)
215 Mount subvolume specified by <objectid> rather than the root subvolume.
216 This allows mounting of subvolumes which are not in the root of the mounted
217 filesystem.
218 You can use "btrfs subvolume show " to see the object ID for a subvolume.
219
220 thread_pool=<number>
221 The number of worker threads to allocate. The default number is equal
222 to the number of CPUs + 2, or 8, whichever is smaller.
223
224 user_subvol_rm_allowed
225 Allow subvolumes to be deleted by a non-root user. Use with caution.
226
227MAILING LIST
228============
229
230There is a Btrfs mailing list hosted on vger.kernel.org. You can
231find details on how to subscribe here:
232
233http://vger.kernel.org/vger-lists.html#linux-btrfs
234
235Mailing list archives are available from gmane:
236
237http://dir.gmane.org/gmane.comp.file-systems.btrfs
238
239
240
241IRC
242===
243
244Discussion of Btrfs also occurs on the #btrfs channel of the Freenode
245IRC network.
246
247
248
249 UTILITIES
250 =========
251
252Userspace tools for creating and manipulating Btrfs file systems are
253available from the git repository at the following location:
254
255 http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs.git
256 git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs.git
257
258These include the following tools:
259
260* mkfs.btrfs: create a filesystem
261
262* btrfs: a single tool to manage the filesystems, refer to the manpage for more details
263
264* 'btrfsck' or 'btrfs check': do a consistency check of the filesystem
265
266Other tools for specific tasks:
267
268* btrfs-convert: in-place conversion from ext2/3/4 filesystems
269 28
270* btrfs-image: dump filesystem metadata for debugging 29that maintains information about administration tasks, frequently asked
30questions, use cases, mount options, comprehensible changelogs, features,
31manual pages, source code repositories, contacts etc.
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f6dac40f87ff..80e8472d618b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -148,8 +148,7 @@ int __init btrfs_prelim_ref_init(void)
148 148
149void btrfs_prelim_ref_exit(void) 149void btrfs_prelim_ref_exit(void)
150{ 150{
151 if (btrfs_prelim_ref_cache) 151 kmem_cache_destroy(btrfs_prelim_ref_cache);
152 kmem_cache_destroy(btrfs_prelim_ref_cache);
153} 152}
154 153
155/* 154/*
@@ -566,17 +565,14 @@ static void __merge_refs(struct list_head *head, int mode)
566 struct __prelim_ref *pos2 = pos1, *tmp; 565 struct __prelim_ref *pos2 = pos1, *tmp;
567 566
568 list_for_each_entry_safe_continue(pos2, tmp, head, list) { 567 list_for_each_entry_safe_continue(pos2, tmp, head, list) {
569 struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2; 568 struct __prelim_ref *ref1 = pos1, *ref2 = pos2;
570 struct extent_inode_elem *eie; 569 struct extent_inode_elem *eie;
571 570
572 if (!ref_for_same_block(ref1, ref2)) 571 if (!ref_for_same_block(ref1, ref2))
573 continue; 572 continue;
574 if (mode == 1) { 573 if (mode == 1) {
575 if (!ref1->parent && ref2->parent) { 574 if (!ref1->parent && ref2->parent)
576 xchg = ref1; 575 swap(ref1, ref2);
577 ref1 = ref2;
578 ref2 = xchg;
579 }
580 } else { 576 } else {
581 if (ref1->parent != ref2->parent) 577 if (ref1->parent != ref2->parent)
582 continue; 578 continue;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 861d472564c1..e34a71b3e225 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -95,6 +95,7 @@
95#include <linux/genhd.h> 95#include <linux/genhd.h>
96#include <linux/blkdev.h> 96#include <linux/blkdev.h>
97#include <linux/vmalloc.h> 97#include <linux/vmalloc.h>
98#include <linux/string.h>
98#include "ctree.h" 99#include "ctree.h"
99#include "disk-io.h" 100#include "disk-io.h"
100#include "hash.h" 101#include "hash.h"
@@ -105,6 +106,7 @@
105#include "locking.h" 106#include "locking.h"
106#include "check-integrity.h" 107#include "check-integrity.h"
107#include "rcu-string.h" 108#include "rcu-string.h"
109#include "compression.h"
108 110
109#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 111#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
110#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 112#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
@@ -176,7 +178,7 @@ struct btrfsic_block {
176 * Elements of this type are allocated dynamically and required because 178 * Elements of this type are allocated dynamically and required because
177 * each block object can refer to and can be ref from multiple blocks. 179 * each block object can refer to and can be ref from multiple blocks.
178 * The key to lookup them in the hashtable is the dev_bytenr of 180 * The key to lookup them in the hashtable is the dev_bytenr of
179 * the block ref to plus the one from the block refered from. 181 * the block ref to plus the one from the block referred from.
180 * The fact that they are searchable via a hashtable and that a 182 * The fact that they are searchable via a hashtable and that a
181 * ref_cnt is maintained is not required for the btrfs integrity 183 * ref_cnt is maintained is not required for the btrfs integrity
182 * check algorithm itself, it is only used to make the output more 184 * check algorithm itself, it is only used to make the output more
@@ -3076,7 +3078,7 @@ int btrfsic_mount(struct btrfs_root *root,
3076 3078
3077 list_for_each_entry(device, dev_head, dev_list) { 3079 list_for_each_entry(device, dev_head, dev_list) {
3078 struct btrfsic_dev_state *ds; 3080 struct btrfsic_dev_state *ds;
3079 char *p; 3081 const char *p;
3080 3082
3081 if (!device->bdev || !device->name) 3083 if (!device->bdev || !device->name)
3082 continue; 3084 continue;
@@ -3092,11 +3094,7 @@ int btrfsic_mount(struct btrfs_root *root,
3092 ds->state = state; 3094 ds->state = state;
3093 bdevname(ds->bdev, ds->name); 3095 bdevname(ds->bdev, ds->name);
3094 ds->name[BDEVNAME_SIZE - 1] = '\0'; 3096 ds->name[BDEVNAME_SIZE - 1] = '\0';
3095 for (p = ds->name; *p != '\0'; p++); 3097 p = kbasename(ds->name);
3096 while (p > ds->name && *p != '/')
3097 p--;
3098 if (*p == '/')
3099 p++;
3100 strlcpy(ds->name, p, sizeof(ds->name)); 3098 strlcpy(ds->name, p, sizeof(ds->name));
3101 btrfsic_dev_state_hashtable_add(ds, 3099 btrfsic_dev_state_hashtable_add(ds,
3102 &btrfsic_dev_state_hashtable); 3100 &btrfsic_dev_state_hashtable);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 13a4dc0436c9..f49d8b8c0f00 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -48,6 +48,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
48void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt, 48void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
49 unsigned long pg_index, 49 unsigned long pg_index,
50 unsigned long pg_offset); 50 unsigned long pg_offset);
51
52enum btrfs_compression_type {
53 BTRFS_COMPRESS_NONE = 0,
54 BTRFS_COMPRESS_ZLIB = 1,
55 BTRFS_COMPRESS_LZO = 2,
56 BTRFS_COMPRESS_TYPES = 2,
57 BTRFS_COMPRESS_LAST = 3,
58};
59
51struct btrfs_compress_op { 60struct btrfs_compress_op {
52 struct list_head *(*alloc_workspace)(void); 61 struct list_head *(*alloc_workspace)(void);
53 62
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 769e0ff1b4ce..77592931ab4f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -311,7 +311,7 @@ struct tree_mod_root {
311 311
312struct tree_mod_elem { 312struct tree_mod_elem {
313 struct rb_node node; 313 struct rb_node node;
314 u64 index; /* shifted logical */ 314 u64 logical;
315 u64 seq; 315 u64 seq;
316 enum mod_log_op op; 316 enum mod_log_op op;
317 317
@@ -435,11 +435,11 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
435 435
436/* 436/*
437 * key order of the log: 437 * key order of the log:
438 * index -> sequence 438 * node/leaf start address -> sequence
439 * 439 *
440 * the index is the shifted logical of the *new* root node for root replace 440 * The 'start address' is the logical address of the *new* root node
441 * operations, or the shifted logical of the affected block for all other 441 * for root replace operations, or the logical address of the affected
442 * operations. 442 * block for all other operations.
443 * 443 *
444 * Note: must be called with write lock (tree_mod_log_write_lock). 444 * Note: must be called with write lock (tree_mod_log_write_lock).
445 */ 445 */
@@ -460,9 +460,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
460 while (*new) { 460 while (*new) {
461 cur = container_of(*new, struct tree_mod_elem, node); 461 cur = container_of(*new, struct tree_mod_elem, node);
462 parent = *new; 462 parent = *new;
463 if (cur->index < tm->index) 463 if (cur->logical < tm->logical)
464 new = &((*new)->rb_left); 464 new = &((*new)->rb_left);
465 else if (cur->index > tm->index) 465 else if (cur->logical > tm->logical)
466 new = &((*new)->rb_right); 466 new = &((*new)->rb_right);
467 else if (cur->seq < tm->seq) 467 else if (cur->seq < tm->seq)
468 new = &((*new)->rb_left); 468 new = &((*new)->rb_left);
@@ -523,7 +523,7 @@ alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
523 if (!tm) 523 if (!tm)
524 return NULL; 524 return NULL;
525 525
526 tm->index = eb->start >> PAGE_CACHE_SHIFT; 526 tm->logical = eb->start;
527 if (op != MOD_LOG_KEY_ADD) { 527 if (op != MOD_LOG_KEY_ADD) {
528 btrfs_node_key(eb, &tm->key, slot); 528 btrfs_node_key(eb, &tm->key, slot);
529 tm->blockptr = btrfs_node_blockptr(eb, slot); 529 tm->blockptr = btrfs_node_blockptr(eb, slot);
@@ -588,7 +588,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
588 goto free_tms; 588 goto free_tms;
589 } 589 }
590 590
591 tm->index = eb->start >> PAGE_CACHE_SHIFT; 591 tm->logical = eb->start;
592 tm->slot = src_slot; 592 tm->slot = src_slot;
593 tm->move.dst_slot = dst_slot; 593 tm->move.dst_slot = dst_slot;
594 tm->move.nr_items = nr_items; 594 tm->move.nr_items = nr_items;
@@ -699,7 +699,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
699 goto free_tms; 699 goto free_tms;
700 } 700 }
701 701
702 tm->index = new_root->start >> PAGE_CACHE_SHIFT; 702 tm->logical = new_root->start;
703 tm->old_root.logical = old_root->start; 703 tm->old_root.logical = old_root->start;
704 tm->old_root.level = btrfs_header_level(old_root); 704 tm->old_root.level = btrfs_header_level(old_root);
705 tm->generation = btrfs_header_generation(old_root); 705 tm->generation = btrfs_header_generation(old_root);
@@ -739,16 +739,15 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
739 struct rb_node *node; 739 struct rb_node *node;
740 struct tree_mod_elem *cur = NULL; 740 struct tree_mod_elem *cur = NULL;
741 struct tree_mod_elem *found = NULL; 741 struct tree_mod_elem *found = NULL;
742 u64 index = start >> PAGE_CACHE_SHIFT;
743 742
744 tree_mod_log_read_lock(fs_info); 743 tree_mod_log_read_lock(fs_info);
745 tm_root = &fs_info->tree_mod_log; 744 tm_root = &fs_info->tree_mod_log;
746 node = tm_root->rb_node; 745 node = tm_root->rb_node;
747 while (node) { 746 while (node) {
748 cur = container_of(node, struct tree_mod_elem, node); 747 cur = container_of(node, struct tree_mod_elem, node);
749 if (cur->index < index) { 748 if (cur->logical < start) {
750 node = node->rb_left; 749 node = node->rb_left;
751 } else if (cur->index > index) { 750 } else if (cur->logical > start) {
752 node = node->rb_right; 751 node = node->rb_right;
753 } else if (cur->seq < min_seq) { 752 } else if (cur->seq < min_seq) {
754 node = node->rb_left; 753 node = node->rb_left;
@@ -1230,9 +1229,10 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1230 return NULL; 1229 return NULL;
1231 1230
1232 /* 1231 /*
1233 * the very last operation that's logged for a root is the replacement 1232 * the very last operation that's logged for a root is the
1234 * operation (if it is replaced at all). this has the index of the *new* 1233 * replacement operation (if it is replaced at all). this has
1235 * root, making it the very first operation that's logged for this root. 1234 * the logical address of the *new* root, making it the very
1235 * first operation that's logged for this root.
1236 */ 1236 */
1237 while (1) { 1237 while (1) {
1238 tm = tree_mod_log_search_oldest(fs_info, root_logical, 1238 tm = tree_mod_log_search_oldest(fs_info, root_logical,
@@ -1336,7 +1336,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1336 if (!next) 1336 if (!next)
1337 break; 1337 break;
1338 tm = container_of(next, struct tree_mod_elem, node); 1338 tm = container_of(next, struct tree_mod_elem, node);
1339 if (tm->index != first_tm->index) 1339 if (tm->logical != first_tm->logical)
1340 break; 1340 break;
1341 } 1341 }
1342 tree_mod_log_read_unlock(fs_info); 1342 tree_mod_log_read_unlock(fs_info);
@@ -5361,7 +5361,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5361 goto out; 5361 goto out;
5362 } 5362 }
5363 5363
5364 tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS); 5364 tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL);
5365 if (!tmp_buf) { 5365 if (!tmp_buf) {
5366 ret = -ENOMEM; 5366 ret = -ENOMEM;
5367 goto out; 5367 goto out;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bfe4a337fb4d..84a6a5b3384a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -100,6 +100,9 @@ struct btrfs_ordered_sum;
100/* tracks free space in block groups. */ 100/* tracks free space in block groups. */
101#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL 101#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
102 102
103/* device stats in the device tree */
104#define BTRFS_DEV_STATS_OBJECTID 0ULL
105
103/* for storing balance parameters in the root tree */ 106/* for storing balance parameters in the root tree */
104#define BTRFS_BALANCE_OBJECTID -4ULL 107#define BTRFS_BALANCE_OBJECTID -4ULL
105 108
@@ -715,14 +718,6 @@ struct btrfs_timespec {
715 __le32 nsec; 718 __le32 nsec;
716} __attribute__ ((__packed__)); 719} __attribute__ ((__packed__));
717 720
718enum btrfs_compression_type {
719 BTRFS_COMPRESS_NONE = 0,
720 BTRFS_COMPRESS_ZLIB = 1,
721 BTRFS_COMPRESS_LZO = 2,
722 BTRFS_COMPRESS_TYPES = 2,
723 BTRFS_COMPRESS_LAST = 3,
724};
725
726struct btrfs_inode_item { 721struct btrfs_inode_item {
727 /* nfs style generation number */ 722 /* nfs style generation number */
728 __le64 generation; 723 __le64 generation;
@@ -793,7 +788,7 @@ struct btrfs_root_item {
793 788
794 /* 789 /*
795 * This generation number is used to test if the new fields are valid 790 * This generation number is used to test if the new fields are valid
796 * and up to date while reading the root item. Everytime the root item 791 * and up to date while reading the root item. Every time the root item
797 * is written out, the "generation" field is copied into this field. If 792 * is written out, the "generation" field is copied into this field. If
798 * anyone ever mounted the fs with an older kernel, we will have 793 * anyone ever mounted the fs with an older kernel, we will have
799 * mismatching generation values here and thus must invalidate the 794 * mismatching generation values here and thus must invalidate the
@@ -1002,8 +997,10 @@ struct btrfs_dev_replace {
1002 pid_t lock_owner; 997 pid_t lock_owner;
1003 atomic_t nesting_level; 998 atomic_t nesting_level;
1004 struct mutex lock_finishing_cancel_unmount; 999 struct mutex lock_finishing_cancel_unmount;
1005 struct mutex lock_management_lock; 1000 rwlock_t lock;
1006 struct mutex lock; 1001 atomic_t read_locks;
1002 atomic_t blocking_readers;
1003 wait_queue_head_t read_lock_wq;
1007 1004
1008 struct btrfs_scrub_progress scrub_progress; 1005 struct btrfs_scrub_progress scrub_progress;
1009}; 1006};
@@ -1222,10 +1219,10 @@ struct btrfs_space_info {
1222 * we've called update_block_group and dropped the bytes_used counter 1219 * we've called update_block_group and dropped the bytes_used counter
1223 * and increased the bytes_pinned counter. However this means that 1220 * and increased the bytes_pinned counter. However this means that
1224 * bytes_pinned does not reflect the bytes that will be pinned once the 1221 * bytes_pinned does not reflect the bytes that will be pinned once the
1225 * delayed refs are flushed, so this counter is inc'ed everytime we call 1222 * delayed refs are flushed, so this counter is inc'ed every time we
1226 * btrfs_free_extent so it is a realtime count of what will be freed 1223 * call btrfs_free_extent so it is a realtime count of what will be
1227 * once the transaction is committed. It will be zero'ed everytime the 1224 * freed once the transaction is committed. It will be zero'ed every
1228 * transaction commits. 1225 * time the transaction commits.
1229 */ 1226 */
1230 struct percpu_counter total_bytes_pinned; 1227 struct percpu_counter total_bytes_pinned;
1231 1228
@@ -1822,6 +1819,9 @@ struct btrfs_fs_info {
1822 spinlock_t reada_lock; 1819 spinlock_t reada_lock;
1823 struct radix_tree_root reada_tree; 1820 struct radix_tree_root reada_tree;
1824 1821
1822 /* readahead works cnt */
1823 atomic_t reada_works_cnt;
1824
1825 /* Extent buffer radix tree */ 1825 /* Extent buffer radix tree */
1826 spinlock_t buffer_lock; 1826 spinlock_t buffer_lock;
1827 struct radix_tree_root buffer_radix; 1827 struct radix_tree_root buffer_radix;
@@ -2185,13 +2185,43 @@ struct btrfs_ioctl_defrag_range_args {
2185 */ 2185 */
2186#define BTRFS_QGROUP_RELATION_KEY 246 2186#define BTRFS_QGROUP_RELATION_KEY 246
2187 2187
2188/*
2189 * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
2190 */
2188#define BTRFS_BALANCE_ITEM_KEY 248 2191#define BTRFS_BALANCE_ITEM_KEY 248
2189 2192
2190/* 2193/*
2191 * Persistantly stores the io stats in the device tree. 2194 * The key type for tree items that are stored persistently, but do not need to
2192 * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid). 2195 * exist for extended period of time. The items can exist in any tree.
2196 *
2197 * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data]
2198 *
2199 * Existing items:
2200 *
2201 * - balance status item
2202 * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0)
2193 */ 2203 */
2194#define BTRFS_DEV_STATS_KEY 249 2204#define BTRFS_TEMPORARY_ITEM_KEY 248
2205
2206/*
2207 * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY
2208 */
2209#define BTRFS_DEV_STATS_KEY 249
2210
2211/*
2212 * The key type for tree items that are stored persistently and usually exist
2213 * for a long period, eg. filesystem lifetime. The item kinds can be status
2214 * information, stats or preference values. The item can exist in any tree.
2215 *
2216 * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data]
2217 *
2218 * Existing items:
2219 *
2220 * - device statistics, store IO stats in the device tree, one key for all
2221 * stats
2222 * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0)
2223 */
2224#define BTRFS_PERSISTENT_ITEM_KEY 249
2195 2225
2196/* 2226/*
2197 * Persistantly stores the device replace state in the device tree. 2227 * Persistantly stores the device replace state in the device tree.
@@ -2241,7 +2271,7 @@ struct btrfs_ioctl_defrag_range_args {
2241#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 2271#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
2242#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 2272#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
2243#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 2273#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
2244#define BTRFS_MOUNT_RECOVERY (1 << 18) 2274#define BTRFS_MOUNT_USEBACKUPROOT (1 << 18)
2245#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) 2275#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
2246#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) 2276#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
2247#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) 2277#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
@@ -2250,9 +2280,10 @@ struct btrfs_ioctl_defrag_range_args {
2250#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) 2280#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24)
2251#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) 2281#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
2252#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) 2282#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
2283#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
2253 2284
2254#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) 2285#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
2255#define BTRFS_DEFAULT_MAX_INLINE (8192) 2286#define BTRFS_DEFAULT_MAX_INLINE (2048)
2256 2287
2257#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 2288#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
2258#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 2289#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2353,6 +2384,9 @@ struct btrfs_map_token {
2353 unsigned long offset; 2384 unsigned long offset;
2354}; 2385};
2355 2386
2387#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
2388 ((bytes) >> (fs_info)->sb->s_blocksize_bits)
2389
2356static inline void btrfs_init_map_token (struct btrfs_map_token *token) 2390static inline void btrfs_init_map_token (struct btrfs_map_token *token)
2357{ 2391{
2358 token->kaddr = NULL; 2392 token->kaddr = NULL;
@@ -3448,8 +3482,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
3448static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 3482static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
3449 unsigned num_items) 3483 unsigned num_items)
3450{ 3484{
3451 return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 3485 return root->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
3452 2 * num_items;
3453} 3486}
3454 3487
3455/* 3488/*
@@ -4027,7 +4060,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4027 struct btrfs_root *root, 4060 struct btrfs_root *root,
4028 struct inode *dir, u64 objectid, 4061 struct inode *dir, u64 objectid,
4029 const char *name, int name_len); 4062 const char *name, int name_len);
4030int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, 4063int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4031 int front); 4064 int front);
4032int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 4065int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4033 struct btrfs_root *root, 4066 struct btrfs_root *root,
@@ -4089,6 +4122,7 @@ void btrfs_test_inode_set_ops(struct inode *inode);
4089 4122
4090/* ioctl.c */ 4123/* ioctl.c */
4091long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 4124long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
4125int btrfs_ioctl_get_supported_features(void __user *arg);
4092void btrfs_update_iflags(struct inode *inode); 4126void btrfs_update_iflags(struct inode *inode);
4093void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 4127void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
4094int btrfs_is_empty_uuid(u8 *uuid); 4128int btrfs_is_empty_uuid(u8 *uuid);
@@ -4151,7 +4185,8 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
4151ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); 4185ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
4152 4186
4153/* super.c */ 4187/* super.c */
4154int btrfs_parse_options(struct btrfs_root *root, char *options); 4188int btrfs_parse_options(struct btrfs_root *root, char *options,
4189 unsigned long new_flags);
4155int btrfs_sync_fs(struct super_block *sb, int wait); 4190int btrfs_sync_fs(struct super_block *sb, int wait);
4156 4191
4157#ifdef CONFIG_PRINTK 4192#ifdef CONFIG_PRINTK
@@ -4525,8 +4560,8 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
4525 struct btrfs_key *start, struct btrfs_key *end); 4560 struct btrfs_key *start, struct btrfs_key *end);
4526int btrfs_reada_wait(void *handle); 4561int btrfs_reada_wait(void *handle);
4527void btrfs_reada_detach(void *handle); 4562void btrfs_reada_detach(void *handle);
4528int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 4563int btree_readahead_hook(struct btrfs_fs_info *fs_info,
4529 u64 start, int err); 4564 struct extent_buffer *eb, u64 start, int err);
4530 4565
4531static inline int is_fstree(u64 rootid) 4566static inline int is_fstree(u64 rootid)
4532{ 4567{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b57daa895cea..6cef0062f929 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -43,8 +43,7 @@ int __init btrfs_delayed_inode_init(void)
43 43
44void btrfs_delayed_inode_exit(void) 44void btrfs_delayed_inode_exit(void)
45{ 45{
46 if (delayed_node_cache) 46 kmem_cache_destroy(delayed_node_cache);
47 kmem_cache_destroy(delayed_node_cache);
48} 47}
49 48
50static inline void btrfs_init_delayed_node( 49static inline void btrfs_init_delayed_node(
@@ -651,9 +650,14 @@ static int btrfs_delayed_inode_reserve_metadata(
651 goto out; 650 goto out;
652 651
653 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 652 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
654 if (!WARN_ON(ret)) 653 if (!ret)
655 goto out; 654 goto out;
656 655
656 if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
657 btrfs_debug(root->fs_info,
658 "block rsv migrate returned %d", ret);
659 WARN_ON(1);
660 }
657 /* 661 /*
658 * Ok this is a problem, let's just steal from the global rsv 662 * Ok this is a problem, let's just steal from the global rsv
659 * since this really shouldn't happen that often. 663 * since this really shouldn't happen that often.
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 914ac13bd92f..430b3689b112 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -929,14 +929,10 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
929 929
930void btrfs_delayed_ref_exit(void) 930void btrfs_delayed_ref_exit(void)
931{ 931{
932 if (btrfs_delayed_ref_head_cachep) 932 kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
933 kmem_cache_destroy(btrfs_delayed_ref_head_cachep); 933 kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
934 if (btrfs_delayed_tree_ref_cachep) 934 kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
935 kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); 935 kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
936 if (btrfs_delayed_data_ref_cachep)
937 kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
938 if (btrfs_delayed_extent_op_cachep)
939 kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
940} 936}
941 937
942int btrfs_delayed_ref_init(void) 938int btrfs_delayed_ref_init(void)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index cbb7dbfb3fff..a1d6652e0c47 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -202,13 +202,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
202 struct btrfs_dev_replace_item *ptr; 202 struct btrfs_dev_replace_item *ptr;
203 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 203 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
204 204
205 btrfs_dev_replace_lock(dev_replace); 205 btrfs_dev_replace_lock(dev_replace, 0);
206 if (!dev_replace->is_valid || 206 if (!dev_replace->is_valid ||
207 !dev_replace->item_needs_writeback) { 207 !dev_replace->item_needs_writeback) {
208 btrfs_dev_replace_unlock(dev_replace); 208 btrfs_dev_replace_unlock(dev_replace, 0);
209 return 0; 209 return 0;
210 } 210 }
211 btrfs_dev_replace_unlock(dev_replace); 211 btrfs_dev_replace_unlock(dev_replace, 0);
212 212
213 key.objectid = 0; 213 key.objectid = 0;
214 key.type = BTRFS_DEV_REPLACE_KEY; 214 key.type = BTRFS_DEV_REPLACE_KEY;
@@ -264,7 +264,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
264 ptr = btrfs_item_ptr(eb, path->slots[0], 264 ptr = btrfs_item_ptr(eb, path->slots[0],
265 struct btrfs_dev_replace_item); 265 struct btrfs_dev_replace_item);
266 266
267 btrfs_dev_replace_lock(dev_replace); 267 btrfs_dev_replace_lock(dev_replace, 1);
268 if (dev_replace->srcdev) 268 if (dev_replace->srcdev)
269 btrfs_set_dev_replace_src_devid(eb, ptr, 269 btrfs_set_dev_replace_src_devid(eb, ptr,
270 dev_replace->srcdev->devid); 270 dev_replace->srcdev->devid);
@@ -287,7 +287,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
287 btrfs_set_dev_replace_cursor_right(eb, ptr, 287 btrfs_set_dev_replace_cursor_right(eb, ptr,
288 dev_replace->cursor_right); 288 dev_replace->cursor_right);
289 dev_replace->item_needs_writeback = 0; 289 dev_replace->item_needs_writeback = 0;
290 btrfs_dev_replace_unlock(dev_replace); 290 btrfs_dev_replace_unlock(dev_replace, 1);
291 291
292 btrfs_mark_buffer_dirty(eb); 292 btrfs_mark_buffer_dirty(eb);
293 293
@@ -356,7 +356,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
356 return PTR_ERR(trans); 356 return PTR_ERR(trans);
357 } 357 }
358 358
359 btrfs_dev_replace_lock(dev_replace); 359 btrfs_dev_replace_lock(dev_replace, 1);
360 switch (dev_replace->replace_state) { 360 switch (dev_replace->replace_state) {
361 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 361 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
362 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 362 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -395,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
395 dev_replace->is_valid = 1; 395 dev_replace->is_valid = 1;
396 dev_replace->item_needs_writeback = 1; 396 dev_replace->item_needs_writeback = 1;
397 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 397 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
398 btrfs_dev_replace_unlock(dev_replace); 398 btrfs_dev_replace_unlock(dev_replace, 1);
399 399
400 ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); 400 ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
401 if (ret) 401 if (ret)
@@ -407,7 +407,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
407 trans = btrfs_start_transaction(root, 0); 407 trans = btrfs_start_transaction(root, 0);
408 if (IS_ERR(trans)) { 408 if (IS_ERR(trans)) {
409 ret = PTR_ERR(trans); 409 ret = PTR_ERR(trans);
410 btrfs_dev_replace_lock(dev_replace); 410 btrfs_dev_replace_lock(dev_replace, 1);
411 goto leave; 411 goto leave;
412 } 412 }
413 413
@@ -433,7 +433,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
433leave: 433leave:
434 dev_replace->srcdev = NULL; 434 dev_replace->srcdev = NULL;
435 dev_replace->tgtdev = NULL; 435 dev_replace->tgtdev = NULL;
436 btrfs_dev_replace_unlock(dev_replace); 436 btrfs_dev_replace_unlock(dev_replace, 1);
437 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 437 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
438 return ret; 438 return ret;
439} 439}
@@ -471,18 +471,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
471 /* don't allow cancel or unmount to disturb the finishing procedure */ 471 /* don't allow cancel or unmount to disturb the finishing procedure */
472 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 472 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
473 473
474 btrfs_dev_replace_lock(dev_replace); 474 btrfs_dev_replace_lock(dev_replace, 0);
475 /* was the operation canceled, or is it finished? */ 475 /* was the operation canceled, or is it finished? */
476 if (dev_replace->replace_state != 476 if (dev_replace->replace_state !=
477 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { 477 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
478 btrfs_dev_replace_unlock(dev_replace); 478 btrfs_dev_replace_unlock(dev_replace, 0);
479 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 479 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
480 return 0; 480 return 0;
481 } 481 }
482 482
483 tgt_device = dev_replace->tgtdev; 483 tgt_device = dev_replace->tgtdev;
484 src_device = dev_replace->srcdev; 484 src_device = dev_replace->srcdev;
485 btrfs_dev_replace_unlock(dev_replace); 485 btrfs_dev_replace_unlock(dev_replace, 0);
486 486
487 /* 487 /*
488 * flush all outstanding I/O and inode extent mappings before the 488 * flush all outstanding I/O and inode extent mappings before the
@@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
507 /* keep away write_all_supers() during the finishing procedure */ 507 /* keep away write_all_supers() during the finishing procedure */
508 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 508 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
509 mutex_lock(&root->fs_info->chunk_mutex); 509 mutex_lock(&root->fs_info->chunk_mutex);
510 btrfs_dev_replace_lock(dev_replace); 510 btrfs_dev_replace_lock(dev_replace, 1);
511 dev_replace->replace_state = 511 dev_replace->replace_state =
512 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 512 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
513 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; 513 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
@@ -528,7 +528,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
528 rcu_str_deref(src_device->name), 528 rcu_str_deref(src_device->name),
529 src_device->devid, 529 src_device->devid,
530 rcu_str_deref(tgt_device->name), scrub_ret); 530 rcu_str_deref(tgt_device->name), scrub_ret);
531 btrfs_dev_replace_unlock(dev_replace); 531 btrfs_dev_replace_unlock(dev_replace, 1);
532 mutex_unlock(&root->fs_info->chunk_mutex); 532 mutex_unlock(&root->fs_info->chunk_mutex);
533 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 533 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
534 mutex_unlock(&uuid_mutex); 534 mutex_unlock(&uuid_mutex);
@@ -565,7 +565,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
565 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 565 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
566 fs_info->fs_devices->rw_devices++; 566 fs_info->fs_devices->rw_devices++;
567 567
568 btrfs_dev_replace_unlock(dev_replace); 568 btrfs_dev_replace_unlock(dev_replace, 1);
569 569
570 btrfs_rm_dev_replace_blocked(fs_info); 570 btrfs_rm_dev_replace_blocked(fs_info);
571 571
@@ -649,7 +649,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
649 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 649 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
650 struct btrfs_device *srcdev; 650 struct btrfs_device *srcdev;
651 651
652 btrfs_dev_replace_lock(dev_replace); 652 btrfs_dev_replace_lock(dev_replace, 0);
653 /* even if !dev_replace_is_valid, the values are good enough for 653 /* even if !dev_replace_is_valid, the values are good enough for
654 * the replace_status ioctl */ 654 * the replace_status ioctl */
655 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 655 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
@@ -675,7 +675,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
675 div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); 675 div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
676 break; 676 break;
677 } 677 }
678 btrfs_dev_replace_unlock(dev_replace); 678 btrfs_dev_replace_unlock(dev_replace, 0);
679} 679}
680 680
681int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, 681int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
@@ -698,13 +698,13 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
698 return -EROFS; 698 return -EROFS;
699 699
700 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 700 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
701 btrfs_dev_replace_lock(dev_replace); 701 btrfs_dev_replace_lock(dev_replace, 1);
702 switch (dev_replace->replace_state) { 702 switch (dev_replace->replace_state) {
703 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 703 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
704 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 704 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
705 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 705 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
706 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; 706 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
707 btrfs_dev_replace_unlock(dev_replace); 707 btrfs_dev_replace_unlock(dev_replace, 1);
708 goto leave; 708 goto leave;
709 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 709 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
710 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 710 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
@@ -717,7 +717,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
717 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; 717 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
718 dev_replace->time_stopped = get_seconds(); 718 dev_replace->time_stopped = get_seconds();
719 dev_replace->item_needs_writeback = 1; 719 dev_replace->item_needs_writeback = 1;
720 btrfs_dev_replace_unlock(dev_replace); 720 btrfs_dev_replace_unlock(dev_replace, 1);
721 btrfs_scrub_cancel(fs_info); 721 btrfs_scrub_cancel(fs_info);
722 722
723 trans = btrfs_start_transaction(root, 0); 723 trans = btrfs_start_transaction(root, 0);
@@ -740,7 +740,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
740 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 740 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
741 741
742 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 742 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
743 btrfs_dev_replace_lock(dev_replace); 743 btrfs_dev_replace_lock(dev_replace, 1);
744 switch (dev_replace->replace_state) { 744 switch (dev_replace->replace_state) {
745 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 745 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
746 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 746 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -756,7 +756,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
756 break; 756 break;
757 } 757 }
758 758
759 btrfs_dev_replace_unlock(dev_replace); 759 btrfs_dev_replace_unlock(dev_replace, 1);
760 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 760 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
761} 761}
762 762
@@ -766,12 +766,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
766 struct task_struct *task; 766 struct task_struct *task;
767 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 767 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
768 768
769 btrfs_dev_replace_lock(dev_replace); 769 btrfs_dev_replace_lock(dev_replace, 1);
770 switch (dev_replace->replace_state) { 770 switch (dev_replace->replace_state) {
771 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 771 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
772 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 772 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
773 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 773 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
774 btrfs_dev_replace_unlock(dev_replace); 774 btrfs_dev_replace_unlock(dev_replace, 1);
775 return 0; 775 return 0;
776 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 776 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
777 break; 777 break;
@@ -784,10 +784,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
784 btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing"); 784 btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
785 btrfs_info(fs_info, 785 btrfs_info(fs_info,
786 "you may cancel the operation after 'mount -o degraded'"); 786 "you may cancel the operation after 'mount -o degraded'");
787 btrfs_dev_replace_unlock(dev_replace); 787 btrfs_dev_replace_unlock(dev_replace, 1);
788 return 0; 788 return 0;
789 } 789 }
790 btrfs_dev_replace_unlock(dev_replace); 790 btrfs_dev_replace_unlock(dev_replace, 1);
791 791
792 WARN_ON(atomic_xchg( 792 WARN_ON(atomic_xchg(
793 &fs_info->mutually_exclusive_operation_running, 1)); 793 &fs_info->mutually_exclusive_operation_running, 1));
@@ -802,7 +802,7 @@ static int btrfs_dev_replace_kthread(void *data)
802 struct btrfs_ioctl_dev_replace_args *status_args; 802 struct btrfs_ioctl_dev_replace_args *status_args;
803 u64 progress; 803 u64 progress;
804 804
805 status_args = kzalloc(sizeof(*status_args), GFP_NOFS); 805 status_args = kzalloc(sizeof(*status_args), GFP_KERNEL);
806 if (status_args) { 806 if (status_args) {
807 btrfs_dev_replace_status(fs_info, status_args); 807 btrfs_dev_replace_status(fs_info, status_args);
808 progress = status_args->status.progress_1000; 808 progress = status_args->status.progress_1000;
@@ -858,55 +858,65 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
858 * not called and the the filesystem is remounted 858 * not called and the the filesystem is remounted
859 * in degraded state. This does not stop the 859 * in degraded state. This does not stop the
860 * dev_replace procedure. It needs to be canceled 860 * dev_replace procedure. It needs to be canceled
861 * manually if the cancelation is wanted. 861 * manually if the cancellation is wanted.
862 */ 862 */
863 break; 863 break;
864 } 864 }
865 return 1; 865 return 1;
866} 866}
867 867
868void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) 868void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw)
869{ 869{
870 /* the beginning is just an optimization for the typical case */ 870 if (rw == 1) {
871 if (atomic_read(&dev_replace->nesting_level) == 0) { 871 /* write */
872acquire_lock: 872again:
873 /* this is not a nested case where the same thread 873 wait_event(dev_replace->read_lock_wq,
874 * is trying to acqurire the same lock twice */ 874 atomic_read(&dev_replace->blocking_readers) == 0);
875 mutex_lock(&dev_replace->lock); 875 write_lock(&dev_replace->lock);
876 mutex_lock(&dev_replace->lock_management_lock); 876 if (atomic_read(&dev_replace->blocking_readers)) {
877 dev_replace->lock_owner = current->pid; 877 write_unlock(&dev_replace->lock);
878 atomic_inc(&dev_replace->nesting_level); 878 goto again;
879 mutex_unlock(&dev_replace->lock_management_lock); 879 }
880 return; 880 } else {
881 read_lock(&dev_replace->lock);
882 atomic_inc(&dev_replace->read_locks);
881 } 883 }
884}
882 885
883 mutex_lock(&dev_replace->lock_management_lock); 886void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw)
884 if (atomic_read(&dev_replace->nesting_level) > 0 && 887{
885 dev_replace->lock_owner == current->pid) { 888 if (rw == 1) {
886 WARN_ON(!mutex_is_locked(&dev_replace->lock)); 889 /* write */
887 atomic_inc(&dev_replace->nesting_level); 890 ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
888 mutex_unlock(&dev_replace->lock_management_lock); 891 write_unlock(&dev_replace->lock);
889 return; 892 } else {
893 ASSERT(atomic_read(&dev_replace->read_locks) > 0);
894 atomic_dec(&dev_replace->read_locks);
895 read_unlock(&dev_replace->lock);
890 } 896 }
897}
891 898
892 mutex_unlock(&dev_replace->lock_management_lock); 899/* inc blocking cnt and release read lock */
893 goto acquire_lock; 900void btrfs_dev_replace_set_lock_blocking(
901 struct btrfs_dev_replace *dev_replace)
902{
903 /* only set blocking for read lock */
904 ASSERT(atomic_read(&dev_replace->read_locks) > 0);
905 atomic_inc(&dev_replace->blocking_readers);
906 read_unlock(&dev_replace->lock);
894} 907}
895 908
896void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) 909/* acquire read lock and dec blocking cnt */
910void btrfs_dev_replace_clear_lock_blocking(
911 struct btrfs_dev_replace *dev_replace)
897{ 912{
898 WARN_ON(!mutex_is_locked(&dev_replace->lock)); 913 /* only set blocking for read lock */
899 mutex_lock(&dev_replace->lock_management_lock); 914 ASSERT(atomic_read(&dev_replace->read_locks) > 0);
900 WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); 915 ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
901 WARN_ON(dev_replace->lock_owner != current->pid); 916 read_lock(&dev_replace->lock);
902 atomic_dec(&dev_replace->nesting_level); 917 if (atomic_dec_and_test(&dev_replace->blocking_readers) &&
903 if (atomic_read(&dev_replace->nesting_level) == 0) { 918 waitqueue_active(&dev_replace->read_lock_wq))
904 dev_replace->lock_owner = 0; 919 wake_up(&dev_replace->read_lock_wq);
905 mutex_unlock(&dev_replace->lock_management_lock);
906 mutex_unlock(&dev_replace->lock);
907 } else {
908 mutex_unlock(&dev_replace->lock_management_lock);
909 }
910} 920}
911 921
912void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) 922void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 20035cbbf021..29e3ef5f96bd 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -34,8 +34,11 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
34void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); 34void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
35int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); 35int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
36int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); 36int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
37void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); 37void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw);
38void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); 38void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw);
39void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
40void btrfs_dev_replace_clear_lock_blocking(
41 struct btrfs_dev_replace *dev_replace);
39 42
40static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) 43static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
41{ 44{
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5699bbc23feb..4b02591b0301 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,6 +50,7 @@
50#include "raid56.h" 50#include "raid56.h"
51#include "sysfs.h" 51#include "sysfs.h"
52#include "qgroup.h" 52#include "qgroup.h"
53#include "compression.h"
53 54
54#ifdef CONFIG_X86 55#ifdef CONFIG_X86
55#include <asm/cpufeature.h> 56#include <asm/cpufeature.h>
@@ -110,8 +111,7 @@ int __init btrfs_end_io_wq_init(void)
110 111
111void btrfs_end_io_wq_exit(void) 112void btrfs_end_io_wq_exit(void)
112{ 113{
113 if (btrfs_end_io_wq_cache) 114 kmem_cache_destroy(btrfs_end_io_wq_cache);
114 kmem_cache_destroy(btrfs_end_io_wq_cache);
115} 115}
116 116
117/* 117/*
@@ -612,6 +612,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
612 int found_level; 612 int found_level;
613 struct extent_buffer *eb; 613 struct extent_buffer *eb;
614 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 614 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
615 struct btrfs_fs_info *fs_info = root->fs_info;
615 int ret = 0; 616 int ret = 0;
616 int reads_done; 617 int reads_done;
617 618
@@ -637,21 +638,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
637 638
638 found_start = btrfs_header_bytenr(eb); 639 found_start = btrfs_header_bytenr(eb);
639 if (found_start != eb->start) { 640 if (found_start != eb->start) {
640 btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu", 641 btrfs_err_rl(fs_info, "bad tree block start %llu %llu",
641 found_start, eb->start); 642 found_start, eb->start);
642 ret = -EIO; 643 ret = -EIO;
643 goto err; 644 goto err;
644 } 645 }
645 if (check_tree_block_fsid(root->fs_info, eb)) { 646 if (check_tree_block_fsid(fs_info, eb)) {
646 btrfs_err_rl(eb->fs_info, "bad fsid on block %llu", 647 btrfs_err_rl(fs_info, "bad fsid on block %llu",
647 eb->start); 648 eb->start);
648 ret = -EIO; 649 ret = -EIO;
649 goto err; 650 goto err;
650 } 651 }
651 found_level = btrfs_header_level(eb); 652 found_level = btrfs_header_level(eb);
652 if (found_level >= BTRFS_MAX_LEVEL) { 653 if (found_level >= BTRFS_MAX_LEVEL) {
653 btrfs_err(root->fs_info, "bad tree block level %d", 654 btrfs_err(fs_info, "bad tree block level %d",
654 (int)btrfs_header_level(eb)); 655 (int)btrfs_header_level(eb));
655 ret = -EIO; 656 ret = -EIO;
656 goto err; 657 goto err;
657 } 658 }
@@ -659,7 +660,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
659 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), 660 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
660 eb, found_level); 661 eb, found_level);
661 662
662 ret = csum_tree_block(root->fs_info, eb, 1); 663 ret = csum_tree_block(fs_info, eb, 1);
663 if (ret) { 664 if (ret) {
664 ret = -EIO; 665 ret = -EIO;
665 goto err; 666 goto err;
@@ -680,7 +681,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
680err: 681err:
681 if (reads_done && 682 if (reads_done &&
682 test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 683 test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
683 btree_readahead_hook(root, eb, eb->start, ret); 684 btree_readahead_hook(fs_info, eb, eb->start, ret);
684 685
685 if (ret) { 686 if (ret) {
686 /* 687 /*
@@ -699,14 +700,13 @@ out:
699static int btree_io_failed_hook(struct page *page, int failed_mirror) 700static int btree_io_failed_hook(struct page *page, int failed_mirror)
700{ 701{
701 struct extent_buffer *eb; 702 struct extent_buffer *eb;
702 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
703 703
704 eb = (struct extent_buffer *)page->private; 704 eb = (struct extent_buffer *)page->private;
705 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 705 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
706 eb->read_mirror = failed_mirror; 706 eb->read_mirror = failed_mirror;
707 atomic_dec(&eb->io_pages); 707 atomic_dec(&eb->io_pages);
708 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 708 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
709 btree_readahead_hook(root, eb, eb->start, -EIO); 709 btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO);
710 return -EIO; /* we fixed nothing */ 710 return -EIO; /* we fixed nothing */
711} 711}
712 712
@@ -816,7 +816,7 @@ static void run_one_async_done(struct btrfs_work *work)
816 waitqueue_active(&fs_info->async_submit_wait)) 816 waitqueue_active(&fs_info->async_submit_wait))
817 wake_up(&fs_info->async_submit_wait); 817 wake_up(&fs_info->async_submit_wait);
818 818
819 /* If an error occured we just want to clean up the bio and move on */ 819 /* If an error occurred we just want to clean up the bio and move on */
820 if (async->error) { 820 if (async->error) {
821 async->bio->bi_error = async->error; 821 async->bio->bi_error = async->error;
822 bio_endio(async->bio); 822 bio_endio(async->bio);
@@ -1296,9 +1296,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
1296 spin_lock_init(&root->root_item_lock); 1296 spin_lock_init(&root->root_item_lock);
1297} 1297}
1298 1298
1299static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) 1299static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
1300 gfp_t flags)
1300{ 1301{
1301 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); 1302 struct btrfs_root *root = kzalloc(sizeof(*root), flags);
1302 if (root) 1303 if (root)
1303 root->fs_info = fs_info; 1304 root->fs_info = fs_info;
1304 return root; 1305 return root;
@@ -1310,7 +1311,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
1310{ 1311{
1311 struct btrfs_root *root; 1312 struct btrfs_root *root;
1312 1313
1313 root = btrfs_alloc_root(NULL); 1314 root = btrfs_alloc_root(NULL, GFP_KERNEL);
1314 if (!root) 1315 if (!root)
1315 return ERR_PTR(-ENOMEM); 1316 return ERR_PTR(-ENOMEM);
1316 __setup_root(4096, 4096, 4096, root, NULL, 1); 1317 __setup_root(4096, 4096, 4096, root, NULL, 1);
@@ -1332,7 +1333,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1332 int ret = 0; 1333 int ret = 0;
1333 uuid_le uuid; 1334 uuid_le uuid;
1334 1335
1335 root = btrfs_alloc_root(fs_info); 1336 root = btrfs_alloc_root(fs_info, GFP_KERNEL);
1336 if (!root) 1337 if (!root)
1337 return ERR_PTR(-ENOMEM); 1338 return ERR_PTR(-ENOMEM);
1338 1339
@@ -1408,7 +1409,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1408 struct btrfs_root *tree_root = fs_info->tree_root; 1409 struct btrfs_root *tree_root = fs_info->tree_root;
1409 struct extent_buffer *leaf; 1410 struct extent_buffer *leaf;
1410 1411
1411 root = btrfs_alloc_root(fs_info); 1412 root = btrfs_alloc_root(fs_info, GFP_NOFS);
1412 if (!root) 1413 if (!root)
1413 return ERR_PTR(-ENOMEM); 1414 return ERR_PTR(-ENOMEM);
1414 1415
@@ -1506,7 +1507,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1506 if (!path) 1507 if (!path)
1507 return ERR_PTR(-ENOMEM); 1508 return ERR_PTR(-ENOMEM);
1508 1509
1509 root = btrfs_alloc_root(fs_info); 1510 root = btrfs_alloc_root(fs_info, GFP_NOFS);
1510 if (!root) { 1511 if (!root) {
1511 ret = -ENOMEM; 1512 ret = -ENOMEM;
1512 goto alloc_fail; 1513 goto alloc_fail;
@@ -2272,9 +2273,11 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
2272 fs_info->dev_replace.lock_owner = 0; 2273 fs_info->dev_replace.lock_owner = 0;
2273 atomic_set(&fs_info->dev_replace.nesting_level, 0); 2274 atomic_set(&fs_info->dev_replace.nesting_level, 0);
2274 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); 2275 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2275 mutex_init(&fs_info->dev_replace.lock_management_lock); 2276 rwlock_init(&fs_info->dev_replace.lock);
2276 mutex_init(&fs_info->dev_replace.lock); 2277 atomic_set(&fs_info->dev_replace.read_locks, 0);
2278 atomic_set(&fs_info->dev_replace.blocking_readers, 0);
2277 init_waitqueue_head(&fs_info->replace_wait); 2279 init_waitqueue_head(&fs_info->replace_wait);
2280 init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
2278} 2281}
2279 2282
2280static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) 2283static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
@@ -2385,7 +2388,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2385 return -EIO; 2388 return -EIO;
2386 } 2389 }
2387 2390
2388 log_tree_root = btrfs_alloc_root(fs_info); 2391 log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
2389 if (!log_tree_root) 2392 if (!log_tree_root)
2390 return -ENOMEM; 2393 return -ENOMEM;
2391 2394
@@ -2510,8 +2513,8 @@ int open_ctree(struct super_block *sb,
2510 int backup_index = 0; 2513 int backup_index = 0;
2511 int max_active; 2514 int max_active;
2512 2515
2513 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); 2516 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
2514 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); 2517 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
2515 if (!tree_root || !chunk_root) { 2518 if (!tree_root || !chunk_root) {
2516 err = -ENOMEM; 2519 err = -ENOMEM;
2517 goto fail; 2520 goto fail;
@@ -2603,6 +2606,7 @@ int open_ctree(struct super_block *sb,
2603 atomic_set(&fs_info->nr_async_bios, 0); 2606 atomic_set(&fs_info->nr_async_bios, 0);
2604 atomic_set(&fs_info->defrag_running, 0); 2607 atomic_set(&fs_info->defrag_running, 0);
2605 atomic_set(&fs_info->qgroup_op_seq, 0); 2608 atomic_set(&fs_info->qgroup_op_seq, 0);
2609 atomic_set(&fs_info->reada_works_cnt, 0);
2606 atomic64_set(&fs_info->tree_mod_seq, 0); 2610 atomic64_set(&fs_info->tree_mod_seq, 0);
2607 fs_info->sb = sb; 2611 fs_info->sb = sb;
2608 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; 2612 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
@@ -2622,7 +2626,7 @@ int open_ctree(struct super_block *sb,
2622 INIT_LIST_HEAD(&fs_info->ordered_roots); 2626 INIT_LIST_HEAD(&fs_info->ordered_roots);
2623 spin_lock_init(&fs_info->ordered_root_lock); 2627 spin_lock_init(&fs_info->ordered_root_lock);
2624 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), 2628 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2625 GFP_NOFS); 2629 GFP_KERNEL);
2626 if (!fs_info->delayed_root) { 2630 if (!fs_info->delayed_root) {
2627 err = -ENOMEM; 2631 err = -ENOMEM;
2628 goto fail_iput; 2632 goto fail_iput;
@@ -2750,7 +2754,7 @@ int open_ctree(struct super_block *sb,
2750 */ 2754 */
2751 fs_info->compress_type = BTRFS_COMPRESS_ZLIB; 2755 fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
2752 2756
2753 ret = btrfs_parse_options(tree_root, options); 2757 ret = btrfs_parse_options(tree_root, options, sb->s_flags);
2754 if (ret) { 2758 if (ret) {
2755 err = ret; 2759 err = ret;
2756 goto fail_alloc; 2760 goto fail_alloc;
@@ -3029,8 +3033,9 @@ retry_root_backup:
3029 if (ret) 3033 if (ret)
3030 goto fail_trans_kthread; 3034 goto fail_trans_kthread;
3031 3035
3032 /* do not make disk changes in broken FS */ 3036 /* do not make disk changes in broken FS or nologreplay is given */
3033 if (btrfs_super_log_root(disk_super) != 0) { 3037 if (btrfs_super_log_root(disk_super) != 0 &&
3038 !btrfs_test_opt(tree_root, NOLOGREPLAY)) {
3034 ret = btrfs_replay_log(fs_info, fs_devices); 3039 ret = btrfs_replay_log(fs_info, fs_devices);
3035 if (ret) { 3040 if (ret) {
3036 err = ret; 3041 err = ret;
@@ -3146,6 +3151,12 @@ retry_root_backup:
3146 3151
3147 fs_info->open = 1; 3152 fs_info->open = 1;
3148 3153
3154 /*
3155 * backuproot only affect mount behavior, and if open_ctree succeeded,
3156 * no need to keep the flag
3157 */
3158 btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
3159
3149 return 0; 3160 return 0;
3150 3161
3151fail_qgroup: 3162fail_qgroup:
@@ -3200,7 +3211,7 @@ fail:
3200 return err; 3211 return err;
3201 3212
3202recovery_tree_root: 3213recovery_tree_root:
3203 if (!btrfs_test_opt(tree_root, RECOVERY)) 3214 if (!btrfs_test_opt(tree_root, USEBACKUPROOT))
3204 goto fail_tree_roots; 3215 goto fail_tree_roots;
3205 3216
3206 free_root_pointers(fs_info, 0); 3217 free_root_pointers(fs_info, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e2287c7c10be..53e12977bfd0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4838,7 +4838,7 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4838 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 4838 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4839 4839
4840 /* If we're just plain full then async reclaim just slows us down. */ 4840 /* If we're just plain full then async reclaim just slows us down. */
4841 if (space_info->bytes_used >= thresh) 4841 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4842 return 0; 4842 return 0;
4843 4843
4844 return (used >= thresh && !btrfs_fs_closing(fs_info) && 4844 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
@@ -5373,27 +5373,33 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5373 5373
5374 block_rsv->size = min_t(u64, num_bytes, SZ_512M); 5374 block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5375 5375
5376 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 5376 if (block_rsv->reserved < block_rsv->size) {
5377 sinfo->bytes_reserved + sinfo->bytes_readonly + 5377 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
5378 sinfo->bytes_may_use; 5378 sinfo->bytes_reserved + sinfo->bytes_readonly +
5379 5379 sinfo->bytes_may_use;
5380 if (sinfo->total_bytes > num_bytes) { 5380 if (sinfo->total_bytes > num_bytes) {
5381 num_bytes = sinfo->total_bytes - num_bytes; 5381 num_bytes = sinfo->total_bytes - num_bytes;
5382 block_rsv->reserved += num_bytes; 5382 num_bytes = min(num_bytes,
5383 sinfo->bytes_may_use += num_bytes; 5383 block_rsv->size - block_rsv->reserved);
5384 trace_btrfs_space_reservation(fs_info, "space_info", 5384 block_rsv->reserved += num_bytes;
5385 sinfo->flags, num_bytes, 1); 5385 sinfo->bytes_may_use += num_bytes;
5386 } 5386 trace_btrfs_space_reservation(fs_info, "space_info",
5387 5387 sinfo->flags, num_bytes,
5388 if (block_rsv->reserved >= block_rsv->size) { 5388 1);
5389 }
5390 } else if (block_rsv->reserved > block_rsv->size) {
5389 num_bytes = block_rsv->reserved - block_rsv->size; 5391 num_bytes = block_rsv->reserved - block_rsv->size;
5390 sinfo->bytes_may_use -= num_bytes; 5392 sinfo->bytes_may_use -= num_bytes;
5391 trace_btrfs_space_reservation(fs_info, "space_info", 5393 trace_btrfs_space_reservation(fs_info, "space_info",
5392 sinfo->flags, num_bytes, 0); 5394 sinfo->flags, num_bytes, 0);
5393 block_rsv->reserved = block_rsv->size; 5395 block_rsv->reserved = block_rsv->size;
5394 block_rsv->full = 1;
5395 } 5396 }
5396 5397
5398 if (block_rsv->reserved == block_rsv->size)
5399 block_rsv->full = 1;
5400 else
5401 block_rsv->full = 0;
5402
5397 spin_unlock(&block_rsv->lock); 5403 spin_unlock(&block_rsv->lock);
5398 spin_unlock(&sinfo->lock); 5404 spin_unlock(&sinfo->lock);
5399} 5405}
@@ -5752,7 +5758,7 @@ out_fail:
5752 5758
5753 /* 5759 /*
5754 * This is tricky, but first we need to figure out how much we 5760 * This is tricky, but first we need to figure out how much we
5755 * free'd from any free-ers that occured during this 5761 * free'd from any free-ers that occurred during this
5756 * reservation, so we reset ->csum_bytes to the csum_bytes 5762 * reservation, so we reset ->csum_bytes to the csum_bytes
5757 * before we dropped our lock, and then call the free for the 5763 * before we dropped our lock, and then call the free for the
5758 * number of bytes that were freed while we were trying our 5764 * number of bytes that were freed while we were trying our
@@ -7018,7 +7024,7 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7018 struct btrfs_free_cluster *cluster, 7024 struct btrfs_free_cluster *cluster,
7019 int delalloc) 7025 int delalloc)
7020{ 7026{
7021 struct btrfs_block_group_cache *used_bg; 7027 struct btrfs_block_group_cache *used_bg = NULL;
7022 bool locked = false; 7028 bool locked = false;
7023again: 7029again:
7024 spin_lock(&cluster->refill_lock); 7030 spin_lock(&cluster->refill_lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 392592dc7010..76a0c8597d98 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -206,10 +206,8 @@ void extent_io_exit(void)
206 * destroy caches. 206 * destroy caches.
207 */ 207 */
208 rcu_barrier(); 208 rcu_barrier();
209 if (extent_state_cache) 209 kmem_cache_destroy(extent_state_cache);
210 kmem_cache_destroy(extent_state_cache); 210 kmem_cache_destroy(extent_buffer_cache);
211 if (extent_buffer_cache)
212 kmem_cache_destroy(extent_buffer_cache);
213 if (btrfs_bioset) 211 if (btrfs_bioset)
214 bioset_free(btrfs_bioset); 212 bioset_free(btrfs_bioset);
215} 213}
@@ -232,7 +230,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
232 if (!state) 230 if (!state)
233 return state; 231 return state;
234 state->state = 0; 232 state->state = 0;
235 state->private = 0; 233 state->failrec = NULL;
236 RB_CLEAR_NODE(&state->rb_node); 234 RB_CLEAR_NODE(&state->rb_node);
237 btrfs_leak_debug_add(&state->leak_list, &states); 235 btrfs_leak_debug_add(&state->leak_list, &states);
238 atomic_set(&state->refs, 1); 236 atomic_set(&state->refs, 1);
@@ -1844,7 +1842,8 @@ out:
1844 * set the private field for a given byte offset in the tree. If there isn't 1842 * set the private field for a given byte offset in the tree. If there isn't
1845 * an extent_state there already, this does nothing. 1843 * an extent_state there already, this does nothing.
1846 */ 1844 */
1847static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1845static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start,
1846 struct io_failure_record *failrec)
1848{ 1847{
1849 struct rb_node *node; 1848 struct rb_node *node;
1850 struct extent_state *state; 1849 struct extent_state *state;
@@ -1865,13 +1864,14 @@ static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private
1865 ret = -ENOENT; 1864 ret = -ENOENT;
1866 goto out; 1865 goto out;
1867 } 1866 }
1868 state->private = private; 1867 state->failrec = failrec;
1869out: 1868out:
1870 spin_unlock(&tree->lock); 1869 spin_unlock(&tree->lock);
1871 return ret; 1870 return ret;
1872} 1871}
1873 1872
1874int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1873static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start,
1874 struct io_failure_record **failrec)
1875{ 1875{
1876 struct rb_node *node; 1876 struct rb_node *node;
1877 struct extent_state *state; 1877 struct extent_state *state;
@@ -1892,7 +1892,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1892 ret = -ENOENT; 1892 ret = -ENOENT;
1893 goto out; 1893 goto out;
1894 } 1894 }
1895 *private = state->private; 1895 *failrec = state->failrec;
1896out: 1896out:
1897 spin_unlock(&tree->lock); 1897 spin_unlock(&tree->lock);
1898 return ret; 1898 return ret;
@@ -1972,7 +1972,7 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec)
1972 int err = 0; 1972 int err = 0;
1973 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1973 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1974 1974
1975 set_state_private(failure_tree, rec->start, 0); 1975 set_state_failrec(failure_tree, rec->start, NULL);
1976 ret = clear_extent_bits(failure_tree, rec->start, 1976 ret = clear_extent_bits(failure_tree, rec->start,
1977 rec->start + rec->len - 1, 1977 rec->start + rec->len - 1,
1978 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 1978 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
@@ -2089,7 +2089,6 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page,
2089 unsigned int pg_offset) 2089 unsigned int pg_offset)
2090{ 2090{
2091 u64 private; 2091 u64 private;
2092 u64 private_failure;
2093 struct io_failure_record *failrec; 2092 struct io_failure_record *failrec;
2094 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2093 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2095 struct extent_state *state; 2094 struct extent_state *state;
@@ -2102,12 +2101,11 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page,
2102 if (!ret) 2101 if (!ret)
2103 return 0; 2102 return 0;
2104 2103
2105 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, 2104 ret = get_state_failrec(&BTRFS_I(inode)->io_failure_tree, start,
2106 &private_failure); 2105 &failrec);
2107 if (ret) 2106 if (ret)
2108 return 0; 2107 return 0;
2109 2108
2110 failrec = (struct io_failure_record *)(unsigned long) private_failure;
2111 BUG_ON(!failrec->this_mirror); 2109 BUG_ON(!failrec->this_mirror);
2112 2110
2113 if (failrec->in_validation) { 2111 if (failrec->in_validation) {
@@ -2167,7 +2165,7 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
2167 2165
2168 next = next_state(state); 2166 next = next_state(state);
2169 2167
2170 failrec = (struct io_failure_record *)(unsigned long)state->private; 2168 failrec = state->failrec;
2171 free_extent_state(state); 2169 free_extent_state(state);
2172 kfree(failrec); 2170 kfree(failrec);
2173 2171
@@ -2177,10 +2175,9 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
2177} 2175}
2178 2176
2179int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, 2177int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2180 struct io_failure_record **failrec_ret) 2178 struct io_failure_record **failrec_ret)
2181{ 2179{
2182 struct io_failure_record *failrec; 2180 struct io_failure_record *failrec;
2183 u64 private;
2184 struct extent_map *em; 2181 struct extent_map *em;
2185 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2182 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2186 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2183 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -2188,7 +2185,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2188 int ret; 2185 int ret;
2189 u64 logical; 2186 u64 logical;
2190 2187
2191 ret = get_state_private(failure_tree, start, &private); 2188 ret = get_state_failrec(failure_tree, start, &failrec);
2192 if (ret) { 2189 if (ret) {
2193 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2190 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2194 if (!failrec) 2191 if (!failrec)
@@ -2237,8 +2234,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2237 ret = set_extent_bits(failure_tree, start, end, 2234 ret = set_extent_bits(failure_tree, start, end,
2238 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 2235 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2239 if (ret >= 0) 2236 if (ret >= 0)
2240 ret = set_state_private(failure_tree, start, 2237 ret = set_state_failrec(failure_tree, start, failrec);
2241 (u64)(unsigned long)failrec);
2242 /* set the bits in the inode's tree */ 2238 /* set the bits in the inode's tree */
2243 if (ret >= 0) 2239 if (ret >= 0)
2244 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, 2240 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
@@ -2248,7 +2244,6 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2248 return ret; 2244 return ret;
2249 } 2245 }
2250 } else { 2246 } else {
2251 failrec = (struct io_failure_record *)(unsigned long)private;
2252 pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n", 2247 pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
2253 failrec->logical, failrec->start, failrec->len, 2248 failrec->logical, failrec->start, failrec->len,
2254 failrec->in_validation); 2249 failrec->in_validation);
@@ -3177,7 +3172,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
3177 3172
3178 while (1) { 3173 while (1) {
3179 lock_extent(tree, start, end); 3174 lock_extent(tree, start, end);
3180 ordered = btrfs_lookup_ordered_extent(inode, start); 3175 ordered = btrfs_lookup_ordered_range(inode, start,
3176 PAGE_CACHE_SIZE);
3181 if (!ordered) 3177 if (!ordered)
3182 break; 3178 break;
3183 unlock_extent(tree, start, end); 3179 unlock_extent(tree, start, end);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 880d5292e972..5dbf92e68fbd 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -61,6 +61,7 @@
61struct extent_state; 61struct extent_state;
62struct btrfs_root; 62struct btrfs_root;
63struct btrfs_io_bio; 63struct btrfs_io_bio;
64struct io_failure_record;
64 65
65typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, 66typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
66 struct bio *bio, int mirror_num, 67 struct bio *bio, int mirror_num,
@@ -111,8 +112,7 @@ struct extent_state {
111 atomic_t refs; 112 atomic_t refs;
112 unsigned state; 113 unsigned state;
113 114
114 /* for use by the FS */ 115 struct io_failure_record *failrec;
115 u64 private;
116 116
117#ifdef CONFIG_BTRFS_DEBUG 117#ifdef CONFIG_BTRFS_DEBUG
118 struct list_head leak_list; 118 struct list_head leak_list;
@@ -342,7 +342,6 @@ int extent_readpages(struct extent_io_tree *tree,
342 get_extent_t get_extent); 342 get_extent_t get_extent);
343int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 343int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
344 __u64 start, __u64 len, get_extent_t *get_extent); 344 __u64 start, __u64 len, get_extent_t *get_extent);
345int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
346void set_page_extent_mapped(struct page *page); 345void set_page_extent_mapped(struct page *page);
347 346
348struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 347struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 84fb56d5c018..318b048eb254 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -4,6 +4,7 @@
4#include <linux/hardirq.h> 4#include <linux/hardirq.h>
5#include "ctree.h" 5#include "ctree.h"
6#include "extent_map.h" 6#include "extent_map.h"
7#include "compression.h"
7 8
8 9
9static struct kmem_cache *extent_map_cache; 10static struct kmem_cache *extent_map_cache;
@@ -20,8 +21,7 @@ int __init extent_map_init(void)
20 21
21void extent_map_exit(void) 22void extent_map_exit(void)
22{ 23{
23 if (extent_map_cache) 24 kmem_cache_destroy(extent_map_cache);
24 kmem_cache_destroy(extent_map_cache);
25} 25}
26 26
27/** 27/**
@@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void)
62 62
63/** 63/**
64 * free_extent_map - drop reference count of an extent_map 64 * free_extent_map - drop reference count of an extent_map
65 * @em: extent map beeing releasead 65 * @em: extent map being releasead
66 * 66 *
67 * Drops the reference out on @em by one and free the structure 67 * Drops the reference out on @em by one and free the structure
68 * if the reference count hits zero. 68 * if the reference count hits zero.
@@ -422,7 +422,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
422/** 422/**
423 * remove_extent_mapping - removes an extent_map from the extent tree 423 * remove_extent_mapping - removes an extent_map from the extent tree
424 * @tree: extent tree to remove from 424 * @tree: extent tree to remove from
425 * @em: extent map beeing removed 425 * @em: extent map being removed
426 * 426 *
427 * Removes @em from @tree. No reference counts are dropped, and no checks 427 * Removes @em from @tree. No reference counts are dropped, and no checks
428 * are done to see if the range is in use 428 * are done to see if the range is in use
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a67e1c828d0f..b5baf5bdc8e1 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "volumes.h" 26#include "volumes.h"
27#include "print-tree.h" 27#include "print-tree.h"
28#include "compression.h"
28 29
29#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ 30#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
30 sizeof(struct btrfs_item) * 2) / \ 31 sizeof(struct btrfs_item) * 2) / \
@@ -172,6 +173,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
172 u64 item_start_offset = 0; 173 u64 item_start_offset = 0;
173 u64 item_last_offset = 0; 174 u64 item_last_offset = 0;
174 u64 disk_bytenr; 175 u64 disk_bytenr;
176 u64 page_bytes_left;
175 u32 diff; 177 u32 diff;
176 int nblocks; 178 int nblocks;
177 int bio_index = 0; 179 int bio_index = 0;
@@ -220,6 +222,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
220 disk_bytenr = (u64)bio->bi_iter.bi_sector << 9; 222 disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
221 if (dio) 223 if (dio)
222 offset = logical_offset; 224 offset = logical_offset;
225
226 page_bytes_left = bvec->bv_len;
223 while (bio_index < bio->bi_vcnt) { 227 while (bio_index < bio->bi_vcnt) {
224 if (!dio) 228 if (!dio)
225 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 229 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
@@ -243,7 +247,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
243 if (BTRFS_I(inode)->root->root_key.objectid == 247 if (BTRFS_I(inode)->root->root_key.objectid ==
244 BTRFS_DATA_RELOC_TREE_OBJECTID) { 248 BTRFS_DATA_RELOC_TREE_OBJECTID) {
245 set_extent_bits(io_tree, offset, 249 set_extent_bits(io_tree, offset,
246 offset + bvec->bv_len - 1, 250 offset + root->sectorsize - 1,
247 EXTENT_NODATASUM, GFP_NOFS); 251 EXTENT_NODATASUM, GFP_NOFS);
248 } else { 252 } else {
249 btrfs_info(BTRFS_I(inode)->root->fs_info, 253 btrfs_info(BTRFS_I(inode)->root->fs_info,
@@ -281,13 +285,29 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
281found: 285found:
282 csum += count * csum_size; 286 csum += count * csum_size;
283 nblocks -= count; 287 nblocks -= count;
284 bio_index += count; 288
285 while (count--) { 289 while (count--) {
286 disk_bytenr += bvec->bv_len; 290 disk_bytenr += root->sectorsize;
287 offset += bvec->bv_len; 291 offset += root->sectorsize;
288 bvec++; 292 page_bytes_left -= root->sectorsize;
293 if (!page_bytes_left) {
294 bio_index++;
295 /*
296 * make sure we're still inside the
297 * bio before we update page_bytes_left
298 */
299 if (bio_index >= bio->bi_vcnt) {
300 WARN_ON_ONCE(count);
301 goto done;
302 }
303 bvec++;
304 page_bytes_left = bvec->bv_len;
305 }
306
289 } 307 }
290 } 308 }
309
310done:
291 btrfs_free_path(path); 311 btrfs_free_path(path);
292 return 0; 312 return 0;
293} 313}
@@ -432,6 +452,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
432 struct bio_vec *bvec = bio->bi_io_vec; 452 struct bio_vec *bvec = bio->bi_io_vec;
433 int bio_index = 0; 453 int bio_index = 0;
434 int index; 454 int index;
455 int nr_sectors;
456 int i;
435 unsigned long total_bytes = 0; 457 unsigned long total_bytes = 0;
436 unsigned long this_sum_bytes = 0; 458 unsigned long this_sum_bytes = 0;
437 u64 offset; 459 u64 offset;
@@ -459,41 +481,56 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
459 if (!contig) 481 if (!contig)
460 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 482 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
461 483
462 if (offset >= ordered->file_offset + ordered->len || 484 data = kmap_atomic(bvec->bv_page);
463 offset < ordered->file_offset) {
464 unsigned long bytes_left;
465 sums->len = this_sum_bytes;
466 this_sum_bytes = 0;
467 btrfs_add_ordered_sum(inode, ordered, sums);
468 btrfs_put_ordered_extent(ordered);
469 485
470 bytes_left = bio->bi_iter.bi_size - total_bytes; 486 nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
487 bvec->bv_len + root->sectorsize
488 - 1);
489
490 for (i = 0; i < nr_sectors; i++) {
491 if (offset >= ordered->file_offset + ordered->len ||
492 offset < ordered->file_offset) {
493 unsigned long bytes_left;
494
495 kunmap_atomic(data);
496 sums->len = this_sum_bytes;
497 this_sum_bytes = 0;
498 btrfs_add_ordered_sum(inode, ordered, sums);
499 btrfs_put_ordered_extent(ordered);
500
501 bytes_left = bio->bi_iter.bi_size - total_bytes;
502
503 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
504 GFP_NOFS);
505 BUG_ON(!sums); /* -ENOMEM */
506 sums->len = bytes_left;
507 ordered = btrfs_lookup_ordered_extent(inode,
508 offset);
509 ASSERT(ordered); /* Logic error */
510 sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9)
511 + total_bytes;
512 index = 0;
513
514 data = kmap_atomic(bvec->bv_page);
515 }
471 516
472 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), 517 sums->sums[index] = ~(u32)0;
473 GFP_NOFS); 518 sums->sums[index]
474 BUG_ON(!sums); /* -ENOMEM */ 519 = btrfs_csum_data(data + bvec->bv_offset
475 sums->len = bytes_left; 520 + (i * root->sectorsize),
476 ordered = btrfs_lookup_ordered_extent(inode, offset); 521 sums->sums[index],
477 BUG_ON(!ordered); /* Logic error */ 522 root->sectorsize);
478 sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) + 523 btrfs_csum_final(sums->sums[index],
479 total_bytes; 524 (char *)(sums->sums + index));
480 index = 0; 525 index++;
526 offset += root->sectorsize;
527 this_sum_bytes += root->sectorsize;
528 total_bytes += root->sectorsize;
481 } 529 }
482 530
483 data = kmap_atomic(bvec->bv_page);
484 sums->sums[index] = ~(u32)0;
485 sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
486 sums->sums[index],
487 bvec->bv_len);
488 kunmap_atomic(data); 531 kunmap_atomic(data);
489 btrfs_csum_final(sums->sums[index],
490 (char *)(sums->sums + index));
491 532
492 bio_index++; 533 bio_index++;
493 index++;
494 total_bytes += bvec->bv_len;
495 this_sum_bytes += bvec->bv_len;
496 offset += bvec->bv_len;
497 bvec++; 534 bvec++;
498 } 535 }
499 this_sum_bytes = 0; 536 this_sum_bytes = 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 098bb8f690c9..15a09cb156ce 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
41#include "locking.h" 41#include "locking.h"
42#include "volumes.h" 42#include "volumes.h"
43#include "qgroup.h" 43#include "qgroup.h"
44#include "compression.h"
44 45
45static struct kmem_cache *btrfs_inode_defrag_cachep; 46static struct kmem_cache *btrfs_inode_defrag_cachep;
46/* 47/*
@@ -498,7 +499,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
498 loff_t isize = i_size_read(inode); 499 loff_t isize = i_size_read(inode);
499 500
500 start_pos = pos & ~((u64)root->sectorsize - 1); 501 start_pos = pos & ~((u64)root->sectorsize - 1);
501 num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); 502 num_bytes = round_up(write_bytes + pos - start_pos, root->sectorsize);
502 503
503 end_of_last_block = start_pos + num_bytes - 1; 504 end_of_last_block = start_pos + num_bytes - 1;
504 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 505 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@ -1379,16 +1380,19 @@ fail:
1379static noinline int 1380static noinline int
1380lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, 1381lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1381 size_t num_pages, loff_t pos, 1382 size_t num_pages, loff_t pos,
1383 size_t write_bytes,
1382 u64 *lockstart, u64 *lockend, 1384 u64 *lockstart, u64 *lockend,
1383 struct extent_state **cached_state) 1385 struct extent_state **cached_state)
1384{ 1386{
1387 struct btrfs_root *root = BTRFS_I(inode)->root;
1385 u64 start_pos; 1388 u64 start_pos;
1386 u64 last_pos; 1389 u64 last_pos;
1387 int i; 1390 int i;
1388 int ret = 0; 1391 int ret = 0;
1389 1392
1390 start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); 1393 start_pos = round_down(pos, root->sectorsize);
1391 last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1; 1394 last_pos = start_pos
1395 + round_up(pos + write_bytes - start_pos, root->sectorsize) - 1;
1392 1396
1393 if (start_pos < inode->i_size) { 1397 if (start_pos < inode->i_size) {
1394 struct btrfs_ordered_extent *ordered; 1398 struct btrfs_ordered_extent *ordered;
@@ -1503,6 +1507,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1503 1507
1504 while (iov_iter_count(i) > 0) { 1508 while (iov_iter_count(i) > 0) {
1505 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1509 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1510 size_t sector_offset;
1506 size_t write_bytes = min(iov_iter_count(i), 1511 size_t write_bytes = min(iov_iter_count(i),
1507 nrptrs * (size_t)PAGE_CACHE_SIZE - 1512 nrptrs * (size_t)PAGE_CACHE_SIZE -
1508 offset); 1513 offset);
@@ -1511,6 +1516,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1511 size_t reserve_bytes; 1516 size_t reserve_bytes;
1512 size_t dirty_pages; 1517 size_t dirty_pages;
1513 size_t copied; 1518 size_t copied;
1519 size_t dirty_sectors;
1520 size_t num_sectors;
1514 1521
1515 WARN_ON(num_pages > nrptrs); 1522 WARN_ON(num_pages > nrptrs);
1516 1523
@@ -1523,29 +1530,29 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1523 break; 1530 break;
1524 } 1531 }
1525 1532
1526 reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1533 sector_offset = pos & (root->sectorsize - 1);
1534 reserve_bytes = round_up(write_bytes + sector_offset,
1535 root->sectorsize);
1527 1536
1528 if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | 1537 if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1529 BTRFS_INODE_PREALLOC)) { 1538 BTRFS_INODE_PREALLOC)) &&
1530 ret = check_can_nocow(inode, pos, &write_bytes); 1539 check_can_nocow(inode, pos, &write_bytes) > 0) {
1531 if (ret < 0) 1540 /*
1532 break; 1541 * For nodata cow case, no need to reserve
1533 if (ret > 0) { 1542 * data space.
1534 /* 1543 */
1535 * For nodata cow case, no need to reserve 1544 only_release_metadata = true;
1536 * data space. 1545 /*
1537 */ 1546 * our prealloc extent may be smaller than
1538 only_release_metadata = true; 1547 * write_bytes, so scale down.
1539 /* 1548 */
1540 * our prealloc extent may be smaller than 1549 num_pages = DIV_ROUND_UP(write_bytes + offset,
1541 * write_bytes, so scale down. 1550 PAGE_CACHE_SIZE);
1542 */ 1551 reserve_bytes = round_up(write_bytes + sector_offset,
1543 num_pages = DIV_ROUND_UP(write_bytes + offset, 1552 root->sectorsize);
1544 PAGE_CACHE_SIZE); 1553 goto reserve_metadata;
1545 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1546 goto reserve_metadata;
1547 }
1548 } 1554 }
1555
1549 ret = btrfs_check_data_free_space(inode, pos, write_bytes); 1556 ret = btrfs_check_data_free_space(inode, pos, write_bytes);
1550 if (ret < 0) 1557 if (ret < 0)
1551 break; 1558 break;
@@ -1576,8 +1583,8 @@ again:
1576 break; 1583 break;
1577 1584
1578 ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages, 1585 ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
1579 pos, &lockstart, &lockend, 1586 pos, write_bytes, &lockstart,
1580 &cached_state); 1587 &lockend, &cached_state);
1581 if (ret < 0) { 1588 if (ret < 0) {
1582 if (ret == -EAGAIN) 1589 if (ret == -EAGAIN)
1583 goto again; 1590 goto again;
@@ -1612,9 +1619,16 @@ again:
1612 * we still have an outstanding extent for the chunk we actually 1619 * we still have an outstanding extent for the chunk we actually
1613 * managed to copy. 1620 * managed to copy.
1614 */ 1621 */
1615 if (num_pages > dirty_pages) { 1622 num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
1616 release_bytes = (num_pages - dirty_pages) << 1623 reserve_bytes);
1617 PAGE_CACHE_SHIFT; 1624 dirty_sectors = round_up(copied + sector_offset,
1625 root->sectorsize);
1626 dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
1627 dirty_sectors);
1628
1629 if (num_sectors > dirty_sectors) {
1630 release_bytes = (write_bytes - copied)
1631 & ~((u64)root->sectorsize - 1);
1618 if (copied > 0) { 1632 if (copied > 0) {
1619 spin_lock(&BTRFS_I(inode)->lock); 1633 spin_lock(&BTRFS_I(inode)->lock);
1620 BTRFS_I(inode)->outstanding_extents++; 1634 BTRFS_I(inode)->outstanding_extents++;
@@ -1633,7 +1647,8 @@ again:
1633 } 1647 }
1634 } 1648 }
1635 1649
1636 release_bytes = dirty_pages << PAGE_CACHE_SHIFT; 1650 release_bytes = round_up(copied + sector_offset,
1651 root->sectorsize);
1637 1652
1638 if (copied > 0) 1653 if (copied > 0)
1639 ret = btrfs_dirty_pages(root, inode, pages, 1654 ret = btrfs_dirty_pages(root, inode, pages,
@@ -1654,8 +1669,7 @@ again:
1654 1669
1655 if (only_release_metadata && copied > 0) { 1670 if (only_release_metadata && copied > 0) {
1656 lockstart = round_down(pos, root->sectorsize); 1671 lockstart = round_down(pos, root->sectorsize);
1657 lockend = lockstart + 1672 lockend = round_up(pos + copied, root->sectorsize) - 1;
1658 (dirty_pages << PAGE_CACHE_SHIFT) - 1;
1659 1673
1660 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 1674 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1661 lockend, EXTENT_NORESERVE, NULL, 1675 lockend, EXTENT_NORESERVE, NULL,
@@ -1761,6 +1775,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1761 ssize_t err; 1775 ssize_t err;
1762 loff_t pos; 1776 loff_t pos;
1763 size_t count; 1777 size_t count;
1778 loff_t oldsize;
1779 int clean_page = 0;
1764 1780
1765 inode_lock(inode); 1781 inode_lock(inode);
1766 err = generic_write_checks(iocb, from); 1782 err = generic_write_checks(iocb, from);
@@ -1799,14 +1815,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1799 pos = iocb->ki_pos; 1815 pos = iocb->ki_pos;
1800 count = iov_iter_count(from); 1816 count = iov_iter_count(from);
1801 start_pos = round_down(pos, root->sectorsize); 1817 start_pos = round_down(pos, root->sectorsize);
1802 if (start_pos > i_size_read(inode)) { 1818 oldsize = i_size_read(inode);
1819 if (start_pos > oldsize) {
1803 /* Expand hole size to cover write data, preventing empty gap */ 1820 /* Expand hole size to cover write data, preventing empty gap */
1804 end_pos = round_up(pos + count, root->sectorsize); 1821 end_pos = round_up(pos + count, root->sectorsize);
1805 err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); 1822 err = btrfs_cont_expand(inode, oldsize, end_pos);
1806 if (err) { 1823 if (err) {
1807 inode_unlock(inode); 1824 inode_unlock(inode);
1808 goto out; 1825 goto out;
1809 } 1826 }
1827 if (start_pos > round_up(oldsize, root->sectorsize))
1828 clean_page = 1;
1810 } 1829 }
1811 1830
1812 if (sync) 1831 if (sync)
@@ -1818,6 +1837,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1818 num_written = __btrfs_buffered_write(file, from, pos); 1837 num_written = __btrfs_buffered_write(file, from, pos);
1819 if (num_written > 0) 1838 if (num_written > 0)
1820 iocb->ki_pos = pos + num_written; 1839 iocb->ki_pos = pos + num_written;
1840 if (clean_page)
1841 pagecache_isize_extended(inode, oldsize,
1842 i_size_read(inode));
1821 } 1843 }
1822 1844
1823 inode_unlock(inode); 1845 inode_unlock(inode);
@@ -1825,7 +1847,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1825 /* 1847 /*
1826 * We also have to set last_sub_trans to the current log transid, 1848 * We also have to set last_sub_trans to the current log transid,
1827 * otherwise subsequent syncs to a file that's been synced in this 1849 * otherwise subsequent syncs to a file that's been synced in this
1828 * transaction will appear to have already occured. 1850 * transaction will appear to have already occurred.
1829 */ 1851 */
1830 spin_lock(&BTRFS_I(inode)->lock); 1852 spin_lock(&BTRFS_I(inode)->lock);
1831 BTRFS_I(inode)->last_sub_trans = root->log_transid; 1853 BTRFS_I(inode)->last_sub_trans = root->log_transid;
@@ -1996,10 +2018,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1996 */ 2018 */
1997 smp_mb(); 2019 smp_mb();
1998 if (btrfs_inode_in_log(inode, root->fs_info->generation) || 2020 if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
1999 (BTRFS_I(inode)->last_trans <= 2021 (full_sync && BTRFS_I(inode)->last_trans <=
2000 root->fs_info->last_trans_committed && 2022 root->fs_info->last_trans_committed) ||
2001 (full_sync || 2023 (!btrfs_have_ordered_extents_in_range(inode, start, len) &&
2002 !btrfs_have_ordered_extents_in_range(inode, start, len)))) { 2024 BTRFS_I(inode)->last_trans
2025 <= root->fs_info->last_trans_committed)) {
2003 /* 2026 /*
2004 * We'v had everything committed since the last time we were 2027 * We'v had everything committed since the last time we were
2005 * modified so clear this flag in case it was set for whatever 2028 * modified so clear this flag in case it was set for whatever
@@ -2293,10 +2316,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2293 int ret = 0; 2316 int ret = 0;
2294 int err = 0; 2317 int err = 0;
2295 unsigned int rsv_count; 2318 unsigned int rsv_count;
2296 bool same_page; 2319 bool same_block;
2297 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2320 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2298 u64 ino_size; 2321 u64 ino_size;
2299 bool truncated_page = false; 2322 bool truncated_block = false;
2300 bool updated_inode = false; 2323 bool updated_inode = false;
2301 2324
2302 ret = btrfs_wait_ordered_range(inode, offset, len); 2325 ret = btrfs_wait_ordered_range(inode, offset, len);
@@ -2304,7 +2327,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2304 return ret; 2327 return ret;
2305 2328
2306 inode_lock(inode); 2329 inode_lock(inode);
2307 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); 2330 ino_size = round_up(inode->i_size, root->sectorsize);
2308 ret = find_first_non_hole(inode, &offset, &len); 2331 ret = find_first_non_hole(inode, &offset, &len);
2309 if (ret < 0) 2332 if (ret < 0)
2310 goto out_only_mutex; 2333 goto out_only_mutex;
@@ -2317,31 +2340,30 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2317 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); 2340 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
2318 lockend = round_down(offset + len, 2341 lockend = round_down(offset + len,
2319 BTRFS_I(inode)->root->sectorsize) - 1; 2342 BTRFS_I(inode)->root->sectorsize) - 1;
2320 same_page = ((offset >> PAGE_CACHE_SHIFT) == 2343 same_block = (BTRFS_BYTES_TO_BLKS(root->fs_info, offset))
2321 ((offset + len - 1) >> PAGE_CACHE_SHIFT)); 2344 == (BTRFS_BYTES_TO_BLKS(root->fs_info, offset + len - 1));
2322
2323 /* 2345 /*
2324 * We needn't truncate any page which is beyond the end of the file 2346 * We needn't truncate any block which is beyond the end of the file
2325 * because we are sure there is no data there. 2347 * because we are sure there is no data there.
2326 */ 2348 */
2327 /* 2349 /*
2328 * Only do this if we are in the same page and we aren't doing the 2350 * Only do this if we are in the same block and we aren't doing the
2329 * entire page. 2351 * entire block.
2330 */ 2352 */
2331 if (same_page && len < PAGE_CACHE_SIZE) { 2353 if (same_block && len < root->sectorsize) {
2332 if (offset < ino_size) { 2354 if (offset < ino_size) {
2333 truncated_page = true; 2355 truncated_block = true;
2334 ret = btrfs_truncate_page(inode, offset, len, 0); 2356 ret = btrfs_truncate_block(inode, offset, len, 0);
2335 } else { 2357 } else {
2336 ret = 0; 2358 ret = 0;
2337 } 2359 }
2338 goto out_only_mutex; 2360 goto out_only_mutex;
2339 } 2361 }
2340 2362
2341 /* zero back part of the first page */ 2363 /* zero back part of the first block */
2342 if (offset < ino_size) { 2364 if (offset < ino_size) {
2343 truncated_page = true; 2365 truncated_block = true;
2344 ret = btrfs_truncate_page(inode, offset, 0, 0); 2366 ret = btrfs_truncate_block(inode, offset, 0, 0);
2345 if (ret) { 2367 if (ret) {
2346 inode_unlock(inode); 2368 inode_unlock(inode);
2347 return ret; 2369 return ret;
@@ -2376,9 +2398,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2376 if (!ret) { 2398 if (!ret) {
2377 /* zero the front end of the last page */ 2399 /* zero the front end of the last page */
2378 if (tail_start + tail_len < ino_size) { 2400 if (tail_start + tail_len < ino_size) {
2379 truncated_page = true; 2401 truncated_block = true;
2380 ret = btrfs_truncate_page(inode, 2402 ret = btrfs_truncate_block(inode,
2381 tail_start + tail_len, 0, 1); 2403 tail_start + tail_len,
2404 0, 1);
2382 if (ret) 2405 if (ret)
2383 goto out_only_mutex; 2406 goto out_only_mutex;
2384 } 2407 }
@@ -2544,7 +2567,7 @@ out_trans:
2544 goto out_free; 2567 goto out_free;
2545 2568
2546 inode_inc_iversion(inode); 2569 inode_inc_iversion(inode);
2547 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2570 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
2548 2571
2549 trans->block_rsv = &root->fs_info->trans_block_rsv; 2572 trans->block_rsv = &root->fs_info->trans_block_rsv;
2550 ret = btrfs_update_inode(trans, root, inode); 2573 ret = btrfs_update_inode(trans, root, inode);
@@ -2558,7 +2581,7 @@ out:
2558 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2581 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2559 &cached_state, GFP_NOFS); 2582 &cached_state, GFP_NOFS);
2560out_only_mutex: 2583out_only_mutex:
2561 if (!updated_inode && truncated_page && !ret && !err) { 2584 if (!updated_inode && truncated_block && !ret && !err) {
2562 /* 2585 /*
2563 * If we only end up zeroing part of a page, we still need to 2586 * If we only end up zeroing part of a page, we still need to
2564 * update the inode item, so that all the time fields are 2587 * update the inode item, so that all the time fields are
@@ -2611,7 +2634,7 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2611 return 0; 2634 return 0;
2612 } 2635 }
2613insert: 2636insert:
2614 range = kmalloc(sizeof(*range), GFP_NOFS); 2637 range = kmalloc(sizeof(*range), GFP_KERNEL);
2615 if (!range) 2638 if (!range)
2616 return -ENOMEM; 2639 return -ENOMEM;
2617 range->start = start; 2640 range->start = start;
@@ -2678,10 +2701,10 @@ static long btrfs_fallocate(struct file *file, int mode,
2678 } else if (offset + len > inode->i_size) { 2701 } else if (offset + len > inode->i_size) {
2679 /* 2702 /*
2680 * If we are fallocating from the end of the file onward we 2703 * If we are fallocating from the end of the file onward we
2681 * need to zero out the end of the page if i_size lands in the 2704 * need to zero out the end of the block if i_size lands in the
2682 * middle of a page. 2705 * middle of a block.
2683 */ 2706 */
2684 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); 2707 ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
2685 if (ret) 2708 if (ret)
2686 goto out; 2709 goto out;
2687 } 2710 }
@@ -2712,7 +2735,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2712 btrfs_put_ordered_extent(ordered); 2735 btrfs_put_ordered_extent(ordered);
2713 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 2736 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
2714 alloc_start, locked_end, 2737 alloc_start, locked_end,
2715 &cached_state, GFP_NOFS); 2738 &cached_state, GFP_KERNEL);
2716 /* 2739 /*
2717 * we can't wait on the range with the transaction 2740 * we can't wait on the range with the transaction
2718 * running or with the extent lock held 2741 * running or with the extent lock held
@@ -2794,7 +2817,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2794 if (IS_ERR(trans)) { 2817 if (IS_ERR(trans)) {
2795 ret = PTR_ERR(trans); 2818 ret = PTR_ERR(trans);
2796 } else { 2819 } else {
2797 inode->i_ctime = CURRENT_TIME; 2820 inode->i_ctime = current_fs_time(inode->i_sb);
2798 i_size_write(inode, actual_end); 2821 i_size_write(inode, actual_end);
2799 btrfs_ordered_update_i_size(inode, actual_end, NULL); 2822 btrfs_ordered_update_i_size(inode, actual_end, NULL);
2800 ret = btrfs_update_inode(trans, root, inode); 2823 ret = btrfs_update_inode(trans, root, inode);
@@ -2806,7 +2829,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2806 } 2829 }
2807out_unlock: 2830out_unlock:
2808 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 2831 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
2809 &cached_state, GFP_NOFS); 2832 &cached_state, GFP_KERNEL);
2810out: 2833out:
2811 /* 2834 /*
2812 * As we waited the extent range, the data_rsv_map must be empty 2835 * As we waited the extent range, the data_rsv_map must be empty
@@ -2939,8 +2962,7 @@ const struct file_operations btrfs_file_operations = {
2939 2962
2940void btrfs_auto_defrag_exit(void) 2963void btrfs_auto_defrag_exit(void)
2941{ 2964{
2942 if (btrfs_inode_defrag_cachep) 2965 kmem_cache_destroy(btrfs_inode_defrag_cachep);
2943 kmem_cache_destroy(btrfs_inode_defrag_cachep);
2944} 2966}
2945 2967
2946int btrfs_auto_defrag_init(void) 2968int btrfs_auto_defrag_init(void)
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index e50316c4af15..1f0ec19b23f6 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -556,6 +556,9 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
556 mutex_lock(&root->objectid_mutex); 556 mutex_lock(&root->objectid_mutex);
557 557
558 if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { 558 if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
559 btrfs_warn(root->fs_info,
560 "the objectid of root %llu reaches its highest value",
561 root->root_key.objectid);
559 ret = -ENOSPC; 562 ret = -ENOSPC;
560 goto out; 563 goto out;
561 } 564 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d96f5cf38a2d..41a5688ffdfe 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -263,7 +263,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
263 data_len = compressed_size; 263 data_len = compressed_size;
264 264
265 if (start > 0 || 265 if (start > 0 ||
266 actual_end > PAGE_CACHE_SIZE || 266 actual_end > root->sectorsize ||
267 data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) || 267 data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
268 (!compressed_size && 268 (!compressed_size &&
269 (actual_end & (root->sectorsize - 1)) == 0) || 269 (actual_end & (root->sectorsize - 1)) == 0) ||
@@ -2002,7 +2002,8 @@ again:
2002 if (PagePrivate2(page)) 2002 if (PagePrivate2(page))
2003 goto out; 2003 goto out;
2004 2004
2005 ordered = btrfs_lookup_ordered_extent(inode, page_start); 2005 ordered = btrfs_lookup_ordered_range(inode, page_start,
2006 PAGE_CACHE_SIZE);
2006 if (ordered) { 2007 if (ordered) {
2007 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 2008 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2008 page_end, &cached_state, GFP_NOFS); 2009 page_end, &cached_state, GFP_NOFS);
@@ -4013,7 +4014,8 @@ err:
4013 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 4014 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
4014 inode_inc_iversion(inode); 4015 inode_inc_iversion(inode);
4015 inode_inc_iversion(dir); 4016 inode_inc_iversion(dir);
4016 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 4017 inode->i_ctime = dir->i_mtime =
4018 dir->i_ctime = current_fs_time(inode->i_sb);
4017 ret = btrfs_update_inode(trans, root, dir); 4019 ret = btrfs_update_inode(trans, root, dir);
4018out: 4020out:
4019 return ret; 4021 return ret;
@@ -4156,7 +4158,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4156 4158
4157 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 4159 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
4158 inode_inc_iversion(dir); 4160 inode_inc_iversion(dir);
4159 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 4161 dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
4160 ret = btrfs_update_inode_fallback(trans, root, dir); 4162 ret = btrfs_update_inode_fallback(trans, root, dir);
4161 if (ret) 4163 if (ret)
4162 btrfs_abort_transaction(trans, root, ret); 4164 btrfs_abort_transaction(trans, root, ret);
@@ -4211,11 +4213,20 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
4211{ 4213{
4212 int ret; 4214 int ret;
4213 4215
4216 /*
4217 * This is only used to apply pressure to the enospc system, we don't
4218 * intend to use this reservation at all.
4219 */
4214 bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted); 4220 bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
4221 bytes_deleted *= root->nodesize;
4215 ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, 4222 ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
4216 bytes_deleted, BTRFS_RESERVE_NO_FLUSH); 4223 bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
4217 if (!ret) 4224 if (!ret) {
4225 trace_btrfs_space_reservation(root->fs_info, "transaction",
4226 trans->transid,
4227 bytes_deleted, 1);
4218 trans->bytes_reserved += bytes_deleted; 4228 trans->bytes_reserved += bytes_deleted;
4229 }
4219 return ret; 4230 return ret;
4220 4231
4221} 4232}
@@ -4248,7 +4259,8 @@ static int truncate_inline_extent(struct inode *inode,
4248 * read the extent item from disk (data not in the page cache). 4259 * read the extent item from disk (data not in the page cache).
4249 */ 4260 */
4250 btrfs_release_path(path); 4261 btrfs_release_path(path);
4251 return btrfs_truncate_page(inode, offset, page_end - offset, 0); 4262 return btrfs_truncate_block(inode, offset, page_end - offset,
4263 0);
4252 } 4264 }
4253 4265
4254 btrfs_set_file_extent_ram_bytes(leaf, fi, size); 4266 btrfs_set_file_extent_ram_bytes(leaf, fi, size);
@@ -4601,17 +4613,17 @@ error:
4601} 4613}
4602 4614
4603/* 4615/*
4604 * btrfs_truncate_page - read, zero a chunk and write a page 4616 * btrfs_truncate_block - read, zero a chunk and write a block
4605 * @inode - inode that we're zeroing 4617 * @inode - inode that we're zeroing
4606 * @from - the offset to start zeroing 4618 * @from - the offset to start zeroing
4607 * @len - the length to zero, 0 to zero the entire range respective to the 4619 * @len - the length to zero, 0 to zero the entire range respective to the
4608 * offset 4620 * offset
4609 * @front - zero up to the offset instead of from the offset on 4621 * @front - zero up to the offset instead of from the offset on
4610 * 4622 *
4611 * This will find the page for the "from" offset and cow the page and zero the 4623 * This will find the block for the "from" offset and cow the block and zero the
4612 * part we want to zero. This is used with truncate and hole punching. 4624 * part we want to zero. This is used with truncate and hole punching.
4613 */ 4625 */
4614int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, 4626int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4615 int front) 4627 int front)
4616{ 4628{
4617 struct address_space *mapping = inode->i_mapping; 4629 struct address_space *mapping = inode->i_mapping;
@@ -4622,18 +4634,19 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
4622 char *kaddr; 4634 char *kaddr;
4623 u32 blocksize = root->sectorsize; 4635 u32 blocksize = root->sectorsize;
4624 pgoff_t index = from >> PAGE_CACHE_SHIFT; 4636 pgoff_t index = from >> PAGE_CACHE_SHIFT;
4625 unsigned offset = from & (PAGE_CACHE_SIZE-1); 4637 unsigned offset = from & (blocksize - 1);
4626 struct page *page; 4638 struct page *page;
4627 gfp_t mask = btrfs_alloc_write_mask(mapping); 4639 gfp_t mask = btrfs_alloc_write_mask(mapping);
4628 int ret = 0; 4640 int ret = 0;
4629 u64 page_start; 4641 u64 block_start;
4630 u64 page_end; 4642 u64 block_end;
4631 4643
4632 if ((offset & (blocksize - 1)) == 0 && 4644 if ((offset & (blocksize - 1)) == 0 &&
4633 (!len || ((len & (blocksize - 1)) == 0))) 4645 (!len || ((len & (blocksize - 1)) == 0)))
4634 goto out; 4646 goto out;
4647
4635 ret = btrfs_delalloc_reserve_space(inode, 4648 ret = btrfs_delalloc_reserve_space(inode,
4636 round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE); 4649 round_down(from, blocksize), blocksize);
4637 if (ret) 4650 if (ret)
4638 goto out; 4651 goto out;
4639 4652
@@ -4641,14 +4654,14 @@ again:
4641 page = find_or_create_page(mapping, index, mask); 4654 page = find_or_create_page(mapping, index, mask);
4642 if (!page) { 4655 if (!page) {
4643 btrfs_delalloc_release_space(inode, 4656 btrfs_delalloc_release_space(inode,
4644 round_down(from, PAGE_CACHE_SIZE), 4657 round_down(from, blocksize),
4645 PAGE_CACHE_SIZE); 4658 blocksize);
4646 ret = -ENOMEM; 4659 ret = -ENOMEM;
4647 goto out; 4660 goto out;
4648 } 4661 }
4649 4662
4650 page_start = page_offset(page); 4663 block_start = round_down(from, blocksize);
4651 page_end = page_start + PAGE_CACHE_SIZE - 1; 4664 block_end = block_start + blocksize - 1;
4652 4665
4653 if (!PageUptodate(page)) { 4666 if (!PageUptodate(page)) {
4654 ret = btrfs_readpage(NULL, page); 4667 ret = btrfs_readpage(NULL, page);
@@ -4665,12 +4678,12 @@ again:
4665 } 4678 }
4666 wait_on_page_writeback(page); 4679 wait_on_page_writeback(page);
4667 4680
4668 lock_extent_bits(io_tree, page_start, page_end, &cached_state); 4681 lock_extent_bits(io_tree, block_start, block_end, &cached_state);
4669 set_page_extent_mapped(page); 4682 set_page_extent_mapped(page);
4670 4683
4671 ordered = btrfs_lookup_ordered_extent(inode, page_start); 4684 ordered = btrfs_lookup_ordered_extent(inode, block_start);
4672 if (ordered) { 4685 if (ordered) {
4673 unlock_extent_cached(io_tree, page_start, page_end, 4686 unlock_extent_cached(io_tree, block_start, block_end,
4674 &cached_state, GFP_NOFS); 4687 &cached_state, GFP_NOFS);
4675 unlock_page(page); 4688 unlock_page(page);
4676 page_cache_release(page); 4689 page_cache_release(page);
@@ -4679,39 +4692,41 @@ again:
4679 goto again; 4692 goto again;
4680 } 4693 }
4681 4694
4682 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 4695 clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
4683 EXTENT_DIRTY | EXTENT_DELALLOC | 4696 EXTENT_DIRTY | EXTENT_DELALLOC |
4684 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 4697 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4685 0, 0, &cached_state, GFP_NOFS); 4698 0, 0, &cached_state, GFP_NOFS);
4686 4699
4687 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 4700 ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
4688 &cached_state); 4701 &cached_state);
4689 if (ret) { 4702 if (ret) {
4690 unlock_extent_cached(io_tree, page_start, page_end, 4703 unlock_extent_cached(io_tree, block_start, block_end,
4691 &cached_state, GFP_NOFS); 4704 &cached_state, GFP_NOFS);
4692 goto out_unlock; 4705 goto out_unlock;
4693 } 4706 }
4694 4707
4695 if (offset != PAGE_CACHE_SIZE) { 4708 if (offset != blocksize) {
4696 if (!len) 4709 if (!len)
4697 len = PAGE_CACHE_SIZE - offset; 4710 len = blocksize - offset;
4698 kaddr = kmap(page); 4711 kaddr = kmap(page);
4699 if (front) 4712 if (front)
4700 memset(kaddr, 0, offset); 4713 memset(kaddr + (block_start - page_offset(page)),
4714 0, offset);
4701 else 4715 else
4702 memset(kaddr + offset, 0, len); 4716 memset(kaddr + (block_start - page_offset(page)) + offset,
4717 0, len);
4703 flush_dcache_page(page); 4718 flush_dcache_page(page);
4704 kunmap(page); 4719 kunmap(page);
4705 } 4720 }
4706 ClearPageChecked(page); 4721 ClearPageChecked(page);
4707 set_page_dirty(page); 4722 set_page_dirty(page);
4708 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, 4723 unlock_extent_cached(io_tree, block_start, block_end, &cached_state,
4709 GFP_NOFS); 4724 GFP_NOFS);
4710 4725
4711out_unlock: 4726out_unlock:
4712 if (ret) 4727 if (ret)
4713 btrfs_delalloc_release_space(inode, page_start, 4728 btrfs_delalloc_release_space(inode, block_start,
4714 PAGE_CACHE_SIZE); 4729 blocksize);
4715 unlock_page(page); 4730 unlock_page(page);
4716 page_cache_release(page); 4731 page_cache_release(page);
4717out: 4732out:
@@ -4782,11 +4797,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4782 int err = 0; 4797 int err = 0;
4783 4798
4784 /* 4799 /*
4785 * If our size started in the middle of a page we need to zero out the 4800 * If our size started in the middle of a block we need to zero out the
4786 * rest of the page before we expand the i_size, otherwise we could 4801 * rest of the block before we expand the i_size, otherwise we could
4787 * expose stale data. 4802 * expose stale data.
4788 */ 4803 */
4789 err = btrfs_truncate_page(inode, oldsize, 0, 0); 4804 err = btrfs_truncate_block(inode, oldsize, 0, 0);
4790 if (err) 4805 if (err)
4791 return err; 4806 return err;
4792 4807
@@ -4895,7 +4910,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4895 } 4910 }
4896 4911
4897 if (newsize > oldsize) { 4912 if (newsize > oldsize) {
4898 truncate_pagecache(inode, newsize);
4899 /* 4913 /*
4900 * Don't do an expanding truncate while snapshoting is ongoing. 4914 * Don't do an expanding truncate while snapshoting is ongoing.
4901 * This is to ensure the snapshot captures a fully consistent 4915 * This is to ensure the snapshot captures a fully consistent
@@ -4918,6 +4932,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4918 4932
4919 i_size_write(inode, newsize); 4933 i_size_write(inode, newsize);
4920 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 4934 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4935 pagecache_isize_extended(inode, oldsize, newsize);
4921 ret = btrfs_update_inode(trans, root, inode); 4936 ret = btrfs_update_inode(trans, root, inode);
4922 btrfs_end_write_no_snapshoting(root); 4937 btrfs_end_write_no_snapshoting(root);
4923 btrfs_end_transaction(trans, root); 4938 btrfs_end_transaction(trans, root);
@@ -5588,7 +5603,7 @@ static struct inode *new_simple_dir(struct super_block *s,
5588 inode->i_op = &btrfs_dir_ro_inode_operations; 5603 inode->i_op = &btrfs_dir_ro_inode_operations;
5589 inode->i_fop = &simple_dir_operations; 5604 inode->i_fop = &simple_dir_operations;
5590 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 5605 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5591 inode->i_mtime = CURRENT_TIME; 5606 inode->i_mtime = current_fs_time(inode->i_sb);
5592 inode->i_atime = inode->i_mtime; 5607 inode->i_atime = inode->i_mtime;
5593 inode->i_ctime = inode->i_mtime; 5608 inode->i_ctime = inode->i_mtime;
5594 BTRFS_I(inode)->i_otime = inode->i_mtime; 5609 BTRFS_I(inode)->i_otime = inode->i_mtime;
@@ -5790,7 +5805,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5790 if (name_len <= sizeof(tmp_name)) { 5805 if (name_len <= sizeof(tmp_name)) {
5791 name_ptr = tmp_name; 5806 name_ptr = tmp_name;
5792 } else { 5807 } else {
5793 name_ptr = kmalloc(name_len, GFP_NOFS); 5808 name_ptr = kmalloc(name_len, GFP_KERNEL);
5794 if (!name_ptr) { 5809 if (!name_ptr) {
5795 ret = -ENOMEM; 5810 ret = -ENOMEM;
5796 goto err; 5811 goto err;
@@ -6172,7 +6187,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
6172 inode_init_owner(inode, dir, mode); 6187 inode_init_owner(inode, dir, mode);
6173 inode_set_bytes(inode, 0); 6188 inode_set_bytes(inode, 0);
6174 6189
6175 inode->i_mtime = CURRENT_TIME; 6190 inode->i_mtime = current_fs_time(inode->i_sb);
6176 inode->i_atime = inode->i_mtime; 6191 inode->i_atime = inode->i_mtime;
6177 inode->i_ctime = inode->i_mtime; 6192 inode->i_ctime = inode->i_mtime;
6178 BTRFS_I(inode)->i_otime = inode->i_mtime; 6193 BTRFS_I(inode)->i_otime = inode->i_mtime;
@@ -6285,7 +6300,8 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
6285 btrfs_i_size_write(parent_inode, parent_inode->i_size + 6300 btrfs_i_size_write(parent_inode, parent_inode->i_size +
6286 name_len * 2); 6301 name_len * 2);
6287 inode_inc_iversion(parent_inode); 6302 inode_inc_iversion(parent_inode);
6288 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 6303 parent_inode->i_mtime = parent_inode->i_ctime =
6304 current_fs_time(parent_inode->i_sb);
6289 ret = btrfs_update_inode(trans, root, parent_inode); 6305 ret = btrfs_update_inode(trans, root, parent_inode);
6290 if (ret) 6306 if (ret)
6291 btrfs_abort_transaction(trans, root, ret); 6307 btrfs_abort_transaction(trans, root, ret);
@@ -6503,7 +6519,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6503 BTRFS_I(inode)->dir_index = 0ULL; 6519 BTRFS_I(inode)->dir_index = 0ULL;
6504 inc_nlink(inode); 6520 inc_nlink(inode);
6505 inode_inc_iversion(inode); 6521 inode_inc_iversion(inode);
6506 inode->i_ctime = CURRENT_TIME; 6522 inode->i_ctime = current_fs_time(inode->i_sb);
6507 ihold(inode); 6523 ihold(inode);
6508 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 6524 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6509 6525
@@ -7414,7 +7430,26 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7414 cached_state, GFP_NOFS); 7430 cached_state, GFP_NOFS);
7415 7431
7416 if (ordered) { 7432 if (ordered) {
7417 btrfs_start_ordered_extent(inode, ordered, 1); 7433 /*
7434 * If we are doing a DIO read and the ordered extent we
7435 * found is for a buffered write, we can not wait for it
7436 * to complete and retry, because if we do so we can
7437 * deadlock with concurrent buffered writes on page
7438 * locks. This happens only if our DIO read covers more
7439 * than one extent map, if at this point has already
7440 * created an ordered extent for a previous extent map
7441 * and locked its range in the inode's io tree, and a
7442 * concurrent write against that previous extent map's
7443 * range and this range started (we unlock the ranges
7444 * in the io tree only when the bios complete and
7445 * buffered writes always lock pages before attempting
7446 * to lock range in the io tree).
7447 */
7448 if (writing ||
7449 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7450 btrfs_start_ordered_extent(inode, ordered, 1);
7451 else
7452 ret = -ENOTBLK;
7418 btrfs_put_ordered_extent(ordered); 7453 btrfs_put_ordered_extent(ordered);
7419 } else { 7454 } else {
7420 /* 7455 /*
@@ -7431,9 +7466,11 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7431 * that page. 7466 * that page.
7432 */ 7467 */
7433 ret = -ENOTBLK; 7468 ret = -ENOTBLK;
7434 break;
7435 } 7469 }
7436 7470
7471 if (ret)
7472 break;
7473
7437 cond_resched(); 7474 cond_resched();
7438 } 7475 }
7439 7476
@@ -7764,9 +7801,9 @@ static int btrfs_check_dio_repairable(struct inode *inode,
7764} 7801}
7765 7802
7766static int dio_read_error(struct inode *inode, struct bio *failed_bio, 7803static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7767 struct page *page, u64 start, u64 end, 7804 struct page *page, unsigned int pgoff,
7768 int failed_mirror, bio_end_io_t *repair_endio, 7805 u64 start, u64 end, int failed_mirror,
7769 void *repair_arg) 7806 bio_end_io_t *repair_endio, void *repair_arg)
7770{ 7807{
7771 struct io_failure_record *failrec; 7808 struct io_failure_record *failrec;
7772 struct bio *bio; 7809 struct bio *bio;
@@ -7787,7 +7824,9 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7787 return -EIO; 7824 return -EIO;
7788 } 7825 }
7789 7826
7790 if (failed_bio->bi_vcnt > 1) 7827 if ((failed_bio->bi_vcnt > 1)
7828 || (failed_bio->bi_io_vec->bv_len
7829 > BTRFS_I(inode)->root->sectorsize))
7791 read_mode = READ_SYNC | REQ_FAILFAST_DEV; 7830 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
7792 else 7831 else
7793 read_mode = READ_SYNC; 7832 read_mode = READ_SYNC;
@@ -7795,7 +7834,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7795 isector = start - btrfs_io_bio(failed_bio)->logical; 7834 isector = start - btrfs_io_bio(failed_bio)->logical;
7796 isector >>= inode->i_sb->s_blocksize_bits; 7835 isector >>= inode->i_sb->s_blocksize_bits;
7797 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 7836 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7798 0, isector, repair_endio, repair_arg); 7837 pgoff, isector, repair_endio, repair_arg);
7799 if (!bio) { 7838 if (!bio) {
7800 free_io_failure(inode, failrec); 7839 free_io_failure(inode, failrec);
7801 return -EIO; 7840 return -EIO;
@@ -7825,12 +7864,17 @@ struct btrfs_retry_complete {
7825static void btrfs_retry_endio_nocsum(struct bio *bio) 7864static void btrfs_retry_endio_nocsum(struct bio *bio)
7826{ 7865{
7827 struct btrfs_retry_complete *done = bio->bi_private; 7866 struct btrfs_retry_complete *done = bio->bi_private;
7867 struct inode *inode;
7828 struct bio_vec *bvec; 7868 struct bio_vec *bvec;
7829 int i; 7869 int i;
7830 7870
7831 if (bio->bi_error) 7871 if (bio->bi_error)
7832 goto end; 7872 goto end;
7833 7873
7874 ASSERT(bio->bi_vcnt == 1);
7875 inode = bio->bi_io_vec->bv_page->mapping->host;
7876 ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
7877
7834 done->uptodate = 1; 7878 done->uptodate = 1;
7835 bio_for_each_segment_all(bvec, bio, i) 7879 bio_for_each_segment_all(bvec, bio, i)
7836 clean_io_failure(done->inode, done->start, bvec->bv_page, 0); 7880 clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
@@ -7842,25 +7886,35 @@ end:
7842static int __btrfs_correct_data_nocsum(struct inode *inode, 7886static int __btrfs_correct_data_nocsum(struct inode *inode,
7843 struct btrfs_io_bio *io_bio) 7887 struct btrfs_io_bio *io_bio)
7844{ 7888{
7889 struct btrfs_fs_info *fs_info;
7845 struct bio_vec *bvec; 7890 struct bio_vec *bvec;
7846 struct btrfs_retry_complete done; 7891 struct btrfs_retry_complete done;
7847 u64 start; 7892 u64 start;
7893 unsigned int pgoff;
7894 u32 sectorsize;
7895 int nr_sectors;
7848 int i; 7896 int i;
7849 int ret; 7897 int ret;
7850 7898
7899 fs_info = BTRFS_I(inode)->root->fs_info;
7900 sectorsize = BTRFS_I(inode)->root->sectorsize;
7901
7851 start = io_bio->logical; 7902 start = io_bio->logical;
7852 done.inode = inode; 7903 done.inode = inode;
7853 7904
7854 bio_for_each_segment_all(bvec, &io_bio->bio, i) { 7905 bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7855try_again: 7906 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
7907 pgoff = bvec->bv_offset;
7908
7909next_block_or_try_again:
7856 done.uptodate = 0; 7910 done.uptodate = 0;
7857 done.start = start; 7911 done.start = start;
7858 init_completion(&done.done); 7912 init_completion(&done.done);
7859 7913
7860 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, 7914 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
7861 start + bvec->bv_len - 1, 7915 pgoff, start, start + sectorsize - 1,
7862 io_bio->mirror_num, 7916 io_bio->mirror_num,
7863 btrfs_retry_endio_nocsum, &done); 7917 btrfs_retry_endio_nocsum, &done);
7864 if (ret) 7918 if (ret)
7865 return ret; 7919 return ret;
7866 7920
@@ -7868,10 +7922,15 @@ try_again:
7868 7922
7869 if (!done.uptodate) { 7923 if (!done.uptodate) {
7870 /* We might have another mirror, so try again */ 7924 /* We might have another mirror, so try again */
7871 goto try_again; 7925 goto next_block_or_try_again;
7872 } 7926 }
7873 7927
7874 start += bvec->bv_len; 7928 start += sectorsize;
7929
7930 if (nr_sectors--) {
7931 pgoff += sectorsize;
7932 goto next_block_or_try_again;
7933 }
7875 } 7934 }
7876 7935
7877 return 0; 7936 return 0;
@@ -7881,7 +7940,9 @@ static void btrfs_retry_endio(struct bio *bio)
7881{ 7940{
7882 struct btrfs_retry_complete *done = bio->bi_private; 7941 struct btrfs_retry_complete *done = bio->bi_private;
7883 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 7942 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7943 struct inode *inode;
7884 struct bio_vec *bvec; 7944 struct bio_vec *bvec;
7945 u64 start;
7885 int uptodate; 7946 int uptodate;
7886 int ret; 7947 int ret;
7887 int i; 7948 int i;
@@ -7890,13 +7951,20 @@ static void btrfs_retry_endio(struct bio *bio)
7890 goto end; 7951 goto end;
7891 7952
7892 uptodate = 1; 7953 uptodate = 1;
7954
7955 start = done->start;
7956
7957 ASSERT(bio->bi_vcnt == 1);
7958 inode = bio->bi_io_vec->bv_page->mapping->host;
7959 ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
7960
7893 bio_for_each_segment_all(bvec, bio, i) { 7961 bio_for_each_segment_all(bvec, bio, i) {
7894 ret = __readpage_endio_check(done->inode, io_bio, i, 7962 ret = __readpage_endio_check(done->inode, io_bio, i,
7895 bvec->bv_page, 0, 7963 bvec->bv_page, bvec->bv_offset,
7896 done->start, bvec->bv_len); 7964 done->start, bvec->bv_len);
7897 if (!ret) 7965 if (!ret)
7898 clean_io_failure(done->inode, done->start, 7966 clean_io_failure(done->inode, done->start,
7899 bvec->bv_page, 0); 7967 bvec->bv_page, bvec->bv_offset);
7900 else 7968 else
7901 uptodate = 0; 7969 uptodate = 0;
7902 } 7970 }
@@ -7910,20 +7978,34 @@ end:
7910static int __btrfs_subio_endio_read(struct inode *inode, 7978static int __btrfs_subio_endio_read(struct inode *inode,
7911 struct btrfs_io_bio *io_bio, int err) 7979 struct btrfs_io_bio *io_bio, int err)
7912{ 7980{
7981 struct btrfs_fs_info *fs_info;
7913 struct bio_vec *bvec; 7982 struct bio_vec *bvec;
7914 struct btrfs_retry_complete done; 7983 struct btrfs_retry_complete done;
7915 u64 start; 7984 u64 start;
7916 u64 offset = 0; 7985 u64 offset = 0;
7986 u32 sectorsize;
7987 int nr_sectors;
7988 unsigned int pgoff;
7989 int csum_pos;
7917 int i; 7990 int i;
7918 int ret; 7991 int ret;
7919 7992
7993 fs_info = BTRFS_I(inode)->root->fs_info;
7994 sectorsize = BTRFS_I(inode)->root->sectorsize;
7995
7920 err = 0; 7996 err = 0;
7921 start = io_bio->logical; 7997 start = io_bio->logical;
7922 done.inode = inode; 7998 done.inode = inode;
7923 7999
7924 bio_for_each_segment_all(bvec, &io_bio->bio, i) { 8000 bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7925 ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, 8001 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
7926 0, start, bvec->bv_len); 8002
8003 pgoff = bvec->bv_offset;
8004next_block:
8005 csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
8006 ret = __readpage_endio_check(inode, io_bio, csum_pos,
8007 bvec->bv_page, pgoff, start,
8008 sectorsize);
7927 if (likely(!ret)) 8009 if (likely(!ret))
7928 goto next; 8010 goto next;
7929try_again: 8011try_again:
@@ -7931,10 +8013,10 @@ try_again:
7931 done.start = start; 8013 done.start = start;
7932 init_completion(&done.done); 8014 init_completion(&done.done);
7933 8015
7934 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, 8016 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
7935 start + bvec->bv_len - 1, 8017 pgoff, start, start + sectorsize - 1,
7936 io_bio->mirror_num, 8018 io_bio->mirror_num,
7937 btrfs_retry_endio, &done); 8019 btrfs_retry_endio, &done);
7938 if (ret) { 8020 if (ret) {
7939 err = ret; 8021 err = ret;
7940 goto next; 8022 goto next;
@@ -7947,8 +8029,15 @@ try_again:
7947 goto try_again; 8029 goto try_again;
7948 } 8030 }
7949next: 8031next:
7950 offset += bvec->bv_len; 8032 offset += sectorsize;
7951 start += bvec->bv_len; 8033 start += sectorsize;
8034
8035 ASSERT(nr_sectors);
8036
8037 if (--nr_sectors) {
8038 pgoff += sectorsize;
8039 goto next_block;
8040 }
7952 } 8041 }
7953 8042
7954 return err; 8043 return err;
@@ -8202,9 +8291,11 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
8202 u64 file_offset = dip->logical_offset; 8291 u64 file_offset = dip->logical_offset;
8203 u64 submit_len = 0; 8292 u64 submit_len = 0;
8204 u64 map_length; 8293 u64 map_length;
8205 int nr_pages = 0; 8294 u32 blocksize = root->sectorsize;
8206 int ret;
8207 int async_submit = 0; 8295 int async_submit = 0;
8296 int nr_sectors;
8297 int ret;
8298 int i;
8208 8299
8209 map_length = orig_bio->bi_iter.bi_size; 8300 map_length = orig_bio->bi_iter.bi_size;
8210 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, 8301 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
@@ -8234,9 +8325,12 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
8234 atomic_inc(&dip->pending_bios); 8325 atomic_inc(&dip->pending_bios);
8235 8326
8236 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 8327 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
8237 if (map_length < submit_len + bvec->bv_len || 8328 nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len);
8238 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 8329 i = 0;
8239 bvec->bv_offset) < bvec->bv_len) { 8330next_block:
8331 if (unlikely(map_length < submit_len + blocksize ||
8332 bio_add_page(bio, bvec->bv_page, blocksize,
8333 bvec->bv_offset + (i * blocksize)) < blocksize)) {
8240 /* 8334 /*
8241 * inc the count before we submit the bio so 8335 * inc the count before we submit the bio so
8242 * we know the end IO handler won't happen before 8336 * we know the end IO handler won't happen before
@@ -8257,7 +8351,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
8257 file_offset += submit_len; 8351 file_offset += submit_len;
8258 8352
8259 submit_len = 0; 8353 submit_len = 0;
8260 nr_pages = 0;
8261 8354
8262 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, 8355 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
8263 start_sector, GFP_NOFS); 8356 start_sector, GFP_NOFS);
@@ -8275,9 +8368,14 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
8275 bio_put(bio); 8368 bio_put(bio);
8276 goto out_err; 8369 goto out_err;
8277 } 8370 }
8371
8372 goto next_block;
8278 } else { 8373 } else {
8279 submit_len += bvec->bv_len; 8374 submit_len += blocksize;
8280 nr_pages++; 8375 if (--nr_sectors) {
8376 i++;
8377 goto next_block;
8378 }
8281 bvec++; 8379 bvec++;
8282 } 8380 }
8283 } 8381 }
@@ -8642,6 +8740,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8642 struct extent_state *cached_state = NULL; 8740 struct extent_state *cached_state = NULL;
8643 u64 page_start = page_offset(page); 8741 u64 page_start = page_offset(page);
8644 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 8742 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
8743 u64 start;
8744 u64 end;
8645 int inode_evicting = inode->i_state & I_FREEING; 8745 int inode_evicting = inode->i_state & I_FREEING;
8646 8746
8647 /* 8747 /*
@@ -8661,14 +8761,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8661 8761
8662 if (!inode_evicting) 8762 if (!inode_evicting)
8663 lock_extent_bits(tree, page_start, page_end, &cached_state); 8763 lock_extent_bits(tree, page_start, page_end, &cached_state);
8664 ordered = btrfs_lookup_ordered_extent(inode, page_start); 8764again:
8765 start = page_start;
8766 ordered = btrfs_lookup_ordered_range(inode, start,
8767 page_end - start + 1);
8665 if (ordered) { 8768 if (ordered) {
8769 end = min(page_end, ordered->file_offset + ordered->len - 1);
8666 /* 8770 /*
8667 * IO on this page will never be started, so we need 8771 * IO on this page will never be started, so we need
8668 * to account for any ordered extents now 8772 * to account for any ordered extents now
8669 */ 8773 */
8670 if (!inode_evicting) 8774 if (!inode_evicting)
8671 clear_extent_bit(tree, page_start, page_end, 8775 clear_extent_bit(tree, start, end,
8672 EXTENT_DIRTY | EXTENT_DELALLOC | 8776 EXTENT_DIRTY | EXTENT_DELALLOC |
8673 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 8777 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8674 EXTENT_DEFRAG, 1, 0, &cached_state, 8778 EXTENT_DEFRAG, 1, 0, &cached_state,
@@ -8685,22 +8789,26 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8685 8789
8686 spin_lock_irq(&tree->lock); 8790 spin_lock_irq(&tree->lock);
8687 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 8791 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8688 new_len = page_start - ordered->file_offset; 8792 new_len = start - ordered->file_offset;
8689 if (new_len < ordered->truncated_len) 8793 if (new_len < ordered->truncated_len)
8690 ordered->truncated_len = new_len; 8794 ordered->truncated_len = new_len;
8691 spin_unlock_irq(&tree->lock); 8795 spin_unlock_irq(&tree->lock);
8692 8796
8693 if (btrfs_dec_test_ordered_pending(inode, &ordered, 8797 if (btrfs_dec_test_ordered_pending(inode, &ordered,
8694 page_start, 8798 start,
8695 PAGE_CACHE_SIZE, 1)) 8799 end - start + 1, 1))
8696 btrfs_finish_ordered_io(ordered); 8800 btrfs_finish_ordered_io(ordered);
8697 } 8801 }
8698 btrfs_put_ordered_extent(ordered); 8802 btrfs_put_ordered_extent(ordered);
8699 if (!inode_evicting) { 8803 if (!inode_evicting) {
8700 cached_state = NULL; 8804 cached_state = NULL;
8701 lock_extent_bits(tree, page_start, page_end, 8805 lock_extent_bits(tree, start, end,
8702 &cached_state); 8806 &cached_state);
8703 } 8807 }
8808
8809 start = end + 1;
8810 if (start < page_end)
8811 goto again;
8704 } 8812 }
8705 8813
8706 /* 8814 /*
@@ -8761,15 +8869,28 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
8761 loff_t size; 8869 loff_t size;
8762 int ret; 8870 int ret;
8763 int reserved = 0; 8871 int reserved = 0;
8872 u64 reserved_space;
8764 u64 page_start; 8873 u64 page_start;
8765 u64 page_end; 8874 u64 page_end;
8875 u64 end;
8876
8877 reserved_space = PAGE_CACHE_SIZE;
8766 8878
8767 sb_start_pagefault(inode->i_sb); 8879 sb_start_pagefault(inode->i_sb);
8768 page_start = page_offset(page); 8880 page_start = page_offset(page);
8769 page_end = page_start + PAGE_CACHE_SIZE - 1; 8881 page_end = page_start + PAGE_CACHE_SIZE - 1;
8882 end = page_end;
8770 8883
8884 /*
8885 * Reserving delalloc space after obtaining the page lock can lead to
8886 * deadlock. For example, if a dirty page is locked by this function
8887 * and the call to btrfs_delalloc_reserve_space() ends up triggering
8888 * dirty page write out, then the btrfs_writepage() function could
8889 * end up waiting indefinitely to get a lock on the page currently
8890 * being processed by btrfs_page_mkwrite() function.
8891 */
8771 ret = btrfs_delalloc_reserve_space(inode, page_start, 8892 ret = btrfs_delalloc_reserve_space(inode, page_start,
8772 PAGE_CACHE_SIZE); 8893 reserved_space);
8773 if (!ret) { 8894 if (!ret) {
8774 ret = file_update_time(vma->vm_file); 8895 ret = file_update_time(vma->vm_file);
8775 reserved = 1; 8896 reserved = 1;
@@ -8803,7 +8924,7 @@ again:
8803 * we can't set the delalloc bits if there are pending ordered 8924 * we can't set the delalloc bits if there are pending ordered
8804 * extents. Drop our locks and wait for them to finish 8925 * extents. Drop our locks and wait for them to finish
8805 */ 8926 */
8806 ordered = btrfs_lookup_ordered_extent(inode, page_start); 8927 ordered = btrfs_lookup_ordered_range(inode, page_start, page_end);
8807 if (ordered) { 8928 if (ordered) {
8808 unlock_extent_cached(io_tree, page_start, page_end, 8929 unlock_extent_cached(io_tree, page_start, page_end,
8809 &cached_state, GFP_NOFS); 8930 &cached_state, GFP_NOFS);
@@ -8813,6 +8934,18 @@ again:
8813 goto again; 8934 goto again;
8814 } 8935 }
8815 8936
8937 if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) {
8938 reserved_space = round_up(size - page_start, root->sectorsize);
8939 if (reserved_space < PAGE_CACHE_SIZE) {
8940 end = page_start + reserved_space - 1;
8941 spin_lock(&BTRFS_I(inode)->lock);
8942 BTRFS_I(inode)->outstanding_extents++;
8943 spin_unlock(&BTRFS_I(inode)->lock);
8944 btrfs_delalloc_release_space(inode, page_start,
8945 PAGE_CACHE_SIZE - reserved_space);
8946 }
8947 }
8948
8816 /* 8949 /*
8817 * XXX - page_mkwrite gets called every time the page is dirtied, even 8950 * XXX - page_mkwrite gets called every time the page is dirtied, even
8818 * if it was already dirty, so for space accounting reasons we need to 8951 * if it was already dirty, so for space accounting reasons we need to
@@ -8820,12 +8953,12 @@ again:
8820 * is probably a better way to do this, but for now keep consistent with 8953 * is probably a better way to do this, but for now keep consistent with
8821 * prepare_pages in the normal write path. 8954 * prepare_pages in the normal write path.
8822 */ 8955 */
8823 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 8956 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
8824 EXTENT_DIRTY | EXTENT_DELALLOC | 8957 EXTENT_DIRTY | EXTENT_DELALLOC |
8825 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 8958 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
8826 0, 0, &cached_state, GFP_NOFS); 8959 0, 0, &cached_state, GFP_NOFS);
8827 8960
8828 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 8961 ret = btrfs_set_extent_delalloc(inode, page_start, end,
8829 &cached_state); 8962 &cached_state);
8830 if (ret) { 8963 if (ret) {
8831 unlock_extent_cached(io_tree, page_start, page_end, 8964 unlock_extent_cached(io_tree, page_start, page_end,
@@ -8864,7 +8997,7 @@ out_unlock:
8864 } 8997 }
8865 unlock_page(page); 8998 unlock_page(page);
8866out: 8999out:
8867 btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE); 9000 btrfs_delalloc_release_space(inode, page_start, reserved_space);
8868out_noreserve: 9001out_noreserve:
8869 sb_end_pagefault(inode->i_sb); 9002 sb_end_pagefault(inode->i_sb);
8870 return ret; 9003 return ret;
@@ -9190,16 +9323,11 @@ void btrfs_destroy_cachep(void)
9190 * destroy cache. 9323 * destroy cache.
9191 */ 9324 */
9192 rcu_barrier(); 9325 rcu_barrier();
9193 if (btrfs_inode_cachep) 9326 kmem_cache_destroy(btrfs_inode_cachep);
9194 kmem_cache_destroy(btrfs_inode_cachep); 9327 kmem_cache_destroy(btrfs_trans_handle_cachep);
9195 if (btrfs_trans_handle_cachep) 9328 kmem_cache_destroy(btrfs_transaction_cachep);
9196 kmem_cache_destroy(btrfs_trans_handle_cachep); 9329 kmem_cache_destroy(btrfs_path_cachep);
9197 if (btrfs_transaction_cachep) 9330 kmem_cache_destroy(btrfs_free_space_cachep);
9198 kmem_cache_destroy(btrfs_transaction_cachep);
9199 if (btrfs_path_cachep)
9200 kmem_cache_destroy(btrfs_path_cachep);
9201 if (btrfs_free_space_cachep)
9202 kmem_cache_destroy(btrfs_free_space_cachep);
9203} 9331}
9204 9332
9205int btrfs_init_cachep(void) 9333int btrfs_init_cachep(void)
@@ -9250,7 +9378,6 @@ static int btrfs_getattr(struct vfsmount *mnt,
9250 9378
9251 generic_fillattr(inode, stat); 9379 generic_fillattr(inode, stat);
9252 stat->dev = BTRFS_I(inode)->root->anon_dev; 9380 stat->dev = BTRFS_I(inode)->root->anon_dev;
9253 stat->blksize = PAGE_CACHE_SIZE;
9254 9381
9255 spin_lock(&BTRFS_I(inode)->lock); 9382 spin_lock(&BTRFS_I(inode)->lock);
9256 delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; 9383 delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
@@ -9268,7 +9395,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9268 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9395 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9269 struct inode *new_inode = d_inode(new_dentry); 9396 struct inode *new_inode = d_inode(new_dentry);
9270 struct inode *old_inode = d_inode(old_dentry); 9397 struct inode *old_inode = d_inode(old_dentry);
9271 struct timespec ctime = CURRENT_TIME;
9272 u64 index = 0; 9398 u64 index = 0;
9273 u64 root_objectid; 9399 u64 root_objectid;
9274 int ret; 9400 int ret;
@@ -9365,9 +9491,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9365 inode_inc_iversion(old_dir); 9491 inode_inc_iversion(old_dir);
9366 inode_inc_iversion(new_dir); 9492 inode_inc_iversion(new_dir);
9367 inode_inc_iversion(old_inode); 9493 inode_inc_iversion(old_inode);
9368 old_dir->i_ctime = old_dir->i_mtime = ctime; 9494 old_dir->i_ctime = old_dir->i_mtime =
9369 new_dir->i_ctime = new_dir->i_mtime = ctime; 9495 new_dir->i_ctime = new_dir->i_mtime =
9370 old_inode->i_ctime = ctime; 9496 old_inode->i_ctime = current_fs_time(old_dir->i_sb);
9371 9497
9372 if (old_dentry->d_parent != new_dentry->d_parent) 9498 if (old_dentry->d_parent != new_dentry->d_parent)
9373 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 9499 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
@@ -9392,7 +9518,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9392 9518
9393 if (new_inode) { 9519 if (new_inode) {
9394 inode_inc_iversion(new_inode); 9520 inode_inc_iversion(new_inode);
9395 new_inode->i_ctime = CURRENT_TIME; 9521 new_inode->i_ctime = current_fs_time(new_inode->i_sb);
9396 if (unlikely(btrfs_ino(new_inode) == 9522 if (unlikely(btrfs_ino(new_inode) ==
9397 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 9523 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9398 root_objectid = BTRFS_I(new_inode)->location.objectid; 9524 root_objectid = BTRFS_I(new_inode)->location.objectid;
@@ -9870,7 +9996,7 @@ next:
9870 *alloc_hint = ins.objectid + ins.offset; 9996 *alloc_hint = ins.objectid + ins.offset;
9871 9997
9872 inode_inc_iversion(inode); 9998 inode_inc_iversion(inode);
9873 inode->i_ctime = CURRENT_TIME; 9999 inode->i_ctime = current_fs_time(inode->i_sb);
9874 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 10000 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
9875 if (!(mode & FALLOC_FL_KEEP_SIZE) && 10001 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
9876 (actual_len > inode->i_size) && 10002 (actual_len > inode->i_size) &&
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 48aee9846329..053e677839fe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,8 @@
59#include "props.h" 59#include "props.h"
60#include "sysfs.h" 60#include "sysfs.h"
61#include "qgroup.h" 61#include "qgroup.h"
62#include "tree-log.h"
63#include "compression.h"
62 64
63#ifdef CONFIG_64BIT 65#ifdef CONFIG_64BIT
64/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI 66/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -347,7 +349,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
347 349
348 btrfs_update_iflags(inode); 350 btrfs_update_iflags(inode);
349 inode_inc_iversion(inode); 351 inode_inc_iversion(inode);
350 inode->i_ctime = CURRENT_TIME; 352 inode->i_ctime = current_fs_time(inode->i_sb);
351 ret = btrfs_update_inode(trans, root, inode); 353 ret = btrfs_update_inode(trans, root, inode);
352 354
353 btrfs_end_transaction(trans, root); 355 btrfs_end_transaction(trans, root);
@@ -443,7 +445,7 @@ static noinline int create_subvol(struct inode *dir,
443 struct btrfs_root *root = BTRFS_I(dir)->root; 445 struct btrfs_root *root = BTRFS_I(dir)->root;
444 struct btrfs_root *new_root; 446 struct btrfs_root *new_root;
445 struct btrfs_block_rsv block_rsv; 447 struct btrfs_block_rsv block_rsv;
446 struct timespec cur_time = CURRENT_TIME; 448 struct timespec cur_time = current_fs_time(dir->i_sb);
447 struct inode *inode; 449 struct inode *inode;
448 int ret; 450 int ret;
449 int err; 451 int err;
@@ -844,10 +846,6 @@ static noinline int btrfs_mksubvol(struct path *parent,
844 if (IS_ERR(dentry)) 846 if (IS_ERR(dentry))
845 goto out_unlock; 847 goto out_unlock;
846 848
847 error = -EEXIST;
848 if (d_really_is_positive(dentry))
849 goto out_dput;
850
851 error = btrfs_may_create(dir, dentry); 849 error = btrfs_may_create(dir, dentry);
852 if (error) 850 if (error)
853 goto out_dput; 851 goto out_dput;
@@ -2097,8 +2095,6 @@ static noinline int search_ioctl(struct inode *inode,
2097 key.offset = (u64)-1; 2095 key.offset = (u64)-1;
2098 root = btrfs_read_fs_root_no_name(info, &key); 2096 root = btrfs_read_fs_root_no_name(info, &key);
2099 if (IS_ERR(root)) { 2097 if (IS_ERR(root)) {
2100 btrfs_err(info, "could not find root %llu",
2101 sk->tree_id);
2102 btrfs_free_path(path); 2098 btrfs_free_path(path);
2103 return -ENOENT; 2099 return -ENOENT;
2104 } 2100 }
@@ -2476,6 +2472,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2476 trans->block_rsv = &block_rsv; 2472 trans->block_rsv = &block_rsv;
2477 trans->bytes_reserved = block_rsv.size; 2473 trans->bytes_reserved = block_rsv.size;
2478 2474
2475 btrfs_record_snapshot_destroy(trans, dir);
2476
2479 ret = btrfs_unlink_subvol(trans, root, dir, 2477 ret = btrfs_unlink_subvol(trans, root, dir,
2480 dest->root_key.objectid, 2478 dest->root_key.objectid,
2481 dentry->d_name.name, 2479 dentry->d_name.name,
@@ -2960,8 +2958,8 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
2960 * of the array is bounded by len, which is in turn bounded by 2958 * of the array is bounded by len, which is in turn bounded by
2961 * BTRFS_MAX_DEDUPE_LEN. 2959 * BTRFS_MAX_DEDUPE_LEN.
2962 */ 2960 */
2963 src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); 2961 src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
2964 dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); 2962 dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
2965 if (!src_pgarr || !dst_pgarr) { 2963 if (!src_pgarr || !dst_pgarr) {
2966 kfree(src_pgarr); 2964 kfree(src_pgarr);
2967 kfree(dst_pgarr); 2965 kfree(dst_pgarr);
@@ -3068,6 +3066,9 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
3068 ret = extent_same_check_offsets(src, loff, &len, olen); 3066 ret = extent_same_check_offsets(src, loff, &len, olen);
3069 if (ret) 3067 if (ret)
3070 goto out_unlock; 3068 goto out_unlock;
3069 ret = extent_same_check_offsets(src, dst_loff, &len, olen);
3070 if (ret)
3071 goto out_unlock;
3071 3072
3072 /* 3073 /*
3073 * Single inode case wants the same checks, except we 3074 * Single inode case wants the same checks, except we
@@ -3217,7 +3218,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
3217 3218
3218 inode_inc_iversion(inode); 3219 inode_inc_iversion(inode);
3219 if (!no_time_update) 3220 if (!no_time_update)
3220 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 3221 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
3221 /* 3222 /*
3222 * We round up to the block size at eof when determining which 3223 * We round up to the block size at eof when determining which
3223 * extents to clone above, but shouldn't round up the file size. 3224 * extents to clone above, but shouldn't round up the file size.
@@ -3889,8 +3890,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
3889 * Truncate page cache pages so that future reads will see the cloned 3890 * Truncate page cache pages so that future reads will see the cloned
3890 * data immediately and not the previous data. 3891 * data immediately and not the previous data.
3891 */ 3892 */
3892 truncate_inode_pages_range(&inode->i_data, destoff, 3893 truncate_inode_pages_range(&inode->i_data,
3893 PAGE_CACHE_ALIGN(destoff + len) - 1); 3894 round_down(destoff, PAGE_CACHE_SIZE),
3895 round_up(destoff + len, PAGE_CACHE_SIZE) - 1);
3894out_unlock: 3896out_unlock:
3895 if (!same_inode) 3897 if (!same_inode)
3896 btrfs_double_inode_unlock(src, inode); 3898 btrfs_double_inode_unlock(src, inode);
@@ -5031,7 +5033,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
5031 struct btrfs_root *root = BTRFS_I(inode)->root; 5033 struct btrfs_root *root = BTRFS_I(inode)->root;
5032 struct btrfs_root_item *root_item = &root->root_item; 5034 struct btrfs_root_item *root_item = &root->root_item;
5033 struct btrfs_trans_handle *trans; 5035 struct btrfs_trans_handle *trans;
5034 struct timespec ct = CURRENT_TIME; 5036 struct timespec ct = current_fs_time(inode->i_sb);
5035 int ret = 0; 5037 int ret = 0;
5036 int received_uuid_changed; 5038 int received_uuid_changed;
5037 5039
@@ -5262,8 +5264,7 @@ out_unlock:
5262 .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ 5264 .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
5263 .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } 5265 .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
5264 5266
5265static int btrfs_ioctl_get_supported_features(struct file *file, 5267int btrfs_ioctl_get_supported_features(void __user *arg)
5266 void __user *arg)
5267{ 5268{
5268 static const struct btrfs_ioctl_feature_flags features[3] = { 5269 static const struct btrfs_ioctl_feature_flags features[3] = {
5269 INIT_FEATURE_FLAGS(SUPP), 5270 INIT_FEATURE_FLAGS(SUPP),
@@ -5542,7 +5543,7 @@ long btrfs_ioctl(struct file *file, unsigned int
5542 case BTRFS_IOC_SET_FSLABEL: 5543 case BTRFS_IOC_SET_FSLABEL:
5543 return btrfs_ioctl_set_fslabel(file, argp); 5544 return btrfs_ioctl_set_fslabel(file, argp);
5544 case BTRFS_IOC_GET_SUPPORTED_FEATURES: 5545 case BTRFS_IOC_GET_SUPPORTED_FEATURES:
5545 return btrfs_ioctl_get_supported_features(file, argp); 5546 return btrfs_ioctl_get_supported_features(argp);
5546 case BTRFS_IOC_GET_FEATURES: 5547 case BTRFS_IOC_GET_FEATURES:
5547 return btrfs_ioctl_get_features(file, argp); 5548 return btrfs_ioctl_get_features(file, argp);
5548 case BTRFS_IOC_SET_FEATURES: 5549 case BTRFS_IOC_SET_FEATURES:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 8c27292ea9ea..0de7da5a610d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,7 @@
25#include "btrfs_inode.h" 25#include "btrfs_inode.h"
26#include "extent_io.h" 26#include "extent_io.h"
27#include "disk-io.h" 27#include "disk-io.h"
28#include "compression.h"
28 29
29static struct kmem_cache *btrfs_ordered_extent_cache; 30static struct kmem_cache *btrfs_ordered_extent_cache;
30 31
@@ -1009,7 +1010,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
1009 for (; node; node = rb_prev(node)) { 1010 for (; node; node = rb_prev(node)) {
1010 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 1011 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
1011 1012
1012 /* We treat this entry as if it doesnt exist */ 1013 /* We treat this entry as if it doesn't exist */
1013 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) 1014 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
1014 continue; 1015 continue;
1015 if (test->file_offset + test->len <= disk_i_size) 1016 if (test->file_offset + test->len <= disk_i_size)
@@ -1114,6 +1115,5 @@ int __init ordered_data_init(void)
1114 1115
1115void ordered_data_exit(void) 1116void ordered_data_exit(void)
1116{ 1117{
1117 if (btrfs_ordered_extent_cache) 1118 kmem_cache_destroy(btrfs_ordered_extent_cache);
1118 kmem_cache_destroy(btrfs_ordered_extent_cache);
1119} 1119}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 647ab12fdf5d..147dc6ca5de1 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -295,8 +295,27 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
295 btrfs_dev_extent_chunk_offset(l, dev_extent), 295 btrfs_dev_extent_chunk_offset(l, dev_extent),
296 btrfs_dev_extent_length(l, dev_extent)); 296 btrfs_dev_extent_length(l, dev_extent));
297 break; 297 break;
298 case BTRFS_DEV_STATS_KEY: 298 case BTRFS_PERSISTENT_ITEM_KEY:
299 printk(KERN_INFO "\t\tdevice stats\n"); 299 printk(KERN_INFO "\t\tpersistent item objectid %llu offset %llu\n",
300 key.objectid, key.offset);
301 switch (key.objectid) {
302 case BTRFS_DEV_STATS_OBJECTID:
303 printk(KERN_INFO "\t\tdevice stats\n");
304 break;
305 default:
306 printk(KERN_INFO "\t\tunknown persistent item\n");
307 }
308 break;
309 case BTRFS_TEMPORARY_ITEM_KEY:
310 printk(KERN_INFO "\t\ttemporary item objectid %llu offset %llu\n",
311 key.objectid, key.offset);
312 switch (key.objectid) {
313 case BTRFS_BALANCE_OBJECTID:
314 printk(KERN_INFO "\t\tbalance status\n");
315 break;
316 default:
317 printk(KERN_INFO "\t\tunknown temporary item\n");
318 }
300 break; 319 break;
301 case BTRFS_DEV_REPLACE_KEY: 320 case BTRFS_DEV_REPLACE_KEY:
302 printk(KERN_INFO "\t\tdev replace\n"); 321 printk(KERN_INFO "\t\tdev replace\n");
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index f9e60231f685..36992128c746 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -22,6 +22,7 @@
22#include "hash.h" 22#include "hash.h"
23#include "transaction.h" 23#include "transaction.h"
24#include "xattr.h" 24#include "xattr.h"
25#include "compression.h"
25 26
26#define BTRFS_PROP_HANDLERS_HT_BITS 8 27#define BTRFS_PROP_HANDLERS_HT_BITS 8
27static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); 28static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 619f92963e27..b892914968c1 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -72,7 +72,7 @@ struct reada_extent {
72 spinlock_t lock; 72 spinlock_t lock;
73 struct reada_zone *zones[BTRFS_MAX_MIRRORS]; 73 struct reada_zone *zones[BTRFS_MAX_MIRRORS];
74 int nzones; 74 int nzones;
75 struct btrfs_device *scheduled_for; 75 int scheduled;
76}; 76};
77 77
78struct reada_zone { 78struct reada_zone {
@@ -101,67 +101,53 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info);
101static void __reada_start_machine(struct btrfs_fs_info *fs_info); 101static void __reada_start_machine(struct btrfs_fs_info *fs_info);
102 102
103static int reada_add_block(struct reada_control *rc, u64 logical, 103static int reada_add_block(struct reada_control *rc, u64 logical,
104 struct btrfs_key *top, int level, u64 generation); 104 struct btrfs_key *top, u64 generation);
105 105
106/* recurses */ 106/* recurses */
107/* in case of err, eb might be NULL */ 107/* in case of err, eb might be NULL */
108static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 108static void __readahead_hook(struct btrfs_fs_info *fs_info,
109 u64 start, int err) 109 struct reada_extent *re, struct extent_buffer *eb,
110 u64 start, int err)
110{ 111{
111 int level = 0; 112 int level = 0;
112 int nritems; 113 int nritems;
113 int i; 114 int i;
114 u64 bytenr; 115 u64 bytenr;
115 u64 generation; 116 u64 generation;
116 struct reada_extent *re;
117 struct btrfs_fs_info *fs_info = root->fs_info;
118 struct list_head list; 117 struct list_head list;
119 unsigned long index = start >> PAGE_CACHE_SHIFT;
120 struct btrfs_device *for_dev;
121 118
122 if (eb) 119 if (eb)
123 level = btrfs_header_level(eb); 120 level = btrfs_header_level(eb);
124 121
125 /* find extent */
126 spin_lock(&fs_info->reada_lock);
127 re = radix_tree_lookup(&fs_info->reada_tree, index);
128 if (re)
129 re->refcnt++;
130 spin_unlock(&fs_info->reada_lock);
131
132 if (!re)
133 return -1;
134
135 spin_lock(&re->lock); 122 spin_lock(&re->lock);
136 /* 123 /*
137 * just take the full list from the extent. afterwards we 124 * just take the full list from the extent. afterwards we
138 * don't need the lock anymore 125 * don't need the lock anymore
139 */ 126 */
140 list_replace_init(&re->extctl, &list); 127 list_replace_init(&re->extctl, &list);
141 for_dev = re->scheduled_for; 128 re->scheduled = 0;
142 re->scheduled_for = NULL;
143 spin_unlock(&re->lock); 129 spin_unlock(&re->lock);
144 130
145 if (err == 0) { 131 /*
146 nritems = level ? btrfs_header_nritems(eb) : 0; 132 * this is the error case, the extent buffer has not been
147 generation = btrfs_header_generation(eb); 133 * read correctly. We won't access anything from it and
148 /* 134 * just cleanup our data structures. Effectively this will
149 * FIXME: currently we just set nritems to 0 if this is a leaf, 135 * cut the branch below this node from read ahead.
150 * effectively ignoring the content. In a next step we could 136 */
151 * trigger more readahead depending from the content, e.g. 137 if (err)
152 * fetch the checksums for the extents in the leaf. 138 goto cleanup;
153 */
154 } else {
155 /*
156 * this is the error case, the extent buffer has not been
157 * read correctly. We won't access anything from it and
158 * just cleanup our data structures. Effectively this will
159 * cut the branch below this node from read ahead.
160 */
161 nritems = 0;
162 generation = 0;
163 }
164 139
140 /*
141 * FIXME: currently we just set nritems to 0 if this is a leaf,
142 * effectively ignoring the content. In a next step we could
143 * trigger more readahead depending from the content, e.g.
144 * fetch the checksums for the extents in the leaf.
145 */
146 if (!level)
147 goto cleanup;
148
149 nritems = btrfs_header_nritems(eb);
150 generation = btrfs_header_generation(eb);
165 for (i = 0; i < nritems; i++) { 151 for (i = 0; i < nritems; i++) {
166 struct reada_extctl *rec; 152 struct reada_extctl *rec;
167 u64 n_gen; 153 u64 n_gen;
@@ -188,19 +174,20 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
188 */ 174 */
189#ifdef DEBUG 175#ifdef DEBUG
190 if (rec->generation != generation) { 176 if (rec->generation != generation) {
191 btrfs_debug(root->fs_info, 177 btrfs_debug(fs_info,
192 "generation mismatch for (%llu,%d,%llu) %llu != %llu", 178 "generation mismatch for (%llu,%d,%llu) %llu != %llu",
193 key.objectid, key.type, key.offset, 179 key.objectid, key.type, key.offset,
194 rec->generation, generation); 180 rec->generation, generation);
195 } 181 }
196#endif 182#endif
197 if (rec->generation == generation && 183 if (rec->generation == generation &&
198 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && 184 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
199 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) 185 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
200 reada_add_block(rc, bytenr, &next_key, 186 reada_add_block(rc, bytenr, &next_key, n_gen);
201 level - 1, n_gen);
202 } 187 }
203 } 188 }
189
190cleanup:
204 /* 191 /*
205 * free extctl records 192 * free extctl records
206 */ 193 */
@@ -222,26 +209,37 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
222 209
223 reada_extent_put(fs_info, re); /* one ref for each entry */ 210 reada_extent_put(fs_info, re); /* one ref for each entry */
224 } 211 }
225 reada_extent_put(fs_info, re); /* our ref */
226 if (for_dev)
227 atomic_dec(&for_dev->reada_in_flight);
228 212
229 return 0; 213 return;
230} 214}
231 215
232/* 216/*
233 * start is passed separately in case eb in NULL, which may be the case with 217 * start is passed separately in case eb in NULL, which may be the case with
234 * failed I/O 218 * failed I/O
235 */ 219 */
236int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 220int btree_readahead_hook(struct btrfs_fs_info *fs_info,
237 u64 start, int err) 221 struct extent_buffer *eb, u64 start, int err)
238{ 222{
239 int ret; 223 int ret = 0;
224 struct reada_extent *re;
240 225
241 ret = __readahead_hook(root, eb, start, err); 226 /* find extent */
227 spin_lock(&fs_info->reada_lock);
228 re = radix_tree_lookup(&fs_info->reada_tree,
229 start >> PAGE_CACHE_SHIFT);
230 if (re)
231 re->refcnt++;
232 spin_unlock(&fs_info->reada_lock);
233 if (!re) {
234 ret = -1;
235 goto start_machine;
236 }
242 237
243 reada_start_machine(root->fs_info); 238 __readahead_hook(fs_info, re, eb, start, err);
239 reada_extent_put(fs_info, re); /* our ref */
244 240
241start_machine:
242 reada_start_machine(fs_info);
245 return ret; 243 return ret;
246} 244}
247 245
@@ -260,18 +258,14 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
260 spin_lock(&fs_info->reada_lock); 258 spin_lock(&fs_info->reada_lock);
261 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, 259 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
262 logical >> PAGE_CACHE_SHIFT, 1); 260 logical >> PAGE_CACHE_SHIFT, 1);
263 if (ret == 1) 261 if (ret == 1 && logical >= zone->start && logical <= zone->end) {
264 kref_get(&zone->refcnt); 262 kref_get(&zone->refcnt);
265 spin_unlock(&fs_info->reada_lock);
266
267 if (ret == 1) {
268 if (logical >= zone->start && logical < zone->end)
269 return zone;
270 spin_lock(&fs_info->reada_lock);
271 kref_put(&zone->refcnt, reada_zone_release);
272 spin_unlock(&fs_info->reada_lock); 263 spin_unlock(&fs_info->reada_lock);
264 return zone;
273 } 265 }
274 266
267 spin_unlock(&fs_info->reada_lock);
268
275 cache = btrfs_lookup_block_group(fs_info, logical); 269 cache = btrfs_lookup_block_group(fs_info, logical);
276 if (!cache) 270 if (!cache)
277 return NULL; 271 return NULL;
@@ -280,7 +274,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
280 end = start + cache->key.offset - 1; 274 end = start + cache->key.offset - 1;
281 btrfs_put_block_group(cache); 275 btrfs_put_block_group(cache);
282 276
283 zone = kzalloc(sizeof(*zone), GFP_NOFS); 277 zone = kzalloc(sizeof(*zone), GFP_KERNEL);
284 if (!zone) 278 if (!zone)
285 return NULL; 279 return NULL;
286 280
@@ -307,8 +301,10 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
307 kfree(zone); 301 kfree(zone);
308 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, 302 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
309 logical >> PAGE_CACHE_SHIFT, 1); 303 logical >> PAGE_CACHE_SHIFT, 1);
310 if (ret == 1) 304 if (ret == 1 && logical >= zone->start && logical <= zone->end)
311 kref_get(&zone->refcnt); 305 kref_get(&zone->refcnt);
306 else
307 zone = NULL;
312 } 308 }
313 spin_unlock(&fs_info->reada_lock); 309 spin_unlock(&fs_info->reada_lock);
314 310
@@ -317,7 +313,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
317 313
318static struct reada_extent *reada_find_extent(struct btrfs_root *root, 314static struct reada_extent *reada_find_extent(struct btrfs_root *root,
319 u64 logical, 315 u64 logical,
320 struct btrfs_key *top, int level) 316 struct btrfs_key *top)
321{ 317{
322 int ret; 318 int ret;
323 struct reada_extent *re = NULL; 319 struct reada_extent *re = NULL;
@@ -330,9 +326,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
330 u64 length; 326 u64 length;
331 int real_stripes; 327 int real_stripes;
332 int nzones = 0; 328 int nzones = 0;
333 int i;
334 unsigned long index = logical >> PAGE_CACHE_SHIFT; 329 unsigned long index = logical >> PAGE_CACHE_SHIFT;
335 int dev_replace_is_ongoing; 330 int dev_replace_is_ongoing;
331 int have_zone = 0;
336 332
337 spin_lock(&fs_info->reada_lock); 333 spin_lock(&fs_info->reada_lock);
338 re = radix_tree_lookup(&fs_info->reada_tree, index); 334 re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -343,7 +339,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
343 if (re) 339 if (re)
344 return re; 340 return re;
345 341
346 re = kzalloc(sizeof(*re), GFP_NOFS); 342 re = kzalloc(sizeof(*re), GFP_KERNEL);
347 if (!re) 343 if (!re)
348 return NULL; 344 return NULL;
349 345
@@ -375,11 +371,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
375 struct reada_zone *zone; 371 struct reada_zone *zone;
376 372
377 dev = bbio->stripes[nzones].dev; 373 dev = bbio->stripes[nzones].dev;
374
375 /* cannot read ahead on missing device. */
376 if (!dev->bdev)
377 continue;
378
378 zone = reada_find_zone(fs_info, dev, logical, bbio); 379 zone = reada_find_zone(fs_info, dev, logical, bbio);
379 if (!zone) 380 if (!zone)
380 break; 381 continue;
381 382
382 re->zones[nzones] = zone; 383 re->zones[re->nzones++] = zone;
383 spin_lock(&zone->lock); 384 spin_lock(&zone->lock);
384 if (!zone->elems) 385 if (!zone->elems)
385 kref_get(&zone->refcnt); 386 kref_get(&zone->refcnt);
@@ -389,14 +390,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
389 kref_put(&zone->refcnt, reada_zone_release); 390 kref_put(&zone->refcnt, reada_zone_release);
390 spin_unlock(&fs_info->reada_lock); 391 spin_unlock(&fs_info->reada_lock);
391 } 392 }
392 re->nzones = nzones; 393 if (re->nzones == 0) {
393 if (nzones == 0) {
394 /* not a single zone found, error and out */ 394 /* not a single zone found, error and out */
395 goto error; 395 goto error;
396 } 396 }
397 397
398 /* insert extent in reada_tree + all per-device trees, all or nothing */ 398 /* insert extent in reada_tree + all per-device trees, all or nothing */
399 btrfs_dev_replace_lock(&fs_info->dev_replace); 399 btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
400 spin_lock(&fs_info->reada_lock); 400 spin_lock(&fs_info->reada_lock);
401 ret = radix_tree_insert(&fs_info->reada_tree, index, re); 401 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
402 if (ret == -EEXIST) { 402 if (ret == -EEXIST) {
@@ -404,19 +404,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
404 BUG_ON(!re_exist); 404 BUG_ON(!re_exist);
405 re_exist->refcnt++; 405 re_exist->refcnt++;
406 spin_unlock(&fs_info->reada_lock); 406 spin_unlock(&fs_info->reada_lock);
407 btrfs_dev_replace_unlock(&fs_info->dev_replace); 407 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
408 goto error; 408 goto error;
409 } 409 }
410 if (ret) { 410 if (ret) {
411 spin_unlock(&fs_info->reada_lock); 411 spin_unlock(&fs_info->reada_lock);
412 btrfs_dev_replace_unlock(&fs_info->dev_replace); 412 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
413 goto error; 413 goto error;
414 } 414 }
415 prev_dev = NULL; 415 prev_dev = NULL;
416 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( 416 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
417 &fs_info->dev_replace); 417 &fs_info->dev_replace);
418 for (i = 0; i < nzones; ++i) { 418 for (nzones = 0; nzones < re->nzones; ++nzones) {
419 dev = bbio->stripes[i].dev; 419 dev = re->zones[nzones]->device;
420
420 if (dev == prev_dev) { 421 if (dev == prev_dev) {
421 /* 422 /*
422 * in case of DUP, just add the first zone. As both 423 * in case of DUP, just add the first zone. As both
@@ -427,15 +428,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
427 */ 428 */
428 continue; 429 continue;
429 } 430 }
430 if (!dev->bdev) { 431 if (!dev->bdev)
431 /* 432 continue;
432 * cannot read ahead on missing device, but for RAID5/6, 433
433 * REQ_GET_READ_MIRRORS return 1. So don't skip missing
434 * device for such case.
435 */
436 if (nzones > 1)
437 continue;
438 }
439 if (dev_replace_is_ongoing && 434 if (dev_replace_is_ongoing &&
440 dev == fs_info->dev_replace.tgtdev) { 435 dev == fs_info->dev_replace.tgtdev) {
441 /* 436 /*
@@ -447,8 +442,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
447 prev_dev = dev; 442 prev_dev = dev;
448 ret = radix_tree_insert(&dev->reada_extents, index, re); 443 ret = radix_tree_insert(&dev->reada_extents, index, re);
449 if (ret) { 444 if (ret) {
450 while (--i >= 0) { 445 while (--nzones >= 0) {
451 dev = bbio->stripes[i].dev; 446 dev = re->zones[nzones]->device;
452 BUG_ON(dev == NULL); 447 BUG_ON(dev == NULL);
453 /* ignore whether the entry was inserted */ 448 /* ignore whether the entry was inserted */
454 radix_tree_delete(&dev->reada_extents, index); 449 radix_tree_delete(&dev->reada_extents, index);
@@ -456,21 +451,24 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
456 BUG_ON(fs_info == NULL); 451 BUG_ON(fs_info == NULL);
457 radix_tree_delete(&fs_info->reada_tree, index); 452 radix_tree_delete(&fs_info->reada_tree, index);
458 spin_unlock(&fs_info->reada_lock); 453 spin_unlock(&fs_info->reada_lock);
459 btrfs_dev_replace_unlock(&fs_info->dev_replace); 454 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
460 goto error; 455 goto error;
461 } 456 }
457 have_zone = 1;
462 } 458 }
463 spin_unlock(&fs_info->reada_lock); 459 spin_unlock(&fs_info->reada_lock);
464 btrfs_dev_replace_unlock(&fs_info->dev_replace); 460 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
461
462 if (!have_zone)
463 goto error;
465 464
466 btrfs_put_bbio(bbio); 465 btrfs_put_bbio(bbio);
467 return re; 466 return re;
468 467
469error: 468error:
470 while (nzones) { 469 for (nzones = 0; nzones < re->nzones; ++nzones) {
471 struct reada_zone *zone; 470 struct reada_zone *zone;
472 471
473 --nzones;
474 zone = re->zones[nzones]; 472 zone = re->zones[nzones];
475 kref_get(&zone->refcnt); 473 kref_get(&zone->refcnt);
476 spin_lock(&zone->lock); 474 spin_lock(&zone->lock);
@@ -531,8 +529,6 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
531 kref_put(&zone->refcnt, reada_zone_release); 529 kref_put(&zone->refcnt, reada_zone_release);
532 spin_unlock(&fs_info->reada_lock); 530 spin_unlock(&fs_info->reada_lock);
533 } 531 }
534 if (re->scheduled_for)
535 atomic_dec(&re->scheduled_for->reada_in_flight);
536 532
537 kfree(re); 533 kfree(re);
538} 534}
@@ -556,17 +552,17 @@ static void reada_control_release(struct kref *kref)
556} 552}
557 553
558static int reada_add_block(struct reada_control *rc, u64 logical, 554static int reada_add_block(struct reada_control *rc, u64 logical,
559 struct btrfs_key *top, int level, u64 generation) 555 struct btrfs_key *top, u64 generation)
560{ 556{
561 struct btrfs_root *root = rc->root; 557 struct btrfs_root *root = rc->root;
562 struct reada_extent *re; 558 struct reada_extent *re;
563 struct reada_extctl *rec; 559 struct reada_extctl *rec;
564 560
565 re = reada_find_extent(root, logical, top, level); /* takes one ref */ 561 re = reada_find_extent(root, logical, top); /* takes one ref */
566 if (!re) 562 if (!re)
567 return -1; 563 return -1;
568 564
569 rec = kzalloc(sizeof(*rec), GFP_NOFS); 565 rec = kzalloc(sizeof(*rec), GFP_KERNEL);
570 if (!rec) { 566 if (!rec) {
571 reada_extent_put(root->fs_info, re); 567 reada_extent_put(root->fs_info, re);
572 return -ENOMEM; 568 return -ENOMEM;
@@ -662,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
662 u64 logical; 658 u64 logical;
663 int ret; 659 int ret;
664 int i; 660 int i;
665 int need_kick = 0;
666 661
667 spin_lock(&fs_info->reada_lock); 662 spin_lock(&fs_info->reada_lock);
668 if (dev->reada_curr_zone == NULL) { 663 if (dev->reada_curr_zone == NULL) {
@@ -679,7 +674,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
679 */ 674 */
680 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, 675 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
681 dev->reada_next >> PAGE_CACHE_SHIFT, 1); 676 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
682 if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { 677 if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
683 ret = reada_pick_zone(dev); 678 ret = reada_pick_zone(dev);
684 if (!ret) { 679 if (!ret) {
685 spin_unlock(&fs_info->reada_lock); 680 spin_unlock(&fs_info->reada_lock);
@@ -698,6 +693,15 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
698 693
699 spin_unlock(&fs_info->reada_lock); 694 spin_unlock(&fs_info->reada_lock);
700 695
696 spin_lock(&re->lock);
697 if (re->scheduled || list_empty(&re->extctl)) {
698 spin_unlock(&re->lock);
699 reada_extent_put(fs_info, re);
700 return 0;
701 }
702 re->scheduled = 1;
703 spin_unlock(&re->lock);
704
701 /* 705 /*
702 * find mirror num 706 * find mirror num
703 */ 707 */
@@ -709,29 +713,20 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
709 } 713 }
710 logical = re->logical; 714 logical = re->logical;
711 715
712 spin_lock(&re->lock);
713 if (re->scheduled_for == NULL) {
714 re->scheduled_for = dev;
715 need_kick = 1;
716 }
717 spin_unlock(&re->lock);
718
719 reada_extent_put(fs_info, re);
720
721 if (!need_kick)
722 return 0;
723
724 atomic_inc(&dev->reada_in_flight); 716 atomic_inc(&dev->reada_in_flight);
725 ret = reada_tree_block_flagged(fs_info->extent_root, logical, 717 ret = reada_tree_block_flagged(fs_info->extent_root, logical,
726 mirror_num, &eb); 718 mirror_num, &eb);
727 if (ret) 719 if (ret)
728 __readahead_hook(fs_info->extent_root, NULL, logical, ret); 720 __readahead_hook(fs_info, re, NULL, logical, ret);
729 else if (eb) 721 else if (eb)
730 __readahead_hook(fs_info->extent_root, eb, eb->start, ret); 722 __readahead_hook(fs_info, re, eb, eb->start, ret);
731 723
732 if (eb) 724 if (eb)
733 free_extent_buffer(eb); 725 free_extent_buffer(eb);
734 726
727 atomic_dec(&dev->reada_in_flight);
728 reada_extent_put(fs_info, re);
729
735 return 1; 730 return 1;
736 731
737} 732}
@@ -752,6 +747,8 @@ static void reada_start_machine_worker(struct btrfs_work *work)
752 set_task_ioprio(current, BTRFS_IOPRIO_READA); 747 set_task_ioprio(current, BTRFS_IOPRIO_READA);
753 __reada_start_machine(fs_info); 748 __reada_start_machine(fs_info);
754 set_task_ioprio(current, old_ioprio); 749 set_task_ioprio(current, old_ioprio);
750
751 atomic_dec(&fs_info->reada_works_cnt);
755} 752}
756 753
757static void __reada_start_machine(struct btrfs_fs_info *fs_info) 754static void __reada_start_machine(struct btrfs_fs_info *fs_info)
@@ -783,15 +780,19 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
783 * enqueue to workers to finish it. This will distribute the load to 780 * enqueue to workers to finish it. This will distribute the load to
784 * the cores. 781 * the cores.
785 */ 782 */
786 for (i = 0; i < 2; ++i) 783 for (i = 0; i < 2; ++i) {
787 reada_start_machine(fs_info); 784 reada_start_machine(fs_info);
785 if (atomic_read(&fs_info->reada_works_cnt) >
786 BTRFS_MAX_MIRRORS * 2)
787 break;
788 }
788} 789}
789 790
790static void reada_start_machine(struct btrfs_fs_info *fs_info) 791static void reada_start_machine(struct btrfs_fs_info *fs_info)
791{ 792{
792 struct reada_machine_work *rmw; 793 struct reada_machine_work *rmw;
793 794
794 rmw = kzalloc(sizeof(*rmw), GFP_NOFS); 795 rmw = kzalloc(sizeof(*rmw), GFP_KERNEL);
795 if (!rmw) { 796 if (!rmw) {
796 /* FIXME we cannot handle this properly right now */ 797 /* FIXME we cannot handle this properly right now */
797 BUG(); 798 BUG();
@@ -801,6 +802,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
801 rmw->fs_info = fs_info; 802 rmw->fs_info = fs_info;
802 803
803 btrfs_queue_work(fs_info->readahead_workers, &rmw->work); 804 btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
805 atomic_inc(&fs_info->reada_works_cnt);
804} 806}
805 807
806#ifdef DEBUG 808#ifdef DEBUG
@@ -848,10 +850,9 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
848 if (ret == 0) 850 if (ret == 0)
849 break; 851 break;
850 printk(KERN_DEBUG 852 printk(KERN_DEBUG
851 " re: logical %llu size %u empty %d for %lld", 853 " re: logical %llu size %u empty %d scheduled %d",
852 re->logical, fs_info->tree_root->nodesize, 854 re->logical, fs_info->tree_root->nodesize,
853 list_empty(&re->extctl), re->scheduled_for ? 855 list_empty(&re->extctl), re->scheduled);
854 re->scheduled_for->devid : -1);
855 856
856 for (i = 0; i < re->nzones; ++i) { 857 for (i = 0; i < re->nzones; ++i) {
857 printk(KERN_CONT " zone %llu-%llu devs", 858 printk(KERN_CONT " zone %llu-%llu devs",
@@ -878,27 +879,21 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
878 index, 1); 879 index, 1);
879 if (ret == 0) 880 if (ret == 0)
880 break; 881 break;
881 if (!re->scheduled_for) { 882 if (!re->scheduled) {
882 index = (re->logical >> PAGE_CACHE_SHIFT) + 1; 883 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
883 continue; 884 continue;
884 } 885 }
885 printk(KERN_DEBUG 886 printk(KERN_DEBUG
886 "re: logical %llu size %u list empty %d for %lld", 887 "re: logical %llu size %u list empty %d scheduled %d",
887 re->logical, fs_info->tree_root->nodesize, 888 re->logical, fs_info->tree_root->nodesize,
888 list_empty(&re->extctl), 889 list_empty(&re->extctl), re->scheduled);
889 re->scheduled_for ? re->scheduled_for->devid : -1);
890 for (i = 0; i < re->nzones; ++i) { 890 for (i = 0; i < re->nzones; ++i) {
891 printk(KERN_CONT " zone %llu-%llu devs", 891 printk(KERN_CONT " zone %llu-%llu devs",
892 re->zones[i]->start, 892 re->zones[i]->start,
893 re->zones[i]->end); 893 re->zones[i]->end);
894 for (i = 0; i < re->nzones; ++i) { 894 for (j = 0; j < re->zones[i]->ndevs; ++j) {
895 printk(KERN_CONT " zone %llu-%llu devs", 895 printk(KERN_CONT " %lld",
896 re->zones[i]->start, 896 re->zones[i]->devs[j]->devid);
897 re->zones[i]->end);
898 for (j = 0; j < re->zones[i]->ndevs; ++j) {
899 printk(KERN_CONT " %lld",
900 re->zones[i]->devs[j]->devid);
901 }
902 } 897 }
903 } 898 }
904 printk(KERN_CONT "\n"); 899 printk(KERN_CONT "\n");
@@ -917,7 +912,6 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
917 struct reada_control *rc; 912 struct reada_control *rc;
918 u64 start; 913 u64 start;
919 u64 generation; 914 u64 generation;
920 int level;
921 int ret; 915 int ret;
922 struct extent_buffer *node; 916 struct extent_buffer *node;
923 static struct btrfs_key max_key = { 917 static struct btrfs_key max_key = {
@@ -926,7 +920,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
926 .offset = (u64)-1 920 .offset = (u64)-1
927 }; 921 };
928 922
929 rc = kzalloc(sizeof(*rc), GFP_NOFS); 923 rc = kzalloc(sizeof(*rc), GFP_KERNEL);
930 if (!rc) 924 if (!rc)
931 return ERR_PTR(-ENOMEM); 925 return ERR_PTR(-ENOMEM);
932 926
@@ -940,11 +934,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
940 934
941 node = btrfs_root_node(root); 935 node = btrfs_root_node(root);
942 start = node->start; 936 start = node->start;
943 level = btrfs_header_level(node);
944 generation = btrfs_header_generation(node); 937 generation = btrfs_header_generation(node);
945 free_extent_buffer(node); 938 free_extent_buffer(node);
946 939
947 ret = reada_add_block(rc, start, &max_key, level, generation); 940 ret = reada_add_block(rc, start, &max_key, generation);
948 if (ret) { 941 if (ret) {
949 kfree(rc); 942 kfree(rc);
950 return ERR_PTR(ret); 943 return ERR_PTR(ret);
@@ -959,8 +952,11 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
959int btrfs_reada_wait(void *handle) 952int btrfs_reada_wait(void *handle)
960{ 953{
961 struct reada_control *rc = handle; 954 struct reada_control *rc = handle;
955 struct btrfs_fs_info *fs_info = rc->root->fs_info;
962 956
963 while (atomic_read(&rc->elems)) { 957 while (atomic_read(&rc->elems)) {
958 if (!atomic_read(&fs_info->reada_works_cnt))
959 reada_start_machine(fs_info);
964 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, 960 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
965 5 * HZ); 961 5 * HZ);
966 dump_devs(rc->root->fs_info, 962 dump_devs(rc->root->fs_info,
@@ -977,9 +973,13 @@ int btrfs_reada_wait(void *handle)
977int btrfs_reada_wait(void *handle) 973int btrfs_reada_wait(void *handle)
978{ 974{
979 struct reada_control *rc = handle; 975 struct reada_control *rc = handle;
976 struct btrfs_fs_info *fs_info = rc->root->fs_info;
980 977
981 while (atomic_read(&rc->elems)) { 978 while (atomic_read(&rc->elems)) {
982 wait_event(rc->wait, atomic_read(&rc->elems) == 0); 979 if (!atomic_read(&fs_info->reada_works_cnt))
980 reada_start_machine(fs_info);
981 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
982 (HZ + 9) / 10);
983 } 983 }
984 984
985 kref_put(&rc->refcnt, reada_control_release); 985 kref_put(&rc->refcnt, reada_control_release);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 2c849b08a91b..9fcd6dfc3266 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -496,7 +496,7 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
496 struct btrfs_root *root) 496 struct btrfs_root *root)
497{ 497{
498 struct btrfs_root_item *item = &root->root_item; 498 struct btrfs_root_item *item = &root->root_item;
499 struct timespec ct = CURRENT_TIME; 499 struct timespec ct = current_fs_time(root->fs_info->sb);
500 500
501 spin_lock(&root->root_item_lock); 501 spin_lock(&root->root_item_lock);
502 btrfs_set_root_ctransid(item, trans->transid); 502 btrfs_set_root_ctransid(item, trans->transid);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 92bf5ee732fb..39dbdcbf4d13 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -461,7 +461,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
461 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 461 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
462 int ret; 462 int ret;
463 463
464 sctx = kzalloc(sizeof(*sctx), GFP_NOFS); 464 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
465 if (!sctx) 465 if (!sctx)
466 goto nomem; 466 goto nomem;
467 atomic_set(&sctx->refs, 1); 467 atomic_set(&sctx->refs, 1);
@@ -472,7 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
472 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 472 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
473 struct scrub_bio *sbio; 473 struct scrub_bio *sbio;
474 474
475 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 475 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
476 if (!sbio) 476 if (!sbio)
477 goto nomem; 477 goto nomem;
478 sctx->bios[i] = sbio; 478 sctx->bios[i] = sbio;
@@ -611,7 +611,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
611 u64 flags = 0; 611 u64 flags = 0;
612 u64 ref_root; 612 u64 ref_root;
613 u32 item_size; 613 u32 item_size;
614 u8 ref_level; 614 u8 ref_level = 0;
615 int ret; 615 int ret;
616 616
617 WARN_ON(sblock->page_count < 1); 617 WARN_ON(sblock->page_count < 1);
@@ -1654,7 +1654,7 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1654again: 1654again:
1655 if (!wr_ctx->wr_curr_bio) { 1655 if (!wr_ctx->wr_curr_bio) {
1656 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), 1656 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1657 GFP_NOFS); 1657 GFP_KERNEL);
1658 if (!wr_ctx->wr_curr_bio) { 1658 if (!wr_ctx->wr_curr_bio) {
1659 mutex_unlock(&wr_ctx->wr_lock); 1659 mutex_unlock(&wr_ctx->wr_lock);
1660 return -ENOMEM; 1660 return -ENOMEM;
@@ -1671,7 +1671,8 @@ again:
1671 sbio->dev = wr_ctx->tgtdev; 1671 sbio->dev = wr_ctx->tgtdev;
1672 bio = sbio->bio; 1672 bio = sbio->bio;
1673 if (!bio) { 1673 if (!bio) {
1674 bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); 1674 bio = btrfs_io_bio_alloc(GFP_KERNEL,
1675 wr_ctx->pages_per_wr_bio);
1675 if (!bio) { 1676 if (!bio) {
1676 mutex_unlock(&wr_ctx->wr_lock); 1677 mutex_unlock(&wr_ctx->wr_lock);
1677 return -ENOMEM; 1678 return -ENOMEM;
@@ -2076,7 +2077,8 @@ again:
2076 sbio->dev = spage->dev; 2077 sbio->dev = spage->dev;
2077 bio = sbio->bio; 2078 bio = sbio->bio;
2078 if (!bio) { 2079 if (!bio) {
2079 bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); 2080 bio = btrfs_io_bio_alloc(GFP_KERNEL,
2081 sctx->pages_per_rd_bio);
2080 if (!bio) 2082 if (!bio)
2081 return -ENOMEM; 2083 return -ENOMEM;
2082 sbio->bio = bio; 2084 sbio->bio = bio;
@@ -2241,7 +2243,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2241 struct scrub_block *sblock; 2243 struct scrub_block *sblock;
2242 int index; 2244 int index;
2243 2245
2244 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 2246 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2245 if (!sblock) { 2247 if (!sblock) {
2246 spin_lock(&sctx->stat_lock); 2248 spin_lock(&sctx->stat_lock);
2247 sctx->stat.malloc_errors++; 2249 sctx->stat.malloc_errors++;
@@ -2259,7 +2261,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2259 struct scrub_page *spage; 2261 struct scrub_page *spage;
2260 u64 l = min_t(u64, len, PAGE_SIZE); 2262 u64 l = min_t(u64, len, PAGE_SIZE);
2261 2263
2262 spage = kzalloc(sizeof(*spage), GFP_NOFS); 2264 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2263 if (!spage) { 2265 if (!spage) {
2264leave_nomem: 2266leave_nomem:
2265 spin_lock(&sctx->stat_lock); 2267 spin_lock(&sctx->stat_lock);
@@ -2286,7 +2288,7 @@ leave_nomem:
2286 spage->have_csum = 0; 2288 spage->have_csum = 0;
2287 } 2289 }
2288 sblock->page_count++; 2290 sblock->page_count++;
2289 spage->page = alloc_page(GFP_NOFS); 2291 spage->page = alloc_page(GFP_KERNEL);
2290 if (!spage->page) 2292 if (!spage->page)
2291 goto leave_nomem; 2293 goto leave_nomem;
2292 len -= l; 2294 len -= l;
@@ -2541,7 +2543,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
2541 struct scrub_block *sblock; 2543 struct scrub_block *sblock;
2542 int index; 2544 int index;
2543 2545
2544 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 2546 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2545 if (!sblock) { 2547 if (!sblock) {
2546 spin_lock(&sctx->stat_lock); 2548 spin_lock(&sctx->stat_lock);
2547 sctx->stat.malloc_errors++; 2549 sctx->stat.malloc_errors++;
@@ -2561,7 +2563,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
2561 struct scrub_page *spage; 2563 struct scrub_page *spage;
2562 u64 l = min_t(u64, len, PAGE_SIZE); 2564 u64 l = min_t(u64, len, PAGE_SIZE);
2563 2565
2564 spage = kzalloc(sizeof(*spage), GFP_NOFS); 2566 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2565 if (!spage) { 2567 if (!spage) {
2566leave_nomem: 2568leave_nomem:
2567 spin_lock(&sctx->stat_lock); 2569 spin_lock(&sctx->stat_lock);
@@ -2591,7 +2593,7 @@ leave_nomem:
2591 spage->have_csum = 0; 2593 spage->have_csum = 0;
2592 } 2594 }
2593 sblock->page_count++; 2595 sblock->page_count++;
2594 spage->page = alloc_page(GFP_NOFS); 2596 spage->page = alloc_page(GFP_KERNEL);
2595 if (!spage->page) 2597 if (!spage->page)
2596 goto leave_nomem; 2598 goto leave_nomem;
2597 len -= l; 2599 len -= l;
@@ -3857,16 +3859,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3857 return -EIO; 3859 return -EIO;
3858 } 3860 }
3859 3861
3860 btrfs_dev_replace_lock(&fs_info->dev_replace); 3862 btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
3861 if (dev->scrub_device || 3863 if (dev->scrub_device ||
3862 (!is_dev_replace && 3864 (!is_dev_replace &&
3863 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 3865 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3864 btrfs_dev_replace_unlock(&fs_info->dev_replace); 3866 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
3865 mutex_unlock(&fs_info->scrub_lock); 3867 mutex_unlock(&fs_info->scrub_lock);
3866 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3868 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3867 return -EINPROGRESS; 3869 return -EINPROGRESS;
3868 } 3870 }
3869 btrfs_dev_replace_unlock(&fs_info->dev_replace); 3871 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
3870 3872
3871 ret = scrub_workers_get(fs_info, is_dev_replace); 3873 ret = scrub_workers_get(fs_info, is_dev_replace);
3872 if (ret) { 3874 if (ret) {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 63a6152be04b..19b7bf4284ee 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -34,6 +34,7 @@
34#include "disk-io.h" 34#include "disk-io.h"
35#include "btrfs_inode.h" 35#include "btrfs_inode.h"
36#include "transaction.h" 36#include "transaction.h"
37#include "compression.h"
37 38
38static int g_verbose = 0; 39static int g_verbose = 0;
39 40
@@ -304,7 +305,7 @@ static struct fs_path *fs_path_alloc(void)
304{ 305{
305 struct fs_path *p; 306 struct fs_path *p;
306 307
307 p = kmalloc(sizeof(*p), GFP_NOFS); 308 p = kmalloc(sizeof(*p), GFP_KERNEL);
308 if (!p) 309 if (!p)
309 return NULL; 310 return NULL;
310 p->reversed = 0; 311 p->reversed = 0;
@@ -363,11 +364,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
363 * First time the inline_buf does not suffice 364 * First time the inline_buf does not suffice
364 */ 365 */
365 if (p->buf == p->inline_buf) { 366 if (p->buf == p->inline_buf) {
366 tmp_buf = kmalloc(len, GFP_NOFS); 367 tmp_buf = kmalloc(len, GFP_KERNEL);
367 if (tmp_buf) 368 if (tmp_buf)
368 memcpy(tmp_buf, p->buf, old_buf_len); 369 memcpy(tmp_buf, p->buf, old_buf_len);
369 } else { 370 } else {
370 tmp_buf = krealloc(p->buf, len, GFP_NOFS); 371 tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
371 } 372 }
372 if (!tmp_buf) 373 if (!tmp_buf)
373 return -ENOMEM; 374 return -ENOMEM;
@@ -995,7 +996,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
995 * values are small. 996 * values are small.
996 */ 997 */
997 buf_len = PATH_MAX; 998 buf_len = PATH_MAX;
998 buf = kmalloc(buf_len, GFP_NOFS); 999 buf = kmalloc(buf_len, GFP_KERNEL);
999 if (!buf) { 1000 if (!buf) {
1000 ret = -ENOMEM; 1001 ret = -ENOMEM;
1001 goto out; 1002 goto out;
@@ -1042,7 +1043,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1042 buf = NULL; 1043 buf = NULL;
1043 } else { 1044 } else {
1044 char *tmp = krealloc(buf, buf_len, 1045 char *tmp = krealloc(buf, buf_len,
1045 GFP_NOFS | __GFP_NOWARN); 1046 GFP_KERNEL | __GFP_NOWARN);
1046 1047
1047 if (!tmp) 1048 if (!tmp)
1048 kfree(buf); 1049 kfree(buf);
@@ -1303,7 +1304,7 @@ static int find_extent_clone(struct send_ctx *sctx,
1303 /* We only use this path under the commit sem */ 1304 /* We only use this path under the commit sem */
1304 tmp_path->need_commit_sem = 0; 1305 tmp_path->need_commit_sem = 0;
1305 1306
1306 backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); 1307 backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL);
1307 if (!backref_ctx) { 1308 if (!backref_ctx) {
1308 ret = -ENOMEM; 1309 ret = -ENOMEM;
1309 goto out; 1310 goto out;
@@ -1984,7 +1985,7 @@ static int name_cache_insert(struct send_ctx *sctx,
1984 nce_head = radix_tree_lookup(&sctx->name_cache, 1985 nce_head = radix_tree_lookup(&sctx->name_cache,
1985 (unsigned long)nce->ino); 1986 (unsigned long)nce->ino);
1986 if (!nce_head) { 1987 if (!nce_head) {
1987 nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); 1988 nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
1988 if (!nce_head) { 1989 if (!nce_head) {
1989 kfree(nce); 1990 kfree(nce);
1990 return -ENOMEM; 1991 return -ENOMEM;
@@ -2179,7 +2180,7 @@ out_cache:
2179 /* 2180 /*
2180 * Store the result of the lookup in the name cache. 2181 * Store the result of the lookup in the name cache.
2181 */ 2182 */
2182 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS); 2183 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
2183 if (!nce) { 2184 if (!nce) {
2184 ret = -ENOMEM; 2185 ret = -ENOMEM;
2185 goto out; 2186 goto out;
@@ -2315,7 +2316,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
2315 if (!path) 2316 if (!path)
2316 return -ENOMEM; 2317 return -ENOMEM;
2317 2318
2318 name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS); 2319 name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
2319 if (!name) { 2320 if (!name) {
2320 btrfs_free_path(path); 2321 btrfs_free_path(path);
2321 return -ENOMEM; 2322 return -ENOMEM;
@@ -2730,7 +2731,7 @@ static int __record_ref(struct list_head *head, u64 dir,
2730{ 2731{
2731 struct recorded_ref *ref; 2732 struct recorded_ref *ref;
2732 2733
2733 ref = kmalloc(sizeof(*ref), GFP_NOFS); 2734 ref = kmalloc(sizeof(*ref), GFP_KERNEL);
2734 if (!ref) 2735 if (!ref)
2735 return -ENOMEM; 2736 return -ENOMEM;
2736 2737
@@ -2755,7 +2756,7 @@ static int dup_ref(struct recorded_ref *ref, struct list_head *list)
2755{ 2756{
2756 struct recorded_ref *new; 2757 struct recorded_ref *new;
2757 2758
2758 new = kmalloc(sizeof(*ref), GFP_NOFS); 2759 new = kmalloc(sizeof(*ref), GFP_KERNEL);
2759 if (!new) 2760 if (!new)
2760 return -ENOMEM; 2761 return -ENOMEM;
2761 2762
@@ -2818,7 +2819,7 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2818 struct rb_node *parent = NULL; 2819 struct rb_node *parent = NULL;
2819 struct orphan_dir_info *entry, *odi; 2820 struct orphan_dir_info *entry, *odi;
2820 2821
2821 odi = kmalloc(sizeof(*odi), GFP_NOFS); 2822 odi = kmalloc(sizeof(*odi), GFP_KERNEL);
2822 if (!odi) 2823 if (!odi)
2823 return ERR_PTR(-ENOMEM); 2824 return ERR_PTR(-ENOMEM);
2824 odi->ino = dir_ino; 2825 odi->ino = dir_ino;
@@ -2973,7 +2974,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
2973 struct rb_node *parent = NULL; 2974 struct rb_node *parent = NULL;
2974 struct waiting_dir_move *entry, *dm; 2975 struct waiting_dir_move *entry, *dm;
2975 2976
2976 dm = kmalloc(sizeof(*dm), GFP_NOFS); 2977 dm = kmalloc(sizeof(*dm), GFP_KERNEL);
2977 if (!dm) 2978 if (!dm)
2978 return -ENOMEM; 2979 return -ENOMEM;
2979 dm->ino = ino; 2980 dm->ino = ino;
@@ -3040,7 +3041,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
3040 int exists = 0; 3041 int exists = 0;
3041 int ret; 3042 int ret;
3042 3043
3043 pm = kmalloc(sizeof(*pm), GFP_NOFS); 3044 pm = kmalloc(sizeof(*pm), GFP_KERNEL);
3044 if (!pm) 3045 if (!pm)
3045 return -ENOMEM; 3046 return -ENOMEM;
3046 pm->parent_ino = parent_ino; 3047 pm->parent_ino = parent_ino;
@@ -4280,7 +4281,7 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
4280 strncmp(name, ctx->name, name_len) == 0) { 4281 strncmp(name, ctx->name, name_len) == 0) {
4281 ctx->found_idx = num; 4282 ctx->found_idx = num;
4282 ctx->found_data_len = data_len; 4283 ctx->found_data_len = data_len;
4283 ctx->found_data = kmemdup(data, data_len, GFP_NOFS); 4284 ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
4284 if (!ctx->found_data) 4285 if (!ctx->found_data)
4285 return -ENOMEM; 4286 return -ENOMEM;
4286 return 1; 4287 return 1;
@@ -4481,7 +4482,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
4481 while (index <= last_index) { 4482 while (index <= last_index) {
4482 unsigned cur_len = min_t(unsigned, len, 4483 unsigned cur_len = min_t(unsigned, len,
4483 PAGE_CACHE_SIZE - pg_offset); 4484 PAGE_CACHE_SIZE - pg_offset);
4484 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 4485 page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
4485 if (!page) { 4486 if (!page) {
4486 ret = -ENOMEM; 4487 ret = -ENOMEM;
4487 break; 4488 break;
@@ -5989,7 +5990,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5989 goto out; 5990 goto out;
5990 } 5991 }
5991 5992
5992 sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); 5993 sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL);
5993 if (!sctx) { 5994 if (!sctx) {
5994 ret = -ENOMEM; 5995 ret = -ENOMEM;
5995 goto out; 5996 goto out;
@@ -5997,7 +5998,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5997 5998
5998 INIT_LIST_HEAD(&sctx->new_refs); 5999 INIT_LIST_HEAD(&sctx->new_refs);
5999 INIT_LIST_HEAD(&sctx->deleted_refs); 6000 INIT_LIST_HEAD(&sctx->deleted_refs);
6000 INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); 6001 INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
6001 INIT_LIST_HEAD(&sctx->name_cache_list); 6002 INIT_LIST_HEAD(&sctx->name_cache_list);
6002 6003
6003 sctx->flags = arg->flags; 6004 sctx->flags = arg->flags;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d41e09fe8e38..00b8f37cc306 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -303,7 +303,8 @@ enum {
303 Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, 303 Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
304 Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, 304 Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
305 Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, 305 Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
306 Opt_datasum, Opt_treelog, Opt_noinode_cache, 306 Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot,
307 Opt_nologreplay, Opt_norecovery,
307#ifdef CONFIG_BTRFS_DEBUG 308#ifdef CONFIG_BTRFS_DEBUG
308 Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, 309 Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
309#endif 310#endif
@@ -335,6 +336,8 @@ static const match_table_t tokens = {
335 {Opt_noacl, "noacl"}, 336 {Opt_noacl, "noacl"},
336 {Opt_notreelog, "notreelog"}, 337 {Opt_notreelog, "notreelog"},
337 {Opt_treelog, "treelog"}, 338 {Opt_treelog, "treelog"},
339 {Opt_nologreplay, "nologreplay"},
340 {Opt_norecovery, "norecovery"},
338 {Opt_flushoncommit, "flushoncommit"}, 341 {Opt_flushoncommit, "flushoncommit"},
339 {Opt_noflushoncommit, "noflushoncommit"}, 342 {Opt_noflushoncommit, "noflushoncommit"},
340 {Opt_ratio, "metadata_ratio=%d"}, 343 {Opt_ratio, "metadata_ratio=%d"},
@@ -352,7 +355,8 @@ static const match_table_t tokens = {
352 {Opt_inode_cache, "inode_cache"}, 355 {Opt_inode_cache, "inode_cache"},
353 {Opt_noinode_cache, "noinode_cache"}, 356 {Opt_noinode_cache, "noinode_cache"},
354 {Opt_no_space_cache, "nospace_cache"}, 357 {Opt_no_space_cache, "nospace_cache"},
355 {Opt_recovery, "recovery"}, 358 {Opt_recovery, "recovery"}, /* deprecated */
359 {Opt_usebackuproot, "usebackuproot"},
356 {Opt_skip_balance, "skip_balance"}, 360 {Opt_skip_balance, "skip_balance"},
357 {Opt_check_integrity, "check_int"}, 361 {Opt_check_integrity, "check_int"},
358 {Opt_check_integrity_including_extent_data, "check_int_data"}, 362 {Opt_check_integrity_including_extent_data, "check_int_data"},
@@ -373,7 +377,8 @@ static const match_table_t tokens = {
373 * reading in a new superblock is parsed here. 377 * reading in a new superblock is parsed here.
374 * XXX JDM: This needs to be cleaned up for remount. 378 * XXX JDM: This needs to be cleaned up for remount.
375 */ 379 */
376int btrfs_parse_options(struct btrfs_root *root, char *options) 380int btrfs_parse_options(struct btrfs_root *root, char *options,
381 unsigned long new_flags)
377{ 382{
378 struct btrfs_fs_info *info = root->fs_info; 383 struct btrfs_fs_info *info = root->fs_info;
379 substring_t args[MAX_OPT_ARGS]; 384 substring_t args[MAX_OPT_ARGS];
@@ -393,8 +398,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
393 else if (cache_gen) 398 else if (cache_gen)
394 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 399 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
395 400
401 /*
402 * Even the options are empty, we still need to do extra check
403 * against new flags
404 */
396 if (!options) 405 if (!options)
397 goto out; 406 goto check;
398 407
399 /* 408 /*
400 * strsep changes the string, duplicate it because parse_options 409 * strsep changes the string, duplicate it because parse_options
@@ -606,6 +615,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
606 btrfs_clear_and_info(root, NOTREELOG, 615 btrfs_clear_and_info(root, NOTREELOG,
607 "enabling tree log"); 616 "enabling tree log");
608 break; 617 break;
618 case Opt_norecovery:
619 case Opt_nologreplay:
620 btrfs_set_and_info(root, NOLOGREPLAY,
621 "disabling log replay at mount time");
622 break;
609 case Opt_flushoncommit: 623 case Opt_flushoncommit:
610 btrfs_set_and_info(root, FLUSHONCOMMIT, 624 btrfs_set_and_info(root, FLUSHONCOMMIT,
611 "turning on flush-on-commit"); 625 "turning on flush-on-commit");
@@ -696,8 +710,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
696 "disabling auto defrag"); 710 "disabling auto defrag");
697 break; 711 break;
698 case Opt_recovery: 712 case Opt_recovery:
699 btrfs_info(root->fs_info, "enabling auto recovery"); 713 btrfs_warn(root->fs_info,
700 btrfs_set_opt(info->mount_opt, RECOVERY); 714 "'recovery' is deprecated, use 'usebackuproot' instead");
715 case Opt_usebackuproot:
716 btrfs_info(root->fs_info,
717 "trying to use backup root at mount time");
718 btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
701 break; 719 break;
702 case Opt_skip_balance: 720 case Opt_skip_balance:
703 btrfs_set_opt(info->mount_opt, SKIP_BALANCE); 721 btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
@@ -792,6 +810,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
792 break; 810 break;
793 } 811 }
794 } 812 }
813check:
814 /*
815 * Extra check for current option against current flag
816 */
817 if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
818 btrfs_err(root->fs_info,
819 "nologreplay must be used with ro mount option");
820 ret = -EINVAL;
821 }
795out: 822out:
796 if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) && 823 if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
797 !btrfs_test_opt(root, FREE_SPACE_TREE) && 824 !btrfs_test_opt(root, FREE_SPACE_TREE) &&
@@ -1202,6 +1229,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
1202 seq_puts(seq, ",ssd"); 1229 seq_puts(seq, ",ssd");
1203 if (btrfs_test_opt(root, NOTREELOG)) 1230 if (btrfs_test_opt(root, NOTREELOG))
1204 seq_puts(seq, ",notreelog"); 1231 seq_puts(seq, ",notreelog");
1232 if (btrfs_test_opt(root, NOLOGREPLAY))
1233 seq_puts(seq, ",nologreplay");
1205 if (btrfs_test_opt(root, FLUSHONCOMMIT)) 1234 if (btrfs_test_opt(root, FLUSHONCOMMIT))
1206 seq_puts(seq, ",flushoncommit"); 1235 seq_puts(seq, ",flushoncommit");
1207 if (btrfs_test_opt(root, DISCARD)) 1236 if (btrfs_test_opt(root, DISCARD))
@@ -1228,8 +1257,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
1228 seq_puts(seq, ",inode_cache"); 1257 seq_puts(seq, ",inode_cache");
1229 if (btrfs_test_opt(root, SKIP_BALANCE)) 1258 if (btrfs_test_opt(root, SKIP_BALANCE))
1230 seq_puts(seq, ",skip_balance"); 1259 seq_puts(seq, ",skip_balance");
1231 if (btrfs_test_opt(root, RECOVERY))
1232 seq_puts(seq, ",recovery");
1233#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1260#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1234 if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) 1261 if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
1235 seq_puts(seq, ",check_int_data"); 1262 seq_puts(seq, ",check_int_data");
@@ -1685,7 +1712,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1685 } 1712 }
1686 } 1713 }
1687 1714
1688 ret = btrfs_parse_options(root, data); 1715 ret = btrfs_parse_options(root, data, *flags);
1689 if (ret) { 1716 if (ret) {
1690 ret = -EINVAL; 1717 ret = -EINVAL;
1691 goto restore; 1718 goto restore;
@@ -2163,6 +2190,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
2163 break; 2190 break;
2164 ret = !(fs_devices->num_devices == fs_devices->total_devices); 2191 ret = !(fs_devices->num_devices == fs_devices->total_devices);
2165 break; 2192 break;
2193 case BTRFS_IOC_GET_SUPPORTED_FEATURES:
2194 ret = btrfs_ioctl_get_supported_features((void __user*)arg);
2195 break;
2166 } 2196 }
2167 2197
2168 kfree(vol); 2198 kfree(vol);
@@ -2261,7 +2291,7 @@ static void btrfs_interface_exit(void)
2261 misc_deregister(&btrfs_misc); 2291 misc_deregister(&btrfs_misc);
2262} 2292}
2263 2293
2264static void btrfs_print_info(void) 2294static void btrfs_print_mod_info(void)
2265{ 2295{
2266 printk(KERN_INFO "Btrfs loaded" 2296 printk(KERN_INFO "Btrfs loaded"
2267#ifdef CONFIG_BTRFS_DEBUG 2297#ifdef CONFIG_BTRFS_DEBUG
@@ -2363,7 +2393,7 @@ static int __init init_btrfs_fs(void)
2363 2393
2364 btrfs_init_lockdep(); 2394 btrfs_init_lockdep();
2365 2395
2366 btrfs_print_info(); 2396 btrfs_print_mod_info();
2367 2397
2368 err = btrfs_run_sanity_tests(); 2398 err = btrfs_run_sanity_tests();
2369 if (err) 2399 if (err)
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 1c76d73e06dc..f54bf450bad3 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -188,12 +188,6 @@ btrfs_alloc_dummy_block_group(unsigned long length)
188 kfree(cache); 188 kfree(cache);
189 return NULL; 189 return NULL;
190 } 190 }
191 cache->fs_info = btrfs_alloc_dummy_fs_info();
192 if (!cache->fs_info) {
193 kfree(cache->free_space_ctl);
194 kfree(cache);
195 return NULL;
196 }
197 191
198 cache->key.objectid = 0; 192 cache->key.objectid = 0;
199 cache->key.offset = length; 193 cache->key.offset = length;
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index d05fe1ab4808..7cea4462acd5 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -485,6 +485,7 @@ static int run_test(test_func_t test_func, int bitmaps)
485 cache->bitmap_low_thresh = 0; 485 cache->bitmap_low_thresh = 0;
486 cache->bitmap_high_thresh = (u32)-1; 486 cache->bitmap_high_thresh = (u32)-1;
487 cache->needs_free_space = 1; 487 cache->needs_free_space = 1;
488 cache->fs_info = root->fs_info;
488 489
489 btrfs_init_dummy_trans(&trans); 490 btrfs_init_dummy_trans(&trans);
490 491
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index e2d3da02deee..863a6a3af1f8 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -22,6 +22,7 @@
22#include "../disk-io.h" 22#include "../disk-io.h"
23#include "../extent_io.h" 23#include "../extent_io.h"
24#include "../volumes.h" 24#include "../volumes.h"
25#include "../compression.h"
25 26
26static void insert_extent(struct btrfs_root *root, u64 start, u64 len, 27static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
27 u64 ram_bytes, u64 offset, u64 disk_bytenr, 28 u64 ram_bytes, u64 offset, u64 disk_bytenr,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b6031ce474f7..43885e51b882 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -637,6 +637,8 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
637 637
638 trans->block_rsv = &root->fs_info->trans_block_rsv; 638 trans->block_rsv = &root->fs_info->trans_block_rsv;
639 trans->bytes_reserved = num_bytes; 639 trans->bytes_reserved = num_bytes;
640 trace_btrfs_space_reservation(root->fs_info, "transaction",
641 trans->transid, num_bytes, 1);
640 642
641 return trans; 643 return trans;
642} 644}
@@ -1333,7 +1335,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1333 struct dentry *dentry; 1335 struct dentry *dentry;
1334 struct extent_buffer *tmp; 1336 struct extent_buffer *tmp;
1335 struct extent_buffer *old; 1337 struct extent_buffer *old;
1336 struct timespec cur_time = CURRENT_TIME; 1338 struct timespec cur_time;
1337 int ret = 0; 1339 int ret = 0;
1338 u64 to_reserve = 0; 1340 u64 to_reserve = 0;
1339 u64 index = 0; 1341 u64 index = 0;
@@ -1375,12 +1377,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1375 rsv = trans->block_rsv; 1377 rsv = trans->block_rsv;
1376 trans->block_rsv = &pending->block_rsv; 1378 trans->block_rsv = &pending->block_rsv;
1377 trans->bytes_reserved = trans->block_rsv->reserved; 1379 trans->bytes_reserved = trans->block_rsv->reserved;
1378 1380 trace_btrfs_space_reservation(root->fs_info, "transaction",
1381 trans->transid,
1382 trans->bytes_reserved, 1);
1379 dentry = pending->dentry; 1383 dentry = pending->dentry;
1380 parent_inode = pending->dir; 1384 parent_inode = pending->dir;
1381 parent_root = BTRFS_I(parent_inode)->root; 1385 parent_root = BTRFS_I(parent_inode)->root;
1382 record_root_in_trans(trans, parent_root); 1386 record_root_in_trans(trans, parent_root);
1383 1387
1388 cur_time = current_fs_time(parent_inode->i_sb);
1389
1384 /* 1390 /*
1385 * insert the directory item 1391 * insert the directory item
1386 */ 1392 */
@@ -1523,7 +1529,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1523 1529
1524 btrfs_i_size_write(parent_inode, parent_inode->i_size + 1530 btrfs_i_size_write(parent_inode, parent_inode->i_size +
1525 dentry->d_name.len * 2); 1531 dentry->d_name.len * 2);
1526 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 1532 parent_inode->i_mtime = parent_inode->i_ctime =
1533 current_fs_time(parent_inode->i_sb);
1527 ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); 1534 ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
1528 if (ret) { 1535 if (ret) {
1529 btrfs_abort_transaction(trans, root, ret); 1536 btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 978c3a810893..24d03c751149 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -26,6 +26,7 @@
26#include "print-tree.h" 26#include "print-tree.h"
27#include "backref.h" 27#include "backref.h"
28#include "hash.h" 28#include "hash.h"
29#include "compression.h"
29 30
30/* magic values for the inode_only field in btrfs_log_inode: 31/* magic values for the inode_only field in btrfs_log_inode:
31 * 32 *
@@ -1045,7 +1046,7 @@ again:
1045 1046
1046 /* 1047 /*
1047 * NOTE: we have searched root tree and checked the 1048 * NOTE: we have searched root tree and checked the
1048 * coresponding ref, it does not need to check again. 1049 * corresponding ref, it does not need to check again.
1049 */ 1050 */
1050 *search_done = 1; 1051 *search_done = 1;
1051 } 1052 }
@@ -4500,7 +4501,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4500 4501
4501 mutex_lock(&BTRFS_I(inode)->log_mutex); 4502 mutex_lock(&BTRFS_I(inode)->log_mutex);
4502 4503
4503 btrfs_get_logged_extents(inode, &logged_list, start, end); 4504 /*
4505 * Collect ordered extents only if we are logging data. This is to
4506 * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
4507 * will process the ordered extents if they still exists at the time,
4508 * because when we collect them we test and set for the flag
4509 * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
4510 * same ordered extents. The consequence for the LOG_INODE_ALL log mode
4511 * not processing the ordered extents is that we end up logging the
4512 * corresponding file extent items, based on the extent maps in the
4513 * inode's extent_map_tree's modified_list, without logging the
4514 * respective checksums (since the may still be only attached to the
4515 * ordered extents and have not been inserted in the csum tree by
4516 * btrfs_finish_ordered_io() yet).
4517 */
4518 if (inode_only == LOG_INODE_ALL)
4519 btrfs_get_logged_extents(inode, &logged_list, start, end);
4504 4520
4505 /* 4521 /*
4506 * a brute force approach to making sure we get the most uptodate 4522 * a brute force approach to making sure we get the most uptodate
@@ -4772,6 +4788,42 @@ out_unlock:
4772} 4788}
4773 4789
4774/* 4790/*
4791 * Check if we must fallback to a transaction commit when logging an inode.
4792 * This must be called after logging the inode and is used only in the context
4793 * when fsyncing an inode requires the need to log some other inode - in which
4794 * case we can't lock the i_mutex of each other inode we need to log as that
4795 * can lead to deadlocks with concurrent fsync against other inodes (as we can
4796 * log inodes up or down in the hierarchy) or rename operations for example. So
4797 * we take the log_mutex of the inode after we have logged it and then check for
4798 * its last_unlink_trans value - this is safe because any task setting
4799 * last_unlink_trans must take the log_mutex and it must do this before it does
4800 * the actual unlink operation, so if we do this check before a concurrent task
4801 * sets last_unlink_trans it means we've logged a consistent version/state of
4802 * all the inode items, otherwise we are not sure and must do a transaction
4803 * commit (the concurrent task migth have only updated last_unlink_trans before
4804 * we logged the inode or it might have also done the unlink).
4805 */
4806static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
4807 struct inode *inode)
4808{
4809 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4810 bool ret = false;
4811
4812 mutex_lock(&BTRFS_I(inode)->log_mutex);
4813 if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) {
4814 /*
4815 * Make sure any commits to the log are forced to be full
4816 * commits.
4817 */
4818 btrfs_set_log_full_commit(fs_info, trans);
4819 ret = true;
4820 }
4821 mutex_unlock(&BTRFS_I(inode)->log_mutex);
4822
4823 return ret;
4824}
4825
4826/*
4775 * follow the dentry parent pointers up the chain and see if any 4827 * follow the dentry parent pointers up the chain and see if any
4776 * of the directories in it require a full commit before they can 4828 * of the directories in it require a full commit before they can
4777 * be logged. Returns zero if nothing special needs to be done or 1 if 4829 * be logged. Returns zero if nothing special needs to be done or 1 if
@@ -4784,7 +4836,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
4784 u64 last_committed) 4836 u64 last_committed)
4785{ 4837{
4786 int ret = 0; 4838 int ret = 0;
4787 struct btrfs_root *root;
4788 struct dentry *old_parent = NULL; 4839 struct dentry *old_parent = NULL;
4789 struct inode *orig_inode = inode; 4840 struct inode *orig_inode = inode;
4790 4841
@@ -4816,14 +4867,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
4816 BTRFS_I(inode)->logged_trans = trans->transid; 4867 BTRFS_I(inode)->logged_trans = trans->transid;
4817 smp_mb(); 4868 smp_mb();
4818 4869
4819 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 4870 if (btrfs_must_commit_transaction(trans, inode)) {
4820 root = BTRFS_I(inode)->root;
4821
4822 /*
4823 * make sure any commits to the log are forced
4824 * to be full commits
4825 */
4826 btrfs_set_log_full_commit(root->fs_info, trans);
4827 ret = 1; 4871 ret = 1;
4828 break; 4872 break;
4829 } 4873 }
@@ -4982,6 +5026,9 @@ process_leaf:
4982 btrfs_release_path(path); 5026 btrfs_release_path(path);
4983 ret = btrfs_log_inode(trans, root, di_inode, 5027 ret = btrfs_log_inode(trans, root, di_inode,
4984 log_mode, 0, LLONG_MAX, ctx); 5028 log_mode, 0, LLONG_MAX, ctx);
5029 if (!ret &&
5030 btrfs_must_commit_transaction(trans, di_inode))
5031 ret = 1;
4985 iput(di_inode); 5032 iput(di_inode);
4986 if (ret) 5033 if (ret)
4987 goto next_dir_inode; 5034 goto next_dir_inode;
@@ -5096,6 +5143,9 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5096 5143
5097 ret = btrfs_log_inode(trans, root, dir_inode, 5144 ret = btrfs_log_inode(trans, root, dir_inode,
5098 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5145 LOG_INODE_ALL, 0, LLONG_MAX, ctx);
5146 if (!ret &&
5147 btrfs_must_commit_transaction(trans, dir_inode))
5148 ret = 1;
5099 iput(dir_inode); 5149 iput(dir_inode);
5100 if (ret) 5150 if (ret)
5101 goto out; 5151 goto out;
@@ -5447,6 +5497,9 @@ error:
5447 * They revolve around files there were unlinked from the directory, and 5497 * They revolve around files there were unlinked from the directory, and
5448 * this function updates the parent directory so that a full commit is 5498 * this function updates the parent directory so that a full commit is
5449 * properly done if it is fsync'd later after the unlinks are done. 5499 * properly done if it is fsync'd later after the unlinks are done.
5500 *
5501 * Must be called before the unlink operations (updates to the subvolume tree,
5502 * inodes, etc) are done.
5450 */ 5503 */
5451void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 5504void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
5452 struct inode *dir, struct inode *inode, 5505 struct inode *dir, struct inode *inode,
@@ -5462,8 +5515,11 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
5462 * into the file. When the file is logged we check it and 5515 * into the file. When the file is logged we check it and
5463 * don't log the parents if the file is fully on disk. 5516 * don't log the parents if the file is fully on disk.
5464 */ 5517 */
5465 if (S_ISREG(inode->i_mode)) 5518 if (S_ISREG(inode->i_mode)) {
5519 mutex_lock(&BTRFS_I(inode)->log_mutex);
5466 BTRFS_I(inode)->last_unlink_trans = trans->transid; 5520 BTRFS_I(inode)->last_unlink_trans = trans->transid;
5521 mutex_unlock(&BTRFS_I(inode)->log_mutex);
5522 }
5467 5523
5468 /* 5524 /*
5469 * if this directory was already logged any new 5525 * if this directory was already logged any new
@@ -5494,7 +5550,29 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
5494 return; 5550 return;
5495 5551
5496record: 5552record:
5553 mutex_lock(&BTRFS_I(dir)->log_mutex);
5554 BTRFS_I(dir)->last_unlink_trans = trans->transid;
5555 mutex_unlock(&BTRFS_I(dir)->log_mutex);
5556}
5557
5558/*
5559 * Make sure that if someone attempts to fsync the parent directory of a deleted
5560 * snapshot, it ends up triggering a transaction commit. This is to guarantee
5561 * that after replaying the log tree of the parent directory's root we will not
5562 * see the snapshot anymore and at log replay time we will not see any log tree
5563 * corresponding to the deleted snapshot's root, which could lead to replaying
5564 * it after replaying the log tree of the parent directory (which would replay
5565 * the snapshot delete operation).
5566 *
5567 * Must be called before the actual snapshot destroy operation (updates to the
5568 * parent root and tree of tree roots trees, etc) are done.
5569 */
5570void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
5571 struct inode *dir)
5572{
5573 mutex_lock(&BTRFS_I(dir)->log_mutex);
5497 BTRFS_I(dir)->last_unlink_trans = trans->transid; 5574 BTRFS_I(dir)->last_unlink_trans = trans->transid;
5575 mutex_unlock(&BTRFS_I(dir)->log_mutex);
5498} 5576}
5499 5577
5500/* 5578/*
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 6916a781ea02..a9f1b75d080d 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -79,6 +79,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root);
79void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 79void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
80 struct inode *dir, struct inode *inode, 80 struct inode *dir, struct inode *inode,
81 int for_rename); 81 int for_rename);
82void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
83 struct inode *dir);
82int btrfs_log_new_name(struct btrfs_trans_handle *trans, 84int btrfs_log_new_name(struct btrfs_trans_handle *trans,
83 struct inode *inode, struct inode *old_dir, 85 struct inode *inode, struct inode *old_dir,
84 struct dentry *parent); 86 struct dentry *parent);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 366b335946fa..e2b54d546b7c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -138,7 +138,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void)
138{ 138{
139 struct btrfs_fs_devices *fs_devs; 139 struct btrfs_fs_devices *fs_devs;
140 140
141 fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS); 141 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
142 if (!fs_devs) 142 if (!fs_devs)
143 return ERR_PTR(-ENOMEM); 143 return ERR_PTR(-ENOMEM);
144 144
@@ -220,7 +220,7 @@ static struct btrfs_device *__alloc_device(void)
220{ 220{
221 struct btrfs_device *dev; 221 struct btrfs_device *dev;
222 222
223 dev = kzalloc(sizeof(*dev), GFP_NOFS); 223 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
224 if (!dev) 224 if (!dev)
225 return ERR_PTR(-ENOMEM); 225 return ERR_PTR(-ENOMEM);
226 226
@@ -733,7 +733,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
733 * uuid mutex so nothing we touch in here is going to disappear. 733 * uuid mutex so nothing we touch in here is going to disappear.
734 */ 734 */
735 if (orig_dev->name) { 735 if (orig_dev->name) {
736 name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); 736 name = rcu_string_strdup(orig_dev->name->str,
737 GFP_KERNEL);
737 if (!name) { 738 if (!name) {
738 kfree(device); 739 kfree(device);
739 goto error; 740 goto error;
@@ -1714,12 +1715,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1714 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 1715 } while (read_seqretry(&root->fs_info->profiles_lock, seq));
1715 1716
1716 num_devices = root->fs_info->fs_devices->num_devices; 1717 num_devices = root->fs_info->fs_devices->num_devices;
1717 btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1718 btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0);
1718 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { 1719 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1719 WARN_ON(num_devices < 1); 1720 WARN_ON(num_devices < 1);
1720 num_devices--; 1721 num_devices--;
1721 } 1722 }
1722 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1723 btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0);
1723 1724
1724 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1725 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1725 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; 1726 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
@@ -2287,7 +2288,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2287 goto error; 2288 goto error;
2288 } 2289 }
2289 2290
2290 name = rcu_string_strdup(device_path, GFP_NOFS); 2291 name = rcu_string_strdup(device_path, GFP_KERNEL);
2291 if (!name) { 2292 if (!name) {
2292 kfree(device); 2293 kfree(device);
2293 ret = -ENOMEM; 2294 ret = -ENOMEM;
@@ -2748,7 +2749,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2748 em->start + em->len < chunk_offset) { 2749 em->start + em->len < chunk_offset) {
2749 /* 2750 /*
2750 * This is a logic error, but we don't want to just rely on the 2751 * This is a logic error, but we don't want to just rely on the
2751 * user having built with ASSERT enabled, so if ASSERT doens't 2752 * user having built with ASSERT enabled, so if ASSERT doesn't
2752 * do anything we still error out. 2753 * do anything we still error out.
2753 */ 2754 */
2754 ASSERT(0); 2755 ASSERT(0);
@@ -2966,7 +2967,7 @@ static int insert_balance_item(struct btrfs_root *root,
2966 } 2967 }
2967 2968
2968 key.objectid = BTRFS_BALANCE_OBJECTID; 2969 key.objectid = BTRFS_BALANCE_OBJECTID;
2969 key.type = BTRFS_BALANCE_ITEM_KEY; 2970 key.type = BTRFS_TEMPORARY_ITEM_KEY;
2970 key.offset = 0; 2971 key.offset = 0;
2971 2972
2972 ret = btrfs_insert_empty_item(trans, root, path, &key, 2973 ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -3015,7 +3016,7 @@ static int del_balance_item(struct btrfs_root *root)
3015 } 3016 }
3016 3017
3017 key.objectid = BTRFS_BALANCE_OBJECTID; 3018 key.objectid = BTRFS_BALANCE_OBJECTID;
3018 key.type = BTRFS_BALANCE_ITEM_KEY; 3019 key.type = BTRFS_TEMPORARY_ITEM_KEY;
3019 key.offset = 0; 3020 key.offset = 0;
3020 3021
3021 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3022 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
@@ -3686,12 +3687,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3686 } 3687 }
3687 3688
3688 num_devices = fs_info->fs_devices->num_devices; 3689 num_devices = fs_info->fs_devices->num_devices;
3689 btrfs_dev_replace_lock(&fs_info->dev_replace); 3690 btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
3690 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 3691 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3691 BUG_ON(num_devices < 1); 3692 BUG_ON(num_devices < 1);
3692 num_devices--; 3693 num_devices--;
3693 } 3694 }
3694 btrfs_dev_replace_unlock(&fs_info->dev_replace); 3695 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
3695 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3696 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3696 if (num_devices == 1) 3697 if (num_devices == 1)
3697 allowed |= BTRFS_BLOCK_GROUP_DUP; 3698 allowed |= BTRFS_BLOCK_GROUP_DUP;
@@ -3867,7 +3868,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
3867 return -ENOMEM; 3868 return -ENOMEM;
3868 3869
3869 key.objectid = BTRFS_BALANCE_OBJECTID; 3870 key.objectid = BTRFS_BALANCE_OBJECTID;
3870 key.type = BTRFS_BALANCE_ITEM_KEY; 3871 key.type = BTRFS_TEMPORARY_ITEM_KEY;
3871 key.offset = 0; 3872 key.offset = 0;
3872 3873
3873 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 3874 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
@@ -4118,7 +4119,7 @@ out:
4118 * Callback for btrfs_uuid_tree_iterate(). 4119 * Callback for btrfs_uuid_tree_iterate().
4119 * returns: 4120 * returns:
4120 * 0 check succeeded, the entry is not outdated. 4121 * 0 check succeeded, the entry is not outdated.
4121 * < 0 if an error occured. 4122 * < 0 if an error occurred.
4122 * > 0 if the check failed, which means the caller shall remove the entry. 4123 * > 0 if the check failed, which means the caller shall remove the entry.
4123 */ 4124 */
4124static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 4125static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
@@ -5062,10 +5063,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5062 ret = 1; 5063 ret = 1;
5063 free_extent_map(em); 5064 free_extent_map(em);
5064 5065
5065 btrfs_dev_replace_lock(&fs_info->dev_replace); 5066 btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
5066 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) 5067 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
5067 ret++; 5068 ret++;
5068 btrfs_dev_replace_unlock(&fs_info->dev_replace); 5069 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
5069 5070
5070 return ret; 5071 return ret;
5071} 5072}
@@ -5325,10 +5326,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5325 if (!bbio_ret) 5326 if (!bbio_ret)
5326 goto out; 5327 goto out;
5327 5328
5328 btrfs_dev_replace_lock(dev_replace); 5329 btrfs_dev_replace_lock(dev_replace, 0);
5329 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 5330 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
5330 if (!dev_replace_is_ongoing) 5331 if (!dev_replace_is_ongoing)
5331 btrfs_dev_replace_unlock(dev_replace); 5332 btrfs_dev_replace_unlock(dev_replace, 0);
5333 else
5334 btrfs_dev_replace_set_lock_blocking(dev_replace);
5332 5335
5333 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 5336 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
5334 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && 5337 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
@@ -5751,8 +5754,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5751 bbio->mirror_num = map->num_stripes + 1; 5754 bbio->mirror_num = map->num_stripes + 1;
5752 } 5755 }
5753out: 5756out:
5754 if (dev_replace_is_ongoing) 5757 if (dev_replace_is_ongoing) {
5755 btrfs_dev_replace_unlock(dev_replace); 5758 btrfs_dev_replace_clear_lock_blocking(dev_replace);
5759 btrfs_dev_replace_unlock(dev_replace, 0);
5760 }
5756 free_extent_map(em); 5761 free_extent_map(em);
5757 return ret; 5762 return ret;
5758} 5763}
@@ -6705,8 +6710,8 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
6705 int item_size; 6710 int item_size;
6706 struct btrfs_dev_stats_item *ptr; 6711 struct btrfs_dev_stats_item *ptr;
6707 6712
6708 key.objectid = 0; 6713 key.objectid = BTRFS_DEV_STATS_OBJECTID;
6709 key.type = BTRFS_DEV_STATS_KEY; 6714 key.type = BTRFS_PERSISTENT_ITEM_KEY;
6710 key.offset = device->devid; 6715 key.offset = device->devid;
6711 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 6716 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
6712 if (ret) { 6717 if (ret) {
@@ -6753,8 +6758,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6753 int ret; 6758 int ret;
6754 int i; 6759 int i;
6755 6760
6756 key.objectid = 0; 6761 key.objectid = BTRFS_DEV_STATS_OBJECTID;
6757 key.type = BTRFS_DEV_STATS_KEY; 6762 key.type = BTRFS_PERSISTENT_ITEM_KEY;
6758 key.offset = device->devid; 6763 key.offset = device->devid;
6759 6764
6760 path = btrfs_alloc_path(); 6765 path = btrfs_alloc_path();
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 6c68d6356197..145d2b89e62d 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -249,7 +249,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
249 goto out; 249 goto out;
250 250
251 inode_inc_iversion(inode); 251 inode_inc_iversion(inode);
252 inode->i_ctime = CURRENT_TIME; 252 inode->i_ctime = current_fs_time(inode->i_sb);
253 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 253 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
254 ret = btrfs_update_inode(trans, root, inode); 254 ret = btrfs_update_inode(trans, root, inode);
255 BUG_ON(ret); 255 BUG_ON(ret);
@@ -260,16 +260,12 @@ out:
260 260
261ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) 261ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
262{ 262{
263 struct btrfs_key key, found_key; 263 struct btrfs_key key;
264 struct inode *inode = d_inode(dentry); 264 struct inode *inode = d_inode(dentry);
265 struct btrfs_root *root = BTRFS_I(inode)->root; 265 struct btrfs_root *root = BTRFS_I(inode)->root;
266 struct btrfs_path *path; 266 struct btrfs_path *path;
267 struct extent_buffer *leaf; 267 int ret = 0;
268 struct btrfs_dir_item *di;
269 int ret = 0, slot;
270 size_t total_size = 0, size_left = size; 268 size_t total_size = 0, size_left = size;
271 unsigned long name_ptr;
272 size_t name_len;
273 269
274 /* 270 /*
275 * ok we want all objects associated with this id. 271 * ok we want all objects associated with this id.
@@ -291,6 +287,13 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
291 goto err; 287 goto err;
292 288
293 while (1) { 289 while (1) {
290 struct extent_buffer *leaf;
291 int slot;
292 struct btrfs_dir_item *di;
293 struct btrfs_key found_key;
294 u32 item_size;
295 u32 cur;
296
294 leaf = path->nodes[0]; 297 leaf = path->nodes[0];
295 slot = path->slots[0]; 298 slot = path->slots[0];
296 299
@@ -316,31 +319,45 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
316 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 319 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
317 break; 320 break;
318 if (found_key.type < BTRFS_XATTR_ITEM_KEY) 321 if (found_key.type < BTRFS_XATTR_ITEM_KEY)
319 goto next; 322 goto next_item;
320 323
321 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 324 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
322 if (verify_dir_item(root, leaf, di)) 325 item_size = btrfs_item_size_nr(leaf, slot);
323 goto next; 326 cur = 0;
324 327 while (cur < item_size) {
325 name_len = btrfs_dir_name_len(leaf, di); 328 u16 name_len = btrfs_dir_name_len(leaf, di);
326 total_size += name_len + 1; 329 u16 data_len = btrfs_dir_data_len(leaf, di);
330 u32 this_len = sizeof(*di) + name_len + data_len;
331 unsigned long name_ptr = (unsigned long)(di + 1);
332
333 if (verify_dir_item(root, leaf, di)) {
334 ret = -EIO;
335 goto err;
336 }
327 337
328 /* we are just looking for how big our buffer needs to be */ 338 total_size += name_len + 1;
329 if (!size) 339 /*
330 goto next; 340 * We are just looking for how big our buffer needs to
341 * be.
342 */
343 if (!size)
344 goto next;
331 345
332 if (!buffer || (name_len + 1) > size_left) { 346 if (!buffer || (name_len + 1) > size_left) {
333 ret = -ERANGE; 347 ret = -ERANGE;
334 goto err; 348 goto err;
335 } 349 }
336 350
337 name_ptr = (unsigned long)(di + 1); 351 read_extent_buffer(leaf, buffer, name_ptr, name_len);
338 read_extent_buffer(leaf, buffer, name_ptr, name_len); 352 buffer[name_len] = '\0';
339 buffer[name_len] = '\0';
340 353
341 size_left -= name_len + 1; 354 size_left -= name_len + 1;
342 buffer += name_len + 1; 355 buffer += name_len + 1;
343next: 356next:
357 cur += this_len;
358 di = (struct btrfs_dir_item *)((char *)di + this_len);
359 }
360next_item:
344 path->slots[0]++; 361 path->slots[0]++;
345 } 362 }
346 ret = total_size; 363 ret = total_size;