diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-18 12:42:05 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-18 12:42:05 -0500 |
commit | a22180d2666c018f4fef6818074d78bb76ff2bda (patch) | |
tree | a633aaf423ff39f94d00502d03dbbd99dab4b2ee /fs/btrfs/file.c | |
parent | 2d4dce0070448bcb5ccd04553a4be4635417f565 (diff) | |
parent | 213490b301773ea9c6fb89a86424a6901fcdd069 (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs update from Chris Mason:
"A big set of fixes and features.
In terms of line count, most of the code comes from Stefan, who added
the ability to replace a single drive in place. This is different
from how btrfs normally replaces drives, and is much much much faster.
Josef is plowing through our synchronous write performance. This pull
request does not include the DIO_OWN_WAITING patch that was discussed
on the list, but it has a number of other improvements to cut down our
latencies and CPU time during fsync/O_DIRECT writes.
Miao Xie has a big series of fixes and is spreading out ordered
operations over more CPUs. This improves performance and reduces
contention.
I've put in fixes for error handling around hash collisions. These
are going back to individual stable kernels as I test against them.
Otherwise we have a lot of fixes and cleanups, thanks everyone!
raid5/6 is being rebased against the device replacement code. I'll
have it posted this Friday along with a nice series of benchmarks."
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (115 commits)
Btrfs: fix a bug of per-file nocow
Btrfs: fix hash overflow handling
Btrfs: don't take inode delalloc mutex if we're a free space inode
Btrfs: fix autodefrag and umount lockup
Btrfs: fix permissions of empty files not affected by umask
Btrfs: put raid properties into global table
Btrfs: fix BUG() in scrub when first superblock reading gives EIO
Btrfs: do not call file_update_time in aio_write
Btrfs: only unlock and relock if we have to
Btrfs: use tokens where we can in the tree log
Btrfs: optimize leaf_space_used
Btrfs: don't memset new tokens
Btrfs: only clear dirty on the buffer if it is marked as dirty
Btrfs: move checks in set_page_dirty under DEBUG
Btrfs: log changed inodes based on the extent map tree
Btrfs: add path->really_keep_locks
Btrfs: do not mark ems as prealloc if we are writing to them
Btrfs: keep track of the extents original block length
Btrfs: inline csums if we're fsyncing
Btrfs: don't bother copying if we're only logging the inode
...
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r-- | fs/btrfs/file.c | 406 |
1 files changed, 264 insertions, 142 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9c6673a9231f..77061bf43edb 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include "compat.h" | 41 | #include "compat.h" |
42 | #include "volumes.h" | 42 | #include "volumes.h" |
43 | 43 | ||
44 | static struct kmem_cache *btrfs_inode_defrag_cachep; | ||
44 | /* | 45 | /* |
45 | * when auto defrag is enabled we | 46 | * when auto defrag is enabled we |
46 | * queue up these defrag structs to remember which | 47 | * queue up these defrag structs to remember which |
@@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, | |||
90 | * If an existing record is found the defrag item you | 91 | * If an existing record is found the defrag item you |
91 | * pass in is freed | 92 | * pass in is freed |
92 | */ | 93 | */ |
93 | static void __btrfs_add_inode_defrag(struct inode *inode, | 94 | static int __btrfs_add_inode_defrag(struct inode *inode, |
94 | struct inode_defrag *defrag) | 95 | struct inode_defrag *defrag) |
95 | { | 96 | { |
96 | struct btrfs_root *root = BTRFS_I(inode)->root; | 97 | struct btrfs_root *root = BTRFS_I(inode)->root; |
@@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode, | |||
118 | entry->transid = defrag->transid; | 119 | entry->transid = defrag->transid; |
119 | if (defrag->last_offset > entry->last_offset) | 120 | if (defrag->last_offset > entry->last_offset) |
120 | entry->last_offset = defrag->last_offset; | 121 | entry->last_offset = defrag->last_offset; |
121 | goto exists; | 122 | return -EEXIST; |
122 | } | 123 | } |
123 | } | 124 | } |
124 | set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); | 125 | set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); |
125 | rb_link_node(&defrag->rb_node, parent, p); | 126 | rb_link_node(&defrag->rb_node, parent, p); |
126 | rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); | 127 | rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); |
127 | return; | 128 | return 0; |
129 | } | ||
128 | 130 | ||
129 | exists: | 131 | static inline int __need_auto_defrag(struct btrfs_root *root) |
130 | kfree(defrag); | 132 | { |
131 | return; | 133 | if (!btrfs_test_opt(root, AUTO_DEFRAG)) |
134 | return 0; | ||
135 | |||
136 | if (btrfs_fs_closing(root->fs_info)) | ||
137 | return 0; | ||
132 | 138 | ||
139 | return 1; | ||
133 | } | 140 | } |
134 | 141 | ||
135 | /* | 142 | /* |
@@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, | |||
142 | struct btrfs_root *root = BTRFS_I(inode)->root; | 149 | struct btrfs_root *root = BTRFS_I(inode)->root; |
143 | struct inode_defrag *defrag; | 150 | struct inode_defrag *defrag; |
144 | u64 transid; | 151 | u64 transid; |
152 | int ret; | ||
145 | 153 | ||
146 | if (!btrfs_test_opt(root, AUTO_DEFRAG)) | 154 | if (!__need_auto_defrag(root)) |
147 | return 0; | ||
148 | |||
149 | if (btrfs_fs_closing(root->fs_info)) | ||
150 | return 0; | 155 | return 0; |
151 | 156 | ||
152 | if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) | 157 | if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) |
@@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, | |||
157 | else | 162 | else |
158 | transid = BTRFS_I(inode)->root->last_trans; | 163 | transid = BTRFS_I(inode)->root->last_trans; |
159 | 164 | ||
160 | defrag = kzalloc(sizeof(*defrag), GFP_NOFS); | 165 | defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); |
161 | if (!defrag) | 166 | if (!defrag) |
162 | return -ENOMEM; | 167 | return -ENOMEM; |
163 | 168 | ||
@@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, | |||
166 | defrag->root = root->root_key.objectid; | 171 | defrag->root = root->root_key.objectid; |
167 | 172 | ||
168 | spin_lock(&root->fs_info->defrag_inodes_lock); | 173 | spin_lock(&root->fs_info->defrag_inodes_lock); |
169 | if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) | 174 | if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { |
170 | __btrfs_add_inode_defrag(inode, defrag); | 175 | /* |
171 | else | 176 | * If we set IN_DEFRAG flag and evict the inode from memory, |
172 | kfree(defrag); | 177 | * and then re-read this inode, this new inode doesn't have |
178 | * IN_DEFRAG flag. At the case, we may find the existed defrag. | ||
179 | */ | ||
180 | ret = __btrfs_add_inode_defrag(inode, defrag); | ||
181 | if (ret) | ||
182 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | ||
183 | } else { | ||
184 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | ||
185 | } | ||
173 | spin_unlock(&root->fs_info->defrag_inodes_lock); | 186 | spin_unlock(&root->fs_info->defrag_inodes_lock); |
174 | return 0; | 187 | return 0; |
175 | } | 188 | } |
176 | 189 | ||
177 | /* | 190 | /* |
178 | * must be called with the defrag_inodes lock held | 191 | * Requeue the defrag object. If there is a defrag object that points to |
192 | * the same inode in the tree, we will merge them together (by | ||
193 | * __btrfs_add_inode_defrag()) and free the one that we want to requeue. | ||
179 | */ | 194 | */ |
180 | struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, | 195 | void btrfs_requeue_inode_defrag(struct inode *inode, |
181 | u64 root, u64 ino, | 196 | struct inode_defrag *defrag) |
182 | struct rb_node **next) | 197 | { |
198 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
199 | int ret; | ||
200 | |||
201 | if (!__need_auto_defrag(root)) | ||
202 | goto out; | ||
203 | |||
204 | /* | ||
205 | * Here we don't check the IN_DEFRAG flag, because we need merge | ||
206 | * them together. | ||
207 | */ | ||
208 | spin_lock(&root->fs_info->defrag_inodes_lock); | ||
209 | ret = __btrfs_add_inode_defrag(inode, defrag); | ||
210 | spin_unlock(&root->fs_info->defrag_inodes_lock); | ||
211 | if (ret) | ||
212 | goto out; | ||
213 | return; | ||
214 | out: | ||
215 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | ||
216 | } | ||
217 | |||
218 | /* | ||
219 | * pick the defragable inode that we want, if it doesn't exist, we will get | ||
220 | * the next one. | ||
221 | */ | ||
222 | static struct inode_defrag * | ||
223 | btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) | ||
183 | { | 224 | { |
184 | struct inode_defrag *entry = NULL; | 225 | struct inode_defrag *entry = NULL; |
185 | struct inode_defrag tmp; | 226 | struct inode_defrag tmp; |
@@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, | |||
190 | tmp.ino = ino; | 231 | tmp.ino = ino; |
191 | tmp.root = root; | 232 | tmp.root = root; |
192 | 233 | ||
193 | p = info->defrag_inodes.rb_node; | 234 | spin_lock(&fs_info->defrag_inodes_lock); |
235 | p = fs_info->defrag_inodes.rb_node; | ||
194 | while (p) { | 236 | while (p) { |
195 | parent = p; | 237 | parent = p; |
196 | entry = rb_entry(parent, struct inode_defrag, rb_node); | 238 | entry = rb_entry(parent, struct inode_defrag, rb_node); |
@@ -201,52 +243,131 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, | |||
201 | else if (ret > 0) | 243 | else if (ret > 0) |
202 | p = parent->rb_right; | 244 | p = parent->rb_right; |
203 | else | 245 | else |
204 | return entry; | 246 | goto out; |
205 | } | 247 | } |
206 | 248 | ||
207 | if (next) { | 249 | if (parent && __compare_inode_defrag(&tmp, entry) > 0) { |
208 | while (parent && __compare_inode_defrag(&tmp, entry) > 0) { | 250 | parent = rb_next(parent); |
209 | parent = rb_next(parent); | 251 | if (parent) |
210 | entry = rb_entry(parent, struct inode_defrag, rb_node); | 252 | entry = rb_entry(parent, struct inode_defrag, rb_node); |
211 | } | 253 | else |
212 | *next = parent; | 254 | entry = NULL; |
213 | } | 255 | } |
214 | return NULL; | 256 | out: |
257 | if (entry) | ||
258 | rb_erase(parent, &fs_info->defrag_inodes); | ||
259 | spin_unlock(&fs_info->defrag_inodes_lock); | ||
260 | return entry; | ||
215 | } | 261 | } |
216 | 262 | ||
217 | /* | 263 | void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) |
218 | * run through the list of inodes in the FS that need | ||
219 | * defragging | ||
220 | */ | ||
221 | int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) | ||
222 | { | 264 | { |
223 | struct inode_defrag *defrag; | 265 | struct inode_defrag *defrag; |
266 | struct rb_node *node; | ||
267 | |||
268 | spin_lock(&fs_info->defrag_inodes_lock); | ||
269 | node = rb_first(&fs_info->defrag_inodes); | ||
270 | while (node) { | ||
271 | rb_erase(node, &fs_info->defrag_inodes); | ||
272 | defrag = rb_entry(node, struct inode_defrag, rb_node); | ||
273 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | ||
274 | |||
275 | if (need_resched()) { | ||
276 | spin_unlock(&fs_info->defrag_inodes_lock); | ||
277 | cond_resched(); | ||
278 | spin_lock(&fs_info->defrag_inodes_lock); | ||
279 | } | ||
280 | |||
281 | node = rb_first(&fs_info->defrag_inodes); | ||
282 | } | ||
283 | spin_unlock(&fs_info->defrag_inodes_lock); | ||
284 | } | ||
285 | |||
286 | #define BTRFS_DEFRAG_BATCH 1024 | ||
287 | |||
288 | static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, | ||
289 | struct inode_defrag *defrag) | ||
290 | { | ||
224 | struct btrfs_root *inode_root; | 291 | struct btrfs_root *inode_root; |
225 | struct inode *inode; | 292 | struct inode *inode; |
226 | struct rb_node *n; | ||
227 | struct btrfs_key key; | 293 | struct btrfs_key key; |
228 | struct btrfs_ioctl_defrag_range_args range; | 294 | struct btrfs_ioctl_defrag_range_args range; |
229 | u64 first_ino = 0; | ||
230 | u64 root_objectid = 0; | ||
231 | int num_defrag; | 295 | int num_defrag; |
232 | int defrag_batch = 1024; | ||
233 | 296 | ||
297 | /* get the inode */ | ||
298 | key.objectid = defrag->root; | ||
299 | btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); | ||
300 | key.offset = (u64)-1; | ||
301 | inode_root = btrfs_read_fs_root_no_name(fs_info, &key); | ||
302 | if (IS_ERR(inode_root)) { | ||
303 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | ||
304 | return PTR_ERR(inode_root); | ||
305 | } | ||
306 | |||
307 | key.objectid = defrag->ino; | ||
308 | btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); | ||
309 | key.offset = 0; | ||
310 | inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); | ||
311 | if (IS_ERR(inode)) { | ||
312 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | ||
313 | return PTR_ERR(inode); | ||
314 | } | ||
315 | |||
316 | /* do a chunk of defrag */ | ||
317 | clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); | ||
234 | memset(&range, 0, sizeof(range)); | 318 | memset(&range, 0, sizeof(range)); |
235 | range.len = (u64)-1; | 319 | range.len = (u64)-1; |
320 | range.start = defrag->last_offset; | ||
321 | |||
322 | sb_start_write(fs_info->sb); | ||
323 | num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, | ||
324 | BTRFS_DEFRAG_BATCH); | ||
325 | sb_end_write(fs_info->sb); | ||
326 | /* | ||
327 | * if we filled the whole defrag batch, there | ||
328 | * must be more work to do. Queue this defrag | ||
329 | * again | ||
330 | */ | ||
331 | if (num_defrag == BTRFS_DEFRAG_BATCH) { | ||
332 | defrag->last_offset = range.start; | ||
333 | btrfs_requeue_inode_defrag(inode, defrag); | ||
334 | } else if (defrag->last_offset && !defrag->cycled) { | ||
335 | /* | ||
336 | * we didn't fill our defrag batch, but | ||
337 | * we didn't start at zero. Make sure we loop | ||
338 | * around to the start of the file. | ||
339 | */ | ||
340 | defrag->last_offset = 0; | ||
341 | defrag->cycled = 1; | ||
342 | btrfs_requeue_inode_defrag(inode, defrag); | ||
343 | } else { | ||
344 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | ||
345 | } | ||
346 | |||
347 | iput(inode); | ||
348 | return 0; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * run through the list of inodes in the FS that need | ||
353 | * defragging | ||
354 | */ | ||
355 | int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) | ||
356 | { | ||
357 | struct inode_defrag *defrag; | ||
358 | u64 first_ino = 0; | ||
359 | u64 root_objectid = 0; | ||
236 | 360 | ||
237 | atomic_inc(&fs_info->defrag_running); | 361 | atomic_inc(&fs_info->defrag_running); |
238 | spin_lock(&fs_info->defrag_inodes_lock); | ||
239 | while(1) { | 362 | while(1) { |
240 | n = NULL; | 363 | if (!__need_auto_defrag(fs_info->tree_root)) |
364 | break; | ||
241 | 365 | ||
242 | /* find an inode to defrag */ | 366 | /* find an inode to defrag */ |
243 | defrag = btrfs_find_defrag_inode(fs_info, root_objectid, | 367 | defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, |
244 | first_ino, &n); | 368 | first_ino); |
245 | if (!defrag) { | 369 | if (!defrag) { |
246 | if (n) { | 370 | if (root_objectid || first_ino) { |
247 | defrag = rb_entry(n, struct inode_defrag, | ||
248 | rb_node); | ||
249 | } else if (root_objectid || first_ino) { | ||
250 | root_objectid = 0; | 371 | root_objectid = 0; |
251 | first_ino = 0; | 372 | first_ino = 0; |
252 | continue; | 373 | continue; |
@@ -255,70 +376,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) | |||
255 | } | 376 | } |
256 | } | 377 | } |
257 | 378 | ||
258 | /* remove it from the rbtree */ | ||
259 | first_ino = defrag->ino + 1; | 379 | first_ino = defrag->ino + 1; |
260 | root_objectid = defrag->root; | 380 | root_objectid = defrag->root; |
261 | rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); | ||
262 | |||
263 | if (btrfs_fs_closing(fs_info)) | ||
264 | goto next_free; | ||
265 | |||
266 | spin_unlock(&fs_info->defrag_inodes_lock); | ||
267 | |||
268 | /* get the inode */ | ||
269 | key.objectid = defrag->root; | ||
270 | btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); | ||
271 | key.offset = (u64)-1; | ||
272 | inode_root = btrfs_read_fs_root_no_name(fs_info, &key); | ||
273 | if (IS_ERR(inode_root)) | ||
274 | goto next; | ||
275 | |||
276 | key.objectid = defrag->ino; | ||
277 | btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); | ||
278 | key.offset = 0; | ||
279 | |||
280 | inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); | ||
281 | if (IS_ERR(inode)) | ||
282 | goto next; | ||
283 | 381 | ||
284 | /* do a chunk of defrag */ | 382 | __btrfs_run_defrag_inode(fs_info, defrag); |
285 | clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); | ||
286 | range.start = defrag->last_offset; | ||
287 | num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, | ||
288 | defrag_batch); | ||
289 | /* | ||
290 | * if we filled the whole defrag batch, there | ||
291 | * must be more work to do. Queue this defrag | ||
292 | * again | ||
293 | */ | ||
294 | if (num_defrag == defrag_batch) { | ||
295 | defrag->last_offset = range.start; | ||
296 | __btrfs_add_inode_defrag(inode, defrag); | ||
297 | /* | ||
298 | * we don't want to kfree defrag, we added it back to | ||
299 | * the rbtree | ||
300 | */ | ||
301 | defrag = NULL; | ||
302 | } else if (defrag->last_offset && !defrag->cycled) { | ||
303 | /* | ||
304 | * we didn't fill our defrag batch, but | ||
305 | * we didn't start at zero. Make sure we loop | ||
306 | * around to the start of the file. | ||
307 | */ | ||
308 | defrag->last_offset = 0; | ||
309 | defrag->cycled = 1; | ||
310 | __btrfs_add_inode_defrag(inode, defrag); | ||
311 | defrag = NULL; | ||
312 | } | ||
313 | |||
314 | iput(inode); | ||
315 | next: | ||
316 | spin_lock(&fs_info->defrag_inodes_lock); | ||
317 | next_free: | ||
318 | kfree(defrag); | ||
319 | } | 383 | } |
320 | spin_unlock(&fs_info->defrag_inodes_lock); | ||
321 | |||
322 | atomic_dec(&fs_info->defrag_running); | 384 | atomic_dec(&fs_info->defrag_running); |
323 | 385 | ||
324 | /* | 386 | /* |
@@ -526,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, | |||
526 | split->block_len = em->block_len; | 588 | split->block_len = em->block_len; |
527 | else | 589 | else |
528 | split->block_len = split->len; | 590 | split->block_len = split->len; |
591 | split->orig_block_len = max(split->block_len, | ||
592 | em->orig_block_len); | ||
529 | split->generation = gen; | 593 | split->generation = gen; |
530 | split->bdev = em->bdev; | 594 | split->bdev = em->bdev; |
531 | split->flags = flags; | 595 | split->flags = flags; |
@@ -547,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, | |||
547 | split->flags = flags; | 611 | split->flags = flags; |
548 | split->compress_type = em->compress_type; | 612 | split->compress_type = em->compress_type; |
549 | split->generation = gen; | 613 | split->generation = gen; |
614 | split->orig_block_len = max(em->block_len, | ||
615 | em->orig_block_len); | ||
550 | 616 | ||
551 | if (compressed) { | 617 | if (compressed) { |
552 | split->block_len = em->block_len; | 618 | split->block_len = em->block_len; |
@@ -555,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, | |||
555 | } else { | 621 | } else { |
556 | split->block_len = split->len; | 622 | split->block_len = split->len; |
557 | split->block_start = em->block_start + diff; | 623 | split->block_start = em->block_start + diff; |
558 | split->orig_start = split->start; | 624 | split->orig_start = em->orig_start; |
559 | } | 625 | } |
560 | 626 | ||
561 | ret = add_extent_mapping(em_tree, split); | 627 | ret = add_extent_mapping(em_tree, split); |
@@ -1348,7 +1414,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, | |||
1348 | 1414 | ||
1349 | balance_dirty_pages_ratelimited(inode->i_mapping); | 1415 | balance_dirty_pages_ratelimited(inode->i_mapping); |
1350 | if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) | 1416 | if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) |
1351 | btrfs_btree_balance_dirty(root, 1); | 1417 | btrfs_btree_balance_dirty(root); |
1352 | 1418 | ||
1353 | pos += copied; | 1419 | pos += copied; |
1354 | num_written += copied; | 1420 | num_written += copied; |
@@ -1397,6 +1463,24 @@ out: | |||
1397 | return written ? written : err; | 1463 | return written ? written : err; |
1398 | } | 1464 | } |
1399 | 1465 | ||
1466 | static void update_time_for_write(struct inode *inode) | ||
1467 | { | ||
1468 | struct timespec now; | ||
1469 | |||
1470 | if (IS_NOCMTIME(inode)) | ||
1471 | return; | ||
1472 | |||
1473 | now = current_fs_time(inode->i_sb); | ||
1474 | if (!timespec_equal(&inode->i_mtime, &now)) | ||
1475 | inode->i_mtime = now; | ||
1476 | |||
1477 | if (!timespec_equal(&inode->i_ctime, &now)) | ||
1478 | inode->i_ctime = now; | ||
1479 | |||
1480 | if (IS_I_VERSION(inode)) | ||
1481 | inode_inc_iversion(inode); | ||
1482 | } | ||
1483 | |||
1400 | static ssize_t btrfs_file_aio_write(struct kiocb *iocb, | 1484 | static ssize_t btrfs_file_aio_write(struct kiocb *iocb, |
1401 | const struct iovec *iov, | 1485 | const struct iovec *iov, |
1402 | unsigned long nr_segs, loff_t pos) | 1486 | unsigned long nr_segs, loff_t pos) |
@@ -1409,6 +1493,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, | |||
1409 | ssize_t num_written = 0; | 1493 | ssize_t num_written = 0; |
1410 | ssize_t err = 0; | 1494 | ssize_t err = 0; |
1411 | size_t count, ocount; | 1495 | size_t count, ocount; |
1496 | bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); | ||
1412 | 1497 | ||
1413 | sb_start_write(inode->i_sb); | 1498 | sb_start_write(inode->i_sb); |
1414 | 1499 | ||
@@ -1451,11 +1536,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, | |||
1451 | goto out; | 1536 | goto out; |
1452 | } | 1537 | } |
1453 | 1538 | ||
1454 | err = file_update_time(file); | 1539 | /* |
1455 | if (err) { | 1540 | * We reserve space for updating the inode when we reserve space for the |
1456 | mutex_unlock(&inode->i_mutex); | 1541 | * extent we are going to write, so we will enospc out there. We don't |
1457 | goto out; | 1542 | * need to start yet another transaction to update the inode as we will |
1458 | } | 1543 | * update the inode when we finish writing whatever data we write. |
1544 | */ | ||
1545 | update_time_for_write(inode); | ||
1459 | 1546 | ||
1460 | start_pos = round_down(pos, root->sectorsize); | 1547 | start_pos = round_down(pos, root->sectorsize); |
1461 | if (start_pos > i_size_read(inode)) { | 1548 | if (start_pos > i_size_read(inode)) { |
@@ -1466,6 +1553,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, | |||
1466 | } | 1553 | } |
1467 | } | 1554 | } |
1468 | 1555 | ||
1556 | if (sync) | ||
1557 | atomic_inc(&BTRFS_I(inode)->sync_writers); | ||
1558 | |||
1469 | if (unlikely(file->f_flags & O_DIRECT)) { | 1559 | if (unlikely(file->f_flags & O_DIRECT)) { |
1470 | num_written = __btrfs_direct_write(iocb, iov, nr_segs, | 1560 | num_written = __btrfs_direct_write(iocb, iov, nr_segs, |
1471 | pos, ppos, count, ocount); | 1561 | pos, ppos, count, ocount); |
@@ -1492,14 +1582,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, | |||
1492 | * this will either be one more than the running transaction | 1582 | * this will either be one more than the running transaction |
1493 | * or the generation used for the next transaction if there isn't | 1583 | * or the generation used for the next transaction if there isn't |
1494 | * one running right now. | 1584 | * one running right now. |
1585 | * | ||
1586 | * We also have to set last_sub_trans to the current log transid, | ||
1587 | * otherwise subsequent syncs to a file that's been synced in this | ||
1588 | * transaction will appear to have already occured. | ||
1495 | */ | 1589 | */ |
1496 | BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; | 1590 | BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; |
1591 | BTRFS_I(inode)->last_sub_trans = root->log_transid; | ||
1497 | if (num_written > 0 || num_written == -EIOCBQUEUED) { | 1592 | if (num_written > 0 || num_written == -EIOCBQUEUED) { |
1498 | err = generic_write_sync(file, pos, num_written); | 1593 | err = generic_write_sync(file, pos, num_written); |
1499 | if (err < 0 && num_written > 0) | 1594 | if (err < 0 && num_written > 0) |
1500 | num_written = err; | 1595 | num_written = err; |
1501 | } | 1596 | } |
1502 | out: | 1597 | out: |
1598 | if (sync) | ||
1599 | atomic_dec(&BTRFS_I(inode)->sync_writers); | ||
1503 | sb_end_write(inode->i_sb); | 1600 | sb_end_write(inode->i_sb); |
1504 | current->backing_dev_info = NULL; | 1601 | current->backing_dev_info = NULL; |
1505 | return num_written ? num_written : err; | 1602 | return num_written ? num_written : err; |
@@ -1550,7 +1647,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
1550 | * out of the ->i_mutex. If so, we can flush the dirty pages by | 1647 | * out of the ->i_mutex. If so, we can flush the dirty pages by |
1551 | * multi-task, and make the performance up. | 1648 | * multi-task, and make the performance up. |
1552 | */ | 1649 | */ |
1650 | atomic_inc(&BTRFS_I(inode)->sync_writers); | ||
1553 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); | 1651 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); |
1652 | atomic_dec(&BTRFS_I(inode)->sync_writers); | ||
1554 | if (ret) | 1653 | if (ret) |
1555 | return ret; | 1654 | return ret; |
1556 | 1655 | ||
@@ -1561,7 +1660,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
1561 | * range being left. | 1660 | * range being left. |
1562 | */ | 1661 | */ |
1563 | atomic_inc(&root->log_batch); | 1662 | atomic_inc(&root->log_batch); |
1564 | btrfs_wait_ordered_range(inode, start, end); | 1663 | btrfs_wait_ordered_range(inode, start, end - start + 1); |
1565 | atomic_inc(&root->log_batch); | 1664 | atomic_inc(&root->log_batch); |
1566 | 1665 | ||
1567 | /* | 1666 | /* |
@@ -1767,6 +1866,7 @@ out: | |||
1767 | 1866 | ||
1768 | hole_em->block_start = EXTENT_MAP_HOLE; | 1867 | hole_em->block_start = EXTENT_MAP_HOLE; |
1769 | hole_em->block_len = 0; | 1868 | hole_em->block_len = 0; |
1869 | hole_em->orig_block_len = 0; | ||
1770 | hole_em->bdev = root->fs_info->fs_devices->latest_bdev; | 1870 | hole_em->bdev = root->fs_info->fs_devices->latest_bdev; |
1771 | hole_em->compress_type = BTRFS_COMPRESS_NONE; | 1871 | hole_em->compress_type = BTRFS_COMPRESS_NONE; |
1772 | hole_em->generation = trans->transid; | 1872 | hole_em->generation = trans->transid; |
@@ -1796,48 +1896,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) | |||
1796 | struct btrfs_path *path; | 1896 | struct btrfs_path *path; |
1797 | struct btrfs_block_rsv *rsv; | 1897 | struct btrfs_block_rsv *rsv; |
1798 | struct btrfs_trans_handle *trans; | 1898 | struct btrfs_trans_handle *trans; |
1799 | u64 mask = BTRFS_I(inode)->root->sectorsize - 1; | 1899 | u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); |
1800 | u64 lockstart = (offset + mask) & ~mask; | 1900 | u64 lockend = round_down(offset + len, |
1801 | u64 lockend = ((offset + len) & ~mask) - 1; | 1901 | BTRFS_I(inode)->root->sectorsize) - 1; |
1802 | u64 cur_offset = lockstart; | 1902 | u64 cur_offset = lockstart; |
1803 | u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); | 1903 | u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); |
1804 | u64 drop_end; | 1904 | u64 drop_end; |
1805 | unsigned long nr; | ||
1806 | int ret = 0; | 1905 | int ret = 0; |
1807 | int err = 0; | 1906 | int err = 0; |
1808 | bool same_page = (offset >> PAGE_CACHE_SHIFT) == | 1907 | bool same_page = ((offset >> PAGE_CACHE_SHIFT) == |
1809 | ((offset + len) >> PAGE_CACHE_SHIFT); | 1908 | ((offset + len - 1) >> PAGE_CACHE_SHIFT)); |
1810 | 1909 | ||
1811 | btrfs_wait_ordered_range(inode, offset, len); | 1910 | btrfs_wait_ordered_range(inode, offset, len); |
1812 | 1911 | ||
1813 | mutex_lock(&inode->i_mutex); | 1912 | mutex_lock(&inode->i_mutex); |
1814 | if (offset >= inode->i_size) { | 1913 | /* |
1815 | mutex_unlock(&inode->i_mutex); | 1914 | * We needn't truncate any page which is beyond the end of the file |
1816 | return 0; | 1915 | * because we are sure there is no data there. |
1817 | } | 1916 | */ |
1818 | |||
1819 | /* | 1917 | /* |
1820 | * Only do this if we are in the same page and we aren't doing the | 1918 | * Only do this if we are in the same page and we aren't doing the |
1821 | * entire page. | 1919 | * entire page. |
1822 | */ | 1920 | */ |
1823 | if (same_page && len < PAGE_CACHE_SIZE) { | 1921 | if (same_page && len < PAGE_CACHE_SIZE) { |
1824 | ret = btrfs_truncate_page(inode, offset, len, 0); | 1922 | if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) |
1923 | ret = btrfs_truncate_page(inode, offset, len, 0); | ||
1825 | mutex_unlock(&inode->i_mutex); | 1924 | mutex_unlock(&inode->i_mutex); |
1826 | return ret; | 1925 | return ret; |
1827 | } | 1926 | } |
1828 | 1927 | ||
1829 | /* zero back part of the first page */ | 1928 | /* zero back part of the first page */ |
1830 | ret = btrfs_truncate_page(inode, offset, 0, 0); | 1929 | if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { |
1831 | if (ret) { | 1930 | ret = btrfs_truncate_page(inode, offset, 0, 0); |
1832 | mutex_unlock(&inode->i_mutex); | 1931 | if (ret) { |
1833 | return ret; | 1932 | mutex_unlock(&inode->i_mutex); |
1933 | return ret; | ||
1934 | } | ||
1834 | } | 1935 | } |
1835 | 1936 | ||
1836 | /* zero the front end of the last page */ | 1937 | /* zero the front end of the last page */ |
1837 | ret = btrfs_truncate_page(inode, offset + len, 0, 1); | 1938 | if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { |
1838 | if (ret) { | 1939 | ret = btrfs_truncate_page(inode, offset + len, 0, 1); |
1839 | mutex_unlock(&inode->i_mutex); | 1940 | if (ret) { |
1840 | return ret; | 1941 | mutex_unlock(&inode->i_mutex); |
1942 | return ret; | ||
1943 | } | ||
1841 | } | 1944 | } |
1842 | 1945 | ||
1843 | if (lockend < lockstart) { | 1946 | if (lockend < lockstart) { |
@@ -1930,9 +2033,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) | |||
1930 | break; | 2033 | break; |
1931 | } | 2034 | } |
1932 | 2035 | ||
1933 | nr = trans->blocks_used; | ||
1934 | btrfs_end_transaction(trans, root); | 2036 | btrfs_end_transaction(trans, root); |
1935 | btrfs_btree_balance_dirty(root, nr); | 2037 | btrfs_btree_balance_dirty(root); |
1936 | 2038 | ||
1937 | trans = btrfs_start_transaction(root, 3); | 2039 | trans = btrfs_start_transaction(root, 3); |
1938 | if (IS_ERR(trans)) { | 2040 | if (IS_ERR(trans)) { |
@@ -1963,11 +2065,13 @@ out_trans: | |||
1963 | if (!trans) | 2065 | if (!trans) |
1964 | goto out_free; | 2066 | goto out_free; |
1965 | 2067 | ||
2068 | inode_inc_iversion(inode); | ||
2069 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
2070 | |||
1966 | trans->block_rsv = &root->fs_info->trans_block_rsv; | 2071 | trans->block_rsv = &root->fs_info->trans_block_rsv; |
1967 | ret = btrfs_update_inode(trans, root, inode); | 2072 | ret = btrfs_update_inode(trans, root, inode); |
1968 | nr = trans->blocks_used; | ||
1969 | btrfs_end_transaction(trans, root); | 2073 | btrfs_end_transaction(trans, root); |
1970 | btrfs_btree_balance_dirty(root, nr); | 2074 | btrfs_btree_balance_dirty(root); |
1971 | out_free: | 2075 | out_free: |
1972 | btrfs_free_path(path); | 2076 | btrfs_free_path(path); |
1973 | btrfs_free_block_rsv(root, rsv); | 2077 | btrfs_free_block_rsv(root, rsv); |
@@ -1991,12 +2095,12 @@ static long btrfs_fallocate(struct file *file, int mode, | |||
1991 | u64 alloc_end; | 2095 | u64 alloc_end; |
1992 | u64 alloc_hint = 0; | 2096 | u64 alloc_hint = 0; |
1993 | u64 locked_end; | 2097 | u64 locked_end; |
1994 | u64 mask = BTRFS_I(inode)->root->sectorsize - 1; | ||
1995 | struct extent_map *em; | 2098 | struct extent_map *em; |
2099 | int blocksize = BTRFS_I(inode)->root->sectorsize; | ||
1996 | int ret; | 2100 | int ret; |
1997 | 2101 | ||
1998 | alloc_start = offset & ~mask; | 2102 | alloc_start = round_down(offset, blocksize); |
1999 | alloc_end = (offset + len + mask) & ~mask; | 2103 | alloc_end = round_up(offset + len, blocksize); |
2000 | 2104 | ||
2001 | /* Make sure we aren't being give some crap mode */ | 2105 | /* Make sure we aren't being give some crap mode */ |
2002 | if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) | 2106 | if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) |
@@ -2009,7 +2113,7 @@ static long btrfs_fallocate(struct file *file, int mode, | |||
2009 | * Make sure we have enough space before we do the | 2113 | * Make sure we have enough space before we do the |
2010 | * allocation. | 2114 | * allocation. |
2011 | */ | 2115 | */ |
2012 | ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); | 2116 | ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); |
2013 | if (ret) | 2117 | if (ret) |
2014 | return ret; | 2118 | return ret; |
2015 | 2119 | ||
@@ -2077,7 +2181,7 @@ static long btrfs_fallocate(struct file *file, int mode, | |||
2077 | } | 2181 | } |
2078 | last_byte = min(extent_map_end(em), alloc_end); | 2182 | last_byte = min(extent_map_end(em), alloc_end); |
2079 | actual_end = min_t(u64, extent_map_end(em), offset + len); | 2183 | actual_end = min_t(u64, extent_map_end(em), offset + len); |
2080 | last_byte = (last_byte + mask) & ~mask; | 2184 | last_byte = ALIGN(last_byte, blocksize); |
2081 | 2185 | ||
2082 | if (em->block_start == EXTENT_MAP_HOLE || | 2186 | if (em->block_start == EXTENT_MAP_HOLE || |
2083 | (cur_offset >= inode->i_size && | 2187 | (cur_offset >= inode->i_size && |
@@ -2116,7 +2220,7 @@ static long btrfs_fallocate(struct file *file, int mode, | |||
2116 | out: | 2220 | out: |
2117 | mutex_unlock(&inode->i_mutex); | 2221 | mutex_unlock(&inode->i_mutex); |
2118 | /* Let go of our reservation. */ | 2222 | /* Let go of our reservation. */ |
2119 | btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); | 2223 | btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); |
2120 | return ret; | 2224 | return ret; |
2121 | } | 2225 | } |
2122 | 2226 | ||
@@ -2292,3 +2396,21 @@ const struct file_operations btrfs_file_operations = { | |||
2292 | .compat_ioctl = btrfs_ioctl, | 2396 | .compat_ioctl = btrfs_ioctl, |
2293 | #endif | 2397 | #endif |
2294 | }; | 2398 | }; |
2399 | |||
2400 | void btrfs_auto_defrag_exit(void) | ||
2401 | { | ||
2402 | if (btrfs_inode_defrag_cachep) | ||
2403 | kmem_cache_destroy(btrfs_inode_defrag_cachep); | ||
2404 | } | ||
2405 | |||
2406 | int btrfs_auto_defrag_init(void) | ||
2407 | { | ||
2408 | btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", | ||
2409 | sizeof(struct inode_defrag), 0, | ||
2410 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, | ||
2411 | NULL); | ||
2412 | if (!btrfs_inode_defrag_cachep) | ||
2413 | return -ENOMEM; | ||
2414 | |||
2415 | return 0; | ||
2416 | } | ||